#include <iostream>
#include <stdexcept>
#include <sstream>
#include <vector>

#include <boost/filesystem/path.hpp>
#include <boost/filesystem/fstream.hpp>
#include <boost/filesystem/convenience.hpp>
#include <boost/filesystem/operations.hpp>
#include <boost/timer.hpp>

#include <outilex/DELAcorresp.h>
#include <outilex/dic_entry.h>
#include <outilex/dic_indexer.h>
#include <outilex/tokenization.h>

#include <outilex/usage.h>


using namespace std;
using namespace boost;

namespace fs = boost::filesystem;


const char * USAGE_DESCRIPTION =
"usage: dela-index <dela> -c <delacorresp> [-r <ratio>][-o <index>]\n";


/* return -1 si fin du fichier
 *         0 si erreur lors du chargement
 *         1 si ok
 */
namespace {
set<string> unknown_codes;
}
int read_dic_entry(istream & is, dic_entry & res, DELAcorresp & corresp) {


  string form, lemma, cat, synt, flex;
  if (! getDELAline(is, form, lemma, cat, synt, flex)) { return -1; }

  const DELAcorresp::POS_inf & posinf = corresp[cat];

  if (posinf.name.empty()) {
  
    if (unknown_codes.find(cat) == unknown_codes.end()) {
      cerr << "erreur: unknown POS: " << cat << "\n";
      unknown_codes.insert(cat);
    }
    return 0;
  }
  
  /* tokenize form */

#if 0
  vector<string> toks;
  tokenize(form, toks);

  assert(! toks.empty());

  cerr << "form : " << form << endl;
  cerr << "  tok : " << toks[0] << endl;
  for (int i = 1; i < toks.size(); ++i) {
    cerr << "  tok : " << toks[i] << endl;
    toks[0] += " " + toks[i];
  }
#endif

  dic_entry entry;
  entry.lemma = lemma;
  entry.pos = posinf.name;

  inflected main_inflect;
  main_inflect.form = form;


  // create +attr=val pair from code
 
#warning "c'est trop complique ! (DELAcorresp etc.)"

  vector<string> codes;
  stringtok(synt, "+", back_inserter(codes));

  for (int i = 0; i < codes.size(); ++i) {

    DELAcorresp::synt_code_map::const_iterator it = posinf.SYNTs.find(codes[i]);
    
    if (it == posinf.SYNTs.end()) {
      string unknow = entry.pos + "+" + codes[i];
      if (unknown_codes.find(unknow) == unknown_codes.end()) {
        cerr << "warning: unknow code " << codes[i] << " (in POS " << entry.pos << ")\n";
        unknown_codes.insert(unknow);
      }
      continue;
    }
    const vector<DELAcorresp::feat_def> & v = it->second;
    
    for (int j = 0; j < v.size(); ++j) {
      if (v[j].type == DELAcorresp::LEMMA_FEAT) { // assign feat to lemma
        entry.feats += "+" + v[j].attr + "=" + v[j].val;
      } else { // inflected feat
        main_inflect.feats += "+" + v[j].attr + "=" + v[j].val;
      }
    }
  }


  // inflexionnal codes ...

  codes.clear();
  stringtok(flex, ":", back_inserter(codes));

  if (codes.size() == 0) { // no inflex feats
    entry.inflecteds.push_back(main_inflect);
    res.swap(entry);
    return 1;
  }

  for (int i = 0; i < codes.size(); ++i) { // construct codes.size() inflected
    
    inflected inflect;
    // herits flex attr from main inflect ...
    inflect.form = main_inflect.form;
    inflect.feats = main_inflect.feats;

    string & flexs = codes[i];
  
    for (int j = 0; j < flexs.size(); ++j) {

      DELAcorresp::flex_code_map::const_iterator it = posinf.FLEXs.find(flexs[j]);
      
      if (it == posinf.FLEXs.end()) {

        string unknow = entry.pos + ":"; unknow.push_back(flexs[j]);

        if (unknown_codes.find(unknow) == unknown_codes.end()) {
          cerr << "warning: unknow inflex code " << flexs[j] << " (in POS " << entry.pos << ")\n";
          unknown_codes.insert(unknow);
        }
        continue;
      }

      const vector<DELAcorresp::feat_def> & v = it->second;

      for (int k = 0; k < v.size(); ++k) {
        inflect.feats += "+" + v[k].attr + "=" + v[k].val;
      }
    }
    entry.inflecteds.push_back(inflect);
  }
  res.swap(entry);
  return 1;
}


int main(int argc, char ** argv) try {

  fs::path delapath, opath, corresppath;

  {
    char * text = getenv("DELAFCORRESP");
    if (text) { corresppath = fs::path(text); }
  }

  bool validate = false;

  double ratio = 5. / 3.;
  int next = 500000;

  argv++, argc--;

  if (argc == 0) { usage(); }

  while (argc) {

    string arg = *argv;

    if (**argv == '-') {

      if (arg == "-validate") {

        validate = true;

      } else if (arg == "-h" || arg == "-help" || arg == "-?") {

        usage();

      } else if (arg == "-corresp" || arg == "-c") {

        argv++, argc--;
        if (argc == 0) { bad_args(); }
        corresppath = *argv;

      } else if (arg == "-r") {

        argv++, argc--;
        if (argc == 0) { bad_args(); }
        ratio = atof(*argv);

      } else { bad_args(); }

    } else { delapath = arg; }

    argv++, argc--;
  }

  if (delapath.empty() || corresppath.empty()) { bad_args(); }

  if (opath.empty()) {
    opath = fs::change_extension(delapath, ".idx");
  }

  if (delapath == opath) { // avoid deleting dic
    cerr
      << "error: dela dic and index point to the same file name (" << delapath.string() << ")\n";  
    exit(1);
  }

  fs::ifstream dela(delapath);
 
  dic_indexer idx;

  DELAcorresp corresp(corresppath);

  dic_entry e;

  boost::timer tmr, t;
  int nbflex = 0, nbline = 0;

  int ret;
  while ((ret = read_dic_entry(dela, e, corresp)) != -1) {

    if (ret == 0) { continue; }

    idx.add_entry(e);

    if (idx.size() > next) {
      cout << "minimize() : nb states = " << idx.size() << " ...\n" << flush;
      idx.minimize();
      next = std::max(next, (int) ((double) idx.size() * ratio));
      cout << "ok : nb states = " << idx.size() << ", next = " << next << "\n"; 
    }

    ++nbline;
    nbflex += e.inflecteds.size();

    if ((nbline % 100000) == 0) {
      cout << "\n" << nbline << " lines (" << t.elapsed() << " s) ...\n\n";
      t.restart();
    }
  }

  idx.minimize();

  idx.write(opath);

  cout << "\ndone. " << nbline << " DELA lines. " << nbflex << " inflected forms. "
    << idx.size() << " states. " << tmr.elapsed() << "s.\n"
    << "resulting index in " << opath.string() << ". ";

  boost::intmax_t idxsize = fs::file_size(opath);

  if (idxsize < (1024*1024)) {
    cout << "(" << idxsize << " bytes)\n";
  } else {
    cout << "(" << (idxsize/(1024*1024)) << " Mb)\n"; 
  }

} catch (exception & e) {

  cerr << "fatal error : " << e.what() << "\n"; exit(1);

} catch (...) { cerr << "caught an OVNI?\n"; exit(1); }

