#include <vector>
#include <set>
#include <list>

//#include <boost/lexical_cast.hpp>
#include <boost/filesystem/path.hpp>
#include <boost/filesystem/fstream.hpp>
#include <boost/filesystem/convenience.hpp>
#include <boost/function_output_iterator.hpp>
#include <boost/scoped_ptr.hpp>

#include <outilex/unicode.h>
#include <outilex/ulookup.h>

#include <outilex/xml.h>
#include <outilex/xmlReader.h>

#include <outilex/token.h>

#include <outilex/bin_text_fsa.h>
#include <outilex/sentence_fsa.h>
#include <outilex/lexic.h>

#include <outilex/serialize.h>
#include <outilex/null_output_iterator.h>

using namespace std;
using namespace boost;

namespace fs = boost::filesystem;


namespace {

char * progname;

void usage() {
  cout << "usage: " << progname
    << " -dic <dic1> [<prio1>] [-dic <dic2> [<prio2>] ...] [-imaj][-icase][-imark][-l <lingdef>][-o <out> ] <tokfile>\n";
  exit(0);
}



void bin_fsa_set_size(const fs::path & p, int size) {
  fs::fstream os(p, ios_base::in|ios_base::out);
  os.seekp(4, ios_base::beg);
  //cerr << "pos = " << os.tellp();
  write_int(os, size);
  //cerr << "pos = " << os.tellp();
}

struct dico_pos_t {

  dico::position pos;
  int priority;

  dico_pos_t(const dico::position & p, int prio) : pos(p), priority(prio) {}

  bool operator<(const dico_pos_t & b) const {
    if (priority != b.priority) { // put highest priority first
      return  priority > b.priority;
    }
    return pos < b.pos;
  }
};

typedef set<dico_pos_t> dico_pos_set;

struct dico_pos_inserter {

  dico_pos_set & pos_set;
  int priority;

  dico_pos_inserter(set<dico_pos_t> & S, int p) : pos_set(S), priority(p) {}

  void operator()(const dico::position & pos) const {
    pos_set.insert(dico_pos_t(pos, priority));
  }
};

struct txtfsa_trans_inserter {

  sentence_fsa & fsa;
  mutable_lexic & lexic;
  int from, to;

  txtfsa_trans_inserter(sentence_fsa & A, mutable_lexic & lex, int f, int to)
    : fsa(A), lexic(lex), from(f), to(to) {}

  void operator()(const dic_lex_entry & e) const try {

    lexical_mask m(e, fsa.lingdef);
    if (m) {
      int lbl = lexic.add(m);
      fsa.A.add_trans(from, lbl, to);
    }
  } catch (exception & exp) {
    cerr << "error with dic entry '" << e << "' : " << exp.what() << "\n"; 
  }
};

struct dico_n_priority {
  int priority;
  dico dic;
  dico_n_priority(int pri = 10) : priority(pri), dic() {}
};


typedef vector<dico_n_priority> dicos_set;

typedef multimap<int, fs::path, std::greater<int> > dicos_path_t;



void load_dicos(dicos_set & dicos, dicos_path_t & dicos_path) {

  dicos.resize(dicos_path.size());

  int idx = 0;
  for (dicos_path_t::const_iterator it = dicos_path.begin(), end = dicos_path.end();
       it != end; ++it, ++idx) {
  
    dicos[idx].priority = it->first;
    cout << "loading " << it->second.string() << " (priority = " << it->first << ") ...\n";
    dicos[idx].dic.read(it->second);
  }
  cout << "done. " << dicos.size() << " dicos loaded\n";
}


}// namespace ""

int main(int argc, char ** argv) try {

  progname = *argv;

  fs::path inpath, outpath, lingdefpath;

  dicos_path_t dicos_path;

  int lookup_type = MATCH_EXACT;
  int nbtoken = 0, nbtag = 0, nbword = 0;


  char * bla = getenv("LINGDEF");
  if (bla) { 
    lingdefpath = fs::path(bla);
  }

  argv++, argc--;

  if (argc == 0) { usage(); }


  while (argc) {

    string arg = *argv;

    if (arg == "-h") {
      usage();

    } else if (arg == "-l") {

      argv++, argc--;
      if (! argc) { cerr << "bad args\n"; exit(1); }

      lingdefpath = fs::path(*argv, fs::native);

    } else if (arg == "-dic") {

      argv++, argc--;
      if (! argc) { cerr << "bad args\n"; exit(1); }

      fs::path dicpath = fs::path(*argv, fs::native);

      int priority = 10;
      if (argc > 1 && isdigit(argv[1][0])) { // priority specified
        argv++, argc--;
        priority = lexical_cast<int>(*argv);
      }

      dicos_path.insert(make_pair(priority, dicpath));

    } else if (arg == "-o") {

      argv++, argc--;
      if (! argc) { cerr << "bad args\n"; exit(1); }

      outpath = fs::path(*argv, fs::native);

    } else if (arg == "-imaj") {

      lookup_type |= MATCH_IGNORE_MAJ;

    } else if (arg == "-icase") {

      lookup_type |= MATCH_IGNORE_CASE;

    } else if (arg == "imark") {

      lookup_type |= MATCH_IGNORE_MARKS;

    } else { inpath = fs::path(arg, fs::native); }

    argv++, argc--;
  }

  if (dicos_path.empty()) {
    cerr << "error: no dictionary specified\n";
    exit(1);
  }

  if (inpath.empty()) {
    cerr << "error: no text specified\n";
    exit(1);
  }

  if (lingdefpath.empty()) {
    cerr << "error: no lingdef specified\n";
    exit(1);
  }

  if (outpath.empty()) {

    outpath = inpath;

    if (fs::extension(outpath) == ".gz") {
      outpath = fs::change_extension(outpath, "");
    }
    outpath = fs::change_extension(outpath, ".fsa");
  }


  // load dictionnaries

  dicos_set dicos;

  load_dicos(dicos, dicos_path);

  //cerr << "here\n";

  ling_def ldef(lingdefpath);

  xmlreader reader(inpath);

  // lookup for root element

  int ret = 0;
  bool inside_tu = false;

  do {
    ret = reader.read();
  } while ((ret == 1) && reader.node_type() != XML_READER_TYPE_ELEMENT);

  reader.check();

  //cerr << "here\n";

  if (ret != 1) {
    throw xml_parse_error("invalid xml document");
  }

  if (xmlStrcmp(reader.const_name(), "document")) {
    throw xml_parse_error("invalid xml document type : " + string(reader.const_name()));
  }

  //set<string> unknown_words;
  map<string, int> unknown_words;
  int nbunknwtok = 0; // number of unknown tokens

  bin_otext_fsa otext(outpath, 0);

  vector<sentence_fsa> bloc;
  vector<dico_pos_set> pos_in_dic;
  mutable_lexic lexic;

  int bloc_size = 1000;
  //int currpos = -1; 
  int currsentence = 0;

  ret = reader.read();

  while (ret == 1) {

    reader.check();

    if (reader.node_type() == XML_READER_TYPE_ELEMENT) {

      string name = reader.const_name();

      if (name == "par") { // for now, drop par elements

        ret = reader.read();
        continue;

      } else if (name == "tag") {

        ret = reader.next();
        continue;

      } else if (name == "tu") { // new sentence

        //cerr << "\nnew sentence\n" << endl;
        if (inside_tu) {
          throw runtime_error("bad document file: <tu> elem inside <tu> elem");
        }
        inside_tu = true;
        bloc.resize(bloc.size() + 1);
        bloc.back().set_lingdef(& ldef);
        ret = reader.read();
        continue;
 
      } else if (name == "token") { // new token
      
        nbtoken++;
        //cerr << "new token\n";
        if (inside_tu == false) {
          throw runtime_error("bad document: 'token' elem outside 'tu' elem\n");
        }

        xmlNode * node = reader.expand();
        token tok(node);

        if (tok.text.empty()) {
          cerr << "warning: token with empty text\n";
          ret = reader.next();
          continue;
        }

        sentence_fsa & fsa = bloc.back();
        int nextq = fsa.size() + 1;

        assert(fsa.size() == pos_in_dic.size());

        //cerr << "token = " << tok.text << " nextq = " << nextq << endl;


        // first proceed with compound words

        for (int q = 0; q < pos_in_dic.size(); ++q) {

          int curr_priority = -1;
          //          set<dic_lex_entry> res;
          txtfsa_trans_inserter trans_inserter(fsa, lexic, q, nextq);
          dico_pos_set new_left;

          for (dico_pos_set::iterator it = pos_in_dic[q].begin(), end = pos_in_dic[q].end();
               it != end; ++it) {

            if (it->priority >= curr_priority) {
 
              int nb = ulookup(it->pos, lookup_type, tok.text, 
                               make_function_output_iterator(trans_inserter),
                               make_function_output_iterator(dico_pos_inserter(new_left,
                                                                               it->priority)));
              if (nb) { // once we find a match, we keep track of its priority
                curr_priority = it->priority;
                nbtag += nb;
              }
 
            } else { // low priority, only keep position in dic
            
              ulookup(it->pos, lookup_type, tok.text,
                      null_output_iterator(),
                      make_function_output_iterator(dico_pos_inserter(new_left, it->priority)));
            }
          }
          pos_in_dic[q].swap(new_left);
        }


        int curq = fsa.A.add_state();
        pos_in_dic.resize(fsa.size());

        switch (tok.type) {

        case token::word: {

          nbword++;
          int curr_priority = -1;
          txtfsa_trans_inserter trans_inserter(fsa, lexic, curq, nextq);
          dico_pos_set left;

          { // always add a lex transition

            pos_def * lexpos = ldef.get_pos("lex");

            lexical_mask m(lexpos);
            m.form = m.lemma = tok.text;
            unicode::case_fold(m.case_fold, m.form);
            m["case"] = token::case_names[tok.case_];

            int lbl = lexic.add(m);
            fsa.A.add_trans(curq, lbl, nextq);
          }


          for (dicos_set::iterator it = dicos.begin(), end = dicos.end();
               it != end; ++it) {
          
            if (it->priority >= curr_priority) {
            
              int nb = ulookup(dico::position(it->dic), lookup_type, tok.text,
                                make_function_output_iterator(trans_inserter),
                                make_function_output_iterator(dico_pos_inserter(left,
                                                                                it->priority)));
              if (nb) {
                nbtag += nb;
                curr_priority = it->priority;
              }
 
            } else {
              ulookup(dico::position(it->dic), lookup_type, tok.text,
                      null_output_iterator(),
                      make_function_output_iterator(dico_pos_inserter(left, it->priority)));
            }
          }

          if (curr_priority == -1) { // unknow word

            lexical_mask m(ldef.unknow_pos());
            m.form = m.lemma = tok.text;
            unicode::case_fold(m.case_fold, m.form);
            int lbl = lexic.add(m);
            fsa.A.add_trans(curq, lbl, nextq);

            //unknown_words.insert(tok.text);
            unknown_words[tok.text]++;
            nbunknwtok++;
          }

          pos_in_dic[curq].swap(left);
        }

          break;

        case token::other_type:
        case token::punctuation: {
          lexical_mask m(ldef.punc_pos());
          m.form = m.lemma = tok.text;
          unicode::case_fold(m.case_fold, m.form);
          int lbl = lexic.add(m);
          fsa.A.add_trans(curq, lbl, nextq);
        }
          break;

        case token::numeric: {
          lexical_mask m(ldef.number_pos());
          m.form = m.lemma = tok.text;
          unicode::case_fold(m.case_fold, m.form);
          int lbl = lexic.add(m);
          fsa.A.add_trans(curq, lbl, nextq);
        }
          break;
        }

        ret = reader.next();

        // cerr << "end of new token\n";
        continue;

      } else { // unknow element : skip subtree

        cerr << "unexpected element :" << name << '\n';
        ret = reader.next();
      }

      continue;

    } else if (reader.node_type() == XML_READER_TYPE_WHITESPACE
               || reader.node_type() == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {

      /* match with one space for compound word */

      for (int q = 0; q < pos_in_dic.size(); ++q) {

        dico_pos_set new_left;

        for (dico_pos_set::iterator it = pos_in_dic[q].begin(), end = pos_in_dic[q].end();
             it != end; ++it) {

          /* lookup for one space in dic */
          ulookup(it->pos, lookup_type, " ",
                  null_output_iterator(),
                  make_function_output_iterator(dico_pos_inserter(new_left, it->priority)));
        }

        pos_in_dic[q].swap(new_left);
      }

      ret = reader.read();

    } else if (reader.node_type() == XML_READER_TYPE_END_ELEMENT) {

      string name = reader.const_name();

      if (name == "tu") { // end of sentence

        inside_tu = false;
        //cerr << "\nend of sentence\n";

        sentence_fsa & fsa = bloc.back();
        int qf = fsa.A.add_state(); // add final state
        fsa.A.set_final(qf);

        pos_in_dic.clear(); // clear queue
 
        ++currsentence;

        /*
        if ((currsentence % 100) == 0) {
          cerr << '.';
          if ((currsentence % 1000) == 0) { cerr << ' '; }
        }
        */
        if ((currsentence % 1000) == 0) {
          cout << currsentence << " sentences proceed ...\n";
        }

        if ((currsentence % bloc_size) == 0) {

          //cerr << "new bloc\n";
        
          LEXIC l(lexic);

          for (int i = 0; i < bloc.size(); ++i) {
            bloc[i].set_lexic(l);
            otext << bloc[i];
          }
          bloc.clear();
          lexic.clear();
        }
      }

      ret = reader.read();

    } else if ((reader.node_type() == XML_READER_TYPE_TEXT)
               || (reader.node_type() == XML_READER_TYPE_WHITESPACE)
               || (reader.node_type() == XML_READER_TYPE_SIGNIFICANT_WHITESPACE)) {

      reader.read();

    } else {
      throw xml_parse_error("invalid xml document : unexpected node type : " 
                            + lexical_cast<string>(reader.node_type()));
    }
  }

  reader.check();

  // flush last bloc
 
  LEXIC l(lexic);

  for (int i = 0; i < bloc.size(); ++i) {
    bloc[i].set_lexic(l);
    otext << bloc[i];
  }

  otext.close();

  // set size
  bin_fsa_set_size(outpath, currsentence);

  {
    fs::path unknow_path = outpath.branch_path() / "unknown_words";
    cout << "storing unknown words in '" << unknow_path.string() << "' ...\n";

    fs::ofstream out(unknow_path);

    for (map<string, int>::iterator it = unknown_words.begin(), end = unknown_words.end();
         it != end; ++it) {
      out << it->first << " " << it->second << "\n";
      //for (int i = 0; i < it->second; ++i) {
      //  out << it->first << "\n";
      //}
    }
    //std::copy(unknown_words.begin(), unknown_words.end(), ostream_iterator<string>(out, "\n"));
  }

  cout << "ok. textfsa is in " << outpath.native_file_string()
    <<  " (" << currsentence << " sentences, " << nbtoken << " tokens, " << nbtag << " tags, "
    << nbword << " words, " << unknown_words.size() << " unknown forms, " << nbunknwtok <<
    " unknown tokens)\n";

} catch (exception & e) {
  cerr << "fatal error : " << e.what() << endl;
  exit(1);
}


