#include <vector>
#include <map>
#include <set>
#include <list>

#include <boost/filesystem/path.hpp>
#include <boost/filesystem/fstream.hpp>
#include <boost/filesystem/convenience.hpp>
#include <boost/function_output_iterator.hpp>
#include <boost/scoped_ptr.hpp>

#include <outilex/unicode.h>
#include <outilex/ulookup.h>

#include <outilex/xml.h>
#include <outilex/xmlReader.h>

#include <outilex/token.h>

#include <outilex/bin_text_fsa.h>
#include <outilex/sentence_fsa.h>
#include <outilex/lexic.h>

#include <outilex/serialize.h>
#include <outilex/null_output_iterator.h>


#include <outilex/dico_application.h>


using namespace std;
using namespace boost;

namespace fs = boost::filesystem;


namespace {


#if 0
struct dico_pos_t {

  dico::position pos;
  //  int priority;

  dico_pos_t(const dico::position & p, int prio) : pos(p)/*, priority(prio)*/ {}

  bool operator<(const dico_pos_t & b) const {
    if (priority != b.priority) { // put highest priority first
      return  priority > b.priority;
    }
    return pos < b.pos;
  }
};

typedef set<dico_pos_t> dico_pos_set;

struct dico_pos_inserter {

  dico_pos_set & pos_set;
  int priority;

  dico_pos_inserter(set<dico_pos_t> & S, int p) : pos_set(S), priority(p) {}

  void operator()(const dico::position & pos) const {
    pos_set.insert(dico_pos_t(pos, priority));
  }
};
#endif

typedef std::set<dico::position> dico_pos_set;

struct txtfsa_trans_inserter {

  sentence_fsa & fsa;
  mutable_lexic & lexic;
  int from, to;

  txtfsa_trans_inserter(sentence_fsa & A, mutable_lexic & lex, int f, int to)
    : fsa(A), lexic(lex), from(f), to(to) {}

  void operator()(const dic_lex_entry & e) const try {

    lexical_mask m(e, fsa.lingdef);
    if (m) {
      int lbl = lexic.add(m);
      fsa.A.add_trans(from, lbl, to);
    }
  } catch (exception & exp) {
    //cerr << "error with dic entry '" << e << "' : " << exp.what() << "\n"; 
  }
};


}// namespace ""



void apply_dics(xmlreader & reader, const dicos_list & dicos, int lookup_type,
                ling_def * lingdef, otext_fsa & otext, dico_application_infos & infos) {

  int ret = 0;

  // lookup for root element
 
  do {
    ret = reader.read();
  } while ((ret == 1) && reader.node_type() != XML_READER_TYPE_ELEMENT);

  reader.check();

  if (ret != 1) {
    throw xml_parse_error("invalid xml document");
  }

  if (xmlStrcmp(reader.const_name(), "document")) {
    throw xml_parse_error("invalid xml document type : " + string(reader.const_name()));
  }

  //set<string> unknown_words;


  // vector<sentence_fsa> bloc;
  sentence_fsa fsa(lingdef);
  vector<dico_pos_set> pos_in_dic;
  mutable_lexic lexic;

  bool inside_tu = false;

  infos.nbsentence = 0;

  ret = reader.read();

  while (ret == 1) {

    reader.check();

    if (reader.node_type() == XML_READER_TYPE_ELEMENT) {

      string name = reader.const_name();

      if (name == "par") { // for now, drop par elements

        ret = reader.read();
        continue;

      } else if (name == "tag") {

        ret = reader.next();
        continue;

      } else if (name == "tu") { // new sentence

        if (inside_tu) {
          throw runtime_error("bad document file: <tu> elem inside <tu> elem");
        }
        inside_tu = true;

        assert(fsa.size() == 0);

        ret = reader.read();
        continue;
 
      } else if (name == "token") { // new token
      
        infos.nbtoken++;
        //cerr << "new token\n";
        if (inside_tu == false) {
          throw runtime_error("bad document: 'token' elem outside 'tu' elem\n");
        }

        xmlNode * node = reader.expand();
        token tok(node);

        if (tok.text.empty()) {
          cerr << "warning: token with empty text\n";
          ret = reader.next();
          continue;
        }

        fsa.text += tok.text;

        int nextq = fsa.size() + 1;

        assert(fsa.size() == pos_in_dic.size());

        //cerr << "token = " << tok.text << " nextq = " << nextq << endl;


        /* first proceed with compound words */

        for (int q = 0; q < pos_in_dic.size(); ++q) {

          int curr_priority = -1;
          //  set<dic_lex_entry> res;
          txtfsa_trans_inserter trans_inserter(fsa, lexic, q, nextq);
          dico_pos_set new_left;

          for (dico_pos_set::iterator it = pos_in_dic[q].begin(), end = pos_in_dic[q].end();
               it != end; ++it) {

            if (it->priority() >= curr_priority) {
 
              int nb = ulookup(*it, lookup_type, tok.text, 
                               make_function_output_iterator(trans_inserter),
                               inserter(new_left, new_left.begin()));
              //make_function_output_iterator(dico_pos_inserter(new_left, it->priority)));
              if (nb) { // once we find a match, we keep track of its priority
                curr_priority = it->priority();
                infos.nbtag += nb;
              }
 
            } else { // lower priority, only keep position in dic
            
              ulookup(*it, lookup_type, tok.text,
                      null_output_iterator(),
                      inserter(new_left, new_left.begin()));
            }
          }
          advance_next_token(new_left);
          pos_in_dic[q].swap(new_left);
        }


        int curq = fsa.A.add_state();
        pos_in_dic.resize(fsa.size());

        switch (tok.type) {

        case token::word: {

          infos.nbword++;
          txtfsa_trans_inserter trans_inserter(fsa, lexic, curq, nextq);

          { // always add a lex transition

            pos_def * lexpos = lingdef->get_pos("lex");

            lexical_mask m(lexpos);
            m.form = tok.text;
            unicode::case_fold(m.case_fold, m.form);
            
            // on assigne au lemme un forme indépendante de la casse
            m.lemma = m.case_fold;
            
            m["case"] = token::case_names[tok.case_];

            int lbl = lexic.add(m);
            fsa.A.add_trans(curq, lbl, nextq);
          }


          int curr_priority = -1;
          dico_pos_set left;

          for (dicos_list::const_iterator it = dicos.begin(), end = dicos.end();
               it != end; ++it) {
 
            if (it->priority >= curr_priority) {
 
              int nb = ulookup(dico::position(*it), lookup_type, tok.text,
                                make_function_output_iterator(trans_inserter),
                                inserter(left, left.begin()));
              //make_function_output_iterator(dico_pos_inserter(left, it->priority)));
              if (nb) {
                infos.nbtag += nb;
                curr_priority = it->priority;
              }
 
            } else {
              ulookup(dico::position(*it), lookup_type, tok.text,
                      null_output_iterator(),
                      inserter(left, left.begin()));
              //make_function_output_iterator(dico_pos_inserter(left, it->priority)));
            }
          }

          if (curr_priority == -1) { // unknow word

            lexical_mask m(lingdef->unknown_pos());
            m.form = m.lemma = tok.text;
            unicode::case_fold(m.case_fold, m.form);
            int lbl = lexic.add(m);
            fsa.A.add_trans(curq, lbl, nextq);

            //unknown_words.insert(tok.text);
            infos.unknown_words[tok.text]++;
            infos.nbunknwtok++;
          }

          advance_next_token(left);
          pos_in_dic[curq].swap(left);
        }

          break;

        case token::other_type:
        case token::punctuation: {
          lexical_mask m(lingdef->punc_pos());
          m.form = m.lemma = tok.text;
          unicode::case_fold(m.case_fold, m.form);
          int lbl = lexic.add(m);
          fsa.A.add_trans(curq, lbl, nextq);
        }
          break;

        case token::numeric: {
          lexical_mask m(lingdef->number_pos());
          m.form = m.lemma = tok.text;
          unicode::case_fold(m.case_fold, m.form);
          int lbl = lexic.add(m);
          fsa.A.add_trans(curq, lbl, nextq);
        }
          break;
        }

        ret = reader.next();

        // cerr << "end of new token\n";
        continue;

      } else { // unknow element : skip subtree

        cerr << "unexpected element :" << name << '\n';
        ret = reader.next();
      }

      continue;

    } else if (reader.node_type() == XML_READER_TYPE_WHITESPACE
               || reader.node_type() == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {

      /* spaces are traveled during advance_next_token */
      fsa.text += " ";
#if 0
      if (inside_tu) {

        fsa.text += " ";

        /* match with one space for compound word */

        for (int q = 0; q < pos_in_dic.size(); ++q) {

          dico_pos_set new_left;

          for (dico_pos_set::iterator it = pos_in_dic[q].begin(), end = pos_in_dic[q].end();
               it != end; ++it) {

            /* lookup for one space in dic */
            ulookup(it->pos, lookup_type, " ",
                    null_output_iterator(),
                    make_function_output_iterator(dico_pos_inserter(new_left, it->priority)));
          }

          pos_in_dic[q].swap(new_left);
        }
      }
#endif

      ret = reader.read();

    } else if (reader.node_type() == XML_READER_TYPE_END_ELEMENT) {

      string name = reader.const_name();

      if (name == "tu") { // end of sentence

        //cerr << "\nend of sentence\n";
 
        inside_tu = false;

        int qf = fsa.A.add_state(); // add final state
        fsa.A.set_final(qf);

        pos_in_dic.clear(); // clear queue
 
        ++infos.nbsentence;

        if ((infos.nbsentence % 1000) == 0) {
          cout << infos.nbsentence << " sentences proceed ..." << endl;
        }

        LEXIC l(lexic);
        fsa.set_lexic(l);
        otext << fsa;
        fsa.clear();
        lexic.clear();
      }

      ret = reader.read();

    } else if (reader.node_type() == XML_READER_TYPE_TEXT) {

      cerr << "warning unexpected text in tokenized document\n";
      reader.read();

    } else {
      throw xml_parse_error("invalid xml document : unexpected node type : " 
                            + lexical_cast<string>(reader.node_type()));
    }
  }
  reader.check();
}


