#include <vector>
#include <set>
#include <list>

//#include <boost/lexical_cast.hpp>
#include <boost/filesystem/path.hpp>
#include <boost/filesystem/fstream.hpp>
#include <boost/filesystem/convenience.hpp>
#include <boost/function_output_iterator.hpp>
#include <boost/scoped_ptr.hpp>


#include <outilex/unicode.h>
#include <outilex/ulookup.h>

#include <outilex/xml.h>
#include <outilex/xmlReader.h>

#include <outilex/token.h>

#include <outilex/bin_text_fsa.h>
#include <outilex/sentence_fsa.h>
#include <outilex/lexic.h>

#include <outilex/serialize.h>
#include <outilex/null_output_iterator.h>

#include <boost/progress.hpp>
#include <boost/scoped_ptr.hpp>
#include <boost/filesystem/path.hpp>
#include <boost/filesystem/fstream.hpp>

#include <outilex/text_fsa.h>
#include <outilex/sentence_fsa.h>
#include <outilex/wrtn_grammar.h>
#include <outilex/wrtn_chart.h>
#include <outilex/wrtn_parser.h>
#include <outilex/wparsing_helper.h>
#include <outilex/text_transduction.h>


#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/xmlreader.h>

#include <sstream>
#include <string>
#include <iostream>

#include "Python.h"

#include "Tokenization.h"
#include "Segmentation.h"
#include "DocFilters.h"

// --- To remove when no memory leaks ---
#include <sys/types.h>
#include <unistd.h>
#include <sys/wait.h>




// -- // Uncomment for debug // -- //
//#define DODEBUG

#ifdef DODEBUG
#define DEBUG(x) std::cerr << x << std::endl;
#else
#define DEBUG(x) 
#endif

using namespace std;
using namespace boost;

namespace aaa {
  int count = 0;
}

extern "C" static void destruct_xmlDoc(void *x) {
  DEBUG("Destructing xml document: " << x << " / " << --aaa::count);
  delete  (DocumentPtr)x;
}


// -------------------- segmentation ------------------------------------
// Due to a memory leak, we fork and use a temporary file

extern "C" static PyObject * segment(PyObject *self, PyObject *args)
{
  const char *text;

  if (!PyArg_ParseTuple(args, "s", &text))
    return NULL;

  DocumentPtr document = 0;
  xmlDocPtr xml = 0;
  try {
 

    const char *filename = tmpnam(0);
    DEBUG("Temporary file is " << filename);

    // Fork to get back memory!!!
    pid_t pid = fork();

    if (pid == 0) { 
      /* This is the child process */
      /* Do child process stuff here */

      // Memory leak with those two operations!
      // When fixed, only keep these lines without dumping to a file and reloading !
      document = PreFilter(text, "txt");
      xml = document->xmltree;

      doTokenization(xml);
      doSegmentation(xml);

      FILE *file = fopen(filename, "w");
      xmlDocDump(file, xml);
      fclose(file);
      DEBUG("Done for " << filename);
      exit(0);
    } else if (pid < 0) {
      /* Some sort of error in fork() */
      return(0);
    } 


    /* This means we are in the parent process */
    /* Do parent process stuff */
    waitpid(pid, NULL, 0); /* wait for the child to finish */

    /* Load the XML file */
    document = new Document();
    DEBUG("Loading " << filename);
    document->xmltree = xmlReadFile(filename, NULL, 0);
    if (document->xmltree == 0) {
      delete document;
      return(0);
    }
    DEBUG("Created document " << document); aaa::count++;

    /* Delete the temporary file */
    remove(filename);
    
  } catch(...) {
    delete document;
    return NULL;
  }



  
  return PyCObject_FromVoidPtr(document, destruct_xmlDoc);
  }


/*
  $BINPATH/apply-dic -dic $DICTPATH/dela-fr-public.idx  10 -imaj -icase -imark -l $DIR/lingdef.xml "$TMPDIR/text.segmentation"
  opt: $BINPATH/make-wrtn -l $DIR/lingdef.xml $GRAMMAR.xgrf
  $BINPATH/wrtn-txt-transduct -l $DIR/lingdef.xml -gram $GRAMMAR.wrtn -txt -m -o "$file.done" "$TMPDIR/text.fsa"
*/


extern "C" static void destruct_dictionary(void *x) {
  std::cout << "Destructing dictionary: " << x << std::endl;
  delete  (dico*)x;
}

extern "C" static PyObject *load_dictionary(PyObject *self, PyObject *args) {
  const char *text;
  if (!PyArg_ParseTuple(args, "s", &text))
    return NULL;
  dico *dic = new dico;
  try {
    dic->read(std::string(text));
  } catch(...) {
    delete dic;
    return NULL;
  }
    
  DEBUG("Created dictionary " << dic << " with path " << text);
  return PyCObject_FromVoidPtr(dic, destruct_dictionary);
}

extern "C" static void destruct_lingdef(void *x) {
  std::cout << "Destructing dictionary: " << x << std::endl;
  delete  (ling_def*)x;
}

extern "C" static PyObject *load_lingdef(PyObject *self, PyObject *args) {
  const char *path;
  if (!PyArg_ParseTuple(args, "s", &path))
    return NULL;

  ling_def *ldef = 0;
  try { ldef =  new ling_def(std::string(path)); } 
  catch(...) { return NULL; }

    
  return PyCObject_FromVoidPtr(ldef, destruct_lingdef);
}





// -------------------- apply-dictionary ------------------------------------

class xmlreader2 : public xmlreader {
public:
  inline void open(DocumentPtr doc) {
    close();
    reader = xmlReaderWalker(doc->xmltree);
      
    if (!reader) { throw std::runtime_error("xmlreader: cannot open DOM document!?!"); }
    //      xmlTextReaderSetErrorHandler(reader, xml_reader_default_error_handler, & status);
  }
};

struct dico_n_priority {
  int priority;
  dico *dic;
  dico_n_priority(int pri = 10, dico *_dic) : priority(pri), dic(_dic) {}
};
  
struct dico_pos_t {

  dico::position pos;
  int priority;

  dico_pos_t(const dico::position & p, int prio) : pos(p), priority(prio) {}

  bool operator<(const dico_pos_t & b) const {
    if (priority != b.priority) { // put highest priority first
      return  priority > b.priority;
    }
    return pos < b.pos;
  }
};

typedef vector<dico_n_priority> dicos_set;
typedef set<dico_pos_t> dico_pos_set;
  
struct txtfsa_trans_inserter {

  sentence_fsa & fsa;
  mutable_lexic & lexic;
  int from, to;

  txtfsa_trans_inserter(sentence_fsa & A, mutable_lexic & lex, int f, int to)
    : fsa(A), lexic(lex), from(f), to(to) {}

  void operator()(const dic_lex_entry & e) const try {

    lexical_mask m(e, fsa.lingdef);
    if (m) {
      int lbl = lexic.add(m);
      fsa.A.add_trans(from, lbl, to);
    }
  } catch (exception & exp) {
    cerr << "error with dic entry '" << e << "' : " << exp.what() << "\n"; 
  }
};

struct dico_pos_inserter {

  dico_pos_set & pos_set;
  int priority;

  dico_pos_inserter(set<dico_pos_t> & S, int p) : pos_set(S), priority(p) {}

  void operator()(const dico::position & pos) const {
    pos_set.insert(dico_pos_t(pos, priority));
  }
};

extern "C" static PyObject *apply_dictionary(PyObject *self, PyObject *args)
{
  PyObject * py_doc, * py_ldef;
  PyObject * py_dic_list;
  const char *path;

  DEBUG("Applying dictionnary!");
  if (!PyArg_ParseTuple(args, "O!O!O!s", &PyCObject_Type, &py_ldef, &PyCObject_Type, &py_doc, &PyList_Type, &py_dic_list, &path))
    return NULL;


  ling_def &ldef = *(ling_def*)PyCObject_AsVoidPtr(py_ldef);
  DocumentPtr doc = (DocumentPtr)PyCObject_AsVoidPtr(py_doc);
  DEBUG( "doc ptr is " << doc);

  dicos_set dicos;
  for(int i = 0, N = PyList_Size(py_dic_list); i < N; i++) {
    PyObject * item = PyList_GetItem(py_dic_list, i);
    for(int j = 0, Nj = PyList_Size(item); j < Nj; j++) {
      dico *dic = (dico*) PyCObject_AsVoidPtr(PyList_GetItem(item, j));
      dicos.push_back(dico_n_priority(N-i, dic));
      DEBUG( "Using dictionary " << dic << " with priority " << N-i);
    }
  }

  std::string outpath(path);


  try {
    xmlreader2 reader;
    reader.open(doc);

    // lookup for root element

    int lookup_type = MATCH_EXACT;
    // "imaj" option
    lookup_type |= MATCH_IGNORE_MAJ;
    // "icas"
    lookup_type |= MATCH_IGNORE_CASE;
    // "imarks"
    lookup_type |= MATCH_IGNORE_MARKS;

    int nbtoken = 0, nbtag = 0, nbword = 0;


    int ret = 0;
    bool inside_tu = false;

    do {
      ret = reader.read();
    } while ((ret == 1) && reader.node_type() != XML_READER_TYPE_ELEMENT);

    reader.check();


    if (ret != 1) {
      throw xml_parse_error("invalid xml document");
    }

    if (xmlStrcmp(reader.const_name(), "document")) {
      throw xml_parse_error("invalid xml document type : " + string(reader.const_name()));
    }

    //set<string> unknown_words;
    map<string, int> unknown_words;
    int nbunknwtok = 0; // number of unknown tokens

    bin_otext_fsa otext(outpath, 0);

    vector<sentence_fsa> bloc;
    vector<dico_pos_set> pos_in_dic;
    mutable_lexic lexic;

    int bloc_size = 1000;
    //int currpos = -1; 
    int currsentence = 0;

    ret = reader.read();

    while (ret == 1) {

      reader.check();

      if (reader.node_type() == XML_READER_TYPE_ELEMENT) {

	string name = reader.const_name();

	if (name == "par") { // for now, drop par elements

	  ret = reader.read();
	  continue;

	} else if (name == "tag") {

	  ret = reader.next();
	  continue;

	} else if (name == "tu") { // new sentence

	  if (inside_tu) {
	    throw runtime_error("bad document file: <tu> elem inside <tu> elem");
	  }
	  inside_tu = true;
	  bloc.resize(bloc.size() + 1);
	  bloc.back().set_lingdef(& ldef);
	  ret = reader.read();
	  continue;
 
	} else if (name == "token") { // new token
      
	  nbtoken++;
	  if (inside_tu == false) {
	    throw runtime_error("bad document: 'token' elem outside 'tu' elem\n");
	  }

	  xmlNode * node = reader.expand();
	  token tok(node);

	  if (tok.text.empty()) {
	    cerr << "warning: token with empty text\n";
	    ret = reader.next();
	    continue;
	  }

	  sentence_fsa & fsa = bloc.back();
	  int nextq = fsa.size() + 1;

	  assert(fsa.size() == pos_in_dic.size());

	  //cerr << "token = " << tok.text << " nextq = " << nextq << endl;


	  // first proceed with compound words

	  for (int q = 0; q < pos_in_dic.size(); ++q) {

	    int curr_priority = -1;
	    //          set<dic_lex_entry> res;
	    txtfsa_trans_inserter trans_inserter(fsa, lexic, q, nextq);
	    dico_pos_set new_left;

	    for (dico_pos_set::iterator it = pos_in_dic[q].begin(), end = pos_in_dic[q].end();
		 it != end; ++it) {

	      if (it->priority >= curr_priority) {
 
		int nb = ulookup(it->pos, lookup_type, tok.text, 
				 make_function_output_iterator(trans_inserter),
				 make_function_output_iterator(dico_pos_inserter(new_left,
										 it->priority)));
		if (nb) { // once we find a match, we keep track of its priority
		  curr_priority = it->priority;
		  nbtag += nb;
		}
 
	      } else { // low priority, only keep position in dic
            
		ulookup(it->pos, lookup_type, tok.text,
			null_output_iterator(),
			make_function_output_iterator(dico_pos_inserter(new_left, it->priority)));
	      }
	    }
	    pos_in_dic[q].swap(new_left);
	  }


	  int curq = fsa.A.add_state();
	  pos_in_dic.resize(fsa.size());

	  switch (tok.type) {

	  case token::word: {

	    nbword++;
	    int curr_priority = -1;
	    txtfsa_trans_inserter trans_inserter(fsa, lexic, curq, nextq);
	    dico_pos_set left;

	    { // always add a lex transition

	      pos_def * lexpos = ldef.get_pos("lex");

	      lexical_mask m(lexpos);
	      m.form = m.lemma = tok.text;
	      unicode::case_fold(m.case_fold, m.form);
	      m["case"] = token::case_names[tok.case_];

	      int lbl = lexic.add(m);
	      fsa.A.add_trans(curq, lbl, nextq);
	    }


	    for (dicos_set::iterator it = dicos.begin(), end = dicos.end();
		 it != end; ++it) {
          
	      if (it->priority >= curr_priority) {
            
		int nb = ulookup(dico::position(*it->dic), lookup_type, tok.text,
				 make_function_output_iterator(trans_inserter),
				 make_function_output_iterator(dico_pos_inserter(left,
										 it->priority)));
		if (nb) {
		  nbtag += nb;
		  curr_priority = it->priority;
		}
 
	      } else {
		ulookup(dico::position(*it->dic), lookup_type, tok.text,
			null_output_iterator(),
			make_function_output_iterator(dico_pos_inserter(left, it->priority)));
	      }
	    }

	    if (curr_priority == -1) { // unknow word

	      lexical_mask m(ldef.unknow_pos());
	      m.form = m.lemma = tok.text;
	      unicode::case_fold(m.case_fold, m.form);
	      int lbl = lexic.add(m);
	      fsa.A.add_trans(curq, lbl, nextq);

	      //unknown_words.insert(tok.text);
	      unknown_words[tok.text]++;
	      nbunknwtok++;
	    }

	    pos_in_dic[curq].swap(left);
	  }

	    break;

	  case token::other_type:
	  case token::punctuation: {
	    lexical_mask m(ldef.punc_pos());
	    m.form = m.lemma = tok.text;
	    unicode::case_fold(m.case_fold, m.form);
	    int lbl = lexic.add(m);
	    fsa.A.add_trans(curq, lbl, nextq);
	  }
	    break;

	  case token::numeric: {
	    lexical_mask m(ldef.number_pos());
	    m.form = m.lemma = tok.text;
	    unicode::case_fold(m.case_fold, m.form);
	    int lbl = lexic.add(m);
	    fsa.A.add_trans(curq, lbl, nextq);
	  }
	    break;
	  }

	  ret = reader.next();

	  // cerr << "end of new token\n";
	  continue;

	} else { // unknow element : skip subtree

	  cerr << "unexpected element :" << name << '\n';
	  ret = reader.next();
	}

	continue;

      } else if (reader.node_type() == XML_READER_TYPE_WHITESPACE
		 || reader.node_type() == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {

	/* match with one space for compound word */

	for (int q = 0; q < pos_in_dic.size(); ++q) {

	  dico_pos_set new_left;

	  for (dico_pos_set::iterator it = pos_in_dic[q].begin(), end = pos_in_dic[q].end();
	       it != end; ++it) {

	    /* lookup for one space in dic */
	    ulookup(it->pos, lookup_type, " ",
		    null_output_iterator(),
		    make_function_output_iterator(dico_pos_inserter(new_left, it->priority)));
	  }

	  pos_in_dic[q].swap(new_left);
	}

	ret = reader.read();

      } else if (reader.node_type() == XML_READER_TYPE_END_ELEMENT) {

	string name = reader.const_name();

	if (name == "tu") { // end of sentence

	  inside_tu = false;
	  //cerr << "\nend of sentence\n";

	  sentence_fsa & fsa = bloc.back();
	  int qf = fsa.A.add_state(); // add final state
	  fsa.A.set_final(qf);

	  pos_in_dic.clear(); // clear queue
 
	  ++currsentence;

	  /*
	    if ((currsentence % 100) == 0) {
	    cerr << '.';
	    if ((currsentence % 1000) == 0) { cerr << ' '; }
	    }
	  */
	  if ((currsentence % 1000) == 0) {
	    cout << currsentence << " sentences proceed ...\n";
	  }

	  if ((currsentence % bloc_size) == 0) {

	    //cerr << "new bloc\n";
        
	    LEXIC l(lexic);

	    for (int i = 0; i < bloc.size(); ++i) {
	      bloc[i].set_lexic(l);
	      otext << bloc[i];
	    }
	    bloc.clear();
	    lexic.clear();
	  }
	}

	ret = reader.read();

      } else if ((reader.node_type() == XML_READER_TYPE_TEXT)
		 || (reader.node_type() == XML_READER_TYPE_WHITESPACE)
		 || (reader.node_type() == XML_READER_TYPE_SIGNIFICANT_WHITESPACE)) {

	reader.read();

      } else {
	throw xml_parse_error("invalid xml document : unexpected node type : " 
			      + lexical_cast<string>(reader.node_type()));
      }
    }

    reader.check();

    // flush last bloc
 
    LEXIC l(lexic);

    for (int i = 0; i < bloc.size(); ++i) {
      bloc[i].set_lexic(l);
      otext << bloc[i];
    }

    otext.close();

    // set size
    //      bin_fsa_set_size(outpath, currsentence);


  } catch(...) {
    return NULL;
  }   

  Py_RETURN_NONE;
}









// -------------------- load_grammar ------------------------------------

extern "C" static void destruct_wrtn_grammar(void *x) {
  std::cout << "Destructing grammar: " << x << std::endl;
  delete  (wrtn_grammar*)x;
}


extern "C" static PyObject *load_wrtn_grammar(PyObject *self, PyObject *args) {  
  
  PyObject * py_doc, * py_ldef;
  const char *path;

  std::cerr << "Applying dictionnary!\n";
  if (!PyArg_ParseTuple(args, "O!s", &PyCObject_Type, &py_ldef, &path))
    return NULL;


  ling_def &ldef = *(ling_def*)PyCObject_AsVoidPtr(py_ldef);


  try { 
    wrtn_grammar *gram = new wrtn_grammar(std::string(path), & ldef);
    if (gram)   return PyCObject_FromVoidPtr(gram, destruct_wrtn_grammar);
  } catch(...) {  }
  return NULL;
}


// -------------------- wrtn_txt_tranduct ------------------------------------
// Apply a grammar
// Should return an object but instead the result is put in a file
//

extern "C" static PyObject *wrtn_txt_transduct(PyObject *self, PyObject *args) {  

  // Get parameters (ldef + grammar)
  PyObject * py_gram, * py_ldef;
  const char *path;
  const char *opath;

  if (!PyArg_ParseTuple(args, "O!O!ss", &PyCObject_Type, &py_ldef, &PyCObject_Type, &py_gram, &path, &opath))
    return NULL;


  ling_def &lingdef = *(ling_def*)PyCObject_AsVoidPtr(py_ldef);
  wrtn_grammar &gram = *(wrtn_grammar*)PyCObject_AsVoidPtr(py_gram);

  scoped_ptr<itext_fsa> p_itext(new_itext_fsa(std::string(path), & lingdef));
  itext_fsa & itext = *p_itext;
  ofstream os(opath);  
  txt_trans_mode_type transmode    = MERGE;
  txt_trans_output_type outputmode = TEXT;

  int PARSER_FLAGS = wrtn_parser::SURF_PARSING;
  int total = itext.size();
  
  wchart_text_transducer transducer(os, gram.start_name(), transmode, outputmode);
  progress_displayer<wchart_text_transducer> displayer(transducer, total);

  boost::timer tmr;
  wrtn_parse(itext, gram, displayer, PARSER_FLAGS, true); // true for longest_match

  Py_RETURN_NONE;
}


extern "C" { 
  PyMethodDef outilexMethods[] = {
    {"segment",  segment, METH_VARARGS, "Tokenize a string, returns an XML document."},
    {"load_lingdef", load_lingdef, METH_VARARGS, "Load a linguistic definition file"},
    {"load_dictionary", load_dictionary, METH_VARARGS, "Load a dictionary"},
    {"apply_dictionary",  apply_dictionary, METH_VARARGS, "Apply one or more dictionnaries to a segmented text."},
    {"load_wrtn_grammar", load_wrtn_grammar, METH_VARARGS, "Load a grammar"},
    {"wrtn_txt_transduct",  wrtn_txt_transduct, METH_VARARGS, "Transduct with a WRTN"},
    {NULL, NULL, 0, NULL}        /* Sentinel */
  };
}

extern "C" PyMODINIT_FUNC
initoutilex(void)
{
  (void) Py_InitModule("outilex", outilexMethods);
}



