#include <iostream>
#include <string>

#include <boost/regex.hpp>
#include <boost/regex/icu.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/assign.hpp>


#include <boost/filesystem/path.hpp>
#include <boost/filesystem/fstream.hpp>
#include <boost/filesystem/convenience.hpp>
#include <boost/timer.hpp>

#include <outilex/xmlWriter.h>
#include <outilex/usage.h>

using namespace std;
using namespace boost;
using namespace boost::assign;
namespace fs = boost::filesystem;

namespace {

u32regex tokenizer_regex(make_u32regex("([[:L*:]]+"    // mot
                                       "|[[:Nd:]]+"    // nombre
                                       "|[[:P*:]]|[[:S*:]]" // ponctuation + symbole
                                       "|\\s+)")); // space

u32regex word_regex(make_u32regex("[[:L*:]]+"));
u32regex upper_regex(make_u32regex("[[:Lu:]]+"));
u32regex capit_regex(make_u32regex("[[:Lu:]][[:L*:]]+"));
u32regex num_regex(make_u32regex("[[:Nd:]]+"));
u32regex punc_regex(make_u32regex("[[:P*:]]|[[:S*:]]"));

u32regex space_regex(make_u32regex("\\s+"));

// delimiteur de fin de phrase
u32regex eos_regex(make_u32regex("\\.|!|\\?|;"));


/* anti dictionnaire pour les mots qui peuvent etre suivis
 * d'un point sans marquer une fin de phrase
 * ex: Mr. Dr. etc.
 */

/*
set<string> antidic =
  list_of("A")("B")("C")("D")("E")("F")("G")("H")("I")("J")("K")("L")("M")("N")("O")
  ("P")("Q")("R")("S")("T")("U")("V")("W")("X")("Y")("Z")
  ("Mr")("Mrs")("Dr")("Phd")("St");
*/

char * antidic =
"[[:L*:]]"
"|[[:Nd:]]+"
"|Mr|Mrs|Dr|Phd|St"
;

u32regex antidic_regex(make_u32regex(antidic));

/* segment un flux texte brut encodé en utf8 en phrases et tokens
 * dans le format de systran
 */

void tokenize(istream & is, xmlwriter & writer, int & nbtok, int & nbsentence) {

  writer.start_document();

  writer.start_element("document");
  writer.write_attribute("original_format", "txt");

  writer.start_element("par");

  writer.start_element("tu");

  // end of sentence
  bool eos = false, eoslast = false;
  string lastok;

  int nbline = 0;
  string line;
  while (getline(is, line)) {
  
    u32regex_token_iterator<string::const_iterator>
      tok(make_u32regex_token_iterator(line, tokenizer_regex, 1)), end;


    while (tok != end) {

      const string & txt = tok->str();

      if (u32regex_match(txt, space_regex)) { // space
        writer.write_string(txt);
        ++tok;
        continue;
      }

      eoslast = eos;

      eos = u32regex_match(txt, eos_regex)
        && (! u32regex_match(lastok, antidic_regex));
        //&& (antidic.find(lastok) == antidic.end());
 

      //if (eos) { cerr << "eos with '" << txt << "'\n"; }

      /* on detecte la fin de phrase apres une suite non vide
       * de tokens de fin de phrase
       */
      if (eoslast && ! eos) {
        writer.end_element(); // tu
        ++nbsentence;
        writer.start_element("tu");
      }


      writer.start_element("token");

      if (u32regex_match(txt, word_regex)) { // word

        writer.write_attribute("type", "word");

        if (u32regex_match(txt, upper_regex)) {
          writer.write_attribute("case", "upper");
        } else if (u32regex_match(txt, capit_regex)) {
          writer.write_attribute("case", "capit");
        }

      } else if (u32regex_match(txt, num_regex)) { // numeric

        writer.write_attribute("type", "numeric");

      } else if (u32regex_match(txt, punc_regex)) { // punctuation

        writer.write_attribute("type", "punctuation");
      }

      writer.write_string(txt);

      writer.end_element(); // token

      lastok = txt;
      ++nbtok;
      ++tok;
    }

    writer.write_string("\n");

    ++nbline;
    if ((nbline % 1000) == 0) {
      cerr << nbline << " lines ...\n";
    }
  }

  writer.end_element(); // tu
  writer.end_element(); // par
  writer.end_element(); // document
  writer.end_document();
}


} // namespace ""


const char * USAGE_DESCRIPTION =
"usage: tokenize [-i] [-o <seg>] <txt>\n"
"\n"
"with\n"
" -i : indent the resulting XML segmentation file\n";


int main(int argc, char ** argv) try {

  fs::path txtpath, outpath;
  bool indent = false;

  argv++, argc--;

  if (argc == 0) { usage(); }


  while (argc) {
  
    string arg = *argv;

    if (arg[0] == '-') {
    
      if (arg == "-h") {

        usage();

      } else if (arg == "-i") {

        indent = true;

      } else if (arg == "-o") {

        argv++, argc--;
        if (argc == 0) { arg_needed(arg); }

        outpath = fs::path(*argv);

      } else {
        unknown_arg(arg);
      }

    } else {
      txtpath = fs::path(arg);
    }

    argv++, argc--;
  }

  if (txtpath.empty()) { bad_args(); }

  if (outpath.empty()) { outpath = fs::change_extension(txtpath, ".seg"); }


  if (txtpath == outpath) {
    cerr << "error: input and output are the same file (" << txtpath.string() << ")\n";
    exit(1);
  }


  fs::ifstream in(txtpath);

  xmlwriter out(outpath);

  if (indent) { out.set_indent(1); }


  boost::timer tmr;

  int nbtok = 0, nbsentence = 0;
  tokenize(in, out, nbtok, nbsentence);

  cout << "done. result in " << outpath.string() << "\n" 
    << nbtok << " tokens. " << nbsentence << " sentences.\n"
    << tmr.elapsed() << " s. (average of " << (double) nbtok / tmr.elapsed() << " tok/s)\n";

} catch (exception & e) {
  cerr << "fatal error : " << e.what() << "\n";
  exit(1);
}



