#include <vector>
#include <set>
#include <list>

#include <boost/timer.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/filesystem/path.hpp>
#include <boost/filesystem/fstream.hpp>
#include <boost/filesystem/convenience.hpp>


#include <outilex/xml.h>
#include <outilex/serialize.h>

#include <outilex/bin_text_fsa.h>
#include <outilex/otext_fsa_lexic_buf.h>

#include <outilex/ulookup.h>

#include <outilex/dico_application.h>

#include <outilex/foreach.h>
#include <outilex/usage.h>


using namespace std;
using namespace boost;

namespace fs = boost::filesystem;


const char * USAGE_DESCRIPTION =
"usage : apply-dic [-o <txtfsa> ] [ -dic <dic> [<prio>] ... | -dics <dicosfile> ] <segfile>"
"\n"
"with :\n"
" -dic <dic> [N]  : use the dictionary <dic> with the optionnal priority N\n"
" -dics <dics>    : use the dictionaries specified in <dics> file\n"
" -o <fsa>        : specify the name of the result text fsa file\n"
" <toktext>       : XML segmented text to proceessed\n"
"\n"
" -max <size>     : max lexic size\n"
" -imaj           : majuscules in text can match with minuscule in dictionnaries form in\n"
" -icase          : ignore totally case in text for dictionnary lookup\n"
" -imark          : ignore diachritics for dictionnary lookup\n";

namespace {

void bin_fsa_set_size(const fs::path & p, int size) {
  fs::fstream os(p, ios_base::in|ios_base::out);
  os.seekp(4, ios_base::beg);
  //cerr << "pos = " << os.tellp();
  write_int(os, size);
  //cerr << "pos = " << os.tellp();
}

}// namespace ""

void dump_dicos_list(dicos_list &dicos) {
  foreach_(dico & dic, dicos) {
    cerr <<  "> " <<dic.priority << "\n";
  }
}

int main(int argc, char ** argv) try {

  fs::path inpath, outpath, lingdefpath;

  int lexic_size = 100; // bon rapport (empirique)
  dicos_list dicos;

  int lookup_type = MATCH_EXACT;

  {
    char * bla = getenv("LINGDEF");
    if (bla) { 
      lingdefpath = fs::path(bla);
    }
  }

  argv++, argc--;
  if (argc == 0) { usage(); }


  while (argc) {

    string arg = *argv;

    if (arg == "-h") {
      usage();

    } else if (arg == "-l") {

      argv++, argc--;
      if (! argc) { bad_args(); }

      lingdefpath = fs::path(*argv, fs::native);

    } else if (arg == "-dic") {

      argv++, argc--;
      if (! argc) { bad_args(); }

      fs::path dicpath = fs::path(*argv, fs::native);

      int priority = 10;
      if (argc > 1 && isdigit(argv[1][0])) { // priority specified
        argv++, argc--;
        priority = lexical_cast<int>(*argv);
      }

      load_dic(dicos, dicpath, priority);

    } else if (arg == "-dics") {

      argv++, argc--;
      if (! argc) { arg_needed(arg); }

      cout << "loading dics from " << *argv << " ...\n";
      load_dics(dicos, fs::path(*argv));
      cout << "ok.\n";
      
    } else if (arg == "-o") {

      argv++, argc--;
      if (! argc) { bad_args(); }

      outpath = fs::path(*argv, fs::native);

    } else if (arg == "-max") {

      argv++, argc--;
      if (argc == 0) { arg_needed(arg); }
      lexic_size = boost::lexical_cast<int>(*argv);

    } else if (arg == "-imaj") {

      lookup_type |= MATCH_IGNORE_MAJ;

    } else if (arg == "-icase") {

      lookup_type |= MATCH_IGNORE_CASE;

    } else if (arg == "-imark") {

      lookup_type |= MATCH_IGNORE_MARKS;

    } else { inpath = fs::path(arg, fs::native); }

    argv++, argc--;
  }

  if (lingdefpath.empty()) {
    bad_args("no lingdef specified");
  }

  if (dicos.empty() || inpath.empty() || lingdefpath.empty()) {
    bad_args("missing argument");
  }


  //dump_dicos_list(dicos);

  if (outpath.empty()) {

    outpath = inpath;

    if (fs::extension(outpath) == ".gz") {
      outpath = fs::change_extension(outpath, "");
    }
    outpath = fs::change_extension(outpath, ".fsa");
  }


  boost::timer tmr;

  ling_def lingdef(lingdefpath);

  xmlreader reader(inpath);

  dico_application_infos infos;

  bin_otext_fsa binotext(outpath, 0);
  otext_fsa_lexic_buf otext(binotext, lexic_size);
 
  apply_dics(reader, dicos, lookup_type, &lingdef, otext, infos);

  otext.close();

  // set size
  bin_fsa_set_size(outpath, infos.nbsentence);

  {
    fs::path unknow_path = outpath.branch_path() / "unknown_words";
    cout << "storing unknown words in '" << unknow_path.string() << "' ...\n";

    fs::ofstream out(unknow_path);

    for (map<string, int>::iterator it = infos.unknown_words.begin(),
         end = infos.unknown_words.end();
         it != end; ++it) {

      out << it->first << " " << it->second << "\n";
    }
    //std::copy(unknown_words.begin(), unknown_words.end(), ostream_iterator<string>(out, "\n"));
  }

  cout << "ok. textfsa is in " << outpath.string()
    <<  " (" << infos.nbsentence << " sentences, " << infos.nbtoken << " tokens, "
    << infos.nbtag << " tags, "
    << infos.nbword << " words, " << infos.unknown_words.size() << " unknown forms, "
    << infos.nbunknwtok << " unknown tokens)\n"
    << tmr.elapsed() << " s. (average of " << (double) infos.nbtoken / tmr.elapsed() << " tok/s).\n";

} catch (exception & e) {
  cerr << "fatal error : " << e.what() << endl;
  exit(1);
}


