#include <string>
#include <stdexcept>

#include <boost/lexical_cast.hpp>
#include <boost/filesystem/path.hpp>
#include <boost/filesystem/fstream.hpp>

#include <boost/regex.hpp>
#include <boost/regex/icu.hpp>

#include <outilex/unicode.h>

#include <outilex/serialize.h>
#include <outilex/dico.h>
#include <outilex/dic_indexer.h>

using namespace std;
using namespace boost;

namespace fs = boost::filesystem;

void dic_indexer::clear() {
  fsa.clear();
  entries_feats.clear();
  inflex_feats.clear();
}


string compress_lemma(const vector<UChar> & form, int lenf,
                      const vector<UChar> & lemma, int lenl, UErrorCode & uerror) {

  string res;

  int i = 0;
  while (i < lenl && i < lenf) {
    if (form[i] != lemma[i]) { break; }
    i++;
  }

  /* i == len of common prefix */

  if (i != 0) { res = boost::lexical_cast<string>(lenf - i); }

  res += unicode::utf8_from_u_str(& lemma[i], lenl - i, uerror);
  return res;
}


namespace {

u32regex tokenizer_regex(make_u32regex("(([[:L*:]]|[[:M*:]])+"    // mot
                                       "|[[:Nd:]]+"    // nombre
                                       "|[[:P*:]]|[[:S*:]]" // ponctuation + symbole
                                       "|\\s+)")); // space
}

void dic_indexer::add_entry(const dic_entry & e) {

  string efeats = e.pos + e.feats;
  string ifeats;

  int idx = entries_feats.add_if_not_here(efeats, efeats);

  UErrorCode uerror = U_ZERO_ERROR;

  int lenl, lenf, lenb;
  vector<UChar> lemma(64), form(64), buf(64), tokenized_form(64);

  lenb = unicode::u_str_from_utf8(buf, e.lemma, uerror);
  lenl = unicode::normalize(lemma, UNORM_NFD, & buf[0], lenb, uerror);

  //unicode_check(uerror);
  //  cerr << "after unicode stuffs\n";

  for (vector<inflected>::const_iterator it = e.inflecteds.begin();
       it != e.inflecteds.end(); ++it) {

    const inflected & inflex = *it;

    lenb = unicode::u_str_from_utf8(buf, inflex.form, uerror); /* UTF-8 to UTF-16 */
    lenf = unicode::normalize(form, UNORM_NFD, & buf[0], lenb, uerror); /* UTF-16 decomp */

    unicode_check(uerror);

    ifeats = inflex.feats + '.' + compress_lemma(form, lenf, lemma, lenl, uerror);
    dic_entry_value val(idx, inflex_feats.add_if_not_here(ifeats, ifeats));

    /* terminer la chaine par 0 */
    form.resize(lenf+1);
    form[lenf] = 0;

    /* decoupe la forme en tokens */
    u32regex_token_iterator<const UChar *>
      tok(make_u32regex_token_iterator(& form[0], tokenizer_regex, 1)), end;

    tokenized_form.clear();
    while (tok != end) {
      tokenized_form.insert(tokenized_form.end(), tok->first, tok->second);
      tokenized_form.push_back('#');
      ++tok;
    }

    fsa.add_entry(&tokenized_form[0], (&tokenized_form[0]) + tokenized_form.size() - 1, val);

    //fsa.add_entry(& form[0], (& form[0]) + lenf, val);
  }
}


void dic_indexer::output_state(ostream & os, int no) const {

  const FSA::state & q = fsa.states[no];

  unsigned short n = q.trans.size();

  if (q.final) { // mark the node as final (one bit flag)
    n = n | 0x8000;
  }

  write_ushort(os, n);

  /* output transition */

  FSA::state::const_trans_iterator it;
  for (it = q.trans.begin(); it != q.trans.end(); ++it) {

    unsigned short c = static_cast<unsigned short>(it->first);

    write_ushort(os, c);
    write_int3(os, it->second);
  }
}


void dic_indexer::output_values(ostream & os) const {

  int size = fsa.values.size();

  write_int(os, size);

  for (int i = 0; i < size; ++i) {

    const set<dic_entry_value> & vals = fsa.values[i];

    write_int2(os, vals.size());

    for (set<dic_entry_value>::iterator it = vals.begin(); it != vals.end(); ++it) {
 
      const dic_entry_value & v = *it;

      /* output lex entry no */

      write_int3(os, v.idx);

      /* output morpho-feats info */

      write_int3(os, v.feats);
    }
  }
}


void dic_indexer::output_feats(ostream & os) const {
  os << "%%\n";
  os << entries_feats.size() << '\n';
  for (int i = 0; i < entries_feats.size(); ++i) {
    os << entries_feats[i] << '\n';
  }
  os << "%%\n";
  os << inflex_feats.size() << '\n';
  for (int i = 0; i < inflex_feats.size(); ++i) {
    os << inflex_feats[i] << '\n';
  }
  os << "%%\n";
}


void dic_indexer::output_fsa(ostream & os) const {

  /*
  cerr << "output_fsa: " << fsa.size() << " states, " << entries_feats.size() << " feat entries, "
    << inflex_feats.size() << " inflex feats\n";
  */
  
  write_int(os, fsa.size());

  for (int i = 0; i < fsa.size(); i++) {
    output_state(os, i);
  }

  output_values(os);
}



void dic_indexer::write(const fs::path & path) const {

  fs::ofstream bin(path, ios_base::binary|ios_base::out|ios_base::trunc);

  if (! bin) {
    cerr << "unable to open " << path.string() << endl;
    exit(1);
  }

  write_int(bin, DICOMAGIC);

  output_fsa(bin);
  output_feats(bin);
}


struct dump_entries {

  ostream & os;
  const dic_indexer & dic;

  dump_entries(ostream & _os, const dic_indexer & _dic) : os(_os), dic(_dic) {}

  void operator()(const vector<UChar> & _form, const set<dic_entry_value> & vals) {
    UErrorCode uerror = U_ZERO_ERROR;
    string form = unicode::utf8_from_u_str(_form, uerror);
    os << form << ":\n";
    for (set<dic_entry_value>::const_iterator it = vals.begin(); it != vals.end(); ++it) {
      os << "-> " << dic.entries_feats[(*it).idx] << " " << dic.inflex_feats[(*it).feats] << '\n';
    }
  }
};

void dic_indexer::dump(std::ostream & os) const {
  fsa.apply_lexic(dump_entries(os, *this));
}

