#include <iostream>
#include <fstream>
#include <string>
#include <map>


#include <boost/regex.hpp>
#include <boost/regex/icu.hpp>
#include <boost/lexical_cast.hpp>


#include <boost/filesystem/path.hpp>
#include <boost/filesystem/convenience.hpp>

#include <outilex/xml.h>
#include <outilex/sax.h>

using namespace std;
using namespace boost;

namespace fs = boost::filesystem;

namespace {

u32regex tokenizer_regex(make_u32regex("([[:L*:]]+"
                                       "|[[:Nd:]]+"
                                       "|[[:P*:]]"
                                       "|[[:S*:]])"));

u32regex word_regex(make_u32regex("[[:L*:]]+"));
u32regex upper_regex(make_u32regex("[[:Lu:]]+"));
u32regex capit_regex(make_u32regex("[[:Lu:]][[:L*:]]+"));
u32regex num_regex(make_u32regex("[[:Nd:]]+"));
u32regex punc_regex(make_u32regex("[[:P*:]]|[[:S*:]]"));



class my_handler : public xml::sax_handler {

public:

  my_handler(xmlwriter & writer_)
    : writer(writer_), nb_sent(0), nb_word(0), in_sentence(false), in_word(false), word() {}

  void start_document() { writer.start_document(); }

  void end_document() { writer.end_document(); }

  void start_element(const string & name, map<string, string> & attrs) {
 
    //cerr << "start elem: " << name << endl;

    if (name == "text") {

      writer.start_element("document");
      writer.write_attribute("original_format", "tree");

      writer.start_element("par");
      writer.write_attribute("id", "1");

    } else if (name == "SENT") {

      assert(! in_sentence);

      writer.start_element("tu");
      writer.write_attribute("id", "s" + lexical_cast<string>(nb_sent));

      in_sentence = true;
      ++nb_sent;

    } else if (name == "w") {

      assert(in_sentence);

      if (attrs["compound"] == "yes") { return; }

      in_word = 1;
      word.clear();
    }
  }

  void end_element(const string & name) {


    if (name == "text") {

      writer.end_element(); // par
      writer.end_element(); // document

    } else if (name == "SENT") {

      writer.end_element(); // tu
      in_sentence = false;

    } else if (name == "w") {

      if (in_word) {
 
        // tokenize the word
        u32regex_token_iterator<string::const_iterator>
          tok(make_u32regex_token_iterator(word, tokenizer_regex, 1)), end;
      
        while (tok != end) {

          writer.start_element("token");

          const string & txt = tok->str();
          
          if (u32regex_match(txt, word_regex)) { // word

            writer.write_attribute("type", "word");

            if (u32regex_match(txt, upper_regex)) {
              writer.write_attribute("case", "upper");
            } else if (u32regex_match(txt, capit_regex)) {
              writer.write_attribute("case", "capit");
            }

          } else if (u32regex_match(txt, num_regex)) { // numeric
          
            writer.write_attribute("type", "numeric");
            
          } else if (u32regex_match(txt, punc_regex)) { // punctuation
          
            writer.write_attribute("type", "punctuation");
          }

          writer.write_string(txt);

          writer.end_element(); // token

          ++nb_word;
          ++tok;
        }
          
      }
      in_word = 0;
    }
  }


  void characters(const string & w) {

    if (in_word) {
      word += w;
    }
  }


public:
  xmlwriter & writer;
  int nb_sent, nb_word;
  bool in_sentence;
  int in_word;
  string word;

  /* in_word :
   * 0 : pas dans un element word, ou alors dans un element word avec attribut compound=yes
   * 1 : a l'entree d'un elem word non compose
   * //inutil2 : texte dans un elem word non compose
   */
};

} // namespace ""

int main(int argc, char ** argv) try {
  
  if (argc < 2) {
    cerr << "bad args\n";
    exit(1);
  }

  argv++, argc--;
  while (argc) {

    cout << "processing " << *argv << " ...\n";

    fs::path in(*argv);
    fs::path out = fs::change_extension(in, ".seg");

    if (in == out) {
      cerr << " error: bad file extension\n";
      argv++, argc--;
      continue;
    }

    xmlwriter writer(out);
    writer.set_indent(1);

    my_handler parser(writer);
    parser.parse(*argv);

    cout << parser.nb_word << " words. " << parser.nb_sent << " sentences.\n";
  
    argv++, argc--;
  }

} catch (exception & e) {
  cerr << "fatal error : " << e.what() << endl;
}

