#include <iostream>
#include <string>

#include <boost/lexical_cast.hpp>
#include <boost/filesystem/path.hpp>
#include <boost/filesystem/convenience.hpp>

#include <outilex/xmlReader.h>
#include <outilex/xmlWriter.h>
#include <outilex/xml.h>


using namespace std;
using namespace boost;

namespace fs = boost::filesystem;

namespace {

xmlwriter writer;
int sentenceno = -1;
int stateno = -1;
int currpos = 0;
int pos0 = 0; // position of the state 0 in current sentence



const char * find_attribute(const char * name, const xmlChar ** atts) {

  if (atts == NULL) { return NULL; }

  while (*atts) {
    if (xmlStrcmp(name, *atts) == 0) {
      //cerr << (char *) *atts << " = " << (char *) (*(atts + 1)) << endl;
      atts++; break;
    }
    atts++, atts++;
  }
  return (const char *) (*atts);
}


void startDocument(void * d) {
  writer.start_document();
}

void endDocument(void * d) {
  writer.end_document();
}


void startElement(void * ctx, const xmlChar * name, const xmlChar ** atts) {
  
  //cerr << "startelem: " << (char *) name << endl;

  const char * text;

  if (xmlStrcmp(name, "par") == 0) { return; } // forget about paragraphs ...

  if (xmlStrcmp(name, "tagged-text") == 0) {
  
    writer.start_element("text_fsa");

  } else if (xmlStrcmp(name, "sentence") == 0 || xmlStrcmp(name, "tu") == 0) {

    ++sentenceno;
    writer.start_element("sentence");
    writer.write_attribute("no", lexical_cast<string>(sentenceno));
    stateno = -1;

  } else if (xmlStrcmp(name, "p") == 0) { // new state
 
    text = find_attribute("pos", atts);

    currpos = lexical_cast<int>(text);

    ++stateno;
    if (stateno != 0) { // close previous state
      writer.end_element();
    } else {
      pos0 = currpos;
    }

    writer.start_element("state");
    writer.write_attribute("no", lexical_cast<string>(stateno));
    writer.write_attribute("pos", lexical_cast<string>(currpos));

  } else if (xmlStrcmp(name, "tag") == 0) {

    text = find_attribute("to", atts);
    int to = lexical_cast<int>(text) - pos0;

    writer.start_element("transition");
    writer.write_attribute("to", lexical_cast<string>(to));

  } else { // recopy element as is (lex_entry, form, lemma, ..)

    writer.start_element(name);

    if (atts != NULL) {
      for (int i = 0; atts[i]; ++i) {
        if (atts[i + 1]) {
          writer.write_attribute(atts[i], atts[i+1]);
          ++i;
        }
      }
    }
  }
}

void endElement(void * ctx, const xmlChar * name) {
  
  if (xmlStrcmp(name, "par") == 0) { return; } // forget about paragraphs ...

  if (xmlStrcmp(name, "p") == 0) {  return; } // ... and pos
 

  if (xmlStrcmp(name, "sentence") == 0 || xmlStrcmp(name, "tu") == 0) {
    // end of sentence: mark state as final and close it

    writer.write_attribute("final", "1");
    writer.end_element(); // state

    writer.end_element(); // sentence


  } else if (xmlStrcmp(name, "tag") == 0) {
 
    writer.end_element(); // transition

  } else { // recopy element as is (lex_entry, form, lemma, ..)

    writer.end_element();
  }
}



void characters(void * ctx, const xmlChar * ch, int len) {

  string str((const char *) ch, len);
  writer.write_string(str);
}



/*
 * SAX block
 */

xmlSAXHandler mySAXhandler = {
    NULL, /* internalSubset */
    NULL, /* isStandalone */
    NULL, /* hasInternalSubset */
    NULL, /* hasExternalSubset */
    NULL, /* resolveEntity */
    NULL, /* getEntity */
    NULL, /* entityDecl */
    NULL, /* notationDecl */
    NULL, /* attributeDecl */
    NULL, /* elementDecl */
    NULL, /* unparsedEntityDecl */
    NULL, /* setDocumentLocator */
    startDocument, /* startDocument */
    endDocument, /* endDocument */
    startElement, /* startElement */
    endElement, /* endElement */
    NULL, /* reference */
    characters, /* characters */
    NULL, /* ignorableWhitespace */
    NULL, /* processingInstruction */
    NULL, /* comment */
    NULL, /* xmlParserWarning */
    NULL, /* xmlParserError */
    NULL, /* xmlParserError */
    NULL, /* getParameterEntity */
    NULL, /* cdataBlock; */
    NULL, /* externalSubset; */
    1,
    NULL,
    NULL, /* startElementNs */
    NULL, /* endElementNs */
    NULL  /* xmlStructuredErrorFunc */
};


char * progname;

void usage() {
  cout << "usage: " << progname << " [-o <output> ] <tokenized text>\n";
  exit(0);
}

} // namespace anonymous



int main(int argc, char ** argv) {

  try {
  progname = *argv;
  fs::path inpath, outpath;

  argv++, argc--;

  if (! argc) { usage(); }


  while (argc) {
    
    string arg = *argv;
 
    if (arg == "-o") {
      
      argv++, argc--;
      if (! argc) { usage(); }
      
      outpath = fs::path(*argv, fs::native);
 
    } else if (arg == "-h") {
    
      usage();
    
    } else {
      inpath = fs::path(arg, fs::native);
    }
    argv++, argc--;
  }
 

  if (inpath.empty()) {
    cerr << "bad args\n";
    exit(1);
  }
 
  if (outpath.empty()) {
    outpath = fs::change_extension(inpath, ".fsa");
  }

  writer.open(outpath);

  sentenceno = -1;
  stateno = -1;
  currpos = 0;
  pos0 = 0;
  
  int res = xmlSAXUserParseFile(& mySAXhandler, NULL, inpath.native_file_string().c_str());

  if (res) { cerr << "some error occured...\n"; }

  cerr << "done. " << sentenceno + 1 << " sentences. " << currpos - sentenceno << " tokens.\n";

  return res;

  } catch (exception & e) {
    cerr << "fatal error: exception caught: " << e.what() << endl;
    exit(1);
  }
}

