#include <iostream>
#include <sstream>
#include <fstream>
#include <stdexcept>

#include <boost/filesystem/fstream.hpp>
#include <boost/filesystem/convenience.hpp>
#include <boost/regex.hpp>
#include <boost/regex/icu.hpp>
#include <boost/lexical_cast.hpp>

#include <outilex/stringtok.h>
#include <outilex/unitex_grf.h>
#include <outilex/generic_fst.h>
#include <outilex/xml.h>



using namespace std;
using namespace boost;

namespace fs = boost::filesystem;

namespace {
/* unicode regexp to tokenize grf input :
   <[^>]*>                   -> <le.det+m+s>
   ^:(.)+                    -> appel a un sous graphe
   @[^@]+@                   -> parametre
   [[:L*:]]+                 -> forme flechie
   [[:Nd:]]+                 -> 2005
   [[:P*:]]|[[:S*:]]  -> punctuation and symbol
   see http://www.boost.org/libs/regex/doc/character_class_names.html */

u32regex grf_token_regex(make_u32regex("(<[^>]*>"                 // <le.det+p>
                                       "|^:(.)+" // :path:to:subgraph
                                       "|@[^@]+@"                 // @N0=queP@
                                       "|[[:L*:]+"                // forme  
                                       "|[[:Nd:]]+"               // 12321
                                       "|[[:P*:]]|[[:S*:]])"));   // .

template<typename OutputIterator>
void tokenize_grf_label(const std::string & label, OutputIterator out) {

  u32regex_token_iterator<string::const_iterator>
    tok(make_u32regex_token_iterator(label, grf_token_regex, 1)), end;

  while (tok != end) {
    *out = tok->str();
    ++out, ++tok;
  }
}

} // namespace ""



void grf_box::read_line(const std::string & line) {

  input.clear();
  output.clear();
  trans.clear();

  string::size_type endlabel = line.rfind('"');

  if (endlabel == string::npos) {
    throw runtime_error("grf_box::readline: bad grf format");
  }

  /* parse box label */

  string label = line.substr(1, endlabel - 1);
  if (label.empty()) { label = "<E>"; }

  string::size_type slash = label.find('/');

  if (slash != string::npos) {
    output = label.substr(slash + 1);
    label.erase(slash);
  } else {
    output.clear();
  }

  bool inangle = false;
  for (int i = 0; i < label.size(); ++i) {
    
    if (inangle) {
      if (label[i] == '>') { inangle = false; }
    } else {
      if (label[i] == '<') {
        inangle = true;
      } else if (label[i] == '+') { label[i] = '\n'; } // replace '+' separator by '\n'
    }
  }
  
  stringtok(label.substr(0, slash), "\n", back_inserter(input)); 

#if 0

  DO NOT TOKENIZE HERE, but during grf/fst conversion

  vector<string> untokenized_input;
  /* tokenize input
   * ex: "la <pomme de terre.noun>" -> "la" "<pomme de terre.noun>"
   */
  input.resize(untokenized_input.size());

  /* new tokenization, use boost regex on unicode icu strings */

  for (int i = 0; i < untokenized_input.size(); ++i) {

    u32regex_token_iterator<string::const_iterator>
      tok(make_u32regex_token_iterator(untokenized_input[i], grf_token_regex, 1)), end;

    while (tok != end) {
      input[i].push_back(tok->str());
      ++tok;
    }
  }
#endif

  /* read box description */
  istringstream is(line.substr(endlabel + 1));

  is >> x;
  is >> y;

  int ntrans, no;
  is >> ntrans;

  while (is >> no) {
    trans.push_back(no);
  }

  assert(trans.size() == ntrans);
}


void unitex_grf::load(const boost::filesystem::path & path) {

  boxes.clear();
  if (fs::extension(path) == ".xgrf"){
    read_xml(path);
    return;
  }

  fs::ifstream is(path);
  if (! is) {
    throw runtime_error("grf::load: cannot open " + path.string());
  }  

  read_grf8(is);
}



void grf_box::read_xml(xmlNode * node) {

  input.clear();
  output.clear();
  trans.clear();

  xmlNodePtr node1;
  vector<string> transitions;
  char * text;

  text = xmlGetProp(node, "x");
  x = lexical_cast<int>(text);
  xmlFree(text);

  text = xmlGetProp(node, "y");
  y = lexical_cast<int>(text);
  xmlFree(text);
 
  node = node->xmlChildrenNode;
  while (node != NULL) {

    if (xmlStrcmp(node->name, "labels") == 0) {

      node1 = node->xmlChildrenNode;

      while (node1 != NULL) { //reading labels

	if (xmlStrcmp(node1->name, "label") == 0){
	  text = xmlGetProp(node1, "value");
	  input.push_back(text);
          xmlFree(text);
	}	

	node1 = node1->next;
      }

#if 0

     DO NOT TOKENIZE LABELS HERE

      input.resize(untokenized_input.size());
      /* new tokenization, use boost regex on unicode icu strings */
      
      for (int i = 0; i < untokenized_input.size(); ++i) {
	if(untokenized_input[i][0] != ':'){
	  u32regex_token_iterator<string::const_iterator>
	    tok(make_u32regex_token_iterator(untokenized_input[i], grf_token_regex, 1)), end;
	  while (tok != end) {
	    input[i].push_back(tok->str());
	    ++tok;
	  }
	}
	else{
	  input[i].push_back(untokenized_input[i]);
	}
      }
#endif
    }

    if (xmlStrcmp(node->name, "out") == 0) { // reading output
      output = xmlGetConstProp(node,"value");
    }

    if (xmlStrcmp(node->name, "trans") == 0) { // reading output
      string temp = xmlGetConstProp(node,"dest");
      transitions.clear();
      stringtok(temp, " +", back_inserter(transitions));
      for(int i = 0 ; i < transitions.size() ; i++){
	trans.push_back(lexical_cast<int>(transitions[i]));
      }
    }
    node = node->next;
  }
}



void grf_box::write_xml(xmlwriter & writer, int id) const {

  writer.start_element("box");

  writer.write_attribute("id", lexical_cast<string>(id));
  writer.write_attribute("x", lexical_cast<string>(x));
  writer.write_attribute("y", lexical_cast<string>(y));

  writer.start_element("labels");
  writer.write_attribute("number", lexical_cast<string>(input.size()));

  for (int i = 0; i < input.size(); ++i) {
    writer.start_element("label");
    // A changer en noeud texte ? ...
    writer.write_attribute("value", lexical_cast<string>(input[i]));
    writer.end_element();//label
  }
  writer.end_element();//labels

  writer.start_element("out");
  writer.write_attribute("value", lexical_cast<string>(output));
  writer.end_element();//out

  writer.start_element("trans");
  ostringstream oss;
  std::copy(trans.begin(), trans.end(), ostream_iterator<int>(oss, " "));
  writer.write_attribute("dest", oss.str());
  writer.end_element(); //trans

  writer.end_element(); //box
}


void unitex_grf::read_xgrf_boxes(xmlNodePtr node){  
  int i = 0;
  node = node->xmlChildrenNode;
  while(node != NULL){
    if (xmlStrcmp(node->name, (const xmlChar *) "box") == 0) {
      boxes[i].read_xml(node);
      i++;
    }
    node = node->next;
  }
  
}

void unitex_grf::read_xml(const boost::filesystem::path & path) {

  boxes.clear();

  xmlNodePtr cur;
  xmlTextReaderPtr reader = xmlReaderForFile((char *)path.string().c_str(), NULL, 0);
  //xmlTextReaderSetParserProp(reader, XML_PARSER_VALIDATE, 1);
  if(xmlTextReaderRead(reader) != 1) return;
  if ((xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT)
      && (xmlStrcmp(xmlTextReaderConstName(reader), (const xmlChar *) "xgrf") == 0)) {
    cur = xmlTextReaderExpand(reader);
    cur =  cur->xmlChildrenNode;
    while(cur != NULL){
       if (xmlStrcmp(cur->name, (const xmlChar *) "boxes") == 0) {
	 int number = lexical_cast<int>(xmlGetProp(cur,"number"));
	 boxes.resize(number);
	 read_xgrf_boxes(cur);
       }
      cur = cur->next;
    }

  }
  xmlFreeTextReader(reader);
}


void unitex_grf::write_xml(xmlwriter & writer) const {

  writer.start_element("xgrf");

  writer.start_element("header");
  //....
  writer.end_element();

  writer.start_element("boxes");
  writer.write_attribute("number", lexical_cast<string>(boxes.size()));

  for (int id = 0; id < boxes.size(); ++id) {
    boxes[id].write_xml(writer, id);
  }

  writer.end_element(); //boxes
  writer.end_element(); //xgrf
}


void unitex_grf::write_xml(const fs::path & path) const {
  xmlwriter writer(path);
  writer.set_indent(true);
  writer.start_document();
  write_xml(writer);
  writer.end_document();
}



void unitex_grf::read_grf8(istream & is) {

  boxes.clear();

  is.exceptions(ios::badbit | ios::failbit | ios::eofbit);

  string line;

  getline(is, line);

  if (line.empty() || (line[0] != '#' && line[3] != '#')) { // line[3] if utf-8 bom
    cerr << "1st line = '" << line << "'\n";
    throw runtime_error("grf::load: bad grf format (missing '#')");
  }

  /* skip headers */
  do {

    getline(is, line);
    
  } while (line.empty() || line[0] != '#'); 

  /* read nb boxes */

  getline(is, line);
  int nbboxes;
  istringstream(line) >> nbboxes;

  boxes.resize(nbboxes);

  for (int i = 0; i < nbboxes; ++i) {
    getline(is, line);
    boxes[i].read_line(line);  
  }
}




void grf_to_generic_fst(const unitex_grf & grf, generic_fst & fst) {

  fst.clear();
  fst.resize(grf.size());

  /* set final state (always box #2) */

  fst.set_final(1);

  for (int q = 0; q < grf.size(); ++q) {

    const grf_box & box = grf[q];


    /* tokenize input
     * ex: "la <pomme de terre.noun>" -> "la" "<pomme de terre.noun>"
     */

    vector<vector<string> > input(box.input.size());

    for (int i = 0; i < box.input.size(); ++i) {
      tokenize_grf_label(box.input[i], back_inserter(input[i]));
    }


    for (vector<int>::const_iterator tr = box.trans.begin(); tr != box.trans.end(); ++tr) {

      for (int i = 0; i < input.size(); ++i) {

        if (input[i].empty()) { continue; }

        if (input[i].size() == 1) { // one token (common case)
          
          fst.add_trans(q, input[i][0], box.output, *tr);

        } else { // create a trans per token

          int curq = q;
          int j = 0;
          for (j = 0; j < input[i].size() - 1; ++j) {
            int nq = fst.add_state();
            fst.add_trans(curq, input[i][j], (j == 0) ? box.output : "", nq);
            curq = nq;
          }
          fst.add_trans(curq, input[i][j], "", *tr);
        }
      }
    }
  }
}



void write_grf8_headers(ostream & os) {
os <<
  "#Unigraph8\n"
  "SIZE 1188 840\n"
  "FONT Times New Roman:  10\n"
  "OFONT Times New Roman:  10\n"
  "BCOLOR 16777215\n"
  "FCOLOR 0\n"
  "ACOLOR 13487565\n"
  "SCOLOR 16711680\n"
  "CCOLOR 255\n"
  "DBOXES y\n"
  "DFRAME y\n"
  "DDATE n\n"
  "DFILE y\n"
  "DDIR n\n"
  "DRIG n\n"
  "DRST n\n"
  "FITS 100\n"
  "PORIENT L\n"
  "#\n";
}


void unitex_grf::write_grf8(const fs::path & path) const {

  fs::ofstream os(path);

  write_grf8_headers(os);

  os << boxes.size() << "\n";

  for (int i = 0; i < boxes.size(); ++i) {
    
    const grf_box & box = boxes[i];
    if (box.input.empty()) { throw runtime_error("grf::write_grf8: box with no label?"); }
    
    os << '"' << box.input[0];
    for (int i = 1; i < box.input.size(); ++i) {
      os << "+" << box.input[i];
    }
    if (! box.output.empty()) { os << "/" << box.output; }
    os << '"' << ' ';

    os << box.x << ' ' << box.y << ' ' << box.trans.size() << ' ';

    for (int i = 0; i < box.trans.size(); ++i) {
      os << box.trans[i] << ' ';
    }
    os << '\n';
  }
}

