#include <iostream>
#include <vector>

#include <outilex/xml.h>
#include <outilex/xpath.h>

using namespace std;

bool DUMPID;
int chunkid;

#if 0
void dump_id(ostream & os) {
  if (DUMPID) {
    os << "/" << chunkid++ << " ";
  }
}

#endif

struct chunk {
  string lemma;
  int id;

  chunk() : lemma(), id(-1) {}
  chunk(const string & l, int i = -1) : lemma(l), id(i) {}
};

ostream & operator<<(ostream & os, const chunk & c) {
  os << "(" << c.lemma << "," << c.id << ")";
  return os;
}

void dump_rels(const chunk & head, const vector<chunk> & deps, ostream & os = cout) {

  for (int i = 0; i < deps.size(); ++i) {
    os << head.id << " " << deps[i].id << "\t" << head.lemma << " <- " << deps[i].lemma << "\n";
  }
}

void dump_sentence(xmlDoc * doc, xmlNode * SENT, ostream & os) {


  xmlXPathContext * xpath_ctx = xmlXPathNewContext(doc, SENT);
  assert(xpath_ctx);

  xmlXPathObject * xpath_obj = xmlXPathEvalExpression(".//w[not(@compound)]", xpath_ctx);
  assert(xpath_obj);

  xmlNodeSet * nodeset = xpath_obj->nodesetval;
  assert(nodeset);

  for (int i = 0; i < nodeset->nodeNr; ++i) {
    
    xmlNode * node = nodeset->nodeTab[i];
    
    char * text = (char *) xmlNodeGetContent(node);
    os << text << ' ';

    xmlFree(text);
  }

  os << '\n';

  xmlXPathFreeObject(xpath_obj);
  xmlXPathFreeContext(xpath_ctx);
}

string get_w_form(xmlNode * node) {

  string form;
  char * compound = xmlGetProp(node, "compound");

  if (compound) {
    //cout << "comp = " << compound << endl;
    for (node = node->children; node; node = node->next) {
      if (node->type != XML_ELEMENT_NODE) { continue; }
      form += get_w_form(node) + " ";
    }
    if (form.empty()) { form = "?no compound form? "; }
    form.resize(form.size() - 1);
    xmlFree(compound);

  } else {
    char * txt = (char *) xmlNodeGetContent(node);
    form = txt;
    xmlFree(txt);
  }
  return form;
}


chunk dump_chunks(xmlNode * node, ostream & os);


chunk dump_sent(xmlNode * node, ostream & os) {

  chunk head;
  vector<chunk> deps;

  for (xmlNode * n2 = node->children; n2; n2 = n2->next) {

    if (node->type != XML_ELEMENT_NODE) { continue; }


    chunk c = dump_chunks(n2, os);

    string name((char *) n2->name);
    if (head.id == -1 && name == "VN") {
      head = c;
    } else { deps.push_back(c); }
  }

  dump_rels(head, deps);
  return head;
}



chunk dump_vn(xmlNode * node, ostream & os) {

  chunk head;
  vector<chunk> deps;

  string form, lemma;


  for (node = node->children; node; node = node->next) {

    if (node->type != XML_ELEMENT_NODE) { continue; }
    
    string name((char *) node->name);

    if (name == "AdP" || name == "COORD") { continue; }

    if (name != "w") {
      //      os << "?" << name << "?\n";
      cerr << "dump_vn: unknown elem: '" << name << "'\n";
      continue;
    }
 
    string cat = xmlGetConstProp(node, "cat");
  
    if (cat == "CL") {
    
      //os << "<" << get_w_form(node) << ',' << xmlGetConstProp(node, "lemma") << ".PPV> ";
      chunk c(xmlGetConstProp(node, "lemma"), chunkid++);
      deps.push_back(c);

    } else if (cat == "ADV" || cat == "PONCT" || cat == "AdP") {
      // ignore punct and adv inside vn
    } else if (cat == "V") {
    
      //    os << "<" << get_w_form(node) << ',' << xmlGetConstProp(node, "lemma") << ".V> ";
      //form += " " + get_w_form(node);
      head.lemma = xmlGetConstProp(node, "lemma");

    } else {
      os << "?w:cat=" << cat << "? ";
      cerr << "dump_vn: unknown cat: " << cat << endl;
    }

  }
  //os << "<" << form << "," << lemma << ".V> ";
  //dump_id(os);
  head.id = chunkid++;
  dump_rels(head, deps);
  return head;
}



chunk dump_np(xmlNode * node, ostream & os) {

  node = node->children;

  chunk head;
  vector<chunk> deps;

  string lemma;
  /*
  string form;
  string CAT = "CN";
  */

  // dump chunk
  for (; node; node = node->next) {
  
    if (node->type != XML_ELEMENT_NODE) { continue; }

    string name((char *) node->name);

    if (name != "w") { break; }

  //  form += get_w_form(node) + " ";

    if (lemma.empty()) { // lemma is first word of cat N
      string cat = xmlGetConstProp(node, "cat") ? xmlGetConstProp(node, "cat") : "";
      if (cat == "N") {
        lemma = xmlGetConstProp(node, "lemma");
      } else if (cat == "PRO") {
        lemma = xmlGetConstProp(node, "lemma");
        //CAT = "PRO+";
        //CAT += xmlGetConstProp(node, "subcat");
      }
    }
  }

  //if (form.empty()) { form = "?no form? "; }
  //if (lemma.empty()) { lemma = "?no lemma?"; }
  //form.resize(form.size() - 1);

  //os << "<" << form << "," << lemma << "." << CAT << "> ";
  //dump_id(os);
  head.lemma = lemma;
  head.id = chunkid++;

  // dump modifieurs a gauche

  for (; node; node = node->next) {
    if (node->type == XML_ELEMENT_NODE) { 
      deps.push_back(dump_chunks(node, os)); 
    }
  }
  dump_rels(head, deps);
  return head;
}

chunk dump_ap(xmlNode * node, ostream & os) {

  node = node->children;


  chunk head;
  vector<chunk> deps;

/*
  string form;
  string lemma;
*/

  // dump chunk adjectival
  for (; node; node = node->next) {
  
    if (node->type != XML_ELEMENT_NODE) { continue; }

    string name((char *) node->name);

    if (name != "w") { break; }

  //  form += get_w_form(node) + " ";

    if (head.lemma.empty()) { // lemma is first word of cat A
      string cat = xmlGetConstProp(node, "cat");
      if (cat == "A") {
        head.lemma = xmlGetConstProp(node, "lemma");
      }
    }
  }

/*
  if (form.empty()) { form = "?no form? "; }
  if (lemma.empty()) { lemma = "?no lemma?"; }

  form.resize(form.size() - 1);
  
  os << "<" << form << "," << lemma << ".CA> ";
  dump_id(os);
*/
  head.id = chunkid++;

  // dump modifieurs a gauche

  for (; node; node = node->next) {
    if (node->type == XML_ELEMENT_NODE) {
      deps.push_back(dump_chunks(node, os));
    }
  }

  dump_rels(head, deps);
  return head;
}


chunk dump_pp(xmlNode * node, ostream & os) {

  node = node->children;

  chunk head;
  vector<chunk> deps;

  /*
  string form;
  string lemma;
  */

  // dump prep
  for (; node; node = node->next) {
  
    if (node->type != XML_ELEMENT_NODE) { continue; }

    string name((char *) node->name);

    if (name != "w") { break; }

    //form += get_w_form(node) + " ";

    if (head.lemma.empty()) { // lemma is first word of cat N
      string cat = xmlGetConstProp(node, "cat");
      if (cat == "P") {
        head.lemma = xmlGetConstProp(node, "lemma");
      }
    }
  }

  /*
  if (form.empty()) { form = "?no form? "; }
  if (lemma.empty()) { lemma = "?no lemma?"; }

  form.resize(form.size() - 1);
  os << "<" << form << "," << lemma << ".PREP> ";
  dump_id(os);
*/
  head.id = chunkid++;

  // dump modifieurs a gauche

  for (; node; node = node->next) {
    if (node->type == XML_ELEMENT_NODE) { 
      deps.push_back(dump_chunks(node, os));
    }
  }
  dump_rels(head, deps);
  return head;
}


chunk dump_w(xmlNode * node, ostream & os) {

  string cat = xmlGetConstProp(node, "cat");

  if (cat == "PONCT") {
  
    //os << get_w_form(node) << " ";
    return chunk(get_w_form(node) + "PUNC", chunkid++);
  
  } else if (cat == "ADV") {
  
    //os << "<" << get_w_form(node) << "," << xmlGetConstProp(node, "lemma") << ".ADV> ";
    //dump_id(os);

    return chunk(xmlGetConstProp(node, "lemma"), chunkid++);

  } else if (cat == "C") {
  
    //os << "<" << get_w_form(node) << "," << xmlGetConstProp(node, "lemma") << ".CONJ> ";
    //dump_id(os);
    return chunk(xmlGetConstProp(node, "lemma"), chunkid++);

  } else if (cat == "P") {
  
    //os << "<" << get_w_form(node) << "," << xmlGetConstProp(node, "lemma") << ".PREP> ";
    //dump_id(os);
    return chunk(xmlGetConstProp(node, "lemma"), chunkid++);

  } else if (cat == "V") {

    //os << "<" << get_w_form(node) << "," << xmlGetConstProp(node, "lemma") << ".V> "; 
    //dump_id(os);
    return chunk(xmlGetConstProp(node, "lemma"), chunkid++);

  } else if (cat == "CL") {
  
    //os << "<" << get_w_form(node) << "," << xmlGetConstProp(node, "lemma") << ".PRO> ";
    //dump_id(os);
    return chunk(xmlGetConstProp(node, "lemma"), chunkid++);

/*
  } else if (cat == "A") {

    os << "<" << get_w_form(node) << "," << xmlGetConstProp(node, "lemma") << ".A> "; 
    dump_id(os);

  } else if (cat == "N") {

    os << "<" << get_w_form(node) << "," << xmlGetConstProp(node, "lemma") << ".N> "; 
    dump_id(os);

  } else if (cat == "D") {

    os << "<" << get_w_form(node) << "," << xmlGetConstProp(node, "lemma") << ".DET> "; 
    dump_id(os);

*/
  } else if (cat == "I") {

    //os << "<" << get_w_form(node) << "," << xmlGetConstProp(node, "lemma") << ".INTJ> "; 
    //dump_id(os);
    return chunk(xmlGetConstProp(node, "lemma"), chunkid++);

  } else {
    //os << "?w?\n";
    cerr << "dump_w: unknow cat : '" << cat << "'\n";
  }
  return chunk("w.error");
}


chunk dump_chunks(xmlNode * node, ostream & os) {

  if (node->type == XML_ELEMENT_NODE) {
  
    string name((char *) node->name);

    if (name == "SENT" || name == "VPinf" || name == "Srel" || name == "Sint"
        || name == "VPpart"
        || name == "Ssub" || name == "COORD"
        || name == "AdP") {
 
     return dump_sent(node, os);

    } else if (name == "VN") {

      return dump_vn(node, os);

    } else if (name == "NP") {

      return dump_np(node, os);

    } else if (name == "AP") {

      return dump_ap(node, os);

    } else if (name == "PP") {

      return dump_pp(node, os);

    } else if (name == "w") {

      return dump_w(node, os);

    } else {
      os << "?" << name << "? "; 
      cerr << "dump_chunks: unknown elem : '" << (char *) node->name << "\n";
    }
  }
  return chunk();
}



int main(int argc, char ** argv) try {

  if (argc < 2) { cerr << "bad args\n"; exit(1); }

  if (strcmp(argv[1], "-id") == 0) {
    DUMPID = true;
  argc--, argv++;
  }

  if (argc < 2) { cerr << "bad args\n"; exit(1); }

  xmlDoc * doc = xmlParseFile(argv[1]);
  assert(doc);

  xmlXPathContext * xpath_ctx = xmlXPathNewContext(doc);
  assert(xpath_ctx);


  xmlXPathObject * xpath_obj = xmlXPathEvalExpression("//SENT", xpath_ctx); 
  assert(xpath_obj);

  xmlNodeSet * nodeset = xpath_obj->nodesetval;
  assert(nodeset);

  //cout << "nb SENT = " << nodeset->nodeNr << "\n";

  for (int i = 0; i < nodeset->nodeNr; ++i) {
    
    xmlNode * node = nodeset->nodeTab[i];

    cout << "SENT #" << i << " : ";
  
    dump_sentence(doc, node, cout);
    cout << "\n";

    chunkid = 0;
    chunk head = dump_chunks(node, cout);
    cout << "HEAD:" << head << "\n";
    cout << ".\n";
  }

  xmlXPathFreeObject(xpath_obj);
  xmlXPathFreeContext(xpath_ctx);
  xmlFreeDoc(doc);

} catch (exception & e) {
  cerr << "fatal error : " << e.what() << "\n";
  exit(1);
}

