#include "Tokenization.h"

using namespace std;

/**
 *
 */
static void parsePar(xmlNodePtr par) {
 
  xmlNodePtr cur = par->children;

  xmlNodePtr tokenized=NULL;
  xmlNodePtr firsttokenized=NULL;

  while (cur != NULL) {
    string currenttext;

    while (cur != NULL) {
      xmlNodePtr next=cur->next;
      if (cur->type==XML_TEXT_NODE) {
	xmlChar *s=xmlNodeGetContent (cur);
	
	xmlUnlinkNode(cur);
	xmlFreeNode(cur);
	currenttext+=(char *)s;
	
	xmlFree(s);
      }         
      cur = next;
    }
    
    list<token *> tokens;
    G_Preprocess(currenttext, tokens);

    bool delete_next_separator=false;

    {for(list<token *>::iterator it = tokens.begin(); it != tokens.end(); ++it) {
      token *t = *it;
      xmlNodePtr Nt = NULL;
      if (t->GetType()!=_tag) {
	Nt=t->CreateXmlNode();
	/* Do not keep "separator" token since they are merged in xmlTree */
	if (t->GetType()!=_separator) {
	  delete_next_separator=false;
	  Nt->_private=t; // keep token in private field of XmlNode
	} else {
	  delete t;
	  if(delete_next_separator) {
	    xmlChar *s=xmlNodeGetContent(Nt);
	    if(s) {
	      std::string space=reinterpret_cast<char *>(s);
	      xmlFree(s);
	      if(!space.empty()) {
		delete_next_separator=false;
		if (utf8len(space.c_str())==1) {
		  xmlFreeNode(Nt);
		  continue;
		} else {
		  space.erase(0,utf8charlen(space.c_str()));
		  xmlChar *encoded=xmlEncodeEntitiesReentrant(Nt->doc,reinterpret_cast<const xmlChar *>(space.c_str()));
		  xmlNodeSetContent(Nt,encoded);
		  xmlFree(encoded);
		}
              }
            }
          }
        }
        /* rebuild xml tree */
        if (tokenized) { tokenized->next=Nt; Nt->prev=tokenized; }
        else firsttokenized=Nt;
        tokenized=Nt;
      }
    }}

    xmlNodePtr next=cur?cur->next:NULL;

    cur=next;
  }    

  /* now check if the first nodes are not spaces. they should not. regular spaces 
     have been filtered out by filters, but it can remains ideographs spaces,
     or any other non standard spaces. those spaces are moved before the <par> tag. */
    while(firsttokenized && 
	  !(firsttokenized->type==XML_ELEMENT_NODE && 
	    (!xmlStrcmp(firsttokenized->name, (const xmlChar *)"token"))))
    {
      xmlNodePtr next=firsttokenized->next;
      xmlUnlinkNode(firsttokenized);
      xmlAddPrevSibling(par,firsttokenized);
      firsttokenized=next;
    }

  if (firsttokenized) xmlAddChildList(par,firsttokenized);
}

/**
 *
 */
void doTokenization(xmlDocPtr doc) {
  xmlNodePtr cur = xmlDocGetRootElement(doc);
  if (cur != NULL &&
      xmlStrcmp(cur->name, (const xmlChar *) "document") == 0) {
    cur = cur->xmlChildrenNode;
    while (cur != NULL) {
      if (cur->type==XML_ELEMENT_NODE &&
	  xmlStrcmp(cur->name, (const xmlChar *) "par") == 0) { 
	parsePar(cur);
      }
      cur = cur->next;
    }
  }
}
