

#include <string>
#include <map>
using namespace std;
#include "DocFilters.h"
#include "CharsetConvert.h"

static int tuid;

static int
parseParTu(xmlDocPtr doc,xmlNodePtr par,string &res,int intu,int normalize_output,int tu_group,string &post_token,int istarget,long nb_par, long par_num, std::string &sentence_beginning) {
  std::map<string,string> opened_mark;
  std::map<string,string> opened_mark_tu;
  xmlNodePtr cur = par->children;
  int first=(tu_group && istarget && !xmlStrcmp(par->name, (const xmlChar *)"transtu")) || ((!istarget || !tu_group || normalize_output==2) && !xmlStrcmp(par->name, (const xmlChar *)"tu"));
  if (first) {
    if (normalize_output) {
      char sid[10];
      sprintf(sid,"%d",tuid);    
      res+="<SYSTRAN sentence_id "+(string)sid+">";
    }
    res+=sentence_beginning;
    sentence_beginning="";
  }
  
  if (par->type==XML_ELEMENT_NODE && 
      !xmlStrcmp(par->name, (const xmlChar *)"token") && (!normalize_output || intu)) {
       const char *ostring;
       xmlChar *s;
       if (!istarget && xmlHasProp(par,(xmlChar *)"source"))
              s=xmlGetProp(par,(xmlChar *)"source");
          /* ostring should be re-encoded here */
          else s=xmlNodeGetContent(par);
        ostring=(const char*)s;
	
	res+=ostring;res+=post_token;post_token="";
	xmlFree(s);
      }
      else
  while (cur != NULL) {

    /* User-markup */
    if (cur->type==XML_ELEMENT_NODE && (!xmlStrcmp(cur->name,(const xmlChar*)"bmark") || !xmlStrcmp(cur->name,(const xmlChar*)"emark")) && !xmlHasProp(cur,(const xmlChar *)"disabled")) {
      xmlChar *id=xmlGetProp(cur,(const xmlChar *)"id");
      xmlChar *type=xmlGetProp(cur,(const xmlChar *)"type");
      xmlChar *value=xmlGetProp(cur,(const xmlChar *)"value");
      if (id) {
	string mvalue;
	string malt;
	if (*cur->name=='e') { // emark
	  if (opened_mark.find((const char*)id)!=opened_mark.end()) {
	    string mark=opened_mark[(const char *)id];
	    res+=mark.c_str();
	  }
	} else {
	  /*
	  switch (GetUserMarkup((const char *)type,(const char *)value,mvalue,malt)) {
	  case m_raw:
	    // raw markup
	    res+=mvalue;
	  case m_bool:
	    // boolean markup - only display alt, if any
	    if (malt.length()) opened_mark[(const char *)id]="/"+malt;
	    break;
	  case m_native:
	    break;
	  case m_pxml:
	    // pseudo xml markup: there is no native markup in text
	    res+=(string)"<"+mvalue+(string)">";
	    if (!malt.length()) {
	      opened_mark[(const char *)id]=(string)"</"+mvalue+(string)">";
	    } else {
	      opened_mark[(const char *)id]=(string)"/"+malt+(string)"</"+mvalue+(string)">";
	    }
	    break;
	default:
	  break;
	  }
	*/
	}

	}
      if (id) xmlFree(id);
      if (type) xmlFree(type);
      if (value) xmlFree(value);
    }
    else if (cur->type==XML_ELEMENT_NODE && 
	((!tu_group && !xmlStrcmp(cur->name, (const xmlChar *)"tu")) || 
	 (tu_group && istarget && !xmlStrcmp(cur->name, (const xmlChar *)"transtu")) || 
	 (tu_group && (!istarget || normalize_output==2) && !xmlStrcmp(cur->name, (const xmlChar *)"tu")) || 
	 !xmlStrcmp(cur->name, (const xmlChar *)"tu_group") ||
	 !xmlStrcmp(cur->name, (const xmlChar *)"token") ||
	 !xmlStrcmp(cur->name, (const xmlChar *)"par")
	 )){
      /* pseudo recursivity - to handle par, par/tu, and par/tu/token */
      if(!xmlStrcmp(cur->name,(xmlChar*)"par")) {
	//if(!setstatus(nb_par,par_num++,XMLFLOW_STEP_POSTFILTER_TXT))
	//return 0;
      } else {
	if (!xmlStrcmp(cur->name,(xmlChar*)"tu") || !xmlStrcmp(cur->name,(xmlChar*)"tu_group")) {
	  xmlChar *sid=xmlGetProp(cur,(xmlChar *)"id");
	  if (sid && *sid)
	    tuid=atol((char*)sid+1); /* sid="s\d+" */
	  if(sid)
	    xmlFree(sid);
	}
	tu_group|=!xmlStrcmp(cur->name, (const xmlChar *)"tu_group");
      }
      int subintu=intu || !xmlStrcmp(cur->name, (const xmlChar *)"tu")|| !xmlStrcmp(cur->name, (const xmlChar *)"transtu");
      if (!parseParTu(doc,cur,res,subintu,normalize_output,tu_group,post_token,istarget,nb_par,par_num,sentence_beginning)) return 0;
    }
    else if (!normalize_output && cur->type==XML_ELEMENT_NODE && (!xmlStrcmp(cur->name, (const xmlChar *)"tag"))){

      if (xmlHasProp(cur,(xmlChar *)"internal")) {
	// some internal tags has been generated
	xmlChar *internal=xmlGetProp(cur,(xmlChar *)"internal");

	xmlFree(internal);
      }

      xmlChar *s=xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
      if(s) {
        if(!xmlStrcmp(cur->parent->name,(const xmlChar *)"par") ||
           !xmlStrcmp(cur->parent->name,(const xmlChar *)"document"))
          sentence_beginning+=(char*)s;
        else
          res+=(char*)s;
  	xmlFree(s);
      }
    }
    else if (cur->type==XML_TEXT_NODE && (!normalize_output || intu)) {
      xmlChar *s=xmlNodeGetContent (cur);
      const char *ostring;
      
      /* ostring should be re-encoded here */
      ostring=(const char*)s;
      if((!xmlStrcmp(cur->parent->name,(const xmlChar *)"par") ||
	  !xmlStrcmp(cur->parent->name,(const xmlChar *)"document"))
	 && (strspn(ostring," \t\n\r")!=strlen(ostring) ||
	     sentence_beginning.length()))
	{
	  /* if we are in between 2 tu and the sentence content is not 
	     only space, then copy the node at the beginning of next sentence
	  */
	  sentence_beginning+=(char*)ostring;
	}
      else
	{
	  res+=ostring;res+=post_token;
	  post_token="";
	}
      xmlFree(s);
    }    
    cur = cur->next;
  }
  if (first && normalize_output) {
    res+="\n";
    if (normalize_output==2 && !xmlStrcmp(par->name,(xmlChar*)"transtu"))
      res+="\n";
  }
  return 1;
}

std::string AddBOM (const std::string &s,const char *target_charset) {
  /*if (OptTrue("TXT_UTF8_BOM")) {
    if (!strcmp(target_charset,"UTF-8"))
      return "\xef\xbb\xbf"+s;
      }*/
  return s;
}

static int
parseDoc(xmlDocPtr doc,string &res,int istarget) {
  
  xmlNodePtr cur = xmlDocGetRootElement(doc);

  if (cur == NULL) {
    fprintf(stderr,"ERROR_PARSING_XML\n");
    return 0;
  }
  
  if (xmlStrcmp(cur->name, (const xmlChar *) "document")) {
    fprintf(stderr,"ERROR_PARSING_XML\n");
    return 0;
  }
  
  int target_format=0;
  //if (OptEqual("TARGET_FORMAT","text/plain")) target_format=1;
  //if (OptEqual("NOTRAN","3")) target_format=2;
  string post_token;
  long nb_par=0;
  for(xmlNodePtr bla=doc->children->children;bla;bla=bla->next)
    if(bla->type==XML_ELEMENT_NODE && !xmlStrcmp(bla->name,(const xmlChar*)"par"))
      nb_par++;
  std::string sentence_beginning;
  if (!parseParTu(doc,cur,res,0,target_format,0,post_token,istarget,nb_par,1,sentence_beginning)) return 0;
  res+=sentence_beginning;

  return 1;
}


std::string PostFilterTXT(DocumentPtr D)
{
  // Will need later to get tuid from id property in (trans?tu)
  tuid=1;
  bool istarget = true;
  string res;

  if (!parseDoc (D->xmltree,res,istarget)) return "";

  // Encoding conversion

  const char *target_charset = "UTF-8";//OptGetValue("TARGET_CHARSET");

  if(strcmp(target_charset,"UTF-8")) {
    DefaultCharsetConvertor conv("UTF-8",target_charset);
    if(!conv.IsOk()) {
      fprintf(stderr,"INCORRECT_CHARSET\n");
      return "";
    }
    return AddBOM(conv.Convert(res),target_charset);
  } else
    return AddBOM(res,target_charset);
}
