
#include <string.h>
#include <ctype.h>

#include "ParseTXT.h"

extern int max_line_length;

static int chomp (char *Buf,
		  const char *&preSep,
		  const char*&postSep,
		  bool mail_translate,
		  bool &thread) {

  static char b[2000];
  static char a[2000];

  int l = strlen(Buf)-1;
  thread=false;
  while(l>=0 && (Buf[l]<=32 && Buf[l]>0))
    l--;
  strcpy(b,Buf+l+1);
  Buf[l+1]=0;
  int postSepplace=l+1;
  postSep=b;
  l=0;
  while(Buf[l] && ((Buf[l]<=32 && Buf[l]>0) || (mail_translate && Buf[l]=='>')))
  {
    if(Buf[l]=='>')
      thread=true;
    a[l]=Buf[l];l++;
  }
  a[l]=0;
  preSep=a;
  return postSepplace;
}

static int ismarkseg(const char *buf) {
  const char *refbuf=buf;
  if (strncmp(buf,"<SYSTRAN",8)) return 0;
  buf+=8;
  if (!isspace(*buf)) return 0;
  while(isspace(*buf)) buf++;
  if (strncmp(buf,"sentence_id",11)) return 0;
  buf+=11;
  if (!isspace(*buf)) return 0;
  while(isspace(*buf)) buf++;
  if (!isdigit(*buf)) return 0;
  while(isdigit(*buf)) buf++;
  while(isspace(*buf)) buf++;
  if (*buf!='>') return 0;
  return (buf+1)-refbuf;
}

int ecTxtParse(xmlNodePtr root_xml,const char *s,bool is_wrapped) {

  //fprintf(stdout,"  ecTxtParse, s=%s\n",s);

  int par_id=1;
  int dnt_id=0;
  char *refBuf;
  char *Buf=strdup(s);
  refBuf=Buf;
  char endbuffer=*Buf;
  long Buflen=strlen(Buf);

#if !defined(ENGINEv5) && defined(DYNAMIC_PRESEG)
  int preseg=0;
  {
    /* Automatic detection of preseg mode */
    int d=0;
    const char *markseg=Buf-1;
    while ((markseg=strstr(markseg+1,"<SYSTRAN"))!=NULL 
	   && !(d=ismarkseg(markseg)));
    if (d) {
      OptSetValue("preseg","1");
      OptSetValue("segmentation_mark","1");
      preseg=1;
    }
  }
#endif
  
  bool mail_translate = 0;
  bool donottranslate_thread = 0;
  
  while(endbuffer) {
    //if(!setstatus(Buflen,Buf-refBuf,XMLFLOW_STEP_PREFILTER_TXT))
    //return ecABORTED;
    size_t decb=0;
    *Buf=endbuffer; // Restore end of string
#if !defined(ENGINEv5) && defined(DYNAMIC_PRESEG)
    char *rBuf=Buf;
    if (preseg) {
      while ((Buf=strstr(Buf,"<SYSTRAN"))!=NULL 
	     && !(decb=ismarkseg(Buf)));
      if (!Buf) endbuffer=0;
      else {
	endbuffer=*(Buf+decb);
	*Buf=0;
	decb+=Buf-rBuf;
      }
      Buf=rBuf;
    } else
#endif
      {
    size_t t;
    for(t=0;Buf[t];++t) {
      if(is_wrapped) {
	if ((Buf[t]=='\n' || Buf[t]=='\r')) {
	  /* if the text_filter_pars was automatic, then check if the current
	     line can be a wrap (size of the current line + size of the 
	     following word do not fit in max_line_length) */
	  size_t sizebef=1;
	  while(t>=sizebef && !strchr("\r\n",Buf[t-sizebef])) 
	    sizebef++;
	  size_t sizeaft=1;
	  while(Buf[t+sizeaft] && !strchr("\r\n \t:",Buf[t+sizeaft])) 
	    sizeaft++;
	  if (sizebef+sizeaft<0.9*max_line_length) break;
	}
        if(Buf[t]=='\n') { // Unix
	  size_t u=t+1;
	  while(Buf[u] && (strchr(" \t\f",Buf[u]) ||
		           (mail_translate && Buf[u]=='>')))
	   ++u;
	  if (Buf[u]=='\n')
	    break;
	} else if (Buf[t]=='\r' && Buf[t+1]=='\n') { // Windows
	  size_t u=t+2;
	  while(Buf[u] && (strchr(" \t\f",Buf[u]) ||
		           (mail_translate && Buf[u]=='>')))
	    ++u;
	  if (Buf[u]=='\r' && Buf[u+1]=='\n')
	   break;
	} else if (Buf[t]=='\r') { // Mac 
	  size_t u=t+1;
	  while(Buf[u] && (strchr(" \t\f",Buf[u]) ||
		           (mail_translate && Buf[u]=='>')))
	    ++u;       
	  if(Buf[u]=='\r')
	    break;
	}
      } else {
        if(Buf[t]=='\n' || Buf[t]=='\r')
          break;	  
      }
    }

    size_t r=strspn(Buf+t,"\n\r\f");
    endbuffer=Buf[t+r];
    Buf[t+r]=0;
	decb=t+r;
      }

    char *realBuf=(char*)Buf;
    //fprintf(stdout,"  realBuf=%s\n",realBuf);
    if (Buf[0]==0xef && Buf[1]==0xbb && Buf[2]==0xbf) // utf-8 signature
      realBuf+=3;
    const char *preSep;
    const char *postSep;
    bool in_thread;
    int postSepplace=chomp(realBuf,preSep,postSep,mail_translate,in_thread);

    if (*preSep) {
      xmlNodeAddContent (root_xml,(const xmlChar *)preSep);
      //fprintf(stdout,"  xmlNodeAddContent(preSep=%s)\n",preSep);
    }

    realBuf+=strlen(preSep);
    //fprintf(stdout,"  realBuf2=%s\n",realBuf);

    if (*realBuf) {
      xmlNodePtr par_xml=xmlNewChild(root_xml,NULL,(const xmlChar *)"par",NULL);    
      /*if(in_thread && donottranslate_thread)
	AddDNTbmark(par_xml,dnt_id);*/
      char pars[10];
      sprintf(pars,"%d",par_id);
      par_id++;
      xmlNewProp(par_xml, (const xmlChar *)"id", (const xmlChar *)pars);

      bool unwrap_next_line=false;
      
      if(mail_translate) {
	// Look if we have some mail headers -> the line should not be wrapped
	char *p=realBuf;
	while(*p && strchr(" >\t",*p))
	  ++p;
	if(!strncmp(p,"----",4)) { // ----- Original Message -------
	  while(*p=='-')
	    ++p;
	  while(*p && !strchr("\n\r-",*p))
	    ++p;
	  if(!strncmp(p,"----",4))
	    unwrap_next_line=true;
	} else if(*p && !strchr(" :\n\t\r",*p)) { // From: bla
	  while(*p && !strchr(" :\n\t\r",*p))
	    ++p;

	  while(*p && strchr(" \t",*p))
	    ++p;

	  if(*p==':')
	    unwrap_next_line=true;
	}
      }
      
      while(*realBuf) {

	char *p = realBuf;
	//fprintf(stdout,"  while, p=%s\n",p);
	while(*p && !strchr(" \t\r\n\f",*p) && strncmp(p," ",2))
	  ++p;
	char refp=*p;
	*p=0;

	xmlNodeAddContent(par_xml,(const xmlChar *)realBuf);
	//fprintf(stdout,"  xmlNodeAddContent(realBuf=%s)\n",realBuf);

	*p=refp;
	realBuf=p;
	//fprintf(stdout,"  realBuf3=%s\n",realBuf);
	if(!*realBuf)
	  break;
	bool initial_indenting=false;
	
	while(*p && (strchr(" \t\r\n\f",*p) || !strncmp(p," ",2) ||
	      (mail_translate && initial_indenting && *p=='>'))) {
	  if(strchr("\n\r",*p))
	    initial_indenting=true;
	  
	  if(*p=='')
	    p+=2;
	  else
	    p++;
	}

	refp=*p;
	*p=0;

	/*if(initial_indenting && in_thread && donottranslate_thread)
	  if(!strchr(realBuf,'>'))
	    AddDNTemark(par_xml,dnt_id);*/

	//fprintf(stdout,"  *realBuf=%c\n",*realBuf);
	//fprintf(stdout,"  p-realBuf=%d\n",p-realBuf);

	
	if(mail_translate && initial_indenting && strchr(realBuf,'>')) {
	  // Mail thread
	    xmlNodePtr tag=xmlNewTextChild(par_xml,0,(const xmlChar *)"tag",
		                                     (const xmlChar *)realBuf);
	    xmlSetProp(tag,(const xmlChar *)"space",(const xmlChar *)"1");
	    *p=refp;
	    /*if(!in_thread && donottranslate_thread) {
	      in_thread=true;
	      AddDNTbmark(par_xml,dnt_id);
	      }*/
	} else if
	  (/*(OptGetValue("formatting_spaces")!=std::string("keep") &&
	     OptGetValue("formatting_spaces")!=std::string("segmenting")) ||*/
	   (*realBuf==' ' && (p-realBuf==1 || 
		             (p-realBuf==2 && realBuf[1]==' '))) ||
	   (*realBuf=='' && p-realBuf==2) ||
	   (*realBuf=='\n' || *realBuf=='\r' || *realBuf=='\f') ||
	   initial_indenting) {
	  /* 1 or 2 spaces, 1 nbsp, initial indenting, \r, \n : Normal space */

	  xmlNodeAddContent(par_xml,(const xmlChar *)realBuf);
	  //fprintf(stdout,"  xmlNodeAddContent(realBuf_space=%s)\n",realBuf);

	  *p=refp;
	} else {
	  /* Formatting space */
	  if(/*OptGetValue("formatting_spaces")==std::string("segmenting")*/0) {
	    //fprintf(stdout,"  xmlAddChild(%s)\n",realBuf);
	    xmlAddChild(par_xml->parent,xmlNewText((const xmlChar *)realBuf));
	    par_xml=xmlNewChild(par_xml->parent,
				      NULL,
				      (const xmlChar *)"par",
				      NULL);
	    sprintf(pars,"%d",par_id);
	    par_id++;
	    xmlNewProp(par_xml, (const xmlChar *)"id", (const xmlChar *)pars);
	    *p=refp;

	    /* When there is formmating spaces in a line switch to line
	      * pars for the next line */
	    unwrap_next_line=true;
	  } else {
	    xmlNodePtr tag=xmlNewTextChild(par_xml,0,(const xmlChar *)"tag",
		                                     (const xmlChar *)realBuf);
	    xmlSetProp(tag,(const xmlChar *)"space",(const xmlChar *)"1");
	    *p=refp;
	  }
	}
	realBuf=p;

	if(unwrap_next_line && is_wrapped) {
#if !defined(ENGINEv5) && defined(DYNAMIC_PRESEG)
	  if(!preseg && is_wrapped)
#endif
	  {
	    char *eol=p;
	    while(*eol && *eol!='\n' && *eol!='\r')
	      ++eol;
	    if(*eol) {
	      eol+=strspn(eol,"\n\r\f")-1;
	      Buf[decb]=endbuffer;
	      endbuffer=*eol;
	      *eol=0;
	      decb=eol-Buf;
	      Buf[postSepplace]=*postSep;
	      postSep="";
	    }
	  }
	  unwrap_next_line=false;
	}     
      } // while(*realBuf)

      /*if(in_thread && donottranslate_thread)
	AddDNTemark(par_xml,dnt_id);*/
    }
    if (*postSep) {
      xmlNodeAddContent (root_xml,(const xmlChar *)postSep);
      //fprintf(stdout,"  xmlNodeAddContent(postSep=%s)\n",postSep);
    }
    Buf+=decb;
  }
  free(refBuf);
  return ecOK;
}
