#include "Segmentation.h"

using namespace std;

static int lastcut;

//Internal functions
static int isEllipsis(token *t,int &enclosed) {
  if (*t!=_punctuation) return 0;
  const char *s=t->c_str();
  int i;
  int first=0;
  int last=strlen(s);
  if ((s[first]=='(' && s[last-1]==')') ||
      (s[first]=='[' && s[last-1]==']'))
    { first=1;last--;enclosed=1; }
  else enclosed=0;
  // Should include presentation character for ellipsis
  for(i=first;i<last;i++) if (s[i]!='.') return 0;
  if (i-first<3) return 0;
  return i-first;
}

static int hasspace(const string &s) {
  return s.find(" ",0)!=string::npos;
}

static int nothasspace(const string &s) {
  return s.find(" ",0)==string::npos;
} 

void G_InitSegmentationPar() {
  lastcut=0;
}

int G_Segmentation(token **T,list<xmlNodePtr> *lsep,const string *ssep,int p,int max,int &next,token *&addtoken,token **&deletetoken) {
  next=p+1;

  // 1.1- "WORD."
  if (tok_eq(T[p],".") && tok_isword(T[p-1]) && utf8len(ssep[p-1])==0 &&
      dynamic_cast<token_word*>(T[p-1])->alphabet!=f_CJKV) {
    int rspace=utf8len(ssep[p]);
    int split=1;

    bool acronym=strchr(T[p-1]->norm.c_str(),'.')!=0;

    // a/ "WORD.WORD"       -> do not split but check if abbrv
    if (!rspace && tok_isword(T[p+1])) {
      split=0;goto lookup;
    }

    // a'/ "WORD.NUMBER"     -> do not split
    if (!rspace && tok_isnum(T[p+1])) {
      //cerr<<"--(1.1-a')--"<<endl;
      ((token_punctuation*)T[p])->subt=token_punctuation::gc_Pc;
      T[p]->SetAttribute(T[p]->refXml);
      return 0;
    }

    // b/ "WORD.[,)]"          -> do not split but check if abbrv
    if (!rspace && tok_in(T[p+1],",)")) {
      //cerr<<"--(1.1-b)--"<<endl;
      split=0;goto lookup;
    }

    //     c / length(WORD)=1 && uppercase(WORD)    -> do not split
    if (utf8len(T[p-1]->c_str())==1 && !tok_islower(T[p-1])) {
      //cerr<<"--(1.1-c)--"<<endl;
      ((token_punctuation*)T[p])->subt=token_punctuation::gc_Pc;
      T[p]->SetAttribute(T[p]->refXml);
      return 0;
    }

    //   h'/WORD is an acronym (previously regrouped by entity recognition)
    if (acronym && tok_islower(T[p+1])) {
      //cerr<<"--(1.1-h)--"<<endl;
      /* in that case we decide to drop following period, since the tokenization didn't "asked" for it */
      T[p-1]->src+=T[p]->src;
      deletetoken=T+p;
      T[p-1]->RefreshXmlContent();
      return 0;
    }
    
    // Look WORD in dictionary
  lookup:
    if(!split) {
      ((token_punctuation*)T[p])->subt=token_punctuation::gc_Pc;
      T[p]->SetAttribute(T[p]->refXml);
      return 0;
    }
  }

  // 1.2- "NUM."
  if (tok_eq(T[p],".") && tok_isnum(T[p-1]) && utf8len(ssep[p-1])==0) {

    // a/ "NUM. NUM"   -> do not split
    if (T[p+1] && tok_isnum(T[p+1])) {
	//cerr<<"--(1.2-a)--"<<endl;      
      ((token_punctuation*)T[p])->subt=token_punctuation::gc_Pc;
      T[p]->SetAttribute(T[p]->refXml);
      return 0;
    }

    // b/ "NUM. WORD"
    //     WORD is lowercase   -> do not split
    if (T[p+1] && tok_isword(T[p+1]) && tok_islower(T[p+1])) {
	//cerr<<"--(1.2-b)--"<<endl;      
      ((token_punctuation*)T[p])->subt=token_punctuation::gc_Pc;
      T[p]->SetAttribute(T[p]->refXml);
      return 0;
    }

  }

  // Handle of ellipsis, normally three dots, but we can find more 
  // non final ellipsis can be found in official [] or frequent () 
  // See reference documentation below
  int enclosed=0;
  int lellipsis=isEllipsis(T[p],enclosed);

  //  2- Ellipsis
  if (lellipsis) {

    // a/ "(...)" or "[...]"            -> do not split and skip block
    if (enclosed) {
      return 0;  // real ellipsis: this great man (...) whose record
    }
    // b/ "...."                        -> ellipsis plus dot. split
    if (lellipsis==4) {
      lastcut=p;
      return 1;  // official sentence end ellipsis: 3+1 dots
    }
    // c/ "WORD... "                    -> split
    if (nothasspace(ssep[p-1]) && hasspace(ssep[p])) {
      if(T[p+1] && tok_ispunct(T[p+1]) && tok_in(T[p+1],">)}]:!?"))
	return 0;
      if (T[p+1] && tok_ispunct(T[p+1]) && tok_in(T[p+1],"\"'") && 
	(p+1==max || hasspace(ssep[p+1])))
	return 0;
      if (T[p+1] && tok_islower(T[p+1])) return 0;
      else                               { lastcut=p; return 1; }
    }
    // d/ (default)                     -> do not split
    return 0;
  }

  /* Trivial rules for segmentation */

  if (tok_eq(T[p],"。")) {
   T[p]->norm="."; // Could not do it before, because of entity rec. rules
                   // (and previous segmentation rules)
   T[p]->RefreshXmlContent();
  }

  // 3- generic rules
  if (tok_ispunct(T[p])) {
    int nlspace=nothasspace(ssep[p-1]);
    int nrspace=nothasspace(ssep[p]);

    // a/ "[.!?]["']"   -> do not split (apply a' on next three tokens)
    if (nrspace && (tok_in(T[p],".!?")) &&
	tok_ispunct(T[p+1]) && tok_in(T[p+1],"\"'") &&
	(p+1==max || hasspace(ssep[p+1]) || tok_eq(T[p+2],"."))) {
      return 0;
    }

    // a'/ "[.!?]["'] "   -> split after \"
    if (nlspace && tok_in(T[p],"\"'") &&
	tok_ispunct(T[p-1]) && (tok_in(T[p-1],"?!.") || tok_eq(T[p-1],"..."))
	  && hasspace(ssep[p])) {
      lastcut=p;
      return 1;
    }

    // b/ "[.:;!?]([>\)\}\]].:!?)"   -> do not split (apply b'/b" on next tokens)
    if (nrspace && (tok_in(T[p],".:;!?)")) &&
	tok_ispunct(T[p+1]) && tok_in(T[p+1],">)}].:!?")) {
      return 0;
    }

    // b'/ "[.:;!?]([>\)\}\]]) ."   -> do not split
    if (nlspace && tok_in(T[p],">)}].:!?") &&
	tok_ispunct(T[p-1]) && (tok_in(T[p-1],".:;!?)") || tok_eq(T[p-1],"...")) &&
	tok_ispunct(T[p+1]) && tok_in(T[p+1],".")) {
      return 0;
    }

    // b"/ "[.:;!?]([>\)\}\]])"   -> split after $1
    if (nlspace && tok_in(T[p],">)}].:!?") &&
	tok_ispunct(T[p-1]) && (tok_in(T[p-1],".:;!?)") || tok_eq(T[p-1],"...")) &&
	((token_punctuation*)T[p-1])->subt!=token_punctuation::gc_Pc) {
      lastcut=p;
      return 1;
    }

    // c'/ ".."   -> do not split after first dot
    if (tok_in(T[p],".") && T[p+1] && tok_in(T[p+1],".")) {
      return 0;
    }
    
    // c/ ".!:;?" (strong punct)   -> split
    if ((tok_in(T[p],".:;!?"))) {
      lastcut=p;
      return 1;
    }
  }

    // 3bis- 
    // a/--* -> split before and after
    for(int k=p;k<=p+1 && k<=max;++k)
      if (T[k]->norm.size()>1 && 
	  T[k]->norm.find_first_not_of("-")==std::string::npos) {
	lastcut=p;
	return 1;
      }

    // b/Spanish ¿/! -> split before
    if(p+1<=max && (tok_eq(T[p+1],"¿") || tok_eq(T[p+1],"¡"))) {
      lastcut=p;
      return 1;
    }
    
    // c/ in segmentation_characters -> split before and after
    const char *segmentation_characters="·,•,|";

    if(segmentation_characters && *segmentation_characters) {
      while(*segmentation_characters) {
	const char *comma=segmentation_characters;
	while(*comma && *comma!=',')
	  ++comma;
	for(int k=p;k<=p+1 && k<=max;++k)
	  if(comma!=segmentation_characters && 
	     !strncmp(segmentation_characters,T[k]->norm.c_str(),
	              comma-segmentation_characters)) {
	    lastcut=p;
    	    return 1;
	  }
	segmentation_characters=*comma?comma+1:comma;
      }
    }

// #if defined(LANG_SRC_KO) || defined(LANG_SRC_JA) || defined(LANG_SRC_ZH)
    // d/CJK ● ★ ☆ -> split before
    if (tok_eq(T[p+1],"●") || tok_eq(T[p+1],"★") || tok_eq(T[p+1],"☆")) { 
      lastcut=p;
      return 1;
    }

    // e/CJK 【,】-> split respectively before/after
    if (tok_eq(T[p],"】") || (p+1<=max && tok_eq(T[p+1],"【"))) {
      lastcut=p;
      return 1;
    }

    // f/CJK ∶ -> split after
    if (tok_eq(T[p],"∶")) {
      lastcut=p;
      return 1;
    }
// #endif

  // 4- acronyms ending sentence rules
  if (tok_isword(T[p]) && !T[p]->src.empty() && T[p]->src[T[p]->src.length()-1]=='.' && !T[p+1])
    {
      addtoken=newtoken(".",".",_punctuation);
      lastcut=p;
      /* but remove the '.' in the src field of T[p] */
      T[p]->src.erase(T[p]->src.length()-1);
      T[p]->RefreshXmlContent();
      return 1;
    }
  return 0;
}


/**
 *
 */
static int
parsePar(xmlNodePtr par,int &tu_id) {

  xmlNodePtr cur = par->children;
  int ntoken=0;
  /* counting the number of tokens in the paragraph */
  while (cur != NULL) {
    xmlNodePtr next=cur->next;
    if (cur->type==XML_ELEMENT_NODE && (!xmlStrcmp(cur->name, (const xmlChar *)"token"))){
      ntoken++;
    }
    cur=next;
  }

  /* No sentences in a par without a single token! */
  if (ntoken==0) return 1;

  // building the structure on which segmentation will be applied
  // lsep contains the list of tags between tokens lsep[0] contains tags before
  // lsep[ntokens] contains trailing tags
  // ssep[xx] contains textual separators (space, cr, tabs)
  token **tokens=new token *[1+ntoken+1];
  xmlNodePtr *ntokens=new xmlNodePtr[ntoken+1];
  list<xmlNodePtr> *lsep=new list<xmlNodePtr>[ntoken+1];
  string *ssep=new string[ntoken+1];
  /* special segmentation mark. specsep=-1/0/1 for 
                          (do not segmente,default,segmente) */
  int *specsep=new int[ntoken+1];
  memset(specsep,0,(ntoken+1)*sizeof(int));
  int current_tok=0;
  cur=par->children;

  while (cur!=NULL) {
    xmlNodePtr next=cur->next;
    if (cur->type==XML_ELEMENT_NODE && (!xmlStrcmp(cur->name, (const xmlChar *)"token"))){
      if (!cur->_private) cur->_private=(void*)CreateToken(cur);
      tokens[++current_tok]=(token*)cur->_private;
      ntokens[current_tok]=cur;
    }
    else {
      lsep[current_tok].push_back(cur);
      if (cur->type==XML_TEXT_NODE) {
	char *sep=(char *)xmlNodeGetContent(cur);
	ssep[current_tok]+=sep;
	xmlFree(sep);
	if (cur->_private) { delete (token*)cur->_private;cur->_private=NULL; }
      }
    }
    xmlUnlinkNode(cur);
    cur=next;
  }

  // Tokens array starts and ends with NULL
  tokens[current_tok+1]=NULL;
  tokens[0]=NULL;

  std::list<xmlNodePtr> ts_to_remove;
  
  char stuid[10];
  sprintf(stuid,"s%d",tu_id++);
  token::InitId(1);
  /* we send initial separators */
  {
    {for(list<xmlNodePtr>::iterator it = lsep[0].begin();
 	 it != lsep[0].end(); ++it) {
      xmlAddChild(par, *it);
    }}
  }


  /* and prepare first tu */
  xmlNodePtr tu=xmlNewNode (NULL,(const xmlChar *)"tu");
  xmlSetProp(tu,(const xmlChar *)"id",(const xmlChar *)stuid);
  xmlAddChild(par,tu);

  G_InitSegmentationPar();

  /* next contains the number of token on which we will restart segmenting:
     indeed the segmentation function G_Segmentation returns value
       - presence of a sentence end at the current token
       - when we can restart segmenting.
     this parameter allows segmentation to define negative rules *there cannot
     be a sentence ends here*
  */

  /*
    TODO: in 
    M.{\v <systag type="seg" subtype="tu" value="not"/>} Bill{\v <systag type="seg" subtype="tu"/>} Clinton
    space before Clinton should be before tu start: it does not work
  */

  int next=0;
  for(current_tok=1;current_tok<=ntoken;current_tok++) {
    xmlAddChild(tu,ntokens[current_tok]);
    /* Set an id the current token */
    tokens[current_tok]->SetId();

    /* to avoid rebuilding the complete xml node associated to this token
       we just add the Id attribute here */
    sprintf(stuid,"t%d",tokens[current_tok]->GetId());
    xmlSetProp(ntokens[current_tok],
	       (const xmlChar *)"id",(const xmlChar *)stuid);

    int dec=0;
    token *addtoken=NULL;
    token **deletetoken=NULL;

    bool new_tu=false;
    
    /* If sentence ends, insert new tu */ 
    if (specsep[current_tok]==1 || 
	(specsep[current_tok]!=-1 &&
	 current_tok>=next && 
	 ((dec=G_Segmentation(tokens,lsep,ssep,current_tok,ntoken,next,addtoken,deletetoken))!=0))) 
      {
	for(;dec>1 && current_tok<=ntoken;current_tok++)
	  {
	    {for (list<xmlNodePtr>::iterator it=lsep[current_tok].begin();
		  it!=lsep[current_tok].end(); ++it) {
	      xmlAddChild(tu,*it);
	    }}
	    if (current_tok==ntoken) break;
	    xmlAddChild(tu,ntokens[current_tok+1]);
	    /* Set an id the current token */
	    tokens[current_tok+1]->SetId();

	    /* to avoid rebuilding the complete xml node associated to this token
	       we just add the Id attribute here */
	    sprintf(stuid,"t%d",tokens[current_tok+1]->GetId());
	    xmlSetProp(ntokens[current_tok+1],
		       (const xmlChar *)"id",(const xmlChar *)stuid);
	    dec--;
	  }

	/* add potential extra token restored by segmentation: typical U.S. = U.S. . */
	if (addtoken) {
	  addtoken->SetId();
	  xmlNodePtr Ntok=addtoken->CreateXmlNode();
	  Ntok->_private=addtoken;
	  xmlAddChild(tu,Ntok);
	}

	if (current_tok!=ntoken) {
	  new_tu=true;

	  tu=xmlNewNode (NULL,(const xmlChar *)"tu");
	  sprintf(stuid,"s%d",tu_id++);
	  xmlSetProp(tu,(const xmlChar *)"id",(const xmlChar *)stuid);
	  xmlAddChild(par,tu);
	}
      }
    
    /* In any case insert trailing tags */
    /* insert the trailing tags at par level when between tus */
    {
      {for (list<xmlNodePtr>::iterator it=lsep[current_tok].begin();
	    it!=lsep[current_tok].end(); ++it) {
	if(new_tu)
	  xmlAddPrevSibling(tu,*it);
	else
  	  xmlAddChild(current_tok==ntoken?par:tu,*it);
      }}
    }
    if (deletetoken) {
      xmlUnlinkNode((*deletetoken)->refXml);
      xmlFreeNode((*deletetoken)->refXml);
      delete *deletetoken;
      *deletetoken=NULL;
    }
  }

  delete [] tokens;
  delete [] ntokens;
  delete [] lsep;
  delete [] ssep;
  delete [] specsep;
}

/**
 *
 */
void doSegmentation(xmlDocPtr doc) {  
  int tu_id = 0;
  xmlNodePtr cur = xmlDocGetRootElement(doc);
  if (cur != NULL &&
      xmlStrcmp(cur->name, (const xmlChar *) "document") == 0) {
    cur = cur->xmlChildrenNode;
    while (cur != NULL) {
      if (cur->type==XML_ELEMENT_NODE &&
	  xmlStrcmp(cur->name, (const xmlChar *) "par") == 0) {
	parsePar(cur, tu_id);
      }
      cur = cur->next;
    }
  }
}
