
// $Id: token.cc,v 1.1 2005/02/25 15:31:46 anonymous Exp $

#include <string>
#include "token.h"

using namespace std;

char *type_name[]={"undef","separator","numeric","punctuation","symbol","entity","word","tag","unknown"};

char *gc_name[]={"Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co","Cn"};

tokentype  GetTokenType(const char *);
gc_val GetGc   (const char *);
f_val  GetF    (const char *);

string Int2String(int);
int String2Int(string);

void token::RefreshXmlContent() {
  xmlNodeSetContent(refXml,(const xmlChar *)"");
  xmlNodeAddContent(refXml,(const xmlChar*)norm.c_str());
  if (src!=norm) {
    /* Set src attribute only if different from norm */
    xmlSetProp(refXml,(const xmlChar *)"source",(const xmlChar *)src.c_str());
  } else {
    xmlUnsetProp(refXml,(const xmlChar *)"source");
  }

}

// ---- class token_separator -------------------------------------------------

token_separator::token_separator(const string &s, const string &n):
  subt(gc_undef) {
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  type=_separator;
}

token_separator::token_separator(const string &s, const string &n, int gc):
  subt((gc_subt)gc) {
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  type=_separator;
}

xmlNodePtr token_separator::CreateXmlNode() {
  xmlNodePtr cur = xmlNewText((const xmlChar*) norm.c_str());
  refXml=cur;
  return cur;
}

void token_separator::SetAttribute(xmlNodePtr cur) {
}

// ---- class token_numeric ---------------------------------------------------

token_numeric::token_numeric(const string &s, const string &n) {
  wordweight=1;
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  type=_numeric;
}

xmlNodePtr token_numeric::CreateXmlNode() {
  xmlNodePtr cur=token::CreateXmlNode();
  
  return cur;
}

void token_numeric::SetAttribute(xmlNodePtr) {
}

// ---- class token_punctuation -----------------------------------------------

token_punctuation::token_punctuation(const string &s, const string &n):
  subt(gc_undef) {
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  type=_punctuation;
}

token_punctuation::token_punctuation(const string &s, const string &n, int gc):
  subt((gc_subt)gc) {
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  type=_punctuation;
}

xmlNodePtr token_punctuation::CreateXmlNode() {
  xmlNodePtr cur=token::CreateXmlNode();
  // Setting token_punctuation subt
  SetAttribute(cur);
  return cur;
}


void token_punctuation::SetAttribute(xmlNodePtr cur) {
  if (subt!=-1) {
    xmlSetProp(cur,(const xmlChar *)"subt",(const xmlChar *)gc_s[(int)subt]);
  }
  else {
    xmlUnsetProp(cur,(const xmlChar *)"subt");
  }
}

// ---- class token_symbol ----------------------------------------------------

token_symbol::token_symbol(const string &s, const string &n):
  subt(gc_undef) {
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  type=_symbol;
}

token_symbol::token_symbol(const string &s, const string &n, int gc):
  subt((gc_subt)gc) {
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  type=_symbol;
}

xmlNodePtr token_symbol::CreateXmlNode() {
  xmlNodePtr cur=token::CreateXmlNode();
  // Setting token_symbol subt

  return cur;
}

void token_symbol::SetAttribute(xmlNodePtr) {
}

// ---- class token_entity ----------------------------------------------------

const char *token_entity::entity_subt_name[] =
  {
    "",
    "email",
    "acronym",
    "uri",
    "date",
    "numeric",
    "propernoun",
    "address",
    "ip",
  };

const token_entity &token_entity::operator=(const token_entity &t) {
  if(&t==this)
    return *this;
 
  *static_cast<token *>(this)=t;
  subt=t.subt;
  
  return *this;
}

token_entity::token_entity(const token_entity &t) : token(t) {
  wordweight=1;
  subt=t.subt;
}

token_entity::~token_entity() {
}

token_entity::token_entity(const string &s, const string &n):
  subt(e_undef) {
  wordweight=1;
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  type=_entity;
}

token_entity::token_entity(const string &s, const string &n, int e):
  subt((entity_subt)e) {
  wordweight=1;
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  type=_entity;
}

xmlNodePtr token_entity::CreateXmlNode() {
  xmlNodePtr cur=token::CreateXmlNode();
  // Setting token_entity sub
  SetAttribute(cur);
  return cur;
}

void token_entity::SetAttribute(xmlNodePtr cur) {
  xmlSetProp(
      cur,
      (const xmlChar *)"subt",
      (const xmlChar *)entity_subt_name[subt]);
}

// ---- class token_word ------------------------------------------------------

token_word::token_word(const string &s, const string &n, int alph):
  alphabet((f_val)alph),
  pos("") {
  wordweight=1;
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  wordcase=::GetCase(n.c_str());
  type=_word;
}

token_word::token_word(const string &s, const string &n, const string &p,
		       int alph):
  alphabet((f_val)alph),
  pos(p) {
  wordweight=1;
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  wordcase=::GetCase(n.c_str());
  type=_word;
}

xmlNodePtr token_word::CreateXmlNode() {
  xmlNodePtr cur=token::CreateXmlNode();
  SetAttribute(cur);
  return cur;
}

string token_word::GetPos() {
  return pos;
}

void token_word::SetPos(string p) {
  pos=p;
}

void token_word::SetAttribute(xmlNodePtr cur) {
  if (alphabet!=f_na)
    xmlSetProp(cur,(const xmlChar *)"alph",(const xmlChar *)f_s[(int)alphabet]);
  else
    xmlUnsetProp(cur,(const xmlChar *)"alph");

  if (pos != "") {
    xmlSetProp(cur,(const xmlChar *)"pos",(const xmlChar *)pos.c_str());
  }
  else {
    xmlUnsetProp(cur,(const xmlChar *)"pos");
  }

  if (wordcase!=_clower && alphabet!=f_CJKV) {
    xmlSetProp(cur,(const xmlChar *)"case",(const xmlChar *)c_case[wordcase]);
  }
  else {
    xmlUnsetProp(cur,(const xmlChar *)"case");
  }
}

// ---- class token_tag -------------------------------------------------------

token_tag::token_tag() {
  id=0;
  translation_id=0;
  type=_tag;
  hasspace=false;
}

xmlNodePtr token_tag::CreateXmlNode() {
  xmlNodePtr cur=token::CreateXmlNode();
  SetAttribute(cur);

  return cur;
}

void token_tag::SetAttribute(xmlNodePtr cur) {
  xmlSetProp(cur,(const xmlChar *)"type",(const xmlChar *)type_name[type]);
  if(hasspace)
    xmlSetProp(cur,(const xmlChar *)"space",(const xmlChar *)"1");
}

// ---- class token_unknown ---------------------------------------------------

token_unknown::token_unknown(const string &s, const string &n) {
  id=0;
  translation_id=0;
  src=s;
  norm=n;
  type=_unknown;
}

xmlNodePtr token_unknown::CreateXmlNode() {
  xmlNodePtr cur=token::CreateXmlNode();
  SetAttribute(cur);

  return cur;
}

void token_unknown::SetAttribute(xmlNodePtr cur) {
  xmlSetProp(cur,(const xmlChar *)"type",(const xmlChar *)type_name[type]);
}

// ---- class token -----------------------------------------------------------

int token::currentid=1;
int token::translation_currentid=1;

token::token() {
  wordweight=0;
  weight=1;
  id=0;
  translation_id=0;
  hasbeennormalized=false;
  hasbeenfoundinindex=false;
  src=norm="";
  type=_undef;
  wordcase=_cna;
  refXml=NULL;
  ts=NULL;
  translate=true;
  no_entity=false;
  no_entity_misc=false;
}

token::token(const string &s, const string &n) {
  wordweight=0;
  weight=1;
  id=0;
  translation_id=0;
  hasbeennormalized=false;
  hasbeenfoundinindex=false;
  src=s;
  norm=n;
  type=_undef;
  refXml=NULL;
  ts=NULL;
  translate=true;
  no_entity=false;
  no_entity_misc=false;
}

string token::GetPos() {
  // Returning nothing
  return "";
}

void token::SetPos(string p) {
  // Setting nothing
}

void token::SetAttribute(xmlNodePtr cur) {
  // Setting nothing
}

void token::SetTagSep(list<xmlNodePtr> ltagsep) {
  tagsep=ltagsep;
}

list<xmlNodePtr> &token::GetTagSep() {
  return tagsep;
}

xmlNodePtr token::CreateGenericXmlNode() {
  xmlNodePtr cur = xmlNewNode(NULL,(const xmlChar *)"token");
  // was using SetContent before: but it seems that there is a bug
  // when setting content to '&'?
  xmlNodeAddContent(cur,(const xmlChar *)norm.c_str());
  if (src!=norm) {
    /* Set src attribute only if different from norm */
    xmlSetProp(cur,(const xmlChar *)"source",(const xmlChar *)src.c_str());
  }
  if(!translate)
    xmlSetProp(cur,(const xmlChar *)"translate",(const xmlChar *)"0");
  if(no_entity)
    xmlSetProp(cur,(const xmlChar *)"no_entity",(const xmlChar *)"1");
  if(no_entity_misc)
    xmlSetProp(cur,(const xmlChar *)"no_entity_misc",(const xmlChar *)"1");
  refXml=cur;
  return cur;
}

xmlNodePtr token::CreateXmlNode() {
  xmlNodePtr cur = CreateGenericXmlNode();
  xmlSetProp(cur,(const xmlChar *)"type",(const xmlChar *)type_name[type]);

  {
    char tid[10];
    sprintf(tid,"t%d",id);
    xmlSetProp(cur,(const xmlChar *)"id",(const xmlChar *)tid);
  }

  return cur;
}

// ----------------------------------------------------------------------------

token *CreateToken(xmlNodePtr cur) {
  token *tok;
  int id=-1;
  string text, type, alph, source;

  if (cur->type==XML_TEXT_NODE) {
    // PCDATA is token_separator
    type="separator";
  }
  else if (cur->type!=XML_ELEMENT_NODE) {
    fprintf(stderr,"[CreateToken] xmlNodePtr must be a XML_ELEMENT_NODE.\n");
    return NULL;
  }
  
  xmlChar *x_text   = xmlNodeGetContent(cur);
  xmlChar *x_id     = xmlGetProp(cur,(const xmlChar *)"id");
  xmlChar *x_type   = xmlGetProp(cur,(const xmlChar *)"type");
  xmlChar *x_source = xmlGetProp(cur,(const xmlChar *)"source");

  if (x_text)   { text   = (char *) x_text;             xmlFree(x_text);   }
  if (x_id)     { id     = atoi((char *) x_id);   xmlFree(x_id);     }
  if (x_type)   { type   = (char *) x_type;             xmlFree(x_type);   }
  if (x_source) {
    source = (char *) x_source;
    xmlFree(x_source);
  } else
    source=text;

  if(type.length()) {
    tok=newtoken(source,text,GetTokenType(type.c_str()));
  }
  else {
    // default type is word
    tok=new token_word(source,text,GetAlphabet(text.c_str()));
  }
  if (id!=-1) tok->SetId(id);
  tok->SetAttribute(cur);
  return tok;
}

token *newtoken(const string &s, const string &n, tokentype type) {
  switch(type) {
  case _separator:   return new token_separator(s,n);
  case _numeric:     return new token_numeric(s,n);
  case _punctuation: return new token_punctuation(s,n);
  case _symbol:      return new token_symbol(s,n);
  case _entity:      return new token_entity(s,n);
  case _word:        return new token_word(s,n,GetAlphabet(n.c_str()));
  case _unknown:     return new token_unknown(s,n);
  default:
    return new token(s,n);
  }
}

token *newtoken(const string &s, const string &n, const string &p,
		tokentype type) {
  switch(type) {
  case _separator:   return new token_separator(s,n);
  case _numeric:     return new token_numeric(s,n);
  case _punctuation: return new token_punctuation(s,n);
  case _symbol:      return new token_symbol(s,n);
  case _entity:      return new token_entity(s,n);
  case _word:        return new token_word(s,n,p,GetAlphabet(n.c_str()));
  case _unknown:     return new token_unknown(s,n);
  default:
    return new token(s,n);
  }
}

token *newtoken(const string &s,xmlNodePtr xtok) {
  xmlChar *content=xmlNodeGetContent(xtok); 
  std::string n;
  
  if(content) {
    n=(char *) content;
    xmlFree(content);
  }
 
  xmlChar *xtoktype=xmlGetProp(xtok,(xmlChar*)"type");

  std::string strtoktype="undef";
  
  if(xtoktype) {
    strtoktype=(char *)xtoktype;
    xmlFree(xtoktype);
  }
  
  tokentype toktype=_undef;
  
  for(unsigned i=0;i<sizeof(type_name)/sizeof(char *);++i)
    if(!strcmp(type_name[i],strtoktype.c_str())) {
      toktype=static_cast<tokentype>(i);
      break;
    }
	
  token *res=newtoken(s,n,toktype);
  
  xmlChar *trans=xmlGetProp(xtok,(xmlChar*)"translate");

  if(trans) {
    if(!xmlStrcmp(trans,(const xmlChar *)"0"))
      res->translate=false;
    xmlFree(trans);
  }

  xmlChar *no_entity=xmlGetProp(xtok,(xmlChar*)"no_entity");

  if(no_entity) {
    if(!xmlStrcmp(no_entity,(const xmlChar *)"1"))
      res->no_entity=true;
    if(!xmlStrcmp(no_entity,(const xmlChar *)"misc"))
      res->no_entity_misc=true;
    xmlFree(no_entity);
  }
  
  xmlChar *no_entity_misc=xmlGetProp(xtok,(xmlChar*)"no_entity_misc");

  if(no_entity_misc) {
    if(!xmlStrcmp(no_entity_misc,(const xmlChar *)"1"))
      res->no_entity_misc=true;
    xmlFree(no_entity_misc);
  }
  
  return res;
}

tokentype GetTokenType(const char *type) {
  for (int i=0 ; type_name[i] ; i++) {
    if (!strcmp(type_name[i],type)) return (tokentype) i;
  }
  return _undef;
}

gc_val GetGc(const char *gc) {
  for (int i=0 ; gc_name[i] ; i++) {
    if (!strcmp(gc_name[i],gc)) return (gc_val) i;
  }
  return gc_undef;
}

f_val GetF(const char *f) {
  for (int i=0 ; f_s[i] ; i++) {
    if (!strcmp(f_s[i],f)) return (f_val) i;
  }
  return f_undef;
}

int operator==(const token& t1, const token& t2) {
  if ((t1.src == t2.src)    &&
      (t1.norm == t2.norm)  &&
      (t1.type == t2.type)) return 1;
  else return 0;
}

token *copyToken(const token &t) {
  switch(t.type) {
  case _separator:   return new token_separator(*(token_separator*)&t);
  case _numeric:     return new token_numeric(*(token_numeric*)&t);
  case _punctuation: return new token_punctuation(*(token_punctuation*)&t);
  case _symbol:      return new token_symbol(*(token_symbol*)&t);
  case _entity:      return new token_entity(*(token_entity*)&t);
  case _word:        return new token_word(*(token_word*)&t);
  case _unknown:     return new token_unknown(*(token_unknown*)&t);
  default:
    return new token(t);
  }
}

extern "C" int isDate(void *t)
{
  if(!t)
    return false;

  token *tok=reinterpret_cast<token *>(t);
  return tok->type==_entity && dynamic_cast<token_entity *>(tok)->subt==token_entity::e_date;
}

extern "C" int HasBeenNormalized(void *t)
{
  if(!t)
    return false;

  token *tok=reinterpret_cast<token *>(t);
  return tok->hasbeennormalized;
}

// ----------------------------------------------------------------------------
