/* $Id: ParseHTML.cc,v 1.1 2005/02/25 15:31:46 anonymous Exp $ */

#include <libxml/xmlmemory.h>
#include <libxml/parser.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <list>
#include <ctype.h>

#include "DocFilters.h"

//#include "portable.h"
using namespace std;

//#include "debuglevel.h"
#include "ParseHTML.h"
//#include "HTML/htmltype.h"
#include "document.h"
#include "Unicode.h"
/*
#include "Options.h"
#include "markup.h"

#include "StringID.h"
*/
#include "EncodingIANA.h"
#include "Case.h"
//#include "LangDefs.h"

//#include "debuglevel.h"
/*
#include "lang.h"
#include "status.h"
*/
#define ecOK 1
int global_idgen=1;

string Int2String(int i) {
  char number[100]; 
  
  sprintf(number,"%d",i);
  return ((string)number);
}

bool operator==(const attr &a1,const attr &a2) {
  return a1.name==a2.name && a1.value==a2.value;
}

bool operator!=(const attr &a1,const attr &a2) {
  return a1.name!=a2.name || a1.value!=a2.value;
}

void ConvertText(std::string &text)
{
  if (pCurrentDocH->current_conv)
    text=pCurrentDocH->current_conv->Convert(text);
  //else if (!check_utf8(text.c_str())) {
    // We need to switch to the mimetype encoding *now*, since we cannot
    // add non-UTF-8 text to the XML tree

    // We suppose UTF-8 input

    /*
    DefaultCharsetConvertor *conv = 
      new DefaultCharsetConvertor(OptGetValue("SOURCE_CHARSET"),"UTF-8");
    
    if(conv->IsOk()) {
      pCurrentDocH->current_conv=conv;
      ConvertDoc(pCurrentDocH->getRoot(),*conv);
      text=pCurrentDocH->current_conv->Convert(text);
      }*/
  //}
}

void ConvertDoc(xmlNodePtr n,CharsetConvertor &conv,bool source_only) {
  /* we have just realized that there was a charset - we change it for already
   saved text */
  xmlNodePtr cur=n->xmlChildrenNode;
  while(cur) {
    if (cur->type==XML_ELEMENT_NODE) {
      if(!source_only || xmlStrcmp(cur->name,(const xmlChar *)"transtu"))
        ConvertDoc(cur,conv,source_only);
    }
    else if (cur->type==XML_TEXT_NODE) {
      string current_text=conv.Convert((char *)cur->content);
      xmlNodeSetContent(cur,(const xmlChar *)current_text.c_str());
    }
    cur=cur->next;  
  }
}

static int replaceentity(std::string &s,std::string::size_type k) {
  string entity;
  std::string::size_type start=k;
  std::string::size_type len=s.size();
  std::string utf8character;
  
  if (!len || s[k]!='&') 
    return ecHTML_INTERNALERROR;

  if(s.size()<=k+2)
    return 1;
  
  ++k;
  
  if (s[k]=='#') {
    // Unicode or iso1 entity - it seems that any code <= 255 refers to iso1?
    k++;
    int code=0;
    if (s[k]=='x' || s[k]=='X') { // Hexadecimal reference
      entity.append(1,s[k]);
      while(++k<len && isxdigit(s[k])) {
	code=code*16+(isdigit(s[k])?(s[k])-'0':toupper(s[k])-'A'+10);
	entity.append(1,s[k]);
      }
    } else { // Decimal reference
      while(k<len && isdigit(s[k])) entity.append(1,s[k++]);
      code=atol(entity.c_str());
    }
    if (k<len && s[k]==';') ++k; 
    // apparently the ';' is not so much required... &#171
    if (code>=127 && code <=159) { // Illegal reference characters, but used
  //   for corresponding CP1252 characters;
  //			   we need to convert the code
      DefaultCharsetConvertor conv("WINDOWS-1252","UTF-8");
      utf8character=conv.Convert(std::string(1,char(code)));
    } else    
      utf8character=unicode2utf8(code);
  } else {
    while(k<len && isdigit(s[k]) || (s[k]>='a' && s[k]<='z') ||
	                                            (s[k]>='A' && s[k]<='Z'))
      entity.append(1,s[k++]);
    if (k<len && s[k]==';') k++; 
    // apparently the ';' is not so much required... &gt&gt

    entitydef t;
    
    if (entity.length() && findEntity(entity.c_str(),t))
      // We could return a ecHTML_BADENTITY error, but since it is a very
      // common mistake, we assume we just have a regular '&' and no entity
      utf8character=t.equiventity;
  }
  
  if (!utf8character.empty()) {
    //print_debug("filter:html",3,"replacing entity %s by %s\n",s.substr(start,k-start).c_str(),utf8character.c_str());
    s.replace(start,k-start,utf8character);
  }
  
  return 1;
}

static void ReplaceEntities(xmlNodePtr cur)
  // This function is a postprocessing step which:
  //- replace entities by their content
  //- put spaces and nbsps in preceding tags if they exist
{
  for (;cur;cur=cur->next) {
    if (cur->type==XML_ELEMENT_NODE &&
	(!xmlStrcmp(cur->name,(const xmlChar *)"par") ||
	 !xmlStrcmp(cur->name,(const xmlChar *)"document")) &&
	cur->children) {
      ReplaceEntities(cur->children);
    } else if(cur->type==XML_TEXT_NODE) {
      const char *p=(const char *) cur->content;

      if(p && *p && strchr(p,'&')) {
	std::string s=p;
	std::string::size_type k=0;

	while((k=s.find('&',k))!=std::string::npos) {
	  replaceentity(s,k);
	  ++k;
	}

	xmlNodeSetContent(cur,(const xmlChar *)s.c_str());
	p=(const char *) cur->content;
      }

      if(strchr(" \n\r\t",*p) || !strncmp(p," ",2))
        if(cur->prev && cur->prev->type==XML_ELEMENT_NODE &&
  	   !xmlStrcmp(cur->prev->name,(const xmlChar *)"tag")) {
	   std::string::size_type k=0;
	     
	   while(p[k]) {
	    if(strchr(" \n\r\t",p[k]))
	      ++k;
	    else if (!strncmp(p+k," ",2))
	      k+=2;
	    else
	      break;
	   }
           // all the html tags are space="..."
	   xmlSetProp(cur->prev,(xmlChar *)"space",(xmlChar *)"1");
	   xmlAddChild(cur->prev,
	               xmlNewText((const xmlChar *)std::string(p,k).c_str()));
	   //fprintf(stdout,"xmlAddChild(%s)\n",std::string(p,k).c_str());

	   xmlNodeSetContent(cur,(const xmlChar*)std::string(p+k).c_str());
	   //fprintf(stdout,"xmlNodeSetContent(%s)\n",std::string(p+k).c_str());
	}
    }
  }
}

void documentHTML::FlushTag(restriction restricted) {
  if (par && current_tag.length()) {
    ConvertText(current_tag);
    xmlNodePtr tag_xml=xmlNewTextChild(par,NULL,(const xmlChar *)"tag",(const xmlChar *)current_tag.c_str());
    /* if (taghasspace) */
      xmlSetProp(tag_xml,(xmlChar *)"space",(xmlChar *)(taghasspace?"1":"0"));
    if(restricted==_source)
      xmlSetProp(tag_xml,(xmlChar *)"restricted",(xmlChar *)"source");
    else if(restricted==_target)
      xmlSetProp(tag_xml,(xmlChar *)"restricted",(xmlChar *)"target");
    if(html_block_tag)
      xmlSetProp(tag_xml,(xmlChar *)"html_block",(xmlChar *)"1");
    current_tag="";
  }
  taghasspace=0;
}

void documentHTML::FlushText() {
  if (par && current_text.length()) {
    ConvertText(current_text);
    xmlNodeAddContent (par,(const xmlChar *)current_text.c_str());
    current_text="";
  }
  textendingspace="";
}

static const char *msoValue(const attr &a,string m) {
  m="mso-"+m+":";
  if (strcasecmp(a.name.c_str(),"style")==0
      && strstr(a.value.c_str(),m.c_str())) {
    static string r;
    r=a.value;
    unsigned int pos=r.find(m,0);
    r.erase(0,pos+m.length());
    unsigned int i;
    for(i=0;i<r.length() && !strchr("'\";}",r[i]);i++);
    r.erase(i);
    return r.c_str();
  }
  return NULL;
}

string documentHTML::AddBmark(std::list<attr> &la) {
  FlushText();
  FlushTag();
  //FlushTs();

  string sid="?";
  string type;

  {
    /* first find if this mark was not previously just closed because of
       overlapping of spans                                               */
    for(std::list<attr>::const_iterator a=la.begin();
	a!=la.end();
	++a)
      if (strcasecmp(a->name.c_str(),"id")==0) { sid=a->value;break; }
    if (sid!="?" && SystranMarkMet[sid]) {
      /* We have already opened and likely closed this tag 
	 let us check if we can find the Emark */
      xmlNodePtr cur=par->last;
      bool found=0;
      while(cur) {
	if (cur->type==XML_ELEMENT_NODE && !xmlStrcmp(cur->name,(const xmlChar*)"emark")) {
	  xmlChar *closeid=xmlGetProp(cur,(const xmlChar *)"id");
	  if (closeid && !xmlStrcmp(closeid,(const xmlChar*)sid.c_str())) {
	    xmlFree(closeid);found=true;break; /* we found it */
	  }
	}
	cur=cur->prev;
      }
      if (found) {
	/* remove the previous ending mark */
	xmlUnlinkNode(cur);
	xmlFreeNode(cur);
	return sid;
      }
      /* we have not found the closing emark: this is odd - and means
	 that we are still in the current mark? let us just ignore
	 this mark */
      return "-";
    }
  }

  xmlNodePtr bmark=xmlNewChild(par,NULL,(const xmlChar *)"bmark",NULL);
  for(std::list<attr>::iterator a=la.begin();
      a!=la.end();
      ++a) {
    /* attribute name can be case-insensitive */
    {for(unsigned int i=0;i<a->name.length();i++)a->name[i]=tolower(a->name[i]);}
    if (a->name=="id" || a->name=="action" || a->name=="value") {
      xmlSetProp(bmark,(const xmlChar*)a->name.c_str(),(const xmlChar *)a->value.c_str());
    }
    if (a->name=="class") {
      /* attribute value should not be case-insensitive - but let us be sure */
      {for(unsigned int i=0;i<a->value.length();i++)a->value[i]=tolower(a->value[i]);}
      if (!strncmp(a->value.c_str(),"systran_",8)) {
	type=a->value.c_str()+8;
        
	std::string::size_type k=type.find("_");
			
	if(k!=std::string::npos)
	  type.erase(k);
			
	xmlSetProp(bmark,(const xmlChar*)"type",(const xmlChar *)a->value.c_str()+8);
      }
    }
  }
  if (sid=="?") {
    /* There was no id - we create one temporarily */
    sid=type+"_-"+Int2String(global_idgen++);
    xmlSetProp(bmark,(const xmlChar *)"id",(const xmlChar *)sid.c_str());
  }
  SystranMarkMet[sid]=true;
  return sid;
}

void documentHTML::AddEmark(const string &id) {
  FlushText();
  FlushTag();
  //FlushTs();
  if (id=="-") return; // This mark was dropped by AddBmark
  xmlNodePtr bmark=xmlNewChild(par,NULL,(const xmlChar *)"emark",NULL);
  xmlSetProp(bmark,(const xmlChar*)"id",(const xmlChar *)id.c_str());
}

int countNL(const std::string &s) {
  int count=0;

  const char *c=s.c_str();
  while(*c) {
    if(*c=='\r') {
      ++count;
      if(*(c+1)=='\n')
	++c;
    } else if(*c=='\n')
      ++count;
    ++c;
  }

  return count;
}

void documentHTML::AddTag(const string &s, bool formatspace, restriction restricted,bool block_tag) {
  if (textendingspace.length()) {
    current_tag+=textendingspace;
    taghasspace=1;
  }
  
  FlushText();
  //FlushTs();

  if(block_tag!=html_block_tag) {
    FlushTag();
    html_block_tag=block_tag;
  }
  
  if(!strncmp(s.c_str(),"<img",4))
    taghasspace=1;

  bool wrap_this_newline=true;
  int count=countNL(s);
  
  if(count) {
    if(count==1)
      wrap_this_newline=wrap_next_newline;
    else
      wrap_this_newline=false;
    wrap_next_newline=true;
  }
    
  if(formatspace /*&& OptGetValue("formatting_spaces")==std::string("segmenting") */
      && s!=" " && s!="  " && ((s.find_first_of("\r\n")==std::string::npos) ||
                          !wrap_this_newline)) {
    /* Formatting space */
    newPar();
    taghasspace=1;
    current_tag+=s;
    if(count<=1)
      wrap_next_newline=false;
  } else if(restricted) {
    FlushTag();
    current_tag+=s;
    taghasspace=formatspace;
    FlushTag(restricted);
  } else {
    if(formatspace)
      taghasspace=1;
    current_tag+=s;
  }
}

void documentHTML::AddText(const string &s,const string &lang) {
  FlushTag();
  //FlushTs();
  MapLang[lang]+=s.length();
  textendingspace="";
  current_text+=s;
  addedText+=s.length();
  countc+=s.length();
}

void documentHTML::AddSpace(const string &s) {

  //fprintf(stdout,"  AddSpace(%s)\n",s.c_str());

  if(s.find_first_of("\r\n")!=std::string::npos)
    wrap_next_newline=true;
  
  if (current_tag.length()) {
    //fprintf(stdout,"  --> current_tag.length()\n");
    taghasspace=1;
    current_tag+=s;
  }
  else if (new_ts) {
    //fprintf(stdout,"  --> new_ts\n");
    tshasspace=1;
  }
  else {
    //fprintf(stdout,"  --> else\n");
	
      // Space is integrated in text normalized as a space
      if (!textendingspace.length()) current_text+=" ";
      // but also preserved if followed by a tag
      textendingspace+=s;
    }
}

void documentHTML::newPar() {
  if (root && (!par || addedText)) {
    FlushText();
    FlushTag();
    tshasspace=0;
    par=xmlNewChild(root,NULL,(const xmlChar *)"par",NULL);

    /* copy the current ts on this paragraph. this is critical for 
       spreading typeset status on a high level component:
       example <body lang="fr"><div></div><div></div> */
    //new_ts=true;
    //FlushTs();

    char pars[10];
    sprintf(pars,"%d",par_id++);
    xmlNewProp(par, (const xmlChar *)"id", (const xmlChar *)pars);
    addedText=0;
    SystranMarkMet.clear();
  }
}

void documentHTML::setNoTypesetPar() {
  if(!par) return;
  xmlSetProp(par,(const xmlChar*)"notypeset",(const xmlChar*)"true");
  addedText=1; // Otherwise, we are not sure a new paragraph will be opened!
}

CHECK_CODE documentHTML::closetag(const string &nblock,typeblock lblock) {
  /* Closing HTML tags may imply to unstack at the same time 
     lower level non closed tags */
  int found=0;
  CHECK_CODE validity=CORRECT;
  //htmlchp oldchp=current.chp;
  
  //if(lblock>TYPESET) current.chp=htmlchp();

  do {
    if (lblock<current.block_level) return BAD;
    if (nblock==current.block_name) found=1;
    /* Unstack one state */
    if (!psave) return BAD;

    current=*psave;
    SAVE *next=psave->pNext;
    delete psave;
    psave=next;
  }
  while(!found);

  //if(oldchp!=current.chp) AddTs(current.chp);
  
  return validity;
}

void documentHTML::close() {
  FlushText();
  FlushTag();
  //FlushTs();
  while(psave) {
    SAVE *next=psave->pNext;
    delete psave;
    psave=next;
  }
}

enum etype { _htmltag,_htmlmarkup_declaration,_htmlcomment,_htmlclose };

static int readelement(char *&Buf,etype &e,string &name,std::list<attr> &la,bool &xhtml_slash) {
  char *oldBuf=Buf;
  if (*Buf!='<')
    return ecHTML_INTERNALERROR;
  Buf++;
  e=_htmltag;

  /* It is a comment - just skip everything 
     ouput: e=comment, and name is the complete comment 
     (includes <!-- -->) */
  if (*Buf=='!') {
    /* if we are in skip mode, we are not supposed to look at comment */
    if (pCurrentDocH->current.his==hisSkip) return 0;
    if (strncmp(Buf,"!--",3)==0) {
      char *endcomment=strstr(Buf,"-->");
      if (!endcomment) {
	fprintf(stderr,"Error parsing html ecHTML_CLOSECOMMENT\n");
	/* We extend the comment to the end of the buffer */
	name+=Buf-1;
	Buf+=strlen(Buf);
      } else {
        char nextc=endcomment[3];
        endcomment[3]=0;
        name+=Buf-1;
        endcomment[3]=nextc;
        Buf=endcomment+3;
      }

      e=_htmlcomment;
      return ecOK;
    }
  }
    
    /* It is a closing tag */
    if (*Buf=='/') { e=_htmlclose;Buf++; }
    else if (*Buf=='!' || *Buf=='?') { 
      /* if we are in skip mode, we are not supposed to look at any meta */
      if (pCurrentDocH->current.his==hisSkip) return 0;
      char block=0;
      e=_htmlmarkup_declaration;
      while(*Buf && (block!=0 || *Buf!='>')) {
	if (block && *Buf==block) block=0;
	else if (!block && strchr("\"'",*Buf)) block=*Buf;
	Buf++;
      }
      if (*Buf=='>') {
	Buf++;
	return ecOK; }
      fprintf(stderr,"Error parsing html ecHTML elementclose\n");
      /* We do as if we had just seen the '>' as the last character */
      return ecOK;
    }

    if (pCurrentDocH->current.his==hisSkip && e!=_htmlclose) return 0;
    
  /* looking for element name */
  while(isalpha(*Buf) || isdigit(*Buf) || (*Buf && strchr("-:_",*Buf)))
    name.append(1,(char)toupper(*(Buf++)));

  while(*Buf && *Buf!='>' && (*Buf!='/' || *(Buf+1)!='>')) {
    if (*Buf=='\\' && Buf[1]=='>') 
      { Buf=oldBuf;return 0; }
    // accept cols="40"wrap="virtual"
    if (!isspace(*Buf) && Buf[-1]!='\"' && Buf[-1]!='\'' &&!isspace(Buf[-1])) {
      fprintf(stderr,"ERROR_PARSING_HTML,ecHTML_TAGNAME");
      while(*Buf && *Buf!='>')
	++Buf;
      break;
    }
    
    while(isspace(*Buf)) Buf++;  
    
    if (isalpha(*Buf) || *Buf=='-' || *Buf==':' || (*Buf>='0' && *Buf<='9')) {
      /* looking for a=b */
      attr a;
      while(isalpha(*Buf) || (*Buf && strchr("-:",*Buf)) || (*Buf>='0' && *Buf<='9'))
	a.name.append(1,*(Buf++));
      while(isspace(*Buf)) Buf++;
      /* accept a only, in that case a->value="<e>"; */
      if (*Buf=='=') {
	*Buf++;
	while(isspace(*Buf)) Buf++;
	/* a='...' a="..." */
	if (*Buf=='\'' || *Buf=='"') {
	  char cclose=*(Buf++);
	  while(*Buf && *Buf!=cclose)
	    {
	      a.value.append(1,(char)*(Buf++));
	    }
	  if (!*Buf)
	    {
	      fprintf(stderr,"ERROR_PARSING_HTML,ecHTML_ATTVALUECLOSE");
	      break;
	    }
	  Buf++;
	}
	else {
	  while(*Buf && !isspace(*Buf) && *Buf!='>') 
	    a.value.append(1,(char)*(Buf++));
	}
      }
      la.push_back(a);
    }
    else break;
  }
  if (*Buf=='>') { Buf++; return ecOK; }

  if (*Buf=='/' && *(Buf+1)=='>' && e==_htmltag) {
    xhtml_slash=true;
    Buf+=2;
    return ecOK;
  }
  
  fprintf(stderr,"ERROR_PARSING_HTML,ecHTML_ELEMENTCLOSE\n");

  /* We try to catch the end of the tag... */
  if(*Buf)
    while(*Buf && *Buf!='>')
      ++Buf;

  if(*Buf)
    ++Buf;
  
  return ecOK;
}

static string lc(const string &s) {
  unsigned int i;
  string res;
  for(i=0;i<s.length();i++) {
    if (s[i]>='A' && s[i]<='Z') res+=s[i]-'A'+'a';
    else res+=s[i];
  }
  return res;
}

string DumpTag(etype &e,string &name,std::list<attr> &la,bool xhtml_slash=false) {
  if (e==_htmlcomment) return "<!-- "+name+" -->";
  string t="<";
  if (e==_htmlclose) t+="/";
  t+=lc(name);
  for(std::list<attr>::const_iterator a=la.begin();
      a!=la.end();
      ++a) {
    t+=" ";
    t+=a->name;
    if (a->value!="<e>")
      {
	t+="=";
	if (a->value.find("\"",0)==string::npos) t+="\""+a->value+"\"";
	else t+="\'"+a->value+"\'";
      }
  }
  if(xhtml_slash)
    t+=" /";
  t+=">";
  return t;
}

/*void documentHTML::AddTs(const htmlchp &chp) {
  FlushText();
  FlushTag();

  current_ts=chp;
  new_ts=true;
  }*/

string documentHTML::AddSubFlow(string value,bool raw) {
  if (par) {
    ConvertText(value);
    xmlNodePtr sfpar=xmlNewNode(NULL,(const xmlChar *)"par");
    xmlAddPrevSibling(par,sfpar);
    xmlNodeAddContent (sfpar,(const xmlChar *)value.c_str());
    string sfi="sfi_"+(string)(raw?"r_":"n_")+Int2String(++subflowid);
    xmlSetProp(sfpar,(const xmlChar *)"subflowid",(const xmlChar*)sfi.c_str());
    return "$"+sfi;
  }
  return "";
}

/*void documentHTML::FlushTs() {
  if(par && new_ts) {
    current_ts.toXml(par);
  
    if(tshasspace) current_text+=" ";

    tshasspace=0;
    new_ts=false;
  }
  }*/

std::string PropertyValue(const std::string &tag_name,
                          const std::list<attr> &la,
			  std::string &property) {
  if(!strcasecmp(tag_name.c_str(),"u"))
    return "continuous";
  else if(!strcasecmp(tag_name.c_str(),"font")) {
    std::list<attr>::const_iterator it;
    for(it=la.begin();it!=la.end() && strcasecmp(it->name.c_str(),"face");++it) 
      ;
    if(it!=la.end()) {
      property="font_id";
      std::string value=it->value;
      std::string::size_type k=0;
      
      while((k=value.find(' ',k))!=std::string::npos)
	value.erase(k,1);

      k=0;
      
      while((k=value.find(',',k))!=std::string::npos)
	value.replace(k,1,";");
      
      return value;
    }
  } else if(!strcasecmp(tag_name.c_str(),"span")) {
    std::list<attr>::const_iterator it;
    for(it=la.begin();it!=la.end() && strcasecmp(it->name.c_str(),"style");++it)
      ;
    std::string::size_type debut=0;
    if(it!=la.end() && (debut=std::string(SetLowerCase(it->value))
                              .find("font-family:"))!=std::string::npos) {
      if(debut>0 && it->value[debut-1]!=' ' && it->value[debut-1]!=';')
	;
      else {
	debut=it->value.find(":",debut)+1;
	while(it->value[debut]==' ')
	  ++debut;
	char closing=0;
	if(it->value[debut]=='\'' || it->value[debut]=='"') {
	  closing=it->value[debut];
	  ++debut;
	}
	std::string::size_type end;
	if(closing)
	  end=it->value.find(closing,debut);
	else
	  end=it->value.find(";",debut);
	property="font_id";
	return it->value.substr(debut,end-debut);
      }
    }
  }
  return "1";
}

static int parseelement(char *&Buf,HIS &state) {

  //fprintf(stdout,"  parseelement(%s)\n",Buf);

  string element_name;
  etype e;
  std::list<attr> la;
  bool xhtml_slash=false;
  documentHTML::restriction restricted=documentHTML::_both;
  int err;
  char *refBuf=Buf; // In case there is a problem - we re-set Buf
  if ((err=readelement(Buf,e,element_name,la,xhtml_slash))!=ecOK) { Buf=refBuf;return err; }



  //fprintf(stdout,"  readelement => element_name=%s\n",element_name.c_str());
  //fprintf(stdout,"  readelement => Buf=%s\n",Buf);

  string tagref;
  char refc=*Buf;*Buf=0;tagref=refBuf;*Buf=refc;
  
  switch (e) {
  case _htmlcomment:
  case _htmlmarkup_declaration:
    pCurrentDocH->AddTag(tagref,false);
    break;
  case _htmltag:
    {

      /* Check for meta charset tag */

      // 1.<meta http-equiv="Content-Type" content="text/html;
      // charset=iso-8859-1">
      // (most common)
      //
      // 2.<meta http-equiv="charset" content="iso-8859-1">
      // (less common)
      //
      // 3.<meta charset="iso-8859-1">
      // (Internet Explorer only, not recommended)

      // meta recognition
      if (strcasecmp(element_name.c_str(),"meta")==0) {
	string content;
	int incontent=0;
	for(std::list<attr>::const_iterator a=la.begin();
	    a!=la.end();
	    ++a) {
	  if (strcasecmp(a->name.c_str(),"content")==0) content=a->value;
	  else if (strcasecmp(a->name.c_str(),"http-equiv")==0)
	    {
	      if (strcasecmp(a->value.c_str(),"content-type")==0 ||
		  strcasecmp(a->value.c_str(),"charset")==0) incontent=1;
	    }
	  else if (strcasecmp(a->name.c_str(),"charset")==0) {
	    content=a->value;
	    incontent=1;
	    break;
	  }
	}
	if (incontent) {
	  string charset;
	  const char *p;
	  if ((p=strchr(content.c_str(),';'))!=NULL) {
	    p++;
	  }
	  else p=content.c_str();
	  while(isspace(*p)) p++;
	  if (strncasecmp(p,"charset",7)==0) {
	    while(*p && *p!='=') p++;
	    if (*p) p++;
	  }
	  while(*p && !isspace(*p)) charset.append(1,*(p++));

	  if (charset.length()) {
#ifndef LANG_SRC_RU
	      /* for russian, do not accept the internal meta tag in html
		 because, we are pretty sure of the charset we are working
		 with, thanks to check_ru and MapIso functions - most of the
		 time the charset indicated in RU HTML is erroneous */
	    if(!pCurrentDocH->current_conv
#if defined(APP_LanguageID) || defined(APP_SysTools)
	       && charset.find("1251",0)==string::npos
#endif
	       ) {
	      /* Found a charset ! We switch to this encoding */
	      DefaultCharsetConvertor *conv=new DefaultCharsetConvertor(charset,"UTF-8");
	      if (conv->IsOk()) {
		// we need to convert all text already in tree!
		pCurrentDocH->current_conv=conv;
		ConvertDoc(pCurrentDocH->getRoot(),*conv);
		//OptSetValue("SOURCE_CHARSET",charset.c_str());
		/*if(!OptTrue("USER_TARGET_CHARSET") && 
		    IsAdmissibleCharsetForLang(charset.c_str(),LANG_TGT))
		    OptSetValue("TARGET_CHARSET",charset.c_str());*/
		restricted=documentHTML::_source;
	      } else
		delete conv;
	    } else
#endif
	      restricted=documentHTML::_source;
	  }
	}
      } // meta recognition

      // span recognition
      if (pCurrentDocH->current.his!=hisSkip && !strcasecmp(element_name.c_str(),"span")) {
	/* look if it is some systran span tags - in that case we remove it from html */
	for(std::list<attr>::const_iterator a=la.begin();
	    a!=la.end();
	    ++a) {
	  /* looks for systran tags */
	  if (strcasecmp(a->name.c_str(),"class")==0) {
	    if (strncasecmp(a->value.c_str(),"systran_",8)==0) {
	      /* we found it, push the tag */
	      pCurrentDocH->push_state();
	      pCurrentDocH->current.block_name="SPAN";
	      pCurrentDocH->current.block_level=TYPESET;
	      pCurrentDocH->current.bmarkid=pCurrentDocH->AddBmark(la);
	      return 1;
	    }
	  }

	  /* looks for mso tags - there is a style='...;mso-special-format:...;...' */
	  const char *msoV=msoValue(*a,"special-format");
	  if (msoV && strcmp(msoV,"bullet")==0) {
	    /* push it as a tag */
	    pCurrentDocH->AddTag(DumpTag(e,element_name,la),false,restricted);
	    pCurrentDocH->push_state();
	    pCurrentDocH->current.block_name="SPAN";
	    pCurrentDocH->current.block_level=MSO;
	    pCurrentDocH->current.his=hisSkip;
	    return 1;
	  }
	}
      } // span recognition

      /* Look for the sub-flows: alt,prompt,title params */
      {
	for(std::list<attr>::iterator a=la.begin();
	    a!=la.end();
	    ++a) {
	  if (strcasecmp(a->name.c_str(),"alt")==0 || 
	      strcasecmp(a->name.c_str(),"title")==0 || 
	      strcasecmp(a->name.c_str(),"prompt")==0 ||
	      (strcasecmp(a->name.c_str(),"string")==0 &&
	       strcasecmp(element_name.c_str(),"v:textpath")==0) ||
	      (strcasecmp(a->name.c_str(),"value")==0 &&
	       strcasecmp(element_name.c_str(),"input")==0)
	      ) {
	    bool ok=true;
	    
	    if(strcasecmp(a->name.c_str(),"value")==0 && strcasecmp(element_name.c_str(),"input")==0) {
	      ok=false;
	      for(std::list<attr>::iterator b=la.begin();b!=la.end();++b)
		if(!strcasecmp(b->name.c_str(),"type") && !strcasecmp(b->value.c_str(),"submit")) {
		  ok=true;
		  break;
		}
	    }
		
	    /* It is a sub-flow */
	    if(ok)
	      a->value=pCurrentDocH->AddSubFlow(a->value,true);
	  }
	}
      }

      tagdef t;
      if (!findTag(element_name.c_str(),t))
	{
	  t.nametag=element_name.c_str();
	  t.ttag=SINGLETAG;
	  t.segmentingtag=0;
	  t.leveltag=NOCLOSE;
	  t.property=0;
	}

      if(
	 /*(OptTrue("DONOTTRANSLATE_THREAD") && !strcmp(t.nametag,"BLOCKQUOTE"))
		      || (OptTrue("DONOTTRANSLATE_PRE") && !strcmp(t.nametag,"PRE"))
	 */
		      0) {
	t.ttag=SKIPTAG;
	t.leveltag=TOPLEVEL;
      }
#ifdef DEBUG_FILTER
      cerr<<"[==>"<<element_name<<"/"<<t.ttag<<"]"<<endl;
#endif
      if (pCurrentDocH->current.his!=hisSkip) {
	if(!t.property && (t.ttag==TYPESETTAG && t.leveltag==TYPESET) ||
	                  (t.ttag==SKIPTAG && t.leveltag==TYPESET))
	  t.property="generic";
	
        if(!strcmp(t.nametag,"A") && (xhtml_slash ||
                                      !strncasecmp(Buf,"</a>",4)))
        {
          if(!xhtml_slash) {
            xhtml_slash=true;
            Buf+=4;
          }
          
          // Empty A tags are real tags, not ts, if there is a name attribute
          std::list<attr>::const_iterator it;
          for(it=la.begin();
              it!=la.end() && strcasecmp(it->name.c_str(),"name");
              ++it)
            ;
          if(it!=la.end())
            t.property=0;
        }
        
	if(!t.property) {
	  if(t.nametag==pCurrentDocH->current.block_name &&
	     t.leveltag==TYPESET)
	    // <p>bla<p>bli: close 1st <p>
	    // <div>bla<div>bli: don't close 1st <div>
	    pCurrentDocH->closetag(element_name,t.leveltag);

#ifdef LANG_TGT_AR
	  /* If we have a toplevel tag, then add dir=rtl for ENAR */
	  if (element_name=="BODY" || element_name=="HTML" || element_name=="HEAD") {
	    pCurrentDocH->AddTag(DumpTag(e,element_name,la,xhtml_slash),false,documentHTML::_source,t.ttag!=INSETTAG && t.ttag!=SKIPTAG);
	    bool existdir=false;
	    string previousdirvalue;
	    std::list<attr>::iterator a;
	    for(a=la.begin();
		a!=la.end();
		++a) {
	      if (strcasecmp(a->name.c_str(),"dir")==0) {
		previousdirvalue=a->value;
		a->value="rtl";
		existdir=true;
		break;
	      }
	    }
	    if (!existdir) {
	      attr sa;
	      sa.name="dir";
	      sa.value="rtl";
	      la.push_back(sa);
	    }
	    pCurrentDocH->AddTag(DumpTag(e,element_name,la,xhtml_slash),false,documentHTML::_target,t.ttag!=INSETTAG && t.ttag!=SKIPTAG);
	    if (existdir) {
	      a->value=previousdirvalue;
	    } else la.pop_back();
	  } else
#endif
	    pCurrentDocH->AddTag(DumpTag(e,element_name,la,xhtml_slash),false,restricted,t.ttag!=INSETTAG && t.ttag!=SKIPTAG);
	}

	if (t.leveltag!=NOCLOSE) {
	  pCurrentDocH->push_state();
	  pCurrentDocH->current.block_name=t.nametag;
	  pCurrentDocH->current.block_level=t.leveltag;
	  if(pCurrentDocH->current.block_name=="PRE")
	    pCurrentDocH->current.his=hisPre;
	  
	  bool lang_spec=false;

	  /* Is there some language specification? */
	  {
	    for(std::list<attr>::iterator a=la.begin();
		a!=la.end();
		++a) {
	      const char *msov=msoValue(*a,"ansi-language");
	      if (msov || strcasecmp(a->name.c_str(),"lang")==0 || strcasecmp(a->name.c_str(),"xml:lang")==0) {
		/* check if it is a valid one */
		char lang[3];
		char loca[3];
		//if (AnalyseLanguage((msov?msov:a->value.c_str()),lang,loca)) 
		//{
		    //print_debug("filter:html",5,"setting language %s_%s\n",lang,loca);
		    //pCurrentDocH->current.chp.set("lang",lang);
		    lang_spec=true;
		    //}
		if (msov) {
		  std::string result;
		  std::string::size_type pos=0;

		  do {
		    std::string::size_type refpos=pos;
		    pos=a->value.find(";",pos);
		    std::string element=a->value.substr(
			refpos,
			(pos==std::string::npos)?pos:pos-refpos);
		    if(strncmp(element.c_str(),"mso-ansi-language",17)) {
		      if(!result.empty())
			result+=";";
		      result+=element;
		    }
		    if(pos!=std::string::npos)
		      ++pos;
		  } while(pos!=std::string::npos);

		  a->value=result;
		} else
    		  la.erase(a);
		break;
	      }
	    }
	  }
	    
          if(t.property && !xhtml_slash) {
	    std::string tag=DumpTag(e,element_name,la);
	    std::string property=t.property;
	    
	    std::string value=PropertyValue(element_name,la,property);
	    
	    std::string::size_type percent=0;

	    while((percent=tag.find('%',percent))!=std::string::npos) {
	      tag.replace(percent,1,"&#037;");
	    }
	  
	    ConvertText(tag);
	    
	    std::string ending_tag=
	      "</"+tag.substr(
		  1,
		  tag.find_first_of(" \t\r\n>",tag.find_first_not_of(" \t\r\n"))
		    -1)
	      +">";
	    //pCurrentDocH->current.chp.set(property,value,tag,ending_tag);

	    const char *dnt_font=NULL; //OptGetValue("DNT_FONT");
	    
	    if(property=="font_id" && dnt_font && *dnt_font) {
	      while(*dnt_font) {
		const char *comma=dnt_font;
		while(*comma && *comma!=',')
		  ++comma;
		if(comma!=dnt_font &&
		   !strncasecmp(dnt_font,value.c_str(),comma-dnt_font)) {
		  //pCurrentDocH->current.chp.set("lang","XX");
		}
		dnt_font=*comma?comma+1:comma;
	      }
	    }

	    //pCurrentDocH->AddTs(pCurrentDocH->current.chp);
	  }
	  else {
	    //if (lang_spec) // set Ts for language changes
	    //pCurrentDocH->AddTs(pCurrentDocH->current.chp);
	  }

	  if (t.ttag==SKIPTAG && t.leveltag==TOPLEVEL && !xhtml_slash)
	    pCurrentDocH->current.his=hisSkip;
	}
	if (t.segmentingtag) pCurrentDocH->newPar();
	
	if(t.ttag==NOTYPESETTAG && t.leveltag!=NOCLOSE)
	  pCurrentDocH->setNoTypesetPar();
      } else {
	pCurrentDocH->AddTag(tagref,false);
      }
    }
    break;
  case _htmlclose:
    {
      /* check if we are closing a span systran_mark tag */
      if (pCurrentDocH->current.his!=hisSkip && !strcasecmp(element_name.c_str(),"span")) {
	if (pCurrentDocH->current.bmarkid.length()) {
	  pCurrentDocH->AddEmark(pCurrentDocH->current.bmarkid);
	  pCurrentDocH->closetag(element_name,TYPESET);
	  return 1;
	}
	/* just a regular span */
      }
      tagdef t;
      if (!findTag(element_name.c_str(),t))
	{
	  fprintf(stderr,"[filter:html] unknown tag %s\n",element_name.c_str());
	  t.nametag=element_name.c_str();
	  t.ttag=SINGLETAG;
	  t.segmentingtag=0;
	  t.leveltag=NOCLOSE;
	  t.property=0;
	}
      if(
	 /*(OptTrue("DONOTTRANSLATE_THREAD") && !strcmp(t.nametag,"BLOCKQUOTE"))
		      || (OptTrue("DONOTTRANSLATE_PRE") && !strcmp(t.nametag,"PRE"))
	 */
		     0 ) {
	t.ttag=SKIPTAG;
	t.leveltag=TOPLEVEL;
        } 
   	if (pCurrentDocH->current.his==hisSkip && element_name!=pCurrentDocH->current.block_name)
	{
	  pCurrentDocH->AddTag(tagref,false);
	}
      else {
	if(t.property || (((t.ttag==TYPESETTAG && t.leveltag==TYPESET) || (t.ttag==SKIPTAG && t.leveltag==TYPESET)) && pCurrentDocH->current.block_level!=MSO))
  	  pCurrentDocH->closetag(element_name,t.leveltag);
        else {
	  /* upgrade mso skip span tags */ 
	  if (pCurrentDocH->current.block_level==MSO) t.leveltag=MSO;
  	  pCurrentDocH->AddTag(DumpTag(e,element_name,la),false,documentHTML::_both,t.ttag!=INSETTAG && t.ttag!=SKIPTAG);
	  pCurrentDocH->closetag(element_name,t.leveltag);
	}
	if (t.segmentingtag) pCurrentDocH->newPar();
      }
      break;
    }
  } // switch end
  return 1;
}

int ecHtmlParse(const char *s,DocumentPtr D) {

  //fprintf(stdout,"  ecHtmlParse(%s)\n",s);

  char *Buf=strdup(s);
  char *refBuf=Buf;
  int err;
  HIS state=hisNorm;

  if (Buf[0]==0xef && Buf[1]==0xbb && Buf[2]==0xbf) // utf-8 signature
    Buf+=3;

  long Buflen=strlen(Buf);
  
  while(*Buf) {
#ifdef DEBUG_FILTER
    char A[11];
    strncpy(A,Buf,10);
    cerr<<"state="<<pCurrentDocH->current.his<<"---"<<A<<"..."<<endl;
#endif
    char cBuf[2];
    cBuf[1]=0;
    cBuf[0]=*Buf;
    if (*Buf=='<') {
      //if(!setstatus(Buflen,Buf-refBuf,XMLFLOW_STEP_PREFILTER_HTML)) return ecABORTED;
    
      if ((err=parseelement(Buf,state))!=ecOK && pCurrentDocH->current.his!=hisSkip)
	return err;
      if (err!=ecOK) 
	{ pCurrentDocH->AddTag(cBuf,false); }
      else { // successfully passed one element
	continue;
      }
    }
    else if (strchr(" \t\n\r",*Buf)) {
      std::string ws;
      while(*Buf && strchr(" \t\n\r",*Buf))
	ws+=std::string(Buf++,1);
      Buf--;
      if(pCurrentDocH->current.his!=hisPre || ws==" " /*||
	  (OptGetValue("formatting_spaces")!=std::string("keep") && 
	  OptGetValue("formatting_spaces")!=std::string("segmenting"))*/) {
	pCurrentDocH->AddSpace(ws);
      } else {
        pCurrentDocH->AddTag(ws,true);
      }
    }
    else {
      switch (pCurrentDocH->current.his) {
      case hisNorm:
      case hisPre:
	{

	  //fprintf(stdout,"AddText, cBuf=%s\n",cBuf);
	  pCurrentDocH->AddText(cBuf,"EN"/*pCurrentDocH->current.chp.getuniq("lang")*/);
	  break;
	}
      case hisSkip:
	{
	  pCurrentDocH->AddTag(cBuf,false);
	  break;
	}
      }
    }
    Buf++;
  }

  pCurrentDocH->FlushText();
  pCurrentDocH->FlushTag();
  //pCurrentDocH->FlushTs();

  free(refBuf);

  if(!pCurrentDocH->current_conv && !D->mimeencoding.empty() && D->mimeencoding!="UTF-8") {
    DefaultCharsetConvertor conv(D->mimeencoding,"UTF-8");
    if(conv.IsOk())
      ConvertDoc(pCurrentDocH->getRoot(),conv);
  }
  
  pCurrentDocH->close();

  /* We could not do it before, since thier translation must not be reencoded */  
  ReplaceEntities(pCurrentDocH->getRoot());
  
  /* The charset might have changed during the html analysis */
  //D->mimeencoding=OptGetValue("SOURCE_CHARSET");

  int j=0;
  map<string,int>::const_iterator it;
  for(it=pCurrentDocH->MapLang.begin();it!=pCurrentDocH->MapLang.end();j++,it++)
  {
    Document::lm l;
    l.language=it->first;
    l.ratio=pCurrentDocH->countc?(100*it->second/(pCurrentDocH->countc)):0;
    //print_debug("filter:html",1,"language map [%s]\t%d\n",l.language.c_str(),l.ratio);
    D->langmap.push_back(l);
  }

  return ecOK;
}
