%{

#include "Tokenizer.h"

/* define the metacharacter that will represents tags in utf8 sequences */
/* both cannot be part of a legal utf8 sequence */
static const char CHARACTER_TAGSEG=0xfd;
static const char CHARACTER_TAG=0xfe;
static const char CHARACTER_TAGSPACE=0xff;

static bool space_before=true;

#define MVCP { /* printf("%d\n",__LINE__); */ currentpos+=yyleng; space_before=false; }

%}

%option c++ noyywrap never-interactive PREFIX="Gtok"

%x numericARABIC wordARABIC wordCYRILLIC wordLATIN wordGREEK wordUNDEF

  /* Definition of latin ascii letters */ 
LATINLETTER [a-zA-Z]

UKRAINIANLETTER ("Ё"|"Є"|"І"|"Ї"|"Ј"|"Џ"|"А"|"Б"|"В"|"Г"|"Д"|"Е"|"Ж"|"З"|"И"|"Й"|"К"|"Л"|"М"|"Н"|"О"|"П"|"Р"|"С"|"Т"|"У"|"Ф"|"Х"|"Ц"|"Ч"|"Ш"|"Щ"|"Ъ"|"Ы"|"Ь"|"Э"|"Ю"|"Я"|"а"|"б"|"в"|"г"|"д"|"е"|"ж"|"з"|"и"|"й"|"к"|"л"|"м"|"н"|"о"|"п"|"р"|"с"|"т"|"у"|"ф"|"х"|"ц"|"ч"|"ш"|"щ"|"ъ"|"ы"|"ь"|"э"|"ю"|"я"|"ё"|"є"|"і"|"ї"|"ј"|"Ґ"|"ґ")

ARABICFINAL ("\xFB"[\x51\x53\x57\x5B\x5F\x63\x67\x6B\x6F\x73\x77\x7B\x7F\x83\x85\x87\x89\x8B\x8D\x8F\x93\x97\x9B\x9F\xA1\xA5\xA7\xAB\xAF\xB1\xD4\xD8\xDA\xDC\xDF\xE1\xE3\xE5\xEB\xED\xEF\xF1\xF3\xF5\xF7\xFA\xFD]|"\xFC"[\x64-\x96]|"\xFD"[\x11-\x2C\x3C\x51\x58\x5A\x5B\x5E\x5F\x62\x64\x66\x67\x69\x6A\x6C\x6E\x6F\x71\x74-\x85\x87\x8B\x96\x97\x99\x9A\x9B\x9C\x9E-\xB3\xB6\xB7\xB9\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC6\xC7]|"\xFE"[\x82\x84\x86\x88\x8A\x8E\x90\x94\x96\x9A\x9E\xA2\xA6\xAA\xAC\xAE\xB0\xB2\xB6\xBA\xBE\xC2\xC6\xCA\xCE\xD2\xD6\xDA\xDE\xE2\xE6\xEA\xEE\xF0\xF2\xF6\xF8\xFA\xFC])

ARABICINITIAL ("\xFB"[\x54\x58\x5C\x60\x64\x68\x6C\x70\x74\x78\x7C\x80\x90\x94\x98\x9C\xA2\xA8\xAC\xD5\xE6\xE8\xF8\xFB\xFE]|"\xFC"[\x97-\xDE]|"\xFD"[\x2D-\x33\x50\x52\x53\x54\x55\x56\x57\x59\x5C\x5D\x60\x61\x63\x65\x68\x6B\x6D\x70\x72\x73\x77\x7D\x83\x86\x88\x89\x8A\x8C\x8D\x8E\x8F\x92\x93\x94\x95\x98\x9D\xB4\xB5\xB8\xBA\xC3\xC4\xC5]|"\xFE"[\x8B\x91\x97\x9B\x9F\xA3\xA7\xB3\xB7\xBB\xBF\xC3\xC7\xCB\xCF\xD3\xD7\xDB\xDF\xE3\xE7\xEB\xF3])

FULLWIDTHDIGIT ("０"|"１"|"２"|"３"|"４"|"５"|"６"|"７"|"８"|"９")

  /* regular punctuation list */ 
PUNCTUATION  [.;:!?,]

UTF8-2      [\xC0-\xDF][\x80-\xBF]
UTF8-3      [\xE0-\xEF][\x80-\xBF][\x80-\xBF]
UTF8-4      [\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]
UTF8-5      [\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
UTF8-6      [\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
UTF8        [\x01-\x7f]|{UTF8-2}|{UTF8-3}|{UTF8-4}|{UTF8-5}|{UTF8-6}

%%
  /* ---------------------------------------------------------------- */
  /* Punctuation                                                      */
  /* ---------------------------------------------------------------- */

  /* General Unicode punctuation */


  /* Goal is to normalize here "localized" punctuation that transfer will then
	relocalize to target - by default in ..en, normalization will not
	need transfer since target use normalization punctuation
   we keep orginal punctuation for source regeneration
  */

	/* 0x060C Arabic comma */
"،"	   MVCP;addtoken(new token_punctuation(yytext,",",gc_Po));
	/* 0x061F Arabic question mark */
"؟"	   MVCP;addtoken(new token_punctuation(yytext,"?",gc_Po));
	/* 0x061B Arabic semicolon */
"؛"	   MVCP;addtoken(new token_punctuation(yytext,";",gc_Po));
	/* 0x06D4 Arabic full stop */
"۔"	   MVCP;addtoken(new token_punctuation(yytext,".",gc_Po));

        /* 0x3001 IDEOGRAPHIC COMMA */
      /* "、"        MVCP;addtoken(new token_punctuation(yytext,",",gc_Po)); */
        /* 0x3002 IDEOGRAPHIC FULL STOP */
      /* "。"        MVCP;addtoken(new token_punctuation(yytext,".",gc_Po)); */

       /* Ukrainian apostrophe is part of the word, normalize to ' */

                       /* 201C  left double quot.	<e> */
"“"    MVCP;addtoken(new token_punctuation(yytext,"\"",gc_Pi));
                       /* 201D  right double quot.	<e> */
"”"    MVCP;addtoken(new token_punctuation(yytext,"\"",gc_Pf));
                       /* 2019  right single quot.	<e> */
"’"    MVCP;addtoken(new token_punctuation(yytext,"'",gc_Pf));
                       /* 2018  left single quot.	<e> */
"‘"    MVCP;addtoken(new token_punctuation(yytext,"`",gc_Pi));

                       /* 2015 and hyphens              <e> */
("―"|"-"|"－")    MVCP;addtoken(new token_punctuation(yytext,"-",gc_Pc));

  /* 2013, 2014 */
("–"|"—") {
  MVCP;
  addtoken(new token_punctuation(yytext,"-",gc_Pc));
}

                       /* 0x00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
"«"    MVCP;addtoken(new token_punctuation(yytext,"\"",gc_Pi));
                       /* 0x00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
"»"    MVCP;addtoken(new token_punctuation(yytext,"\"",gc_Pf));
                       /* 0x300D LEFT CORNER BRACKET */
"「"    MVCP;addtoken(new token_punctuation(yytext,"\"",gc_Pi));
                       /* 0x300D RIGHT CORNER BRACKET */
"」"    MVCP;addtoken(new token_punctuation(yytext,"\"",gc_Pf));
                       /* 0x300E LEFT WHITE CORNER BRACKET */
"『"    MVCP;addtoken(new token_punctuation(yytext,"'",gc_Pi));
                       /* 0x300F RIGHT WHITE CORNER BRACKET */
"』"    MVCP;addtoken(new token_punctuation(yytext,"'",gc_Pf));
                       /* 0x3014 LEFT TORTOISE SHELL BRACKET */
"〔"    MVCP;addtoken(new token_punctuation(yytext,"(",gc_Ps));
                       /* 0x3015 RIGHT TORTOISE SHELL BRACKET */
"〕"    MVCP;addtoken(new token_punctuation(yytext,")",gc_Pe));

  /* Regular punctuation list   */

{PUNCTUATION}      MVCP;addtoken(new token_punctuation(yytext,yytext));

  /* -- sequence */

"-""-"+     MVCP;addtoken(new token_punctuation(yytext,yytext,gc_Pd));
"*""*"+	MVCP;addtoken(new token_punctuation(yytext,yytext,gc_Po));

  /* Ellipsis 3 and more points... */
"[...]"  |
"(...)"  |
"..."\.* |
\.*"…"(\.|"…")*   { MVCP;
                    std::string s=yytext;
                    std::string::size_type k=0;
                    while((k=s.find("…",k))!=std::string::npos)
                      s.replace(k,3,"...");
                    addtoken(new token_punctuation(yytext,s.c_str()));
                  }

"`"+	MVCP;addtoken(new token_punctuation(yytext,yytext,gc_Pi));

  /* The following punctuation marks are ambiguous, we will need to check
     if it is used as a quote, or as a opening quote, or as a closing quote */
"'"+	MVCP;addtoken(new token_punctuation(yytext,yytext,gc_Po));


"\""+ MVCP;addtoken(new token_punctuation(yytext,yytext,gc_Po));


  /* ---------------------------------------------------------------- */
  /* Symbols                                                          */
  /* ---------------------------------------------------------------- */

  /* ---------------------------------------------------------------- */
  /* Special characters                                               */
  /* ---------------------------------------------------------------- */

  /* 03C2 GREEK SMALL LETTER FINAL SIGMA */
  /* Replaces by 03C3 GREEK SMALL LETTER SIGMA */
ς	MVCP;appendcurrentword(yytext,"σ");

                        /* 0640	arabic tatweel	<e> */
<wordARABIC>"ـ"	       MVCP;appendcurrentword(yytext,"");

<wordARABIC,wordLATIN,wordUNDEF,wordGREEK,wordCYRILLIC>"­"	MVCP;appendcurrentword(yytext,"");

  /* ---------------------------------------------------------------- */
  /* Numeric sequences                                                */
  /* ---------------------------------------------------------------- */

   /* Complete number entity - can mix full-width and simple width */
"-"?([0-9]|{FULLWIDTHDIGIT})+((","|"，")([0-9]|{FULLWIDTHDIGIT})+)*(("."|"．")([0-9]|{FULLWIDTHDIGIT})+)?(" "*("%"|"％"))? |
"-"?("."|"．")([0-9]|{FULLWIDTHDIGIT})+(" "*("%"|"％"))? {
      if(*yytext=='-' && !space_before) {
	REJECT;
      } else {
        MVCP;
        string norm;
        char buffer[8];
        forutf8init();
        forutf8(buffer,yytext) {
           unirdata *ud=findutfdescrpt(buffer);
           if (!postprocess && uCodePage(ud)>0xff00 && uCodePage(ud)<0xfffe) {
            /* Halfwidth - fullwidth forms */
            /* let us convert it to the half-width character - if defined */
            const char *half=uCharacterDecompositionMapping(ud);
            if (half) norm+=half;
            else norm+=buffer;
            } else norm+=buffer;
         }
        addtoken(new token_numeric(yytext,norm.c_str()));
      }
}

   /* Arabic numerics        */
<numericARABIC>{
                        /* 0660	arabic-indic digit 0	0 */
"٠" MVCP;appendcurrentword(yytext,"0");
                        /* 0661	arabic-indic digit 1	1 */
"١" MVCP;appendcurrentword(yytext,"1");	
                        /* 0662	arabic-indic digit 2	2 */
"٢" MVCP;appendcurrentword(yytext,"2");
                        /* 0663	arabic-indic digit 3	3 */
"٣" MVCP;appendcurrentword(yytext,"3");
                        /* 0664	arabic-indic digit 4	4 */
"٤" MVCP;appendcurrentword(yytext,"4");
                        /* 0665	arabic-indic digit 5	5 */
"٥" MVCP;appendcurrentword(yytext,"5");
                        /* 0666	arabic-indic digit 6	6 */
"٦" MVCP;appendcurrentword(yytext,"6");
                        /* 0667	arabic-indic digit 7	7 */
"٧" MVCP;appendcurrentword(yytext,"7");
                        /* 0668	arabic-indic digit 8	8 */
"٨" MVCP;appendcurrentword(yytext,"8");
                        /* 0669	arabic-indic digit 9	9 */
"٩" MVCP;appendcurrentword(yytext,"9");
                        /* 066B	arabic decimal separator	, */
"٫" MVCP;appendcurrentword(yytext,".");
                        /* 066C	arabic thousands separator	' */
"٬" MVCP;appendcurrentword(yytext,",");
                        /* 066A	arabic percent sign	% */
"٪" MVCP;appendcurrentword(yytext,"%");
                        /* 06F0-06F9 extended arabic digit 0-9 */
"۰" MVCP;appendcurrentword(yytext,"0");
"۱" MVCP;appendcurrentword(yytext,"1");
"۲" MVCP;appendcurrentword(yytext,"2");
"۳" MVCP;appendcurrentword(yytext,"3");
"۴" MVCP;appendcurrentword(yytext,"4");
"۵" MVCP;appendcurrentword(yytext,"5");
"۶" MVCP;appendcurrentword(yytext,"6");
"۷" MVCP;appendcurrentword(yytext,"7");
"۸" MVCP;appendcurrentword(yytext,"8");
"۹" MVCP;appendcurrentword(yytext,"9");

}

<numericARABIC><<EOF>> addtoken(new token_numeric(currentwordsource,currentwordnorm));return 1;
<numericARABIC>{UTF8} addtoken(new token_numeric(currentwordsource,currentwordnorm));BEGIN(INITIAL);yyless(0);


  /* ---------------------------------------------------------------- */
  /* Misc entities                                                    */
  /* ---------------------------------------------------------------- */

   /* detection of any uri http://www.ietf.org/rfc/rfc2396.txt,  
      we have added "," to allowed character in parameter

       common type are: 
         ftp://ftp.is.co.za/rfc/rfc1808.txt
         gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles
         http://www.math.uio.no/faq/compression-faq/part1.html
         mailto:mduerst@ifi.unizh.ch
         news:comp.infosystems.www.servers.unix
         telnet://melvyl.ucop.edu/
         file:192.177.23.21:~rasparail/tmp/olive.dat 
         We cannot be too much generic, knowing that "toto" is an uri :-( */

(("ftp"|"mail"|"mailto"|"gopher"|"https"|"http"|"news"|"telnet"|"file")":")("/"|"//"|"")[-0-9a-zA-Z;/?:@&=+$._!~*'()%,]+("#"[-0-9a-zA-Z;/?:@&=+$\\._!~*'()%,]+|"")  |
	/*       more specific on urls without <scheme>: */
	/*      we look for ip, domains with w*, and url with classical .xx suffix  */
([0-9]+"."[0-9]+"."[0-9]+|("w"|"ftp")[-a-zA-Z0-9]+"."[-a-zA-Z0-9._]+|[-a-zA-Z0-9._]+"."([a-zA-Z][a-zA-Z]|"com"|"net"|"org"|"edu"|"biz"|"info"|"gov"|"COM"|"NET"|"ORG"|"EDU"|"BIZ"|"INFO"|"GOV"))(":"[0-9]+|"")("/"[-0-9a-zA-Z;/?:@&=+$._!~*'()%,]+)("#"[-0-9a-zA-Z;/?:@&=+$\\._!~*'()%,]+|"") |
     /* www.anything.com... */
"www."([-a-zA-Z0-9._]+)"."([a-zA-Z][a-zA-Z]|"com"|"net"|"org"|"edu"|"biz"|"info"|"gov"|"COM"|"NET"|"ORG"|"EDU"|"BIZ"|"INFO"|"GOV")(":"[0-9]+|"") |
        /* Windows paths */
[A-Za-z]":"(("\\"[A-Za-z0-9]+("."[A-Za-z0-9]+)?)+("\\")?|"\\") {
        MVCP;
	string uri=yytext;
	if (strchr(";?.!:",uri[uri.length()-1])!=0) {
	    /* do not keep trailing punctuation, probably not part of uri */
	  unput(uri[uri.length()-1]);currentpos-=1;
	  uri.erase(uri.length()-1);
	}
	addtoken(new token_entity(uri.c_str(),uri.c_str(),token_entity::e_uri));
}

  /* Unix paths */
("."|"..")?"/"[a-zA-Z0-9_./-]* {
  if(!space_before) {
    REJECT;
  } else {
    MVCP;
    std::string path=yytext;
    if(strchr(".",path[path.length()-1])) {
      /* do not keep leading '.', likely a sentence period */
      unput(path[path.length()-1]);currentpos--;
      path.erase(path.length()-1);
    }
    addtoken(new 
      token_entity(path.c_str(),path.c_str(),token_entity::e_uri));
  }
}

  /* IP addresses */
([0-9]{1,3}"."){3}[0-9]{1,3} {
  int digit[4];
  sscanf(yytext,"%d.%d.%d.%d",digit,digit+1,digit+2,digit+3);
  int i;
  for(i=0;i<4 && digit[i]<=255;++i)
    ;
  if(i<4) {
    REJECT;
  } else {
    MVCP;
    addtoken(new token_entity(yytext,yytext,token_entity::e_ip));
  }
}

  /* ---------------------------------------------------------------- */
  /* CISCO entities...                                                */
  /* ---------------------------------------------------------------- */

  /* File- and Hostnames */
[a-zA-Z0-9_.-]*"."("gif"|"jpg"|"html"|"htm"|"tar"|"txt"|"cer"|"log"|"mib"|"gz"|"cm"|"sh"|"com"|"net"|"gov"|"org"|"edu"|"biz"|"info") {
  if(!space_before) {
    REJECT;
  } else {
    MVCP;
    addtoken(new token_entity(yytext,yytext,token_entity::e_uri));
  }
}

  /* 
     Generic entity for CISCO, basically everything ASCII with a number inside
     add entity of type [a-z]+()
  */
[\x21-\x7f]*[0-9][\x21-\x7f]* |
[a-zA-Z]+"()" {
  REJECT;
}

  /* Unix paths */
("."|"..")?"/"[a-zA-Z0-9_./-]* {
  if(!space_before) {
    REJECT;
  } else {
    MVCP;
    std::string path=yytext;
    if(strchr(".",path[path.length()-1])) {
      /* do not keep leading '.', likely a sentence period */
      unput(path[path.length()-1]);currentpos--;
      path.erase(path.length()-1);
    }
    addtoken(new 
      token_entity(path.c_str(),path.c_str(),token_entity::e_uri));
  }
}

  /* ---------------------------------------------------------------- */
  /* Separators...                                                    */
  /* ---------------------------------------------------------------- */

"　" | /* 0x3000 Ideograph space */
[ \t\n\r] |
" "    MVCP;space_before=true;addtoken(new token_separator(yytext,yytext));

  /* ---------------------------------------------------------------- */
  /* Remaining is generic utf8...                                     */
  /* ---------------------------------------------------------------- */


<wordARABIC>{ARABICFINAL}/{ARABICINITIAL} {
        MVCP;appendcurrentword(yytext,yytext);
}

<wordARABIC,wordLATIN,wordUNDEF,wordGREEK,wordCYRILLIC><<EOF>> {
	addtoken(new token_word(currentwordsource,currentwordnorm,currentwordalphabet));
	return 1;
}

<wordARABIC,wordLATIN,wordUNDEF,wordGREEK,wordCYRILLIC>{UTF8} {
    /* We only accept here characters belonging to the state alphabet */
    unirdata *ud=findutfdescrpt(yytext);

    if (ud && isletter(uGeneralCategory(ud)) && currentwordalphabet==uFamily(ud)) {
    MVCP;
    bool done=false;
    if (!postprocess && uCodePage(ud)>0xfb4f && uCodePage(ud)<0xfffe) {
       /* Halfwidth - fullwidth forms */
       /* let us convert it to the half-width character - if defined */

       const char *half=uCharacterDecompositionMapping(ud);
       if (half) {
          for(int i=strlen(half)-1;i>=0;--i) unput(half[i]);
          currentpos-=strlen(half);
          done=1;
         }
       }
       if (!done) appendcurrentword(yytext,yytext);
    }
    else {

	/* first we will close current word */
	addtoken(new token_word(currentwordsource,currentwordnorm,currentwordalphabet));
	/* restore state  */ 
	BEGIN(INITIAL);
	/* then we cancel */ 
        yyless(0);
    }
}

{UTF8} {

  /* Very general scheme - change state for word/special numeric */
  MVCP;
  unirdata *ud=findutfdescrpt(yytext);
  int done=0;
  if (!postprocess && uCodePage(ud)>0xff00 && uCodePage(ud)<0xfffe) {
     /* Halfwidth - fullwidth forms */
     /* let us convert it to the half-width character - if defined */
     const char *half=uCharacterDecompositionMapping(ud);
     if (half) {
        for(int i=strlen(half)-1;i>=0;--i) unput(half[i]);
        currentpos-=strlen(half);
        done=1;
     }
  }

  if (!ud) {
    addtoken(new token_unknown(yytext,yytext));
  }
  else if (!done) {
    if (isletter(uGeneralCategory(ud))) { // includes diacritics mark
      currentwordnorm="";currentwordsource="";
      currentwordalphabet=uFamily(ud);
       switch (uFamily(ud)) {
      case f_LATIN:
	BEGIN(wordLATIN);break;
      case f_CYRILLIC:
	BEGIN(wordCYRILLIC);break;
      case f_ARABIC:
	BEGIN(wordARABIC);break;
      case f_GREEK:
	BEGIN(wordGREEK);break;
      case f_CJKV:
        addtoken(new token_word(yytext,yytext,f_CJKV));break;
      default:
	BEGIN(wordUNDEF);break;
      }
      if (uFamily(ud)!=f_CJKV) {
        /* do not count this "switching" character, it will be reparsed; */
        currentpos-=yyleng; 
        yyless(0);
      }
   }
    else if (issymbol(uGeneralCategory(ud)) ||
             !strcmp(yytext,"@") ||
             !strcmp(yytext,"&") ||
             !strcmp(yytext,"#") ||
             !strcmp(yytext,"%") ||
             !strcmp(yytext,"*")
	) {
      addtoken(new token_symbol(yytext,yytext));
    }
    else if (ispunctuation(uGeneralCategory(ud))) {
      token *t=new token_punctuation(yytext,yytext);
      ((token_punctuation*)t)->subt=(token_punctuation::gc_subt)uGeneralCategory(ud);
      addtoken(t);
    }
    else if (uGeneralCategory(ud)==gc_Cf) {
      /* formatting characters - we drop them */
    }
    else if (uGeneralCategory(ud)==gc_Nd) {
      if (uFamily(ud)==f_ARABIC) {
	      currentwordnorm="";currentwordsource="";
	      currentwordalphabet=uFamily(ud);
	      BEGIN(numericARABIC);
	      currentpos-=yyleng;
	      yyless(0);
      }
    }
    else {
      addtoken(new token_unknown(yytext,yytext));
    }
  }
}

        /* ? character coming from non utf-8 */

<numericARABIC,wordARABIC,wordCYRILLIC,wordLATIN,wordGREEK,wordUNDEF>. |
.      {MVCP;
          fprintf(stderr, "Illegal UTF-8 character encountered\n");
}

<<EOF>> return 1;

%%

GtokStringParser::GtokStringParser(const char *Str,
                                   bool postprocess,
                                   list<token*> &output) {
  this->output = &output;
  this->postprocess = postprocess;
  InStr = (char *) Str;
  space_before = true;
  currentpos = 0;
}

GtokStringParser::~GtokStringParser()
{
  // output is not freed since it is the return value of Preprocess function
}

// redirect scanner input to InStr 
int GtokStringParser::LexerInput(char *buf, int max_size)
{
  int size;
  char *p;
  if (!InStr) return 0;

  // copy up to max_size chars from InStr to buf and increment InStr
  for (p = buf, size = 0;*InStr && size<max_size;)
    {
      *p++ = *InStr;size++;
      InStr++;
    }
  *p=0;
  return size;
}

void G_Preprocess(const string &s, bool postprocess, list<token *> &output)
{
  GtokStringParser parser(s.c_str(), postprocess, output);
  parser.yylex();
}

void G_Preprocess(const string &s, list<token *> &output)
{
  G_Preprocess(s, false, output);
}
