#include "Unicode.h"

using namespace std;

// Check validity of t against utf8 specifications
int check_utf8(const char *t) {
  //  [0x00-0x7f]                                                #One-byte
  //| [0xc2-0xdf][0x80-0xbf]                                     #Two-byte
  //| 0xe0[0xa0-0xbf][0x80-0xbf]                                 #Three-byte
  //| [0xe1-0xef][0x80-0xbf][0x80-0xbf]                          #Three-byte
  //| 0xf0[0x90-0xbf][0x80-0xbf][0x80-0xbf]                      #Four-byte
  //| [0xf1-0xf7][0x80-0xbf][0x80-0xbf][0x80-0xbf]               #Four-byte
  //| 0xf8[0x88-0xbf][0x80-0xbf][0x80-0xbf][0x80-0xbf]           #Five-byte
  //| [0xf9-0xfb][0x80-0xbf][0x80-0xbf][0x80-0xbf][0x80-0xbf]    #Five-byte
  //| 0xfc[0x84-0xbf][0x80-0xbf][0x80-0xbf][0x80-0xbf][0x80-0xbf]#Six-byte
  //| 0xfd[0x80-0xbf][0x80-0xbf][0x80-0xbf][0x80-0xbf][0x80-0xbf]#Six-byte
  
  const char *T=t;

  while(*T) {
    if (*T<=0x7f) {
      T+=1;
    } else if (*T<0xc2)
      return 0;
    else if (*T<=0xdf) {
      if (T[1]<0x80 || T[1]>0xbf)
	return 0;
      T+=2;
    } else if (*T==0xe0) {
      if (T[1]<0xa0 || T[1]>0xbf)
	return 0;
      if (T[2]<0x80 || T[2]>0xbf)
	return 0;
      T+=3;
    } else if (*T<=0xef) {
      if (T[1]<0x80 || T[1]>0xbf)
	return 0;
      if (T[2]<0x80 || T[2]>0xbf)
	return 0;
      T+=3;
    } else if (*T==0xf0) {
      if (T[1]<0x90 || T[1]>0xbf)
	return 0;
      if (T[2]<0x80 || T[2]>0xbf)
	return 0;
      if (T[3]<0x80 || T[3]>0xbf)
	return 0;
      T+=4;
    } else if (*T<=0xf7) {
      if (T[1]<0x80 || T[1]>0xbf)
	return 0;
      if (T[2]<0x80 || T[2]>0xbf)
	return 0;
      if (T[3]<0x80 || T[3]>0xbf)
	return 0;
      T+=4;
    } else if (*T==0xf8) {
      if (T[1]<0x88 || T[8]>0xbf)
	return 0;
      if (T[2]<0x80 || T[2]>0xbf)
	return 0;
      if (T[3]<0x80 || T[3]>0xbf)
	return 0;
      if (T[4]<0x80 || T[4]>0xbf)
	return 0;
      T+=5;
    } else if (*T<=0xfb) {
      if (T[1]<0x80 || T[8]>0xbf)
	return 0;
      if (T[2]<0x80 || T[2]>0xbf)
	return 0;
      if (T[3]<0x80 || T[3]>0xbf)
	return 0;
      if (T[4]<0x80 || T[4]>0xbf)
	return 0;
      T+=5;
    } else if (*T==0xfc) {
      if (T[1]<0x84 || T[8]>0xbf)
	return 0;
      if (T[2]<0x80 || T[2]>0xbf)
	return 0;
      if (T[3]<0x80 || T[3]>0xbf)
	return 0;
      if (T[4]<0x80 || T[4]>0xbf)
	return 0;
      if (T[5]<0x80 || T[5]>0xbf)
	return 0;
      T+=6;
    } else if (*T==0xfd) {
      if (T[1]<0x80 || T[8]>0xbf)
	return 0;
      if (T[2]<0x80 || T[2]>0xbf)
	return 0;
      if (T[3]<0x80 || T[3]>0xbf)
	return 0;
      if (T[4]<0x80 || T[4]>0xbf)
	return 0;
      if (T[5]<0x80 || T[5]>0xbf)
	return 0;
      T+=6;    
    } else
      return 0;
  }

  return 1;
}

/* Reference: Unicode Transformation Format, Roman Czyborra */
/* A little tweak on the code of course. */
const char* unicode2utf8(unsigned int uc)
{
        unsigned char u1, u2, u3, u4;
        static char ret[5];

        if (uc < 0x80) {
	        ret[0]=uc;ret[1]=0;
                return ret;
        } else if (uc < 0x800) {
                u2 = 0xC0 | uc >> 6;
                u1 = 0x80 | uc & 0x3F;
                ret[0]=u2; ret[1]=u1;ret[2]=0;
                return ret;
        } else if (uc < 0x10000) {
                u3 = 0xE0 | uc >> 12;
                u2 = 0x80 | uc >> 6 & 0x3F;
                u1 = 0x80 | uc & 0x3F;
                ret[0]=u3; ret[1]=u2;ret[2]=u1;ret[3]=0;
                return ret;
        } else if (uc < 0x200000) {
                u4 = 0xF0 | uc >> 18;
                u3 = 0x80 | uc >> 12 & 0x3F;
                u2 = 0x80 | uc >> 6 & 0x3F;
                u1 = 0x80 | uc & 0x3F;
                ret[0]=u4; ret[1]=u3;ret[2]=u2;ret[3]=u1;ret[4]=0;
                return ret;
        }

        return "";
}

const char *iso12utf8(char ic) {
  unsigned char u1, u2;
  static char ret[5];  

  if (((unsigned char) ic)<0x80) { ret[0]=ic;ret[1]=0;return ret; }
  u2 = 0xC0 | ic >> 6;
  u1 = 0x80 | ic & 0x3F;
  ret[0]=u2; ret[1]=u1;ret[2]=0;
  return ret;
}

size_t utf8charlen(const char *S) {
  const unsigned char *s=(const unsigned char *)S;
  if (*s==0 || *s>=0xfe) return 0;
  if (*s<0xc0) return 1;
  if (*s<=0xdf) return 2;
  if (*s<=0xef) return 3;
  if (*s<=0xf7) return 4;
  if (*s<=0xfb) return 5;
  return 6;
}

const char *GetAccents(const char *utf8char)
{
  unirdata *u=findutfdescrpt(utf8char);
  if(!u)
    return 0;

  const char *a=uCharacterDecompositionMapping(u);
  if(a==0)
    return 0;

  static std::string result;
  result="";

  while(*a) {
    unirdata *v=findutfdescrpt(a);
    size_t k=utf8charlen(a);
    
    if(uGeneralCategory(v)==gc_Mn ||
       uGeneralCategory(v)==gc_Mc ||
       uGeneralCategory(v)==gc_Me)
      result+=std::string(a,k);
    else {
      const char *acc=GetAccents(a);
      if(acc)
	result+=acc;
    }

    a+=k;
  }

  return result.c_str();
}

const char* RemoveAccent(const char *utf8char, const char *utf8acc)
/* TODO: be much more general... */
{
  static std::string result;
  result=utf8char;
  
  unirdata *u=findutfdescrpt(utf8char);
  
  if(!u)
    return result.c_str();
  
  const char *a=uCharacterDecompositionMapping(u);

  if(!a || utf8len(a)!=2)
    return result.c_str();

  size_t k=utf8charlen(a);
  
  if(strcmp(a+k,utf8acc))
    return result.c_str();

  result=std::string(a,utf8charlen(a));
  
  return result.c_str();
}

const char* SetAccent(const char *utf8char, const char *utf8acc)
/* TODO: generalization to other types of accents, to multiple accents... */
{
  unsigned acc=utf82unicode(utf8acc);

  static std::string result;
  
  switch(acc) {
   case 0x0301: /* Combining Acute Accent */
    switch(utf8char[0]) {
      case 'A':
	result="Á";
	break;

      case 'E':
	result="É";
	break;

      case 'I':
	result="Í";
	break;

      case 'O':
	result="Ó";
	break;

      case 'U':
	result="Ú";
	break;

      case 'Y':
	result="Ý";
	break;
      
      case 'a':
	result="á";
	break;

      case 'e':
	result="é";
	break;

      case 'i':
	result="í";
	break;

      case 'o':
	result="ó";
	break;

      case 'u':
	result="ú";
	break;

      case 'y':
	result="ý";
	break;

      default:
	if(!strncmp(utf8char,"α",2))
	  result="ά";
	else if(!strncmp(utf8char,"Α",2))
	  result="Ά";
	else if(!strncmp(utf8char,"ε",2))
	  result="έ";
	else if(!strncmp(utf8char,"Η",2))
	  result="Ή";
	else if(!strncmp(utf8char,"η",2))
	  result="ή";
	else if(!strncmp(utf8char,"Ε",2))
	  result="Έ";
	else if(!strncmp(utf8char,"ι",2))
	  result="ί";
	else if(!strncmp(utf8char,"ϊ",2))
	  result="ΐ";
	else if(!strncmp(utf8char,"Ι",2))
	  result="Ί";
	else if(!strncmp(utf8char,"ο",2))
	  result="ό";
	else if(!strncmp(utf8char,"Ο",2))
	  result="Ό";
	else if(!strncmp(utf8char,"υ",2))
	  result="ύ";
	else if(!strncmp(utf8char,"ϋ",2))
	  result="ΰ";
	else if(!strncmp(utf8char,"Υ",2))
	  result="Ύ";
	else if(!strncmp(utf8char,"ω",2))
	  result="ώ";
	else if(!strncmp(utf8char,"Ω",2))
	  result="Ώ";
        else
          result=utf8char;	  
    }
    break;

   case 0x0303: /* Combining Tilda */
    switch(utf8char[0]) {
      case 'A':
	result="Ã";
	break;

      case 'N':
	result="Ñ";
	break;

      case 'O':
	result="Õ";
	break;

      case 'a':
	result="ã";
	break;

      case 'n':
	result="ñ";
	break;

      case 'o':
	result="õ";
	break;

      default:
	result=utf8char;
    }
    break;

   default:
    result=utf8char;
  }

  return result.c_str();
}  

const char *GetFirstUTF8DecompChar(const char *utf8) {
  unirdata *u=findutfdescrpt(utf8);
  if (!u)
    return 0;
  
  char *a=uCharacterDecompositionMapping(u);
  if(a==0)
    return 0;
  
  static std::string U;
  U=a;
  U.erase(utf8charlen(a));
  
  const char *result=GetFirstUTF8DecompChar(U.c_str()); // Recursive
  
  if(result)
    return result;
  else
    return U.c_str();
}

int HasUTF8DecompChar(const char *utf8, const char *decomputf8) {
  unirdata *u=findutfdescrpt(utf8);
  if (!u)
    return 0;
  
  char *a=uCharacterDecompositionMapping(u);
  if(a==0)
    return 0;

  while(*a) {
    size_t k=utf8charlen(a);
    if(!strncmp(decomputf8,a,k))
      return 1;
    else if(HasUTF8DecompChar(a,decomputf8))
      return 1;

    a+=k;
  }

  return 0;
}

size_t utf8len(const char *S) {
  size_t s=0;
  size_t t;
  while((t=utf8charlen(S))!=0) { s++;S+=t; }
  return s;
}

const char *utf8advance(const char *pszUtf8, size_t nChars)
{
  size_t i;
  size_t nCsize;
  for (i=0; (nCsize = utf8charlen(pszUtf8))!=0; i++) {
    if (i >= nChars) break;
    pszUtf8 += nCsize;
  }
  return pszUtf8;
}

const char *utf8backup(const char *pszUtf8, size_t nChars)
{
  size_t nLen = utf8len(pszUtf8);
  nLen -= nChars;
  if (nLen < 0) nLen = 0;
  const char *pszEnd = utf8advance(pszUtf8, nLen);
  return pszEnd;
}

size_t utf8copychar(const char *pszUtf8, char *pszCopy)
{
  if (pszUtf8 == NULL) return 0;
  size_t nCsize = utf8charlen(pszUtf8);
  size_t i;
  for (i=0; i < nCsize; i++) {
    *pszCopy = *pszUtf8;
    pszCopy++;
    pszUtf8++;
  }
  *pszCopy = 0;
  return nCsize;
}

unsigned int utf82unicode(const char *S) {
  const unsigned char *s=(const unsigned char *)S;
  if (*s==0 || *s>=0xfe) return 0;
  if (*s<=0x7f) return *s;
  if (!s[1]) return 0;
  if (*s< 0xe0) return ((s[0]&0x1f)<<6) +((s[1]&0x3f));
  if (!s[2]) return 0;
  if (*s< 0xf0) return ((s[0]&0x0f)<<12)+((s[1]&0x3f)<<6)+((s[2]&0x3f));
  if (!s[3]) return 0;
  if (*s< 0xf8) return ((s[0]&0x07)<<18)+((s[1]&0x3f)<<12)+((s[2]&0x3f)<<6)
		                                          +((s[3]&0x3f));
  return 0; // Incorrect unicode
}

static int compare_unidata(const void *a,const void *b) {
  return uCodePage((unidata *)a)-uCodePage((unidata *)b);
}

unidata refCJKV=
{0x0000
#ifdef DEBUGUNI
,"<CJKV>"
#endif
,f_CJKV
,gc_Lo
#ifdef DEBUGUNI
,0,bc_L
#endif
,NULL
#ifdef DEBUGUNI
,0,0,0,m_N,"",""
#endif
,0x0,0x0
#ifdef DEBUGUNI
,0x0
#endif
};

unirdata *findunidescrpt(unsigned int ucscode) {
  static int count=0;
  if(ucscode<545) // Optimization
    return (ud_ref+ucscode);
  if ((ucscode>=0x4e00 && ucscode<=0x9faf) /* CJK Unified Ideographs */
      || (ucscode>=0xac00 && ucscode<=0xd7af) /* Hangul Syllabes */
      || (ucscode>=0xf900 && ucscode<=0xfa5f)) /* CJK Compatibility Ideographs */
    {
      return (unirdata*)(ud_ref_end+ucscode);
    }
  /* we use the code page 0 for the variable passed ot bsearch - we cannot 
     declare a local variable, else we will not be sure it respects
     if <=ud_ref_end critical in the unirdata definition */
  unidata &ucode=ud_ref[0];
  ucode.CodePage=ucscode;
  void *ud=bsearch((void*)&ucode,ud_ref+1,size_ud_ref-1,sizeof(unidata),compare_unidata);
  ucode.CodePage=0;
  if (ud) return ((unirdata*)ud);

  /* by default */
  return (unirdata*)(ud_ref_end+ucscode);  
}

unirdata *findutfdescrpt(const char *utf8) {
  unsigned int ucscode=utf82unicode(utf8);
  if (!ucscode) return NULL;
  return findunidescrpt(ucscode);
}

swstring ToSWString(const char *s) {
  swchar w;
  swstring W;
  int i;
  while ((i=smbtowc(&w,s,10))>0) { W+=w;s+=i; }
#if defined(DEBUG) && !defined(WIN32)
  if (i==-1) {
#ifdef LDEBUG
    cerr<<"incorrect mb character: "<<s<<endl;
#endif
  }
#endif
  return W;
}

swstring ToSWString(const string &s) {
  return ToSWString(s.c_str());
}

const char *  FromSWString(const swstring &w) {
  static string s;
  char buf[5];
  unsigned int i;
  s.erase();
  for(i=0;i<w.length();i++) {
    int p=swctomb(buf,w[i]);
    if (p<=0) break;
    buf[p]=0;
    s+=buf;
  }
  return s.c_str();
}

/* Check if the utf-8 word has at least one letter in alphabet */
int HasLetterInAlphabet(const char *utf8,f_val alphabet) {
  forutf8init();
  char s[5]; 
  forutf8(s,utf8) {
    unirdata *u=findutfdescrpt(s);
    if (u && isletter(uGeneralCategory(u)) &&
	(alphabet==f_undef || uFamily(u)==alphabet)) return 1;
  }
  return 0;
}

f_val GetAlphabet(const char *utf8) {
  f_val alph=f_na;

  forutf8init();
  char s[6]; 
  forutf8(s,utf8) {
    unirdata *u=findutfdescrpt(s);
    if (u && isletter(uGeneralCategory(u)) && uFamily(u)!=f_na) {
      if(uFamily(u)==f_undef || (alph!=f_na && alph!=uFamily(u)))
	return f_undef;
      else
	alph=uFamily(u);
    }
  }
  return alph;
}

string iso1toutf8(const string &s) {
  string r;
  unsigned int i;
  for(i=0;i<s.length();i++)
    r+=iso12utf8(s[i]);
  return r;
}

int smbtowc(swchar *pwc, const char *s, size_t n) {
  if (!*s) return 0; // end of char...
#ifdef XMLFLOW
  *pwc=utf82unicode(s);
  return utf8charlen(s);
#else
  *pwc=(swchar)s[0];
  return 1;
#endif
}

swchar sonembtowc(const char *s) {
  if (!*s) return 0; // end of char...
  return utf82unicode(s);
}

int swctomb(char *s, swchar wc) {
#ifdef XMLFLOW
  strcpy(s,unicode2utf8(wc));
  return strlen(s);
#else
  if (wc<=255) { s[0]=(char)wc; return 1; }
  return -1;
#endif
}

int isLetter(const char *utf8) {
  forutf8init();
  char s[5]; 
  forutf8(s,utf8) {
    unirdata *u=findutfdescrpt(s);
    return (u && isletter(uGeneralCategory(u)));
  }
  return 0;
}

string CheckUTF8(const string &s) {
  char *refS=strdup(s.c_str());
  unsigned char *S=(unsigned char*)refS;
  int encodingissue=0;

  while (*S) {
    if (*S<32 && *S!='\n' && *S!='\r' && *S!='\t') {
      /* do not refuse the string for those characters that are utf-8 valid
	 but not accepted for xml */
      *S=' ';S++;continue;
    }
    if (*S<=0x7f) { S++;continue; }
    if (!S[1] || (S[1]&0xc0)!=0x80) { 
      return "XXX"; 
    }
    if (*S< 0xe0) { S+=2;continue; }
    if (!S[2] || (S[2]&0xc0)!=0x80) { *S='?';return "XXX"; }
    if (*S< 0xf0) { S+=3;continue; }
    if (!S[3] || (S[3]&0xc0)!=0x80) { return "XXX"; }
    if (*S< 0xfe) { S+=4;continue; }
    { return "X"; }
  }

  string result=(char*)refS;
  free(refS);
  return result;
}

string ProtectFileName(const string &s) {
  string r;
  unsigned int i;
  for(i=0;i<s.length();i++) {
    if (s[i]>128 || s[i]<=32 || s[i]=='%') {
      char b[5];
      sprintf(b,"%x",s[i]);
      if (s[i]<16) r+="%0"+(string)b;
      else r+="%"+(string)b;
    }
    else r+=s[i];
  }
  return r;
}

#define hex(a) ((a)>='a'?(a)-'a'+10:(a)-'0')
string UnProtectFileName(const string &s) {
  string r;
  unsigned int i;
  for(i=0;i<s.length();i++) {
    if (s[i]>128 || s[i]<=32) return s; // It was obviously not a protected filename
    if (s[i]=='%') {
      if (i>=s.length()-2 || 
	  !strchr("0123456789abcdef",s[i+1]) ||
	  !strchr("0123456789abcdef",s[i+2])) return s; // It was obviously not a protected filename
      int chr=hex(s[i+1])*16+hex(s[i+2]);
      i+=2;
      r+=chr;
    }
    else r+=s[i];
  }
  return r;
}

string LoosyUTF8convert(const string &s) {
  if (check_utf8(s.c_str())) return s;
  string r;
  unsigned int i;
  for(i=0;i<s.length();i++) {
    if (s[i]>128 || s[i]<32) {
      r+="?";
    }
    else r+=s[i];
  }
  return r;
}

#ifdef XMLFLOW
/* Korean specific decomposition code */

//Unicode to combination code transformation. 
//
//Note: Unicode has three code areas corresponding to the three 
//components for a Korean letter: first consonant, medial vowel, final consonant
//
//unicode (U+1100~U+1112) is prepared for initial consonant; 
//unicode (U+1161~U+1175) is prepared for medial vowel;
//unicode (U+11A8~U+11C2) is prepared for final consonant.
//Currently, unicode (U+1113~U+1119), (U+1175~U+11A2), and (U+11c3~U+11f9) are not 
//used Korean writing.
//
//unicode (U+3130~U+318E) is prepared for notating individual jamo (consonant or vowel); 
//unicode (U+AC00~U+D7A3) is prepared for syllable (letter).
//
//Korean combination code consists of three parts: 
//initial consonant, medial vowel, and final consonant.
//Each part has its own sequence number in its correspondent code area.
//
//The following function splits unicode into three component (consonant, vowel, consonant): 
//initial (5 bits), medial (5 bits), final (5 bits), 
//and compose them into 2 bytes' code: [initial medial final].
//
// eg., (U+B0A0) NAL(initial: NIEUN, medial:A, final:RIEUL) 
//       => (00010 00000 01000) 
// Index of HANGUL CHOSEONG 'NIEUN'   in (U+1100~U+1112) is 2,
// Index of HANGUL JUNGSEONG 'A'      in (U+1160~U+1175) is 0,
// Index of HANGUL JONGSEONG 'RIEUL'  in (U+11A8~u+11C2) is 8.

// defined a priori, validates that JAMO table is correct
#define UNIJAMOTABSIZE	28
#define UNIJAMOTABSIZE_LJAMO	19

static const unsigned int UniKoreanJamoTblSND[UNIJAMOTABSIZE]=
{
0x3130,0x3131,0x3132,0x3133,0x3134,0x3135,0x3136,//nh
0x3137,0x3139,0x313A,0x313B,0x313C,0x313D,//ls
0x313E,0x313F,0x3140,0x3141,0x3142,0x3144,//s
0x3145,0x3146,0x3147,0x3148,0x314A,0x314B,//kh
0x314C,0x314D,0x314E  //h
};

static const unsigned int UniKoreanJamoTblSND_ljamo[]=
{
  0x3131,0x3132,0x3134,0x3137,0x3138,0x3139, //l
  0x3141,0x3142,0x3143,0x3145,0x3146,//ss
  0x3147,0x3148,0x3149,0x214A,0x314B,//kh
  0x314C,0x314D,0x314E//h
};

//Unicode Korean related definition: begin
//Unicode Korean syllable area
static const unsigned int UnicodeKoreanBase=0xAC00;
static const unsigned int UnicodeKoreanLast=0xD79F;

//Unicode Korean jamo first area 
static const unsigned int UnicodeKoreanJamoBase1=0x1100;
static const unsigned int UnicodeKoreanJamoLast1=0x11FA;

//Unicode Korean jamo second area 
static const unsigned int UnicodeKoreanJamoBase2=0x3130;
static const unsigned int UnicodeKoreanJamoLast2=0x314E;

static bool ko_UniCodeToCombCode_L(unsigned int uniCode)
{
    if (uniCode>=UnicodeKoreanBase && uniCode<=UnicodeKoreanLast)
      {
        uniCode=uniCode-UnicodeKoreanBase;
        uniCode=uniCode %(21*28); //VCount*TCount;
        uniCode=uniCode%28;   //TCount
        unsigned int jongSung=uniCode;
        return (jongSung==0x08);
      }
    /* what about the 2 other areas? */
    return 0;
}

static void ko_UniCodeToCombCode(unsigned int uniCode, unsigned int &combCode,bool l_jamo=false)
{
    if (uniCode>=UnicodeKoreanBase && uniCode<=UnicodeKoreanLast)
      {
	unsigned int choSung,jungSung,jongSung;
	uniCode=uniCode-UnicodeKoreanBase;
        choSung=uniCode/(21*28); //VCount*TCount;
        uniCode=uniCode%(21*28); //VCount*TCount;
        jungSung=uniCode/28;  //TCount
        uniCode=uniCode%28;   //TCount
        jongSung=uniCode;
        combCode=choSung;
        combCode=(combCode<<5)+jungSung;
        combCode=(combCode<<5)+jongSung;
	return;
      }
    if (uniCode>=UnicodeKoreanJamoBase1 && uniCode<=UnicodeKoreanJamoLast1)
      {
        combCode=0x0;
	return;
      }
    if (uniCode>=UnicodeKoreanJamoBase2 && uniCode<=UnicodeKoreanJamoLast2)
      {
	size_t fi;
	static const unsigned int *table;
	size_t table_size;
	if(l_jamo) {
	  table=UniKoreanJamoTblSND_ljamo;
	  table_size=UNIJAMOTABSIZE_LJAMO;
	} else {
	  table=UniKoreanJamoTblSND;
	  table_size=UNIJAMOTABSIZE;
	}
	
        for(fi=0;uniCode!=table[fi] && fi<table_size;fi++)
	  ;
	
        if(fi>=table_size)
	  combCode=0x0;
        else
	  combCode=fi;
	if(l_jamo)
	  combCode<<=10;
	return;
      }
    combCode=0;
    return;
}

int ko_wd_equal_beg(const char *wpstr, const char *str) {
  char s[7];
  int nbchars=utf8len(str);
  
  const char *p=wpstr;
  forutf8init();
  forutf8(s,str) {
    nbchars--;

    if(nbchars) {
      if(strncmp(s,p,forutf8ls))
	return 0;
    } else { // last character, may be partial
      unsigned partial,full;
      
      ko_UniCodeToCombCode(utf82unicode(s),partial,true);
      ko_UniCodeToCombCode(utf82unicode(p),full,true);

      if(partial==full)
	return 1;
      else if(partial==((full>>5)<<5))
	return 1;
      else if(partial==((full>>10)<<10))
	return 1;
      else
	return 0;
    }

    p+=utf8charlen(p);
  }

  return 1;
}

int ko_wd_equal_end(const char *wpstr, const char *str) {
  char s[7];
  size_t nbchars=utf8len(str);
  
  const char *p=wpstr;

  while(utf8len(p)>nbchars)
    p+=utf8charlen(p);
  
  forutf8init();

  bool first=true;
  forutf8(s,str) {
    if(!first) {
      if(strncmp(s,p,forutf8ls))
	return 0;
    } else { // first character, may be partial
      first=false;
      
      unsigned partial,full;
      
      ko_UniCodeToCombCode(utf82unicode(s),partial);
      ko_UniCodeToCombCode(utf82unicode(p),full);

      if(partial==full)
	;
      else if(partial==(full&0x1F)) // 2 >> 5 - 1
	;
      else if(partial==(full&0x03FF)) // 2 >> 10 - 1
	;
      else
	return 0;
    }

    p+=utf8charlen(p);
  }

  return 1;
}

//Check if Korean letter *utfa consists of three parts 
//by checking if there is a final consonant in combination code.
int ko_cons(char *utfa)
{
  unsigned int combCode;

  if (!*utfa) return 0;
  
  /* Move to the last utf8 character of the string if not a single character */
  const char *last=utf8backup(utfa,1);
  unirdata *ru=findutfdescrpt(last);
  int cp=uCodePage(ru);
  
  ko_UniCodeToCombCode(cp,combCode);
  
  return (combCode & 0x1f);
}

//Check if Korean letter *utfa consists of three parts 
//by checking if there is a final vowel in combination code.
int ko_cons_l(char *utfa)
{
   if (!*utfa) return 0;

   /* Move to the last utf8 character of the string if not a single character */
   const char *last=utf8backup(utfa,1);
   unirdata *ru=findutfdescrpt(last);
   int cp=uCodePage(ru);

   return ko_UniCodeToCombCode_L(cp);
}


// return 1: 'ha' case
// return 2: 'hay' case
// return 0: 
// change 'ha' to 'toy', 'hay' to 'toy_e'
// check if *source is Korean word, and change last letter in
// *source.
int ko_changeKorEnd(char *source,char *target)
{
   strcpy(target,source);
   char *last=(char*)utf8backup(target,1);
   if(!strcmp(last,"하"))
   {
      strcpy(last,"되");
      return 1;
   }
   else if(!strcmp(last,"해"))
     {
       strcpy(last,"되어");
       return 2;
     }
   return 0;
}

//
//conjugate two words in combination code forms.
//A word can have several letters, such as 'English'=>YENG_E
//and a letter YENG consists of three parts: 
//consonant1+vowel+consonant2, in unicode: (U+110B)+(U+1167)+(U+11BC)
//
//In the following procedure, we can have three incoming patterns:
//
//(1) (consonant1+vowel)+(cosonant2+consonant3+vowel)
// result: (consonant1+vowel+consonant2)+(consonant3+vowel).
//
//(2) (consonant1+vowel+consonant2)+(consonant3+vowel)
// result: (consonant1+vowel+consonant2)+(consonant3+vowel)
//
// (3) (consonant1+vowel)+(consonant3+vowel)
// result: (consonant1+vowel)+(consonant2+vowel)
// (4) (consonant1+vowel+consonant2)+(cosonant3+consonant4+vowel)
// result: report error
//
static void ko_conjugation(unsigned int comb1,unsigned int comb2,unsigned int &comb3)
{
    comb3=0xffff;
    if(!(comb1&0x1f)&&!(comb2&0x7fe0)&&(comb2&0x1f))
        comb3=comb1|comb2;
}

//Korean combination code to unicode transformation.
// eg., NAL (initial: NIEUN, medial:A, final:RIEUL)
//      (00010 00000 01000) => (U+B0A0) 
static void ko_CombCodeToUniCode(unsigned int combCode, unsigned int &uniCode)
{
    unsigned int choSung,jungSung,jongSung,tmp;
    tmp=combCode;
    jongSung=combCode&0x1f;
    jungSung=(combCode&0x03e0)>>5;
    choSung=(tmp&0x7c00)>>10;
    uniCode=UnicodeKoreanBase+(choSung*21+jungSung)*28+jongSung;
}

//A Korean letter consists of three parts: consonant1+vowel+consonant2
//The following function conjugates two Korean words,  
//
//Input: two symbols in utf8 codes, output: utf8 code.
//The procedure is: to transform (1) utfa and utfb in utf8 to unicode, 
//(2) unicode to combination code, (3) conjugate the two codes,
//(4) transform combination code to unicode, (5) unicode to utf8 code (utfc).
int ko_conjugateString(char *utfa,char *utfb,char *utfc)
{
   if (!*utfa) 
   {
    strcpy(utfc,utfb);
    return true; 
   }
   else if(!*utfb)
   {
    strcpy(utfc,utfa);
    return true;  
   }

   char *lasta=(char*)utf8backup(utfa,1);
   unirdata *unia=findutfdescrpt(lasta);
   unsigned int cpa=uCodePage(unia);
   unirdata *unib=findutfdescrpt(utfb);
   unsigned int cpb=uCodePage(unib);

   //check if ub contain individual jongSung
   if(!(cpb>=UnicodeKoreanJamoBase2&&cpb<=UnicodeKoreanJamoLast2))
   {
     /* if not we can simply cat the 2 strings */
     strcpy(utfc,utfa);
     strcat(utfc,utfb);
     return true;
   }
   else
   {
     /* if so, then make the combination */
     unsigned int comb1,comb2,comb3;
     unsigned int unitmp;
     ko_UniCodeToCombCode(cpa,comb1);
     ko_UniCodeToCombCode(cpb,comb2);
     ko_conjugation(comb1,comb2,comb3);
     strcpy(utfc,utfa);
     if(comb3!=0xFFFF) {
       ko_CombCodeToUniCode(comb3,unitmp);
       char *lastc=(char*)utf8backup(utfc,1);
       *lastc=0;
       strcat(utfc,unicode2utf8(unitmp));
       strcat(utfc,utfb+utf8charlen(utfb));
     } else
       strcat(utfc,utfb);
   }
   return true;
}



/* Unicode sub-blocks - used essentially by CJK, all of them being f_CJKV */

int is_hiragana(char *s)
{
  unirdata *ru=findutfdescrpt(s);
  int cp=uCodePage(ru);
  return (0x3040<=cp && cp<0x3094);
}

int is_katakana(char *s)
{
  unirdata *ru=findutfdescrpt(s);
  int cp=uCodePage(ru);
  return (0x30a1<=cp && cp<0x30fa) || cp==0x30fc;
}

int is_hanji(char *s)
{
  unirdata *ru=findutfdescrpt(s);
  int cp=uCodePage(ru);
  return (0x4e00<=cp && cp<0x9fff);        
}

int is_hangul(char *s)
{
  unirdata *ru=findutfdescrpt(s);
  int cp=uCodePage(ru);
  return (0x3130<=cp && cp<=0x314e) || (cp>=0xac00 && cp<=0xd79f);        
}

const char *uGroup(unirdata *ru)
{
  int cp=uCodePage(ru);

  if (cp>=0x0000 && cp<=0x024F) return "Latin";
  if (cp>=0x0370 && cp<=0x03FF) return "Greek";
  if (cp>=0x0400 && cp<=0x052F) return "Cyrillic";
  if (cp>=0x0530 && cp<=0x058F) return "Armenian";
  if (cp>=0x0590 && cp<=0x05FF) return "Hebrew";
  if (cp>=0x0600 && cp<=0x06FF) return "Arabic";
  if (cp>=0x0700 && cp<=0x074F) return "Syriac";
  if (cp>=0x0780 && cp<=0x07BF) return "Thaana";
  if (cp>=0x0900 && cp<=0x097F) return "Devanagari";
  if (cp>=0x0980 && cp<=0x09FF) return "Bengali";
  if (cp>=0x0A00 && cp<=0x0A7F) return "Gurmukhi";
  if (cp>=0x0A80 && cp<=0x0AFF) return "Gujarati";
  if (cp>=0x0B00 && cp<=0x0B7F) return "Oriya";
  if (cp>=0x0B80 && cp<=0x0BFF) return "Tamil";
  if (cp>=0x0C00 && cp<=0x0C7F) return "Telugu";
  if (cp>=0x0C80 && cp<=0x0CFF) return "Kannada";
  if (cp>=0x0D00 && cp<=0x0D7F) return "Malayalam";
  if (cp>=0x0D80 && cp<=0x0DFF) return "Sinhala";
  if (cp>=0x0E00 && cp<=0x0E7F) return "Thai";
  if (cp>=0x0E80 && cp<=0x0EFF) return "Lao";
  if (cp>=0x0F00 && cp<=0x0FFF) return "Tibetan";
  if (cp>=0x1000 && cp<=0x109F) return "Myanmar";
  if (cp>=0x10A0 && cp<=0x10FF) return "Georgian";
  if (cp>=0x1100 && cp<=0x11FF) return "Hangul Jamo";
  if (cp>=0x1200 && cp<=0x137F) return "Ethiopic";
  if (cp>=0x13A0 && cp<=0x13FF) return "Cherokee";
  if (cp>=0x1400 && cp<=0x167F) return "Unified Canadian Aboriginal Syllabics";
  if (cp>=0x1680 && cp<=0x169F) return "Ogham";
  if (cp>=0x16A0 && cp<=0x16FF) return "Runic";
  if (cp>=0x1700 && cp<=0x171F) return "Tagalog";
  if (cp>=0x1720 && cp<=0x173F) return "Hanunoo";
  if (cp>=0x1740 && cp<=0x175F) return "Buhid";
  if (cp>=0x1760 && cp<=0x177F) return "Tagbanwa";
  if (cp>=0x1780 && cp<=0x17FF) return "Khmer";
  if (cp>=0x1800 && cp<=0x18AF) return "Mongolian";
  if (cp>=0x1E00 && cp<=0x1EFF) return "Latin";
  if (cp>=0x1F00 && cp<=0x1FFF) return "Greek";
  if (cp>=0x2000 && cp<=0x206F) return "General Punctuation";
  if (cp>=0x2070 && cp<=0x209F) return "Superscripts and Subscripts";
  if (cp>=0x20A0 && cp<=0x20CF) return "Currency Symbols";
  if (cp>=0x2100 && cp<=0x214F) return "Letterlike Symbols";
  if (cp>=0x2150 && cp<=0x218F) return "Number Forms";
  if (cp>=0x2190 && cp<=0x21FF) return "Arrows";
  if (cp>=0x2200 && cp<=0x22FF) return "Mathematical Operators";
  if (cp>=0x2300 && cp<=0x23FF) return "Miscellaneous Technical";
  if (cp>=0x2400 && cp<=0x243F) return "Control Pictures";
  if (cp>=0x2440 && cp<=0x245F) return "Optical Character Recognition";
  if (cp>=0x2460 && cp<=0x24FF) return "Enclosed Alphanumerics";
  if (cp>=0x2500 && cp<=0x257F) return "Box Drawing";
  if (cp>=0x2580 && cp<=0x259F) return "Block Elements";
  if (cp>=0x25A0 && cp<=0x25FF) return "Geometric Shapes";
  if (cp>=0x2600 && cp<=0x26FF) return "Miscellaneous Symbols";
  if (cp>=0x2700 && cp<=0x27BF) return "Dingbats";
  if (cp>=0x27C0 && cp<=0x27EF) return "Miscellaneous Mathematical Symbols-A";
  if (cp>=0x27F0 && cp<=0x27FF) return "Supplemental Arrows-A";
  if (cp>=0x2800 && cp<=0x28FF) return "Braille Patterns";
  if (cp>=0x2900 && cp<=0x297F) return "Supplemental Arrows-B";
  if (cp>=0x2980 && cp<=0x29FF) return "Miscellaneous Mathematical Symbols-B";
  if (cp>=0x2A00 && cp<=0x2AFF) return "Supplemental Mathematical Operators";
  if (cp>=0x2E80 && cp<=0x2EFF) return "CJK Radicals Supplement";
  if (cp>=0x2F00 && cp<=0x2FDF) return "Kangxi Radicals";
  if (cp>=0x2FF0 && cp<=0x2FFF) return "Ideographic Description Characters";
  if (cp>=0x3000 && cp<=0x303F) return "CJK Symbols and Punctuation";
  if (cp>=0x3040 && cp<=0x309F) return "Katakana/Hiragana";
  if (cp>=0x30A0 && cp<=0x30FF) return "Katakana/Hiragana";
  if (cp>=0x3100 && cp<=0x312F) return "Bopomofo";
  if (cp>=0x3130 && cp<=0x318F) return "Hangul Compatibility Jamo";
  if (cp>=0x3190 && cp<=0x319F) return "Kanbun";
  if (cp>=0x31A0 && cp<=0x31BF) return "Bopomofo Extended";
  if (cp>=0x31F0 && cp<=0x31FF) return "Katakana Phonetic Extensions";
  if (cp>=0x3200 && cp<=0x32FF) return "Enclosed CJK Letters and Months";
  if (cp>=0x3300 && cp<=0x33FF) return "CJK Compatibility";
  if (cp>=0x3400 && cp<=0x4DBF) return "CJK Unified Ideographs Extension A";
  if (cp>=0x4E00 && cp<=0x9FFF) return "CJK Unified Ideographs";
  if (cp>=0xA000 && cp<=0xA48F) return "Yi Syllables";
  if (cp>=0xA490 && cp<=0xA4CF) return "Yi Radicals";
  if (cp>=0xAC00 && cp<=0xD7AF) return "Hangul Syllables";
  if (cp>=0xD800 && cp<=0xDB7F) return "High Surrogates";
  if (cp>=0xDB80 && cp<=0xDBFF) return "High Private Use Surrogates";
  if (cp>=0xDC00 && cp<=0xDFFF) return "Low Surrogates";
  if (cp>=0xE000 && cp<=0xF8FF) return "Private Use Area";
  if (cp>=0xF900 && cp<=0xFAFF) return "CJK Compatibility Ideographs";
  if (cp>=0xFB00 && cp<=0xFB4F) return "Alphabetic Presentation Forms";
  if (cp>=0xFB50 && cp<=0xFDFF) return "Arabic";
  if (cp>=0xFE00 && cp<=0xFE0F) return "Variation Selectors";
  if (cp>=0xFE20 && cp<=0xFE2F) return "Combining Half Marks";
  if (cp>=0xFE30 && cp<=0xFE4F) return "CJK Compatibility Forms";
  if (cp>=0xFE50 && cp<=0xFE6F) return "Small Form Variants";
  if (cp>=0xFE70 && cp<=0xFEFF) return "Arabic Presentation Forms-B";
  if (cp>=0xFF00 && cp<=0xFFEF) return "Halfwidth and Fullwidth Forms";
  if (cp>=0xFFF0 && cp<=0xFFFF) return "Specials";
  return "undef";
}
#endif
