#ifndef _UNICODE_H_
#define _UNICODE_H_

#include <string>
#include <vector>
#include <stdexcept>

#include <unicode/unorm.h>
#include <unicode/utypes.h>
#include <unicode/uchar.h>
#include <unicode/ustring.h>


class unicode_error : public std::runtime_error {
public:
  unicode_error(const std::string & msg) : runtime_error(msg) {}
};

inline void unicode_check(UErrorCode & uerror) {
  if (U_FAILURE(uerror)) {
    throw unicode_error(std::string("Unicode ICU error: ") + u_errorName(uerror));
  }
}


namespace unicode {


  typedef UChar32 code_point;


// UTF-8 sequence / codepoint translation

inline void u8_mask_lead_byte(code_point & c, int count) {
  if (count == 0) {
    c &= 0x7f;
  } else {
    c &= (1 << (6 - count)) - 1;
  }
}
inline int u8_count_trail_bytes(char c) { return U8_COUNT_TRAIL_BYTES(c); }


template<typename Iterator>
code_point utf8_next(Iterator & it) {
  
  int count = u8_count_trail_bytes(*it);

  code_point res = *it; ++it;

  u8_mask_lead_byte(res, count);

  switch (count) {
  case 3:
    res = (res << 6) | (*it & 0x3f); ++it;
  case 2:
    res = (res << 6) | (*it & 0x3f); ++it;
  case 1:
    res = (res << 6) | (*it & 0x3f); ++it;
  }

  return res;
}

template<typename CharIterator>
class utf8_iterator {

  static const code_point UNDEF = 0xffffffff;

  CharIterator it;
  code_point   c;


  void skip() {
    int count = u8_count_trail_bytes(*it) + 1;
    while (count) { ++it; --count; }
  }
  
  void compute() {

    c = *it;
    int count = u8_count_trail_bytes(c);

//    std::cerr << "compute: c=" << c << " count=" << count;
    u8_mask_lead_byte(c, count);
//    std::cerr << " after maskleadbyte: c=" << c << std::endl;
    switch (count) {
    case 3:
      ++it; c = (c << 6) | (*it & 0x3f);
    case 2:
      ++it; c = (c << 6) | (*it & 0x3f);
    case 1:
      ++it; c = (c << 6) | (*it & 0x3f);
    }
  }


public:

  inline utf8_iterator(CharIterator _it) : it(_it), c(UNDEF) {}

  inline code_point operator*() {
    if (c == UNDEF) { compute(); }
    return c;
  }

  inline utf8_iterator<CharIterator> & operator++() {
    if (c == UNDEF) {
      skip();
    } else { ++it; }
    c = UNDEF;
    return *this;
  }
  
  template<typename T>
  bool operator==(const utf8_iterator<T> & b) const {
    return c == b.c && it == b.it;
  }

  template<typename T>
  bool operator!=(const utf8_iterator<T> & b) const {
    return c != b.c || it != b.it;
  }
};



template<typename CharIterator>
inline utf8_iterator<CharIterator> make_utf8_iterator(CharIterator it) {
  return utf8_iterator<CharIterator>(it);
}

template<typename OutputIterator>
inline void codepoint_to_utf8(code_point c, OutputIterator out) {

  if (c <= 0x7f) {
    *out = c; ++out;
  } else {
    if (c <= 0x7ff) {
      *out = (c >> 6) | 0xc0; ++out;
    } else {
      if (c <= 0xffff) {
        *out = (c >> 12) | 0xe0; ++out;
      } else {
        *out = (c >> 18) | 0xf0; ++out;
        *out = ((c >> 12) & 0x3f) | 0x80; ++out;
      }
      *out = ((c >> 6) & 0x3f) | 0x80; ++out;
    }
    *out = (c & 0x3f) | 0x80; ++out;
  }
}

inline void codepoint_to_string(code_point cp, std::string & res) {
  res.clear(); codepoint_to_utf8(cp, back_inserter(res));
}

/* libicu wrapper */

inline UChar tolower(UChar c) { return u_tolower(c); }

inline bool is_mark(code_point c) {
  int8_t type = u_charType(c);
  return type == U_COMBINING_SPACING_MARK || type == U_ENCLOSING_MARK || type == U_NON_SPACING_MARK;
}


inline bool is_alpha(code_point c) { return u_isalpha(c); }
inline bool is_punct(code_point c) { return u_ispunct(c); }
inline bool is_digit(code_point c) { return u_isdigit(c); }
inline int8_t property(code_point c) { return u_charType(c); }



inline int u_str_from_utf8(std::vector<UChar> & to, const char * from, int len, UErrorCode & uerror) {
  int res;
  u_strFromUTF8(& to[0], to.size(), & res, from, len, & uerror);
  if (res >= to.size()) {
    uerror = U_ZERO_ERROR;
    to.resize(res);
    u_strFromUTF8(& to[0], to.size(), & res, from, len, & uerror);
  }
  return res;
}


inline int u_str_from_utf8(std::vector<UChar> & to, const std::string & from, UErrorCode & uerror) {
  return u_str_from_utf8(to, from.data(), from.size(), uerror);
}

inline int u_str_from_utf8(std::vector<UChar> & to, const std::string & from) {
  UErrorCode uerror = U_ZERO_ERROR;
  int res = u_str_from_utf8(to, from.data(), from.size(), uerror);
  unicode_check(uerror);
  return res;
}


inline int utf8_from_u_str(std::vector<char> & dest, const UChar * src, int srclen,
                           UErrorCode & uerror) {

  int destlen;
  u_strToUTF8(& dest[0], dest.size(), & destlen, src, srclen, & uerror);

  if (destlen > dest.size()) {
    uerror = U_ZERO_ERROR;
    dest.resize(destlen);
    u_strToUTF8(& dest[0], dest.size(), & destlen, src, srclen, & uerror);
  }
  return destlen;
}


inline void utf8_from_u_str(std::string & str, const UChar * src, int srclen, UErrorCode & uerror) {
  std::vector<char> v(srclen * 2);
  int len = utf8_from_u_str(v, src, srclen, uerror);
  str.assign(& v[0], len);
}

inline void utf8_from_u_str(std::string & dest, std::vector<UChar> src, int srclen) {
  UErrorCode uerror = U_ZERO_ERROR;
  utf8_from_u_str(dest, & src[0], srclen, uerror);
  unicode_check(uerror);
}

inline void utf8_from_u_str(std::string & dest, std::vector<UChar> src, UErrorCode & uerror) {
  utf8_from_u_str(dest, & src[0], src.size(), uerror);
}

inline void utf8_from_u_str(std::string & dest, std::vector<UChar> src) {
  UErrorCode uerror = U_ZERO_ERROR;
  utf8_from_u_str(dest, src, uerror);
  unicode_check(uerror);
}




inline std::string utf8_from_u_str(const UChar * src, int srclen, UErrorCode & uerror) {
  std::vector<char> v(srclen * 2);
  int len = utf8_from_u_str(v, src, srclen, uerror);
  return std::string(& v[0], len);
}

inline std::string utf8_from_u_str(const std::vector<UChar> & src, UErrorCode & uerror) {
  std::vector<char> v(src.size() * 2);
  int len = utf8_from_u_str(v, & src[0], src.size(), uerror);
  return std::string(& v[0], len);
}

inline int normalize(std::vector<UChar> & dest, UNormalizationMode mode,
                     const UChar * src, int srclen, UErrorCode & uerror) {

  int destlen = unorm_normalize(src, srclen, mode, 0, & dest[0], dest.size(), & uerror);

  if (destlen > dest.size()) {
    uerror = U_ZERO_ERROR;

    dest.resize(destlen);
    destlen = unorm_normalize(src, srclen, mode, 0, & dest[0], dest.size(), & uerror);
  }

  return destlen; 
}

inline void normalize(std::vector<UChar> & str, UNormalizationMode mode, UErrorCode & uerror) {
  std::vector<UChar> buf(str.size() * 2);
  int lenb = normalize(buf, mode, & str[0], str.size(), uerror);
  buf.resize(lenb);
  str.swap(buf);
}

inline void normalize(std::vector<UChar> & str, UNormalizationMode mode) {
  UErrorCode uerror = U_ZERO_ERROR;
  normalize(str, mode, uerror);
  unicode_check(uerror);
}


inline void normalize(std::string & text, UNormalizationMode mode) {
  
  std::vector<UChar> ustr(text.size());
  int len = u_str_from_utf8(ustr, text);
  ustr.resize(len);
  normalize(ustr, mode);
  utf8_from_u_str(text, ustr);
}


inline int case_fold(std::vector<UChar> & dest, const std::vector<UChar> & from, int len) {

  UErrorCode uerror = U_ZERO_ERROR;

  int capacity = dest.size();
  
  int res = u_strFoldCase(& dest[0], capacity, & from[0],  len, U_FOLD_CASE_DEFAULT, & uerror);

  if (res > capacity) {
    uerror = U_ZERO_ERROR;
    dest.resize(res);
    res = u_strFoldCase(& dest[0], res, & from[0],  len, U_FOLD_CASE_DEFAULT, & uerror);
  }
  unicode_check(uerror);
  return res;
}


inline void case_fold(std::string & dest, const std::string & from) {

  std::vector<UChar> From;
  int Fromlen = u_str_from_utf8(From, from);

  std::vector<UChar> Dest(Fromlen);

  int len = case_fold(Dest, From, Fromlen);
  utf8_from_u_str(dest, Dest, len);
}

}; // unicode namespace

#endif

