#include <string>
#include <vector>
#include <outilex/escape.h>
#include <outilex/unicode.h>
#include <outilex/lexical_mask.h>

using namespace std;

namespace {

void feats_tokenize(vector<string> & res, const string & text) {

  if (text.empty()) { return; }

  string::size_type len = text.size();
  string::size_type i = text.find_first_of("+-");

  /* push pos */
  if (i == string::npos) {
    res.push_back(text);
  } else {
    res.push_back(text.substr(0, i)); // push pos
  }
  
  while (i < len) {

    int j = text.find_first_of("+-", i + 1);

    if (j == string::npos) {
      res.push_back(text.substr(i));
      return;
    } else {
      res.push_back(text.substr(i, j - i));
    }
    i = j;
  }
}


char * PUNC = "(){}[]<>,?;.:/=+%$*-_!'\"&@#\\";

inline bool is_punct(char c) { return strchr(PUNC, c); }

} // namespace ""



void lexical_mask::read_text(const std::string & txt, ling_def * lingdef) {

  //  cerr << "lexmask::read_text(" << txt << ")\n";

  unspec();

  if (!((txt[0] == '<' && txt[txt.size() - 1] == '>') 
      || (txt[0] == '{' && txt[txt.size() - 1] == '}'))) {
    
    // if mask not between '<','>' or '{','}', assume plain form 
 
    form = lemma = txt;
    unicode::case_fold(case_fold, form);
    char c = txt[0];

    if (c >= '0' && c <= '9') { // number

      pos = lingdef->number_pos();
    
    } else if (is_punct(c)) { // punctuation symbol

      pos = lingdef->punc_pos();

    } else {
      pos = lingdef->get_pos("lex");
    }
 
    return;
  }

  /* string between <,> or {,}
   * if <> : basic lexmask
   * if {} : lexical entry -> retrieve defaults values  for unspecified attrs
   */

  bool is_a_lexical_entry = (txt[0] == '{');


  string::size_type dot = txt.rfind('.');

  if (dot != string::npos) { // there is a dot, form are specified
  
    string::size_type comma = find_unescaped(txt, ',');

    if (comma < dot) { // inflexionnal form specified 

      // cerr << "parsing form = " << txt.substr(1, comma - 1) << endl;

      form = unescape(txt.substr(1, comma - 1));
      if (! form.empty()) { unicode::case_fold(case_fold, form); }
 
    } else { comma = 0; }

    // cerr << "parsing lemma = " << txt.substr(comma + 1, dot - (comma + 1)) << endl;

    lemma = unescape(txt.substr(comma + 1, dot - (comma + 1)));

  } else { dot = 0; }
 
  /* form are parsed, now parse POS and features
   */

  // look for ':' (DELA notation)
  string::size_type colon = txt.find(':', dot + 1);

  if (colon == string::npos) {
    colon = txt.size() - 1;
  }

  vector<string> v; 
  feats_tokenize(v, txt.substr(dot + 1, colon - (dot + 1)));

  if (v.empty()) { return; } // no POS specified


  /* check for void POS
   */

  if (v[0] == "void") { clear(); return; }

  /* v[0] contains the pos 
   * v[1], ..., v[n] contain the feat set
   */

  pos = lingdef->get_pos(v[0]);
  if (pos == NULL) { 
    if (v.size() == 1 && dot == 0) { // assume canical form (e.g. <pomme>)
      pos = pos_unspec;
      lemma = v[0];
      return;
    }
    throw xml_parse_error("lexmask: unknow pos : '" + v[0] + "'"); 
  }

  for (int i = 1; i < v.size(); i++) {

    feat_set fs; attr_def * attr;

    if (! pos->get_feat_set(v[i], attr, fs)) {
      string unk = v[0] + v[i];
      if (unknow_attributes.find(unk) == unknow_attributes.end()) {
        cerr << "warning: in label : '" << txt << "': unknow feat :" << v[i] << '\n'; 
        unknow_attributes.insert(unk);
      }
      continue;
    }

    feats_map::iterator it = feats.find(attr);
    if (it == feats.end()) {
      feats[attr] = fs;
    } else {
      feats[attr] |= fs;
    }
  }

  // add morpho feats after ':' (DELA notation)
  string feat("++");
  int nbcolon = 0;
  while (colon < txt.size() - 1) {

    if (txt[colon] != ':') {

      feat[1] = txt[colon];

      feat_set fs; attr_def * attr;

      if (pos->get_feat_set(feat, attr, fs)) {

        feats_map::iterator it = feats.find(attr);

        if (it == feats.end()) {
          feats[attr] = fs;
        } else {
          feats[attr] |= fs;
        } 

      } else {
        cerr << "error with label : '" << txt << "': unknow feat :" << feat << '\n'; 
      }        

    } else { nbcolon++; }

    colon++;
  }

  if (nbcolon > 1) { cerr << "warning : label '" << txt << "' probably wrongly loaded\n"; }

  if (is_a_lexical_entry) { // retrieve default values ...

    int nattrs = pos->nb_attrs();
    for (int i = 0; i < nattrs; ++i) {

      attr_def * attr = pos->get_attr(i);

      if (attr->get_default_value() != -1) { // if a default value exists

        if (feats.find(attr) == feats.end()) { // and if the attribute is not explicitly defined
          // set it to default
          feat_set fs(attr->get_type(), attr->get_default_value());
          feats[attr] = fs;
        }
      }
    }
  }
}

namespace {

void feat_val_dump_text(attr_def * attr, const feat_set & fs, std::ostream & os) {

  fs.type->dump_feat_val_set(attr->get_name(), fs.val, attr->do_shortcut(), os);
}

} // namespace ""

void lexical_mask::dump_text(std::ostream & os) const {

  if (empty()) { os << "<void>"; return; }

  ling_def * ldef = 0;
  if (pos != pos_unspec) {
    ldef = pos->lingdef;
    if (pos == ldef->punc_pos()
        && form.size() == 1
        && is_punct(form[0])) { // special case
      os << form;
      return;
    }
  }

  os << '<';

  if (! form.empty()) {

    os << escape(form, ",.") << ',' << escape(lemma, ",.") << '.';

  } else if (! lemma.empty()) {
  
    os << escape(lemma, ",.") << '.';
  }


  if (pos != pos_unspec) {

    os << pos->get_name();

    for (feats_map::const_iterator it = feats.begin(); it != feats.end(); ++it) {
      feat_val_dump_text(it->first, it->second, os);
    }
  }
  os << '>';
}

