#include <iostream>

#include <boost/regex.hpp>
#include <boost/regex/icu.hpp>


#include <outilex/wrtn_chart.h>
#include <outilex/fsa_decoration.h>

#include <outilex/stringtok.h>
#include <outilex/escape.h>

#include <outilex/information_extraction.h>


using namespace std;
using namespace boost;

namespace {


void output_match(const wrtn_chart & chart, const wrtn_match & match, ostream & os) {

  int size = match.path.size();
  for (int i = 0; i < size; ++i) {
    
    const syntref & ref = match.path[i];
    
    if (! match.out[i].empty()) { os << "'" << match.out[i] << "'"; }
 
    if (ref.transno < 0) {
      output_match(chart, chart[ref.qno][- ref.transno - 1], os);
    } else {
      os << chart.fsa.get_trans(ref.qno, ref.transno).in().form << ' ';
    }
  }

  if (! match.out[size].empty()) { os << "'" << match.out[size] << "'"; }
}



void add_feats_from_output(ostringstream & code, string & lemma,
                           const sentence_fsa & text,
                           const syntref & ref, const string & label) {

  //cerr << "add_feats: code = " << code.str() << " label = '" << label << "'\n";

  vector<string> codes;
  stringtok(label, "+,[]", back_inserter(codes));

  for (int n = 0; n < codes.size(); ++n) {

    if (codes[n].empty()) { continue; }

    if (codes[n][0] == '^') { // retrieve feat from the lexical entry

      if (ref.transno < 0) {
        cerr << "error: in label " << label << ": " << codes[n] << " under a subcall\n";
        continue;
      }

      const lexical_mask & m = text.get_trans(ref.qno, ref.transno).in();
      const string attrname = codes[n].substr(1);

      if (attrname == "lemma") {

        lemma = m.lemma;

      } else {

        const feat_set & fs = m[attrname];
        code << "+" << attrname << "=" << fs.get_val_text();
      }

    } else {
      code << "+" << codes[n];
    }
  }
  //cerr << "out of add_feats: code = " << code.str() << " label = '" << label << "'\n\n";
}



/* mark a lex transition for futur deletion,
 * -> set its destination state to -1
 */

void mark_for_deletion(sentence_fsa & text, const syntref & ref) {
  sentence_fsa::transition trans = text.get_trans(ref.qno, ref.transno);
  if (trans.label().pos != text.lingdef->lex_pos()) { // alwais keep <lex> trans
    trans.to() = -1;
  }
}


//const string NOTALEAF = "/not-a-leaf/";

u32regex output_token_regex(make_u32regex("(\\[[[:L*:]]+"
                                          "|[[:L*:]]+\\])"));


void process_output(const string & out, const string & grfname,
                    string & segment, string & text,
                    vector<string> & stack, bool & LEAF, xmlwriter & writer) {

  u32regex_token_iterator<string::const_iterator>
    tok(make_u32regex_token_iterator(out, output_token_regex, 1)), end;


  while (tok != end) {

    const string & txt = tok->str();


    if (txt[0] == '[') { // opening tag

      string name = txt.substr(1);

      stack.push_back(name);
      writer.start_element(name);

      LEAF = true;
      text.clear();

      if (stack.size() == 1) { segment.clear(); }

    } else { // closing tag 

      if (stack.empty()) {
        throw runtime_error("in graph '" + grfname + "' : tag ']' mismatches");
      }


      if (stack.size() == 1) {
        writer.write_element("segment", segment);
      } else if (LEAF) {
        writer.write_string(text); 
      }

      writer.end_element();
      stack.pop_back();
      LEAF = false;
    }

    ++tok;
  }

}

void process_match(const wrtn_chart & chart, const wrtn_match & match,
                   string & segment, string & text, vector<string> & stack,
                   bool & LEAF, xmlwriter & writer) {


  const sentence_fsa & fsa = chart.fsa;

  int size = match.path.size();

  int i = 0; // we need it outside loop
  for (i = 0; i < size; ++i) {

    process_output(match.out[i], match.name, segment, text, stack, LEAF, writer);

    /* travel */
    const syntref & ref = match.path[i];

    if (ref.transno < 0) { // sub-graph

      process_match(chart, chart[ref.qno][-ref.transno - 1],
                               segment, text, stack, LEAF, writer);

    } else { // lexical trans

      if (! stack.empty()) {
        segment += fsa.get_trans(ref.qno, ref.transno).in().form + " ";
        if (LEAF) {
          text += fsa.get_trans(ref.qno, ref.transno).in().form + " ";
        }
      }
    }
  }


  /* end of match : look for last output
   */

  process_output(match.out[i], match.name, segment, text, stack, LEAF, writer);
}


void process_match(wrtn_chart & chart, const wrtn_match & match, xmlwriter & writer) {

  string segment, text;
  vector<string> stack;
  bool LEAF;
  process_match(chart, match, segment, text, stack, LEAF, writer);

  if (! stack.empty()) { throw runtime_error("unbalanced '[' and ']' tags"); }
}



}; // namespace ""



int extract_infos(wrtn_chart & chart, const string & axiom, xmlwriter & writer) {

  int nbmatch = 0;
  int size = chart.size();

  wrtn_match firstmatch(axiom,
                        numeric_limits<int>::max(),
                        synt_path_type(),
                        vector<string>(),
                        numeric_limits<double>::max());
 
  // ignore inner matches
  int maxto = 0;

  for (int q = 0; q < size; ++q) {
 
    wrtn_chart::match_by_val_iterator it = chart[q].lower_bound(firstmatch),
      end = chart[q].val_end();

    if ((it != end)
        && (it->name == axiom)) { // only output the first match
      
      if (maxto < it->to) { // .. and only if it goes farther than previous matches
        maxto = it->to;
        process_match(chart, *it, writer);
        ++nbmatch;
      }
    }
  }
  return nbmatch;
}

