#include <iostream>
#include <fstream>
#include <string>
#include <map>

#include <outilex/sax.h>

using namespace std;


class my_handler : public xml::sax_handler {

public:

  my_handler(ostream & out)
    : os(out), nb_sent(0), nb_word(0), in_sentence(false), in_word(false) { }

  void start_element(const string & name, const map<string, string> & attrs) {
    
    //cerr << "start elem: " << name << endl;

    if (name == "SENT") {

      assert(! in_sentence);
      in_sentence = true;
      ++nb_sent;

    } else if (name == "w") {

      assert(in_sentence);
      /*
      for (map<string, string>::iterator it = attrs.begin(), end = attrs.end();
           it != end; ++it) {
        os << it->first << " = " << it->second << endl;
      }
      */
      map<string,string>::const_iterator it = attrs.find("compound");
      if (it != attrs.end() && it->second == "yes") { return; }

      in_word = true;
      ++nb_word;
    }
  }

  void end_element(const string & name) {

    //cerr << "end elem: " << name << endl;

    if (name == "SENT") {

      os << "\n\n";
      in_sentence = false;

    } else if (name == "w") {
      if (in_word) { os << ' '; }
      in_word = false;
    }
  }

  void characters(const string & w) {
    //cerr << "characters..: '"<< w<<"'\n";
    if (in_word) { os << w; }
  }

public:
  ostream & os;
  int nb_sent, nb_word;
  bool in_sentence;
  int in_word;
};

int main(int argc, char ** argv) {
  
  if (argc < 2) {
    cerr << "bad args\n";
    exit(1);
  }

  ofstream out("corpus-raw.txt");
  my_handler parser(out);

  argv++, argc--;
  while (argc) {
    cout << "processing " << *argv << " ...\n";
    out << "[--- " << *argv << " ---]\n\n";
    parser.parse(*argv);
    out << "\n\n";
    argv++, argc--;
  }

  cout << "done.\ntotal of " << parser.nb_word << " words. " << parser.nb_sent << " sentences.\n";
}

