#include <string>
#include <vector>

#include <boost/regex.hpp>
#include <boost/regex/icu.hpp>

#include <outilex/tokenization.h>

using namespace std;
using namespace boost;

u32regex tokenizer_regex(make_u32regex("("
                                       "([[:L*:]]+|[[:M*:]])"          // mot
                                       "|[[:Nd:]]+"         // nombre
                                       "|[[:P*:]]|[[:S*:]]" // ponctuation + symbole
                                       "|\\s+"              // espace
                                       ")"));
// "|\\s+)")); // space

void tokenize(const string & text, vector<string> & res) {

  u32regex_token_iterator<string::const_iterator>
    tok(make_u32regex_token_iterator(text, tokenizer_regex, 1)), end;

  while (tok != end) {
    res.push_back(tok->str());
    ++tok;
  }
}
