#!/bin/bash

set -e -u

OLIVEXDIR=$HOME/olivex
MYUNITEX=$HOME/unitex/French

ALPHABET=$MYUNITEX/Alphabet.txt
SENTENCE=$MYUNITEX/Graphs/Preprocessing/Sentence/Sentence.fst2
REPLACE=$MYUNITEX/Graphs/Preprocessing/Replace/Replace.fst2

NORMFST2=$MYUNITEX/Graphs/Normalization/Norm.fst2

DICOS=$(cat $OLIVEXDIR/dicos | grep -v '#')

DELAFCORRESP=$OLIVEXDIR/delaf-corresp-fr
LINGDEF=$OLIVEXDIR/lingdef.xml

CORPUSDIR=/tmp/Corpus

TOU16=u82u16



if [ $# = 0 ]; then
  echo "usage: preprocess [-dontnorm] [-iso] <text>"
  exit 0
fi


if [ "$1" = "-dontnorm" ]; then
  NORMFST2=""
  shift
fi

if [ "$1" = "-iso" ]; then
  TOU16=i2u16
  shift
fi



if [ -z "$1" ]; then
  echo "usage: preprocess <utf8txt>"
  exit 1
fi


ORIG=$1
TEXT=$CORPUSDIR/${ORIG%.txt}.txt
SNT=${TEXT%.txt}.snt
SNTDIR=${TEXT%.txt}_snt


mkdir -p $SNTDIR

echo "unicode conversion ..."
(addbom; cat $ORIG | $TOU16) > $TEXT

if [ ! -e $TEXT ]; then echo "error no text: $TEXT"; exit 1; fi

echo "normalisation ..."
Normalize $TEXT > /dev/null

echo "sentences tokenization ..."
Fst2Txt $SNT $SENTENCE $ALPHABET -merge > /dev/null

echo "replace ..."
Fst2Txt $SNT $REPLACE $ALPHABET -replace > /dev/null

echo "tokenization ..."
Tokenize $SNT $ALPHABET > /dev/null

echo "Dico application ..."
Dico $SNT $ALPHABET $DICOS > /dev/null

echo "Text fst2 ..."
Txt2Fst2 $SNT $ALPHABET -clean $NORMFST2 > /dev/null

FST2=${SNTDIR}/text.fst2
UFST2=${SNTDIR}/utext.fst2
FSA=${TEXT%.txt}.fsa.gz
RES=${ORIG%.txt}.fsa2.gz

if [ ! -e "$FST2" ]; then
  echo "error: fst2 $FST2 not found"
  exit 1
fi

echo "unicode deconversion ..."
cat $FST2 | rembom | u162u8 > $UFST2

if [ ! -e $UFST2 ]; then
  echo "error cannot find $UFST2"
  exit 1
fi

echo "XML translation ..."
textfst2xml2 -gz -c $DELAFCORRESP -o $RES $UFST2 2> fst2xmlerrors


echo "NOT text fsa cleanup ..."

#cleantextfsa -addpos -gz -l $LINGDEF -o $RES $FSA 2> cleanfsa-errors
echo "done. result in $RES."

