#!/bin/bash

set -e -u


OLIVEXDIR=$HOME/olivex

if [ -f ~/.olivex ]; then
  OLIVEXDIR=$(cat ~/.olivex)
fi

# la variable d'environement DICOS permet de specifier
# un fichier alternatif qui contient les dictionnaires a appliquer au texte

DICOFILE=${ODICOS:-$OLIVEXDIR/dicos.unitex}


MYUNITEX=$HOME/unitex/French

ALPHABET=$MYUNITEX/Alphabet.txt
SENTENCE=$MYUNITEX/Graphs/Preprocessing/Sentence/Sentence.fst2
REPLACE=$MYUNITEX/Graphs/Preprocessing/Replace/Replace.fst2

NORMFST2=$MYUNITEX/Graphs/Normalization/Norm.fst2

DICOS=$(cat $DICOFILE | grep -v '#')

DELAFCORRESP=${DELAFCORRESP:-OLIVEXDIR/delaf-corresp-fr}
LINGDEF=${LINGDEF:-$OLIVEXDIR/lingdef.xml}

CORPUSDIR=/tmp/Corpus


PROG=${0##*/}

usage() {
  echo "usage: $PROG [-iso][-gz][-dontnorm][-bin|-xml] <text>"
  exit 0
}

# default parameters
ORIG=""
TOU16=u82u16
FORMAT="bin"
COMPRESS=""

while [ $# != 0 ]; do
  case "$1" in
    -help)     usage ;;
    -iso)      TOU16=i2u16 ;;
    -dontnorm) NORMFST2="" ;;
    -xml)      FORMAT="xml" ;;
    -bin)      FORMAT="bin" ;;
    -gz)       COMPRESS="-gz" ;;
    *)         ORIG="$1" ;;
  esac
  shift
done

if [ -z "$ORIG" ]; then
  usage
  exit 1
fi

if [ ! -f "$ORIG" ]; then
  echo "error: file $ORIG not found"
  exit 1
fi


TEXT=$CORPUSDIR/${ORIG%.txt}.txt
SNT=${TEXT%.txt}.snt
SNTDIR=${TEXT%.txt}_snt


mkdir -p $SNTDIR

echo "unicode conversion ..."
(addbom; cat $ORIG | $TOU16) > $TEXT

if [ ! -e $TEXT ]; then echo "error no text: $TEXT"; exit 1; fi

echo "normalisation ..."
Normalize $TEXT > /dev/null

echo "sentences tokenization ..."
Fst2Txt $SNT $SENTENCE $ALPHABET -merge > /dev/null

echo "replace ..."
Fst2Txt $SNT $REPLACE $ALPHABET -replace > /dev/null

echo "tokenization ..."
Tokenize $SNT $ALPHABET > /dev/null

echo "Dico application ..."
Dico $SNT $ALPHABET $DICOS > /dev/null

echo "Text fst2 ..."
Txt2Fst2 $SNT $ALPHABET -clean $NORMFST2 > /dev/null

FST2=${SNTDIR}/text.fst2
UFST2=${SNTDIR}/utext.fst2
FSA=${TEXT%.txt}.fsa.gz

if [ ! -e "$FST2" ]; then
  echo "error: fst2 $FST2 not found"
  exit 1
fi

echo "unicode deconversion ..."
cat $FST2 | rembom | u162u8 > $UFST2

if [ ! -e $UFST2 ]; then
  echo "error cannot find $UFST2"
  exit 1
fi

echo "XML translation ..."
textfst2xml -gz -c $DELAFCORRESP -o $FSA $UFST2 2> fst2xmlerrors

case $FORMAT in
  xml) EXT=.gz
       if [ $COMPRESS = "-gz" ]; then $EXT=$EXT.gz ; fi
       ;;
  bin) EXT=.bin
       ;;
  *)   echo "bad format : $FORMAT"
       exit 1
       ;;
esac
RES=${ORIG%.txt}$EXT

echo "text fsa cleanup ..."
cleantextfsa -addpos $COMPRESS -l $LINGDEF -o $RES $FSA 2> cleanfsa-errors

echo "done. result in $RES."

