diff --git a/README b/README index d937dfff16be1caa6f47afc2a05fcbbffb079606..26ff19ae52c26c211b85466011d84e4fcdb77fd7 100644 --- a/README +++ b/README @@ -110,3 +110,8 @@ VINF infinitive verb form VPP past participle VPR present participle VS subjunctive verb form + +When using normalization options, other tags may appear: +- when using -n, Y means "non-last token of a multi-token unit", X means "multiword/multitag token" +- when using -N, Y means "non-last token of a multi-token unit", multiword/multitag tokens are annotated with tags of the form T1+T2+...+Tn (ex.: chépa/CLS+V+ADV) + diff --git a/bin/MElt-eval.in b/bin/MElt-eval.in index 3efe0b769c80db0e690ce27686d03c2837212a1e..c8f9d7ab1494a0f4051d54ed2b8fc66b05632366 100644 --- a/bin/MElt-eval.in +++ b/bin/MElt-eval.in @@ -5,32 +5,54 @@ PYTHONPATH=@pkgpythondir@:$PYTHONPATH BINDIR=@bindir@ MODEL=unspecified_model LOWERCAPOPTION= +NORMALIZEOPTION= +LANGUAGE= NE=0 +REMOVE_SXPIPE_COMMENTS_OR_CAT="cat" -while getopts :m:Sdhv o +while getopts :m:l:nNSdhv o do case "$o" in v) echo $VERSION && exit 1;; h) echo "Usage: MElt-eval [ -dS ] -m trained-model-name eval-corpus" \ && echo " -m [/path/to/model] Use the MElt model given instead of the default one (which is the model 'fr' for French, trained on the FTB and the Lefff, installed in @datadir@/fr)" \ - && echo "-S process named entities before tagging, using SxPipe grammars distributed with MElt (the model must have been trained with the -S option)" \ - && echo "-h Output this help message" \ - && echo "-v Output MElt version" \ + && echo " -S process named entities before tagging, using SxPipe grammars distributed with MElt (the model must have been trained with the -S option)" \ + && echo " -n Use -n option for temporary normalization (requires -l)" \ + && echo " -N Use -N option for temporary normalization (requires -l)" \ + && echo " -l Specify language for de-noisification options" \ + && echo " -h Output this help message" \ + && echo " -v Output MElt version" \ && exit 1;; - m) MODEL=$OPTARG - ;; + m) MODEL=$OPTARG;; + l) LANGUAGE=$OPTARG;; S) NE=1;; + n) NORMALIZEOPTION="-n -l ${LANGUAGE} -c" + REMOVE_SXPIPE_COMMENTS_OR_CAT="perl -pe \"s/\{.*?\}\\s*//g;\"" + ;; + N) NORMALIZEOPTION="-N -l ${LANGUAGE} -c" + REMOVE_SXPIPE_COMMENTS_OR_CAT="perl -pe \"s/\{.*?\}\\s*//g;\"" + ;; d) LOWERCAPOPTION="-d";; \?) echo "Invalid option: -$OPTARG" >&2 exit 1 ;; - :) # ici $OPTARG==d + :) echo "Error: value needed after '-r' or '-C'" >&2 exit 4 ;; esac done +if [ "x$NORMALIZEOPTION" != "x" ] +then + if [ "x$LANGUAGE" == "x" ] + then + echo "Error: the language must be specified using option -l if the normalizer (-n or -N) is used" >&2 + exit 4 + fi +fi + + shift $(($OPTIND - 1)) # so that $1 is indeed the first argument AFTER those already parsed by getopts # usage: MElt-train trained-model-name eval-corpus [option for MElt] @@ -39,13 +61,13 @@ MODEL_BASE=$(basename "${MODEL}") if [ $NE -eq 1 ] then echo "Evaluating model $MODEL_BASE on corpus $1 (named-entity-aware setup)" >&2 - echo "cat \"$1\" | perl @bindir@/brown2txt.pl -S | tee \"$1.txt\" | MElt -S ${LOWERCAPOPTION} -m \"$MODEL\" > \"$1.txt.melted.$MODEL_BASE\"" >&2 - cat "$1" | perl -pe "s/_A_ANONYMISER//g" | perl @bindir@/brown2txt.pl | tee "$1.txt" | MElt -S ${LOWERCAPOPTION} -m "$MODEL" > "$1.txt.melted.$MODEL_BASE" + echo "cat \"$1\" | perl @bindir@/brown2txt.pl -S | tee \"$1.txt\" | MElt -S ${LOWERCAPOPTION} ${NORMALIZEOPTION} -m \"$MODEL\" | ${REMOVE_SXPIPE_COMMENTS_OR_CAT} > \"$1.txt.melted.$MODEL_BASE\"" >&2 + cat "$1" | perl -pe "s/_A_ANONYMISER//g" | perl @bindir@/brown2txt.pl | tee "$1.txt" | MElt -S ${NORMALIZEOPTION} ${LOWERCAPOPTION} -m "$MODEL" | ${REMOVE_SXPIPE_COMMENTS_OR_CAT} > "$1.txt.melted.$MODEL_BASE" PYTHONPATH=$PYTHONPATH python @bindir@/eval_brown.py -p "$1.txt.melted.$MODEL_BASE" -g "$1" -d "$MODEL/tag_dict.json" else echo "Evaluating model $MODEL_BASE on corpus $1" >&2 - echo "cat \"$1\" | perl @bindir@/brown2txt.pl | tee \"$1.txt\" | MElt -r ${LOWERCAPOPTION} -m \"$MODEL\" > \"$1.txt.melted.$MODEL_BASE\"" >&2 - cat "$1" | perl -pe "s/_A_ANONYMISER//g" | perl @bindir@/brown2txt.pl | tee "$1.txt" | MElt -r ${LOWERCAPOPTION} -m "$MODEL" > "$1.txt.melted.$MODEL_BASE" + echo "cat \"$1\" | perl @bindir@/brown2txt.pl | tee \"$1.txt\" | MElt ${NORMALIZEOPTION} -r ${LOWERCAPOPTION} -m \"$MODEL\" | ${REMOVE_SXPIPE_COMMENTS_OR_CAT} > \"$1.txt.melted.$MODEL_BASE\"" >&2 + cat "$1" | perl -pe "s/_A_ANONYMISER//g" | perl @bindir@/brown2txt.pl | tee "$1.txt" | MElt ${NORMALIZEOPTION} -r ${LOWERCAPOPTION} -m "$MODEL" | ${REMOVE_SXPIPE_COMMENTS_OR_CAT} > "$1.txt.melted.$MODEL_BASE" PYTHONPATH=$PYTHONPATH python @bindir@/eval_brown.py -p "$1.txt.melted.$MODEL_BASE" -g "$1" -d "$MODEL/tag_dict.json" fi diff --git a/bin/MElt-train.in b/bin/MElt-train.in index ae7c7a84ef1234062491611cc38cdce250ee3b98..24d640a121d87b2be067ca77c301ca3082719792 100644 --- a/bin/MElt-train.in +++ b/bin/MElt-train.in @@ -21,12 +21,14 @@ FEATURE_SELECTION_PARAMETERS= while getopts :f:r:C:m:sSDMvh o do case "$o" in v) echo $VERSION && exit 1;; - h) echo "Usage: MElt-train [ -rmMCS ] trained-model-name lexicon training-corpus test-corpus" \ + h) echo "Usage: MElt-train [ -rmsvhfMCS ] trained-model-name lexicon training-corpus test-corpus" \ && echo "-r [n_reruns] Number of successive runs for MegaM (used only in the multiclass MaxEnt setting)" \ && echo "-C [multiclass|multitron] Classifier type: multiclass MaxEnt (often a bit better, much slower to train) or multitron (multiclass perceptron, faster to train, default)" \ && echo "-m [n_iter] Maximum number of iterations for each run in MegaM (default is 100 for multiclass and 40 for multitron)" \ - && echo "-S train a named-entity-aware tagger, using SxPipe grammars distributed with MElt" \ - && echo "-D dump megam model" \ + && echo "-f [parameters] Use specified values as parameters (default is win:2,pwin:2,lex_wd:1,lex_lhs:1,lex_rhs:1,pln:4,sln:4,rpln:1,rsln:0,ffthrsld:2,norm:0 - see bin/MElt_tagger.py.in for details)" \ + && echo "-S Train a named-entity-aware tagger, using SxPipe grammars distributed with MElt" \ + && echo "-s Do not train the lemmatizer" \ + && echo "-D Dump megam model" \ && echo "-h Output this help message" \ && echo "-v Output MElt version" \ && exit 1;; diff --git a/bin/MElt_XML_preprocessor.pl b/bin/MElt_XML_preprocessor.pl index 621acaea53277775e865b81421717bb6279e9fed..02bc3a8573f91a39910932aa6e2c5ba97d651062 100644 --- a/bin/MElt_XML_preprocessor.pl +++ b/bin/MElt_XML_preprocessor.pl @@ -37,3 +37,9 @@ while (<>) { print $_; } + +__END__ + +<a> + ceci est un <b>superbe</b> texte +</a> diff --git a/bin/MElt_explore_featureset_space.pl b/bin/MElt_explore_featureset_space.pl index 9f06f886570715499f71a90aa217bb3daf95662d..22740911a66aeac6f6022335a59991f1f1da54cc 100644 --- a/bin/MElt_explore_featureset_space.pl +++ b/bin/MElt_explore_featureset_space.pl @@ -30,15 +30,15 @@ if (0) { # default $features{ffthrsld} = "1,2,3"; $features{norm} = "0"; } else { # expés réduites mais multiclass PERCEO - $features{lex_lhs} = "0"; + $features{lex_lhs} = "1"; $features{lex_rhs} = "1"; $features{lex_wd} = "1"; - $features{pln} = "3,4"; + $features{pln} = "3,4,5"; + $features{sln} = "4,5,6,7"; $features{pwin} = "2"; $features{win} = "2"; - $features{sln} = "4,5,6"; - $features{rpln} = "0,1"; - $features{rsln} = "0,1,2"; + $features{rpln} = "0,1,2,3"; + $features{rsln} = "0,1,2,3"; $features{ffthrsld} = "1,2"; $features{norm} = "0"; } @@ -51,6 +51,7 @@ $lang = shift || "fr"; $modeltype = shift || "multitron"; $maxiter = shift || 0; $is_NE_aware = shift || 0; +$dryrun = shift || 0; $maxiter = 40 if $maxiter == 0; @@ -92,6 +93,8 @@ for (@options) { } print STDERR "$n\n"; +exit (0) if $dryrun; + for (@options) { unless (-d "$lang-$modeltype$modeltypesuffix-$NE_option-$_") { system("MElt-train -s -C $modeltype -m $maxiter $NE_option -f $_ $lang-$modeltype$modeltypesuffix-$NE_option-$_ $lex $train $test && rm $lang-$modeltype$modeltypesuffix-$NE_option-$_.trainingdata && rm -f $lang-$modeltype$modeltypesuffix-$NE_option-$_/*.json && rm -f $lang-$modeltype$modeltypesuffix-$NE_option-$_/*.npy && rm -f $lang-$modeltype$modeltypesuffix-$NE_option-$_/*.ftl") == 0 or die ""; diff --git a/bin/MElt_tagger.py.in b/bin/MElt_tagger.py.in index 3d082d64e9c4a4aab4f5feca7bcea43a20e0ea9e..9e792a813d809faa37b0bf43df8e80d3b44baaf9 100644 --- a/bin/MElt_tagger.py.in +++ b/bin/MElt_tagger.py.in @@ -490,6 +490,8 @@ class BrownReader(CorpusReader): wasCapOnly = CAPONLYLINE_RE.match(line) token_list = [] for item in line.split(' '): + if item == '': + continue wdtag = WD_TAG_RE.match(item) if (wdtag): wd,tag = wdtag.groups() @@ -498,7 +500,7 @@ class BrownReader(CorpusReader): else: token_list.append( (wd,tag) ) else: - print >> sys.stderr, "Warning: Incorrect token/tag pair: \""+item+"\""+" --- line: "+line + print >> sys.stderr, "Warning: Incorrect token/tag pair: \""+(item.encode('utf8'))+"\""+" --- line: "+line return token_list @@ -1111,7 +1113,7 @@ def debug_n_best_sequence(n_best_sequences): def tag_dict(file_path): tag_dict = defaultdict(dict) - for s in BrownReader(file_path,encoding="utf-8"): + for s in BrownReader(file_path): for wd,tag in s: tag_dict[wd][tag] = 1 return tag_dict @@ -1120,7 +1122,7 @@ def tag_dict(file_path): def word_list(file_path,t=5): word_ct = {} - for s in BrownReader(file_path,encoding="utf-8"): + for s in BrownReader(file_path): for wd,tag in s: word_ct[wd] = word_ct.get(wd,0) + 1 filtered_wd_list = {} diff --git a/bin/build_tag_dict.py b/bin/build_tag_dict.py index fd5811fb0265e9d31d87c8ded17ec2063eee2378..bd787bf3573f1c68a961da0464dbf1fadaab4802 100644 --- a/bin/build_tag_dict.py +++ b/bin/build_tag_dict.py @@ -52,6 +52,8 @@ class BrownReader(): wasCapOnly = CAPONLYLINE_RE.match(line) token_list = [] for item in line.split(' '): + if item == '': + continue wdtag = WD_TAG_RE.match(item) if (wdtag): wd,tag = wdtag.groups() diff --git a/bin/eval_brown.py b/bin/eval_brown.py index e89b8880278dda7506a639c059cc70a827a7131f..2fa49a89228c828b1e18789deaeff0ea00b9fa17 100644 --- a/bin/eval_brown.py +++ b/bin/eval_brown.py @@ -144,9 +144,9 @@ class AccuracySink(ResultSink): print "Confusion matrix" print print '\tas' - print 'is\t',"\t".join(self.tags.keys()) + print 'is\t',"\t".join(self.tags.keys()).encode('utf8') for tag1 in self.tags.keys(): - print tag1,'\t', + print tag1.encode('utf8'),'\t', for tag2 in self.tags.keys(): print self.matrix.get((tag1,tag2),0),'\t', print @@ -177,7 +177,7 @@ class AccuracySink(ResultSink): f = 0.0 if r+p: f = (2*r*p)/(r+p) - print "%10s | %10s %10s %10s | %10s %10s %10s" %(tag, round(r,3), round(p,3), round(f,3), corr, pred, total) + print "%10s | %10s %10s %10s | %10s %10s %10s" %(tag.encode('utf8'), round(r,3), round(p,3), round(f,3), corr, pred, total) print "-"*80 return @@ -197,7 +197,7 @@ def compare_files( gold_file, pred_file, sink, known_words={}, quiet=True, encod pwd,ptag = pred_s[i] except IndexError: ptag = "UNK" - print >> sys.stderr, "Warning: Missing prediction for Sentence #%s, token %s" %(s_ct,i) + print >> sys.stderr, "Warning: Missing prediction for Sentence #%s, token %s" %(s_ct,i.encode('utf8')) # update sinks sink.update(gwd,gtag,ptag,known_words) # errors diff --git a/configure.ac b/configure.ac index 565ed571c46a02f17e35db58d31642e914f7d711..00b082bb6609a8cdfe0c6b047997605de88064e2 100644 --- a/configure.ac +++ b/configure.ac @@ -12,11 +12,6 @@ if test "x$PERL" = x; then AC_MSG_ERROR(perl not found) fi -# look for dicos -# AC_PATH_ALEXINA -# AC_PATH_LEFFF -# AM_CONDITIONAL(HAVE_LEFFF, test -n "$lefffdir") - AC_CONFIG_FILES( data/Makefile bin/Makefile pkgpythonlib/Makefile diff --git a/data/Makefile.am b/data/Makefile.am index 8202e74bfc0c807133943da8649ad26351de2a2c..b92b487306c4412b9e622295ac12efab3668c219 100644 --- a/data/Makefile.am +++ b/data/Makefile.am @@ -189,8 +189,7 @@ lefff.ftb4tags: update_lefff_data %.reg: % cat $* | perl ../bin/lexicon2features.pl > $*.data cut -f1,2 $*.data > $*.data.1,2 - megam.opt -nc -maxi 20 multiclass $*.data.1,2 > $*.model -# megam.opt -nc -maxi 20 multitron $*.data.1,2 > $*.model + megam.opt -nc -maxi 100 -repeat 5 multiclass $*.data.1,2 > $*.model cat $* | perl -pe "s/ /_/g" > $*.us cat $*.data | perl ../bin/megam_predict.pl $*.model | perl ../bin/pred2reg.pl | paste $* - | cut -f1,2,3,6 > $@ diff --git a/data/lex2ftb4cats.pl b/data/lex2ftb4cats.pl index fc89dc0577f4967f3762222435c455a4e736462c..f03d7b5938bfe0928717129bba9d366796d1f60d 100755 --- a/data/lex2ftb4cats.pl +++ b/data/lex2ftb4cats.pl @@ -7,6 +7,63 @@ use utf8; # cat /usr/local/share/enlex/?.lex /usr/local/share/enlex/ponct.lex /usr/local/share/enlex/{lostandfound,addons}.lex | recode l1..u8 | grep -v "@?" | perl data/lex2ftb4cats.pl -l en -ms -o data/enlex_ms +# cat /usr/local/share/enlex/?.lex /usr/local/share/enlex/ponct.lex /usr/local/share/enlex/{lostandfound,addons}.lex | recode l1..u8 | grep -v "@?" | perl lex2ftb4cats.pl -l en -ms -o enlex.ms.meltlex +# cat /usr/local/share/enlex/?.lex /usr/local/share/enlex/ponct.lex /usr/local/share/enlex/{lostandfound,addons}.lex | recode l1..u8 | grep -v "@?" | perl lex2ftb4cats.pl -l en -nms -o enlex.meltlex + +# cat /usr/local/share/delex/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l de -ms -o delex.ms.meltlex +# cat /usr/local/share/delex/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l de -nms -o delex.meltlex + +# cat /usr/local/share/leffe/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l es -ms -o leffe.ms.meltlex +# cat /usr/local/share/leffe/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l es -nms -o leffe.meltlex + +# cat /usr/local/share/saldo/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l sv -ms -o saldo.ms.meltlex +# cat /usr/local/share/saldo/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l sv -nms -o saldo.meltlex + +# cat /usr/local/share/sloleks/*.lex | recode l2..u8 | perl lex2ftb4cats.pl -l si -ms -o sloleks.ms.meltlex +# cat /usr/local/share/sloleks/*.lex | recode l2..u8 | perl lex2ftb4cats.pl -l si -nms -o sloleks.meltlex +# cat /usr/local/share/sloleks/*.lex | recode l2..u8 | perl lex2ftb4cats.pl -l si -lms -o sloleks.ms2.meltlex + +# cat /usr/local/share/lefff/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l fr -ms -o lefff.ms.meltlex +# cat /usr/local/share/lefff/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l fr -nms -o lefff.meltlex +# cat /usr/local/share/lefff/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l fr -lms -o lefff.ms2.meltlex + +# cat /usr/local/share/multext-east-4-0-ro/*.lex | recode l2..u8 | perl lex2ftb4cats.pl -l ro -ms -o multext-east-4-0-ro.ms.meltlex +# cat /usr/local/share/multext-east-4-0-ro/*.lex | recode l2..u8 | perl lex2ftb4cats.pl -l ro -nms -o multext-east-4-0-ro.meltlex +# cat /usr/local/share/multext-east-4-0-ro/*.lex | recode l2..u8 | perl lex2ftb4cats.pl -l ro -lms -o multext-east-4-0-ro.ms2.meltlex + +# cat /usr/local/share/multext-east-4-0-et/*.lex | perl lex2ftb4cats.pl -l et -ms -o multext-east-4-0-et.ms.meltlex +# cat /usr/local/share/multext-east-4-0-et/*.lex | perl lex2ftb4cats.pl -l et -nms -o multext-east-4-0-et.meltlex +# cat /usr/local/share/multext-east-4-0-et/*.lex | perl lex2ftb4cats.pl -l et -lms -o multext-east-4-0-et.ms2.meltlex + +# cat /usr/local/share/multext-east-4-0-bg/*.lex | perl lex2ftb4cats.pl -l bg -ms -o multext-east-4-0-bg.ms.meltlex +# cat /usr/local/share/multext-east-4-0-bg/*.lex | perl lex2ftb4cats.pl -l bg -nms -o multext-east-4-0-bg.meltlex +# cat /usr/local/share/multext-east-4-0-bg/*.lex | perl lex2ftb4cats.pl -l bg -lms -o multext-east-4-0-bg.ms2.meltlex + +# cat /usr/local/share/multext-east-4-0-hu/*.lex | recode l2..u8 | perl lex2ftb4cats.pl -l hu -ms -o multext-east-4-0-hu.ms.meltlex +# cat /usr/local/share/multext-east-4-0-hu/*.lex | recode l2..u8 | perl lex2ftb4cats.pl -l hu -nms -o multext-east-4-0-hu.meltlex +# cat /usr/local/share/multext-east-4-0-hu/*.lex | recode l2..u8 | perl lex2ftb4cats.pl -l hu -lms -o multext-east-4-0-hu.ms2.meltlex + +# cat /usr/local/share/delex/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l de -ms -o delex.ms.meltlex +# cat /usr/local/share/delex/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l de -nms -o delex.meltlex + +# cat /usr/local/share/inmdb/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l ga -ms -o inmdb.ms.meltlex +# cat /usr/local/share/inmdb/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l ga -nms -o inmdb.meltlex + +# cat /usr/local/share/leffga/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l gl -ms -o leffga.ms.meltlex +# cat /usr/local/share/leffga/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l gl -nms -o leffga.meltlex + +# cat /usr/local/share/dela_gr/*.lex | perl lex2ftb4cats.pl -l el -ms -o dela_gr.ms.meltlex +# cat /usr/local/share/dela_gr/*.lex | perl lex2ftb4cats.pl -l el -nms -o dela_gr.meltlex + +# cat /usr/local/share/pollex/*.lex | recode l2..u8 | perl lex2ftb4cats.pl -l sk -ms -o pollex.ms.meltlex +# cat /usr/local/share/pollex/*.lex | recode l2..u8 | perl lex2ftb4cats.pl -l sk -nms -o pollex.meltlex + +# cat /usr/local/share/enlex/?.lex /usr/local/share/enlex/ponct.lex /usr/local/share/enlex/{lostandfound,addons}.lex | recode l1..u8 | grep -v "@?" | perl lex2ftb4cats.pl -l en -ms -o enlex.ms.meltlex +# cat /usr/local/share/enlex/?.lex /usr/local/share/enlex/ponct.lex /usr/local/share/enlex/{lostandfound,addons}.lex | recode l1..u8 | grep -v "@?" | perl lex2ftb4cats.pl -l en -nms -o enlex.meltlex + +# cat /usr/local/share/morph_it/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l it -ms -o morph_it.ms.meltlex +# cat /usr/local/share/morph_it/*.lex | recode l1..u8 | perl lex2ftb4cats.pl -l it -nms -o morph_it.meltlex + $output_file = "lefff.ftb4tags"; $stdout = 0; @@ -294,7 +351,7 @@ unless ($all) { } while (<TMP>) { chomp; - /^(.*?)\t(.*?)((?:__.*?)?)\t(.*)$/ || die; + /^(.*?)\t(.*?)((?:__.*?)?)\t(.*)$/ || die $_; $f = $1; $cat = $2; $synt = $3; diff --git a/data/normalization/en/ngrams b/data/normalization/en/ngrams index 89322bc0fb769535bd7103a4c9e75c88a87ad219..2d80d217d59cbbf8fe1c528cd0934e049f1c0c5c 100644 --- a/data/normalization/en/ngrams +++ b/data/normalization/en/ngrams @@ -25,8 +25,7 @@ 456 alot a lot 426 pls please 408 ng ng -382 gon gon -328 rly rly +328 rly really 326 mmbtu mmbtu 279 mws mws 263 approx. approximately @@ -61,7 +60,7 @@ 110 didnt did n't 110 gty gty 107 wh = en when -105 recieved recieved +105 recieved received 104 thurs thursday 103 tommorrow tomorrow 100 passcode code @@ -84,7 +83,7 @@ 82 ent ent 81 ([^ {}]+)i = ze $1ize 81 mid- mid- -79 recieve recieve +79 recieve receive 79 ([^ {}]+) = ry $1ry 78 conf. conf. 78 rebook rebook @@ -95,7 +94,7 @@ 75 co-operation cooperation 73 cashout cash-out 72 mistransmission transmission -70 plc plc +70 plc company 69 chnages changes 69 non-refundable refundable 67 username name @@ -105,10 +104,10 @@ 65 demarc demarc 64 re-send re-send 64 non-binding non-binding -63 tx tx -62 seperate seperate +63 tx thanks +62 seperate separate 62 post-id post-id -62 plz plz +62 plz please 61 undercollections collections 60 taht that 60 pre-petition petition @@ -214,3 +213,17 @@ 0 & # 039 ; ve 've 0 & # 039 ; d 'd 0 & # (\d+) ; &#$1; +0 w/o without +0 they ain't they are n't +0 there ain't there are n't +0 They ain't They are n't +0 There ain't There are n't +0 ain't I am n't I +0 I ain't I am n't +0 i ain't I am n't +0 ain't is n't +0 wanna want to +0 gonna going to +0 ain't is n't +0 Gonna Going to +0 Wanna Want to diff --git a/data/normalization/fr/ngrams b/data/normalization/fr/ngrams index 8ab2c4f2ea53f1cfd8506958664b6f04fcc9e24f..b055d10bf30793ed00f82670c137b23d3e56aa02 100644 --- a/data/normalization/fr/ngrams +++ b/data/normalization/fr/ngrams @@ -1105,13 +1105,13 @@ ek twa 19 7 aprem 19 Slt c salut c' est 19 gro bizou 19 -ala maizon 19 -a taleur 19 +ala maizon à  la maison 19 +a taleur à tout à  l' heure 19 Slt sava salut ça va 19 rien . 19 Ou lé 19 chérie ! 19 -bone soiré 19 +bone soiré bonne soirée 19 le match 19 pr twa 19 ton num 19 @@ -1711,7 +1711,7 @@ rép 14 Kosa 14 malad malade 14 ? alor 14 -me bégné 14 +me bégné baignais 14 ds ma 14 fai ? 14 lui a 14 @@ -1925,7 +1925,7 @@ la ou 13 mi té 13 bien sur 13 Bsr bonsoir 13 -je voulé 13 +je voulé je voulais 13 parti a 13 stp , 13 Cava Ça va 13 @@ -2285,10 +2285,9 @@ Koman Comment 12 koi toi ? quoi toi ? 12 ma mèr ma mère 12 pu d 12 -o moin 12 +o moin au moins 12 suis a la suis à la 12 pa entendu 12 -soiré . 12 di rien 12 fr koi 12 chance pr 12 @@ -2540,7 +2539,7 @@ d choz 11 soirè soirée 11 ce mat 11 g + 11 -toi sava 11 +toi sava toi ça va 11 Merci a 11 JE Je 11 Jespèr j' espère 11 @@ -2554,63 +2553,63 @@ je rentre 11 et mi 11 jtai je t' ai 11 Sui Suis 11 -ms bon 11 -on vera 11 +ms bon mais bon 11 +on vera on verra 11 a toi et à toi et 11 -1 truc 10 +1 truc un truc 10 ai pa 10 sa pa 10 -fér 10 +fér faire 10 je vous 10 -dsl g 10 +dsl g désolé j' ai 10 c tou c' est tout 10 dsl désolé 10 bonne journé 10 _ACC_F lé 10 ce jour 10 -ki te 10 -gt en 10 +ki te qui te 10 +gt en j' étais en 10 gros bisou 10 -jc ke 10 -i ve 10 -pensé a 10 -bne fete 10 +jc ke je sais que 10 +i ve il veut 10 +pensé a pensé à 10 +bne fete bonne fête 10 Tfé Tu fais 10 -c po 10 +c po c' est pas 10 ? _SENT_BOUND 10 -si ca 10 +si ca si ça 10 _ACC_F lol 10 me prendre 10 -é moi 10 +é moi et moi 10 se voit 10 prenom prénom 10 -trè for 10 +trè for très fort 10 pa avoir 10 -a ki 10 +a ki à qui 10 le devoir 10 zuma 10 -ki vien 10 -en tt 10 +ki vien qui vient 10 +en tt en tout 10 svoi 10 toi oci toi aussi 10 emmene emmène 10 anne année 10 kosa ou 10 biz ! 10 -mé el 10 +mé el mais elle 10 momt moment 10 x 10 moin . 10 2 ma 10 pour une 10 pa ton 10 -Bne soirée 10 +Bne soirée Bonne soirée 10 é d 10 ben g 10 -c bn 10 +c bn c' est bon 10 Papa 10 maman é 10 -etwa ? 10 +etwa et toi 10 ek sa 10 sa a 10 Hey ! 10 @@ -2621,28 +2620,27 @@ un moune 10 toi ta 10 . tu va . tu vas 10 t ek 10 -nimporte koi 10 +nimporte koi n' importe quoi 10 tro cool 10 -a coz 10 +a coz à cause 10 Snif 10 gro bizou gros bisous 10 -Coucou c 10 +Coucou c Coucou c' est 10 etwa et toi 10 peu . 10 J espère que J' espère que 10 -ché mwa 10 +ché mwa chez moi 10 aime ma 10 -ce sOir 10 -A oué 10 -Tu a 10 -kan c 10 +ce sOir ce soir 10 +A oué ah ouais 10 +Tu a Tu as 10 +kan c quand c' est 10 c chian c' est chiant 10 -jpense a 10 +jpense a je pense à 10 toute la 10 pa ni 10 kelkun quelqu'un 10 -WE 10 -javé oublié 10 +WE week-end 10 Ouè Oui 10 tro ! 10 el ! 10 @@ -3748,20 +3746,20 @@ news . 8 Kmt 8 bégné é 8 dit ke 8 -jmen 8 +jmen je m' en 8 ma chambre 8 ke je v que je vais 8 -enf1 8 +enf1 enfin 8 veu pa 8 c mol 8 -komence 8 +komence commence 8 peur ! 8 pa tro tar pas trop tard 8 ? bien 8 aussi je 8 bien la 8 c t c' était 8 -pa kome 8 +pa kome pas comme 8 kan il 8 recu . 8 soir é 8 @@ -3785,6 +3783,7 @@ Anz ! 8 fac ! 8 film ! 8 t ! 8 +presk presque preske presque 8 ptite petite 8 ma fait m' a fait 8