-
Bruno Guillaume authoredBruno Guillaume authored
conll_tool.ml 14.03 KiB
open Printf
open Conll
open Conllx
let rec list_extract n = function
| [] -> failwith "list_extract"
| h::t when n=0 -> (h,t)
| h::t -> let (x,r) = list_extract (n-1) t in (x,h::r)
let dump_id_sentence corpus =
Array.iter
(fun (id,conll) ->
let sentence = match Conll.get_sentence conll with
| Some s -> s
| None -> Conll.build_sentence conll in
printf "%s#%s\n" id sentence
) corpus
let add_text corpus =
Array.map
(fun (id,conll) ->
match Conll.get_sentence conll with
| Some _ -> (id,conll)
| None -> (id,{conll with meta = conll.Conll.meta @ [(sprintf "# text = %s" (Conll.build_sentence conll))] })
) corpus
let sentid corpus =
let new_corpus = Array.map
(fun (id, conll) ->
(id, Conll.normalize_multiwords (Conll.ensure_sentid_in_meta ~default:id conll))
) corpus in
Conll_corpus.dump new_corpus
let fusion corpus =
let new_corpus = Array.map
(fun (id, conll) -> (id, Conll.normalize_multiwords conll)
) corpus in
Conll_corpus.dump new_corpus
let split corpus id_list =
let (i,o) = List.partition
(function
| (id,conll) when List.mem id id_list -> true
| _ -> false
) (Array.to_list corpus) in
(Array.of_list i, Array.of_list o)
let sub corpus id_list =
let sub_list = CCList.filter_map
(fun id ->
CCArray.find_map (fun (i,c) -> if i=id then Some (i,c) else None) corpus
) id_list in
Array.of_list sub_list
let print_usage () =
List.iter (fun x -> printf "%s\n" x)
[
"Usage: conll_tool.native <subcommand> <args>";
"subcommands are:";
" * normalize <input_corpus_file> <output_corpus_file>";
" dump the input corpus in a normalized way (sorting of features, …)";
" * sentences <corpus_file>";
" dump on stdout the list of \"id#sentence\" contained in the <corpus_file>";
" * sentid <corpus_file>";
" dump the input <corpus_file> with #sentid moved from features into metadata when necessary";
" * add_text <input_corpus_file> <output_corpus_file>";
" add the 'text' meta data build from conll line (for French)";
" * random <corpus_file> <num>";
" split input <corpus_file> into a randomly selected subset on sentence large enough to have at least <num> tokens";
" output is stored in two files with extension _sub.conll (the extracted part) and _rem.conll (the remaining sentences)";
" * split <corpus_file> <id_file>";
" split input <corpus_file> in two files with extension _in.conll (the sentid belongs to <id_file>) and _out.conll (the remaining sentences)";
" * order <corpus_file> <id_file>";
" order input <corpus_file> follwing id giveni in <id_file>";
" * fusion <corpus_file>";
" dump the input <corpus_file> with new lines for fusion words (data taken from _UD_mw_span and _UD_mw_fusion special features)";
" * web_anno <corpus_file> <basename> <size>";
" split <corpus_file> into several set of <size> sentences for inclusion in web_anno. Output files are nammed <basename>_xx.conll";
" * cut <corpus_file> <basename> <size>";
" split <corpus_file> into several set of <size> sentences (metadata are kept, unlike in web_anno subcommand). Output files are nammed <basename>_xx.conll";
" * ustat <corpus_id> <corpus_file>";
" output the list of triple (xpos_gov, label, xpos_dep) with the number of occurences. Output is a list of lines like this one:";
" PRON -[obl]-> NOUN ==> 10";
" also build a local file <corpus_id>_utable.php with the html code for stat browsing";
" * xstat <corpus_id> <corpus_file>";
" same as upos with XPOS stat insted of UPOS";
" * pat <corpus_file> <patch_file>";
" apply a patch file produced by annot_tool (pat stands for \"Post Annot Tool\")";
" * merge <sentid1> <sentid2> <new_sentid> <input_corpus_file> <output_corpus_file>";
" merge two sentences in one";
]
let _ =
match List.tl (Array.to_list Sys.argv) with
(* pat stands for post annot_tool *)
| ["pat"; corpus_name; at_file] ->
let corpus = Conll_corpus.load corpus_name in
let at_list = CCIO.(with_in at_file read_lines_l) in
List.iter (fun at ->
match Str.split (Str.regexp "__\\|#\\|\\.svg#") at with
| [sentid; pos; lab] ->
begin
match CCArray.find_idx (fun (id,_) -> id=sentid) corpus with
| None -> printf "ERROR: sentid \"%s\" not found in corpus\n" sentid; exit 1
| Some (i,(_,conll)) ->
let new_conll = Conll.set_label (Conll.Id.of_string pos) lab conll in
corpus.(i) <- (sentid,new_conll)
end
| _ -> printf "ERROR: cannot parse annot_tool output \"%s\"\n" at; exit 1
) at_list;
Conll_corpus.dump corpus
| "pat"::_ -> printf "ERROR: sub-command \"pat\" expects two arguments\n"; print_usage ()
| ["random"; corpus_name; min_tokens_string] ->
let basename = Filename.basename corpus_name in
let corpus = Conll_corpus.load corpus_name in
Random.self_init ();
let min_tokens =
try int_of_string min_tokens_string
with Failure _ -> printf "ERROR: sub-command \"random\" second arg must be int\n"; print_usage (); exit 0 in
let full_size = Array.length corpus in
let full_list = Array.to_list corpus in
let rec loop size bound (sub, rem) =
if bound < 0
then (sub, rem)
else
let n = Random.int size in
let ((id,conll),new_rem) = list_extract n rem in
loop (size-1) (bound - (List.length conll.Conll.lines)) ((id,conll)::sub, new_rem) in
let (sub,rem) = loop full_size min_tokens ([],full_list) in
Conll_corpus.save (basename^"_sub.conll")
(Array.of_list (List.sort (fun (id1,_) (id2,_) -> Stdlib.compare id1 id2) sub));
Conll_corpus.save (basename^"_rem.conll") (Array.of_list rem)
| "random"::_ -> printf "ERROR: sub-command \"random\" expects two arguments\n"; print_usage ()
| ["split"; corpus_name; id_file] ->
let basename = Filename.basename corpus_name in
let corpus = Conll_corpus.load corpus_name in
let id_list = CCIO.(with_in id_file read_lines_l) in
let (c_in, c_out) = split corpus id_list in
Conll_corpus.save (basename^"_in.conll") c_in;
Conll_corpus.save (basename^"_out.conll") c_out;
printf "Splitting; IN: %d sentences, %d tokens; OUT: %d sentences, %d tokens\n"
(Array.length c_in)
(Conll_corpus.token_size c_in)
(Array.length c_out)
(Conll_corpus.token_size c_out)
| ["split"; corpus_name; id_file; corpus_in] ->
let corpus = Conll_corpus.load corpus_name in
let id_list = CCIO.(with_in id_file read_lines_l) in
let c_in = sub corpus id_list in
Conll_corpus.save (corpus_in) c_in
| "split"::_ -> printf "ERROR: sub-command \"split\" expects two or three arguments\n"; print_usage ()
| ["order"; corpus_name; id_file] ->
let corpus = Conll_corpus.load corpus_name in
let id_list = CCIO.(with_in id_file read_lines_l) in
let new_corpus = Array.of_list (
List.map
(fun id -> match Conll_corpus.get id corpus with
| Some g -> (id, g)
| None -> failwith ("unkonwn id "^id)
) id_list
) in
Conll_corpus.dump new_corpus
| ["fusion"; corpus_name] ->
let corpus = Conll_corpus.load corpus_name in
fusion corpus
| "fusion"::_ -> printf "ERROR: sub-command \"fusion\" expects one argument\n"; print_usage ()
| ["sentences"; corpus_name] ->
let corpus = Conll_corpus.load corpus_name in
dump_id_sentence corpus
| "sentences"::_ -> printf "ERROR: sub-command \"sentences\" expects one argument\n"; print_usage ()
| ["sentid"; corpus_name] ->
let corpus = Conll_corpus.load corpus_name in
sentid corpus
| "sentid"::_ -> printf "ERROR: sub-command \"sentid\" expects one argument\n"; print_usage ()
| ["dot"; corpus_name] ->
let corpus = Conll_corpus.load corpus_name in
printf "%s" (Conll.to_dot (snd corpus.(0)))
| ["dot"; corpus_name; output_file] ->
let corpus = Conll_corpus.load corpus_name in
Conll.save_dot output_file (snd corpus.(0))
| "dot"::_ -> printf "ERROR: sub-command \"dot\" expects one argument\n"; print_usage ()
| ["normalize"; corpus_in; corpus_out] ->
let corpus = Conll_corpus.load corpus_in in
Conll_corpus.save corpus_out corpus
| "normalize"::_ -> printf "ERROR: sub-command \"normalize\" expects two arguments\n"; print_usage ()
| ["add_text"; corpus_in; corpus_out] ->
let corpus = Conll_corpus.load corpus_in in
let new_corpus = add_text corpus in
Conll_corpus.save corpus_out new_corpus
| "add_text"::_ -> printf "ERROR: sub-command \"add_text\" expects two arguments\n"; print_usage ()
| "ustat" :: corpus_id :: corpus_files ->
let corpus = Conll_corpus.load_list corpus_files in
let stat = Stat.build Stat.Upos corpus in
Stat.dump stat;
let html = Stat.to_html corpus_id stat in
CCIO.with_out (corpus_id ^ "_utable.php") (fun oc -> CCIO.write_line oc html)
| "ustat"::_ -> printf "ERROR: sub-command \"ustat\" expects at least two argument\n"; print_usage ()
| "xstat" :: corpus_id :: corpus_files ->
let corpus = Conll_corpus.load_list corpus_files in
let stat = Stat.build Stat.Xpos corpus in
Stat.dump stat;
let html = Stat.to_html corpus_id stat in
CCIO.with_out (corpus_id ^ "_xtable.php") (fun oc -> CCIO.write_line oc html)
| "xstat"::_ -> printf "ERROR: sub-command \"xstat\" expects at least two argument\n"; print_usage ()
| ["web_anno"; corpus_name; base_output; size_string] ->
begin
match int_of_string_opt size_string with
| None -> printf "ERROR: sub-command \"web_anno\" third argument \"%s\" must be int \n" size_string; print_usage ()
| Some size ->
let corpus = Conll_corpus.load corpus_name in
Conll_corpus.web_anno corpus base_output size
end
| "web_anno"::_ -> printf "ERROR: sub-command \"web_anno\" expects 3 arguments\n"; print_usage ()
| ["merge"; sentid1; sentid2; new_sentid; corpus_in; corpus_out; ] ->
let corpus = Conll_corpus.load corpus_in in
(match (CCArray.find_idx (fun (id,_) -> id=sentid1) corpus, CCArray.find_idx (fun (id,_) -> id=sentid2) corpus) with
| (Some (pos1,(id1,c1)), Some (pos2,(id2,c2))) ->
if pos2 - pos1 <> 1
then printf "ERROR Merge is possible only on consecutive sentences\n"
else
begin
let new_conll = Conll.merge new_sentid c1 c2 in
corpus.(pos1) <- (id1, new_conll);
corpus.(pos2) <- ("__REMOVE__", Conll.void);
Conll_corpus.save corpus_out corpus
end
| (None, _) -> printf "ERROR No index \"%s\"\n" sentid1
| (_,None) -> printf "ERROR No index \"%s\"\n" sentid1
)
| "merge"::_ -> printf "ERROR: sub-command \"merge\" expects 5 arguments\n"; print_usage ()
| ["cut"; corpus_in; base_output; subsize; ] ->
begin
match int_of_string_opt subsize with
| None -> printf "ERROR: sub-command \"cut\" second argument \"%s\" must be int \n" subsize; print_usage ()
| Some size ->
let corpus = Conll_corpus.load corpus_in in
let len = Array.length corpus in
let last_ballot = (len-1) / size in
for i = 0 to last_ballot do
let out = sprintf "%s_%03d" base_output i in
Conll_corpus.save_sub out (i*size) (min ((i+1)*size-1) (len-1)) corpus
done
end
| "cut"::_ -> printf "ERROR: sub-command \"cut\" expects 3 arguments\n"; print_usage ()
| ["sud_to_json"] ->
begin
try
let cx = Conllx_corpus.read_stdin ~config:(Conllx_config.build "sud") () in
Array.iter (fun (_,conllx) ->
let json = Conllx.to_json conllx in
Printf.printf "%s\n" (Yojson.Basic.pretty_to_string json)
) (Conllx_corpus.get_data cx)
with
| Conllx_error js -> printf " === Conllx_error === \n%s\n ====================\n" (Yojson.Basic.pretty_to_string js)
end
| ["sud_of_json"] ->
begin
try
let json = Yojson.Basic.from_channel stdin in
let conll = Conllx.of_json json in
Printf.printf "%s\n" (Conllx.to_string ~config:(Conllx_config.build "sud") conll)
with
| Conllx_error js -> printf " === Conllx_error === \n%s\n ====================\n" (Yojson.Basic.pretty_to_string js)
end
| ["ud_to_json"] ->
begin
try
let cx = Conllx_corpus.read_stdin ~config:(Conllx_config.build "ud") () in
Array.iter (fun (_,conllx) ->
let json = Conllx.to_json conllx in
Printf.printf "%s\n" (Yojson.Basic.pretty_to_string json)
) (Conllx_corpus.get_data cx)
with
| Conllx_error js -> printf " === Conllx_error === \n%s\n ====================\n" (Yojson.Basic.pretty_to_string js)
end
| ["ud_of_json"] ->
begin
try
let json = Yojson.Basic.from_channel stdin in
let conll = Conllx.of_json json in
Printf.printf "%s\n" (Conllx.to_string ~config:(Conllx_config.build "ud") conll)
with
| Conllx_error js -> printf " === Conllx_error === \n%s\n ====================\n" (Yojson.Basic.pretty_to_string js)
end
| ["seq_to_json"] ->
begin
try
let cx = Conllx_corpus.read_stdin ~config:(Conllx_config.build "sequoia") () in
Array.iter (fun (_,conllx) ->
let json = Conllx.to_json conllx in
Printf.printf "%s\n" (Yojson.Basic.pretty_to_string json)
) (Conllx_corpus.get_data cx)
with
| Conllx_error js -> printf " === Conllx_error === \n%s\n ====================\n" (Yojson.Basic.pretty_to_string js)
end
| ["seq_of_json"] ->
begin
try
let json = Yojson.Basic.from_channel stdin in
let conll = Conllx.of_json json in
Printf.printf "%s\n" (Conllx.to_string ~config:(Conllx_config.build "sequoia") ~columns:Conllx_columns.frsemcor conll)
with
| Conllx_error js -> printf " === Conllx_error === \n%s\n ====================\n" (Yojson.Basic.pretty_to_string js)
end
| [] -> print_usage ()
| x :: _ -> printf "ERROR: unknown sub-command \"%s\"\n" x; print_usage ()