Commit 2fa9e731 authored by bguillaum's avatar bguillaum
Browse files

version 0.29:

* renumbering of conll output
* ignore UDT lines with "i-j" number in UDT Conll

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/semagramme/libcaml-grew/trunk@8779 7838e531-6607-4d57-9587-6c381814729c
parent dc37a6f8
VERSION = 0.28.1
VERSION = 0.29
DATA_DIR = @prefix@@datarootdir@/libgrew/
DOC_DIR = @prefix@@datarootdir@/libgrew-doc/
......
......@@ -791,8 +791,15 @@ module G_graph = struct
let nodes = Gid_map.fold
(fun gid node acc -> (gid,node)::acc)
graph.map [] in
(* sort nodes wrt position *)
let snodes = List.sort (fun (_,n1) (_,n2) -> G_node.position_comp n1 n2) nodes in
(* renumbering of nodes to have a consecutive sequence of int 1 --> n, in case of node deletion or addition *)
let snodes = List_.mapi
(fun i (gid,node) -> (gid, G_node.set_position (float i) node)
) snodes in
let get_num gid =
let gnode = List.assoc gid snodes in
if G_node.is_conll_root gnode
......@@ -836,8 +843,8 @@ module G_graph = struct
(get_num gid)
(match G_fs.get_string_atom "phon" fs with Some p -> p | None -> "_e_")
(match G_fs.get_string_atom "lemma" fs with Some p -> p | None -> "_e_")
(match G_fs.get_string_atom "cat" fs with Some p -> p | None -> "X")
(match G_fs.get_string_atom "pos" fs with Some p -> p | None -> "X")
(match G_fs.get_string_atom "cat" fs with Some p -> p | None -> "_")
(match G_fs.get_string_atom "pos" fs with Some p -> p | None -> "_")
(G_fs.to_conll ~exclude: ["phon"; "lemma"; "cat"; "pos"; "position"] fs)
(String.concat "|" govs)
(String.concat "|" labs)
......
......@@ -388,22 +388,30 @@ module Conll = struct
) (Str.split (Str.regexp "|") morph)
let underscore s = if s = "" then "_" else s
exception Dash
let contain_dash s =
try String.iter (function '-' -> raise Dash | _ -> ()) s; false
with Dash -> true
let parse_line file_name (line_num, line) =
match Str.split (Str.regexp "\t") line with
| num :: _ when contain_dash num -> None
| [ num; phon; lemma; pos1; pos2; morph; govs; dep_labs; _; _ ] ->
begin
try
let gov_list = if govs = "_" then [] else Str.split (Str.regexp "|") govs
and lab_list = if dep_labs = "_" then [] else Str.split (Str.regexp "|") dep_labs in
let deps = List.combine gov_list lab_list in
{line_num = line_num;
num = num;
phon = underscore phon;
lemma = underscore lemma;
pos1 = underscore pos1;
pos2 = underscore pos2;
morph = parse_morph file_name line_num morph;
deps = deps;
Some {
line_num = line_num;
num = num;
phon = underscore phon;
lemma = underscore lemma;
pos1 = underscore pos1;
pos2 = underscore pos2;
morph = parse_morph file_name line_num morph;
deps = deps;
}
with exc -> Error.build ~loc:(Loc.file_line file_name line_num) "[Conll.load] illegal line, exc=%s\n>>>>>%s<<<<<<" (Printexc.to_string exc) line
end
......@@ -411,9 +419,9 @@ module Conll = struct
let load file_name =
let lines = File.read_ln file_name in
List.map (parse_line file_name) lines
List_.opt_map (parse_line file_name) lines
let parse file_name lines = List.map (parse_line file_name) lines
let parse file_name lines = List_.opt_map (parse_line file_name) lines
(* We would prefer to compare the float equivalent of l1.num l2.num but this would break the dicho_find function *)
let compare l1 l2 = Pervasives.compare ((* float_of_string *) l1.num) ((* float_of_string *) l2.num)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment