Commit 72c42330 authored by bguillaum's avatar bguillaum

take ranges into account in conlls

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/semagramme/libcaml-grew/trunk@8829 7838e531-6607-4d57-9587-6c381814729c
parent 71a13255
......@@ -559,18 +559,26 @@ end (* module Massoc_make *)
(* ================================================================================ *)
module Id = struct
type name = string
type t = int
type table = name array
type 'a gtable = 'a array * ('a -> string)
let gbuild ?(loc:Loc.t option) key (table,conv) =
try Array_.dicho_find key table
with Not_found -> Error.build ?loc "Identifier '%s' not found" (conv key)
let build ?(loc:Loc.t option) string table =
try Array_.dicho_find string table
with Not_found -> Error.build ?loc "Identifier '%s' not found" string
let gbuild_opt key (table, _) =
try Some (Array_.dicho_find key table)
with Not_found -> None
type name = string
type table = string array
let build ?(loc:Loc.t option) key table =
try Array_.dicho_find key table
with Not_found -> Error.build ?loc "Identifier '%s' not found" key
let build_opt string table =
try Some (Array_.dicho_find string table)
let build_opt key table =
try Some (Array_.dicho_find key table)
with Not_found -> None
end (* module Id *)
......
......@@ -241,15 +241,23 @@ end
(* ================================================================================ *)
module Id: sig
type name = string
type t = int
type table = name array
type 'a gtable = 'a array * ('a -> string)
(* [Stop] is raised if [string] is not in [gtable] *)
val gbuild: ?loc:Loc.t -> 'a -> 'a gtable -> t
val gbuild_opt: 'a -> 'a gtable -> t option
type name = string
type table = string array
(* [Stop] is raised if [string] is not in [table] *)
val build: ?loc:Loc.t -> name -> table -> t
val build_opt: name -> table -> t option
end
(* ================================================================================ *)
......
......@@ -210,9 +210,10 @@ module G_graph = struct
type t = {
meta: (string * string) list; (* meta-informations *)
map: G_node.t Gid_map.t; (* node description *)
fusion: (Gid.t * (Gid.t * string)) list; (* the list of fusion word considered in UD conll *)
}
let empty = {meta=[]; map=Gid_map.empty}
let empty = {meta=[]; map=Gid_map.empty; fusion=[]}
(* ---------------------------------------------------------------------- *)
let rename mapping graph =
......@@ -337,13 +338,15 @@ module G_graph = struct
)
) map_without_edges full_edge_list in
{meta=gr_ast.Ast.meta; map=map}
{meta=gr_ast.Ast.meta; map=map; fusion = []}
(* -------------------------------------------------------------------------------- *)
let of_conll ?loc lines =
let of_conll ?loc (meta, lines, range_lines) =
let sorted_lines = Conll.root :: (List.sort Conll.compare lines) in
let table = Array.of_list (List.map (fun line -> line.Conll.num) sorted_lines) in
let gtable = (Array.of_list (List.map (fun line -> line.Conll.num) sorted_lines), string_of_int) in
let map_without_edges =
List_.foldi_left
......@@ -356,10 +359,10 @@ module G_graph = struct
(fun acc line ->
(* add line number information in loc *)
let loc = Loc.opt_set_line line.Conll.line_num loc in
let dep_id = Id.build ?loc line.Conll.num table in
let dep_id = Id.gbuild ?loc line.Conll.num gtable in
List.fold_left
(fun acc2 (gov, dep_lab) ->
let gov_id = Id.build ?loc gov table in
let gov_id = Id.gbuild ?loc gov gtable in
let edge = G_edge.make ?loc dep_lab in
(match map_add_edge acc2 (Gid.Old gov_id) edge (Gid.Old dep_id) with
| Some g -> g
......@@ -369,7 +372,17 @@ module G_graph = struct
)
) acc line.Conll.deps
) map_without_edges lines in
{meta=[]; map=map_with_edges}
let fusion =
List.map
(fun {Conll.first; last; fusion} ->
( Gid.Old (Id.gbuild ?loc first gtable),
(Gid.Old (Id.gbuild ?loc last gtable),
fusion)
)
) range_lines in
{meta; map=map_with_edges; fusion}
(* -------------------------------------------------------------------------------- *)
(** input : "Le/DET/le petit/ADJ/petit chat/NC/chat dort/V/dormir ./PONCT/." *)
......@@ -384,17 +397,17 @@ module G_graph = struct
| _ -> [] in
{
Conll.line_num=0;
num = sprintf "%d" (i+1);
num = i+1;
phon;
lemma;
pos1 = "_";
pos2 = pos;
morph;
deps = [(sprintf "%d" i, "SUC")]
deps = [(i, "SUC")]
}
| _ -> Error.build "[Graph.of_brown] Cannot parse Brown item >>>%s<<< (expected \"phon/POS/lemma\")" item
) units in
of_conll conll_lines
of_conll ([], conll_lines, [])
(* -------------------------------------------------------------------------------- *)
let opt_att atts name =
......@@ -441,7 +454,7 @@ module G_graph = struct
new_map
| _ -> Log.critical "[G_graph.of_xml] Not a wellformed <R> tag"
) nodes_without_edges r_list in
{meta=[]; map=final_map}
{meta=[]; map=final_map; fusion=[]}
| _ -> Log.critical "[G_graph.of_xml] Not a <D> tag"
(* -------------------------------------------------------------------------------- *)
......@@ -819,8 +832,18 @@ module G_graph = struct
) graph.map Gid_map.empty in
let buff = Buffer.create 32 in
List.iter (fun (k,v) -> bprintf buff "# %s: %s\n" k v) graph.meta;
List.iter
(fun (gid, node) ->
begin
try
let (gid_last,fusion) = List.assoc gid graph.fusion in
bprintf buff "%g-%g\t%s\t_\t_\t_\t_\t_\t_\t_\t_\n"
(get_num gid) (get_num gid_last) fusion
with
| Not_found -> ()
end;
if not (G_node.is_conll_root node)
then
let gov_labs = try Gid_map.find gid govs_labs with Not_found -> [] in
......
......@@ -103,7 +103,7 @@ module G_graph: sig
val build: ?locals: Label.decl array -> Ast.gr -> t
val of_conll: ?loc:Loc.t -> Conll.line list -> t
val of_conll: ?loc:Loc.t -> Conll.t -> t
(** input : "Le/DET/le petit/ADJ/petit chat/NC/chat dort/V/dormir ./PONCT/."
It supposes that "SUC" is defined in current relations *)
......
......@@ -66,7 +66,7 @@ module G_node = struct
let of_conll ?loc line =
if line = Conll.root
then { empty with conll_root=true }
else { empty with fs = G_fs.of_conll ?loc line; position = float_of_string line.Conll.num }
else { empty with fs = G_fs.of_conll ?loc line; position = float line.Conll.num }
let remove (id_tar : Gid.t) label t = {t with next = Massoc_gid.remove id_tar label t.next}
......
......@@ -369,24 +369,35 @@ end (* Domain *)
module Conll = struct
type line = {
line_num: int;
num: string;
num: int;
phon: string;
lemma: string;
pos1: string;
pos2: string;
morph: (string * string) list;
deps: (string * string ) list;
deps: (int * string ) list;
}
let root = { line_num = -1; num="0"; phon="ROOT"; lemma="__"; pos1="_X"; pos2=""; morph=[]; deps=[] }
let root = { line_num = -1; num=0; phon="ROOT"; lemma="__"; pos1="_X"; pos2=""; morph=[]; deps=[] }
let line_to_string l =
let (gov_list, lab_list) = List.split l.deps in
sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
sprintf "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
l.num l.phon l.lemma l.pos1 l.pos2
(match l.morph with [] -> "_" | list -> String.concat "|" (List.map (fun (f,v) -> sprintf "%s=%s" f v) list))
(String.concat "|" (gov_list))
(String.concat "|" (lab_list))
(List_.to_string string_of_int "|" gov_list)
(String.concat "|" lab_list)
type range_line = {
first: int;
last: int;
fusion: string;
}
let range_line_to_string l =
sprintf "%d-%d\t%s\t_\t_\t_\t_\t_\t_\t_\t_" l.first l.last l.fusion
type t = (string * string) list * line list * range_line list
let parse_morph file_name line_num = function
| "_" -> []
......@@ -401,42 +412,48 @@ module Conll = struct
let underscore s = if s = "" then "_" else s
exception Dash
let contain_dash s =
try String.iter (function '-' -> raise Dash | _ -> ()) s; false
with Dash -> true
let parse_line file_name (line_num, line) =
match Str.split (Str.regexp "\t") line with
| num :: _ when contain_dash num -> None
| [ num; phon; lemma; pos1; pos2; morph; govs; dep_labs; _; _ ] ->
begin
try
let gov_list = if govs = "_" then [] else Str.split (Str.regexp "|") govs
and lab_list = if dep_labs = "_" then [] else Str.split (Str.regexp "|") dep_labs in
let deps = List.combine gov_list lab_list in
Some {
line_num = line_num;
num = num;
phon = underscore phon;
lemma = underscore lemma;
pos1 = underscore pos1;
pos2 = underscore pos2;
morph = parse_morph file_name line_num morph;
deps = deps;
}
with exc -> Error.build ~loc:(Loc.file_line file_name line_num) "[Conll.load] illegal line, exc=%s\n>>>>>%s<<<<<<" (Printexc.to_string exc) line
end
| l -> Error.build ~loc:(Loc.file_line file_name line_num) "[Conll.load] illegal line, %d fields (10 are expected)\n>>>>>%s<<<<<<" (List.length l) line
let parse file_name lines =
List.fold_right
(fun (line_num, line) (acc_meta, acc_basic, acc_fusion) ->
if line.[0] = '#'
then
begin
match Str.bounded_split (Str.regexp "\\(# *\\)\\|\\( *: *\\)") line 2 with
| [tag; value] -> ((tag, value) :: acc_meta, acc_basic, acc_fusion)
| _ -> (acc_meta, acc_basic, acc_fusion)
end
else
match Str.split (Str.regexp "\t") line with
| [ f1; phon; lemma; pos1; pos2; morph; govs; dep_labs; _; _ ] ->
begin
try
match Str.split (Str.regexp "-") f1 with
| [f;l] -> (acc_meta, acc_basic, {first=int_of_string f; last=int_of_string l; fusion=phon}:: acc_fusion)
| [num] ->
let gov_list = if govs = "_" then [] else List.map int_of_string (Str.split (Str.regexp "|") govs)
and lab_list = if dep_labs = "_" then [] else Str.split (Str.regexp "|") dep_labs in
let deps = List.combine gov_list lab_list in
(acc_meta, {
line_num = line_num;
num = int_of_string num;
phon = underscore phon;
lemma = underscore lemma;
pos1 = underscore pos1;
pos2 = underscore pos2;
morph = parse_morph file_name line_num morph;
deps = deps;
}::acc_basic, acc_fusion)
| _ -> Error.build ~loc:(Loc.file_line file_name line_num) "[Conll.load] illegal field one >>>>>%s<<<<<<" f1
with exc -> Error.build ~loc:(Loc.file_line file_name line_num) "[Conll.load] illegal line, exc=%s\n>>>>>%s<<<<<<" (Printexc.to_string exc) line
end
| l -> Error.build ~loc:(Loc.file_line file_name line_num) "[Conll.load] illegal line, %d fields (10 are expected)\n>>>>>%s<<<<<<" (List.length l) line
) lines ([], [], [])
let load file_name =
let lines = File.read_ln file_name in
List_.opt_map (parse_line file_name) lines
let parse file_name lines = List_.opt_map (parse_line file_name) lines
parse file_name lines
(* We would prefer to compare the float equivalent of l1.num l2.num but this would break the dicho_find function *)
let compare l1 l2 = Pervasives.compare ((* float_of_string *) l1.num) ((* float_of_string *) l2.num)
let compare l1 l2 = Pervasives.compare l1.num l2.num
end (* module Conll *)
(* ================================================================================ *)
......
......@@ -142,23 +142,33 @@ end (* module Domain *)
(* ================================================================================ *)
module Conll: sig
type line = {
line_num: int;
num: string;
phon: string;
lemma: string;
pos1: string;
pos2: string;
morph: (string * string) list;
deps: (string * string ) list;
}
line_num: int;
num: int;
phon: string;
lemma: string;
pos1: string;
pos2: string;
morph: (string * string) list;
deps: (int * string ) list;
}
val root: line
val line_to_string: line -> string
val root:line
type range_line = {
first: int;
last: int;
fusion: string;
}
val range_line_to_string: range_line -> string
type t = (string * string) list * line list * range_line list
val load: string -> line list
val load: string -> t
val parse: string -> (int * string) list -> line list
val parse: string -> (int * string) list -> t
val compare: line -> line -> int
end (* module Conll *)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment