Commit f39ae3c4 authored by bguillaum's avatar bguillaum

improve brown loading (more robust and take identifier into account)

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/semagramme/libcaml-grew/trunk@8755 7838e531-6607-4d57-9587-6c381814729c
parent 00ab496e
......@@ -373,11 +373,15 @@ module G_graph = struct
(* -------------------------------------------------------------------------------- *)
(** input : "Le/DET/le petit/ADJ/petit chat/NC/chat dort/V/dormir ./PONCT/." *)
let of_brown brown =
let of_brown ?sentid brown =
let units = Str.split (Str.regexp " ") brown in
let conll_lines = List_.mapi
(fun i item -> match Str.split (Str.regexp "/") item with
| [phon;pos;lemma] ->
(fun i item -> match Str.full_split (Str.regexp "/[A-Z'+'']+/") item with
| [Str.Text phon; Str.Delim pos; Str.Text lemma] ->
let pos = String.sub pos 1 ((String.length pos)-2) in
let morph = match (i,sentid) with
| (0,Some id) -> [("sentid", id)]
| _ -> [] in
{
Conll.line_num=0;
num = sprintf "%d" (i+1);
......@@ -385,10 +389,10 @@ module G_graph = struct
lemma;
pos1 = "_";
pos2 = pos;
morph = [];
morph;
deps = [(sprintf "%d" i, "SUC")]
}
| _ -> failwith "Unexpected MElt output"
| _ -> Error.build "[Graph.of_brown] Cannot parse Brown item >>>%s<<< (expected \"phon/POS/lemma\")" item
) units in
of_conll conll_lines
......
......@@ -107,7 +107,7 @@ module G_graph: sig
(** input : "Le/DET/le petit/ADJ/petit chat/NC/chat dort/V/dormir ./PONCT/."
It supposes that "SUC" is defined in current relations *)
val of_brown: string -> t
val of_brown: ?sentid: string -> string -> t
val of_xml: Xml.xml -> t
(* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ *)
......
......@@ -131,10 +131,10 @@ let of_conll file_name line_list =
Instance.from_graph graph
) ()
let of_brown brown =
let of_brown ?sentid brown =
handle ~name:"of_brown"
(fun () ->
let graph = G_graph.of_brown brown in
let graph = G_graph.of_brown ?sentid brown in
Instance.from_graph graph
) ()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment