Commit 29fd6d3f authored by bguillaum's avatar bguillaum
Browse files

- add conll format support

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/semagramme/libcaml-grew/trunk@6723 7838e531-6607-4d57-9587-6c381814729c
parent d14601e3
...@@ -46,11 +46,19 @@ end ...@@ -46,11 +46,19 @@ end
module Feature_structure = struct module Feature_structure = struct
(* list are supposed to be striclty ordered wrt compare*) (* list are supposed to be striclty ordered wrt compare*)
type t = Feature.t list type t = Feature.t list
let build ?domain ast_fs = let build ?domain ast_fs =
let unsorted = List.map (Feature.build ?domain) ast_fs in let unsorted = List.map (Feature.build ?domain) ast_fs in
List.sort Feature.compare unsorted List.sort Feature.compare unsorted
let of_conll line =
let morph_fs =
List.map (fun (feat_name, feat_value) -> Feature.Equal (feat_name, [feat_value])) line.Conll.morph in
Feature.Equal ("phon", [line.Conll.phon]) ::
Feature.Equal ("lemma", [line.Conll.lemma]) ::
Feature.Equal ("cat", [line.Conll.pos2]) ::
morph_fs
let empty = [] let empty = []
let rec get name = function let rec get name = function
......
...@@ -10,6 +10,8 @@ module Feature_structure: sig ...@@ -10,6 +10,8 @@ module Feature_structure: sig
val build: ?domain:Ast.domain -> Ast.feature list -> t val build: ?domain:Ast.domain -> Ast.feature list -> t
val of_conll: Conll.line -> t
val get: string -> t -> string list option val get: string -> t -> string list option
val get_atom: string -> t -> string option val get_atom: string -> t -> string option
......
...@@ -195,7 +195,6 @@ module G_graph = struct ...@@ -195,7 +195,6 @@ module G_graph = struct
| Feat of (Gid.t * string) | Feat of (Gid.t * string)
| String of string | String of string
let map_add_edge map id_src label id_tar = let map_add_edge map id_src label id_tar =
let node_src = let node_src =
(* Not found can be raised when adding an edge from pos to neg *) (* Not found can be raised when adding an edge from pos to neg *)
...@@ -204,21 +203,18 @@ module G_graph = struct ...@@ -204,21 +203,18 @@ module G_graph = struct
| None -> None | None -> None
| Some new_node -> Some (Gid_map.add id_src new_node map) | Some new_node -> Some (Gid_map.add id_src new_node map)
let build ?domain ?(locals=[||]) full_node_list full_edge_list = let build ?domain ?(locals=[||]) full_node_list full_edge_list =
let (named_nodes, constraints) = let named_nodes =
let rec loop already_bound = function let rec loop already_bound = function
| [] -> ([],[]) | [] -> []
| (ast_node, loc) :: tail -> | (ast_node, loc) :: tail ->
let (tail_nodes, tail_const) = loop (ast_node.Ast.node_id :: already_bound) tail in let tail = loop (ast_node.Ast.node_id :: already_bound) tail in
if List.mem ast_node.Ast.node_id already_bound if List.mem ast_node.Ast.node_id already_bound
then (tail_nodes, (ast_node, loc)::tail_const) then Log.fcritical "[GRS] [Graph.build] try to build a graph with twice the same node id '%s'" ast_node.Ast.node_id
else (G_node.build ?domain (ast_node, loc) :: tail_nodes, tail_const) in else G_node.build ?domain (ast_node, loc) :: tail in
loop [] full_node_list in loop [] full_node_list in
(* let named_nodes = List.map (Node.build ?domain) full_node_list in *)
let sorted_nodes = List.sort (fun (id1,_) (id2,_) -> Pervasives.compare id1 id2) named_nodes in let sorted_nodes = List.sort (fun (id1,_) (id2,_) -> Pervasives.compare id1 id2) named_nodes in
let (sorted_ids, node_list) = List.split sorted_nodes in let (sorted_ids, node_list) = List.split sorted_nodes in
...@@ -245,6 +241,27 @@ module G_graph = struct ...@@ -245,6 +241,27 @@ module G_graph = struct
{map=map;lub=Array.length table} {map=map;lub=Array.length table}
let of_conll lines =
let nodes =
List.fold_left
(fun acc line -> Gid_map.add line.Conll.num (G_node.of_conll line) acc)
Gid_map.empty lines in
let nodes_with_edges =
List.fold_left
(fun acc line ->
if line.Conll.gov=0
then acc
else
let gov_node = Gid_map.find line.Conll.gov acc in
match G_node.add_edge (G_edge.make line.Conll.dep_lab) line.Conll.num gov_node with
| None -> acc
| Some new_node -> Gid_map.add line.Conll.gov new_node acc
) nodes lines in
{map = nodes_with_edges; lub=1+(Gid_map.fold (fun _ _ acc -> acc+1) nodes_with_edges 0)}
(* ---------------------------------------------------- *) (* ---------------------------------------------------- *)
(* Update functions *) (* Update functions *)
(* ---------------------------------------------------- *) (* ---------------------------------------------------- *)
......
...@@ -65,6 +65,9 @@ module G_graph: sig ...@@ -65,6 +65,9 @@ module G_graph: sig
Ast.node list -> Ast.node list ->
Ast.edge list -> Ast.edge list ->
t t
val of_conll: Conll.line list -> t
val to_gr: t -> string val to_gr: t -> string
val to_dot: ?main_feat:string -> ?deco:Deco.t -> t -> string val to_dot: ?main_feat:string -> ?deco:Deco.t -> t -> string
val to_dep: ?main_feat:string -> ?deco:Deco.t -> t -> string val to_dep: ?main_feat:string -> ?deco:Deco.t -> t -> string
......
...@@ -42,6 +42,13 @@ module G_node = struct ...@@ -42,6 +42,13 @@ module G_node = struct
next = Massoc.empty; next = Massoc.empty;
} ) } )
let of_conll line = {
fs = Feature_structure.of_conll line;
pos = Some line.Conll.num;
next = Massoc.empty;
}
let remove id_tar label t = {t with next = Massoc.remove id_tar label t.next} let remove id_tar label t = {t with next = Massoc.remove id_tar label t.next}
let remove_key node_id t = let remove_key node_id t =
......
...@@ -29,6 +29,8 @@ module G_node: sig ...@@ -29,6 +29,8 @@ module G_node: sig
val add_edge: G_edge.t -> int -> t -> t option val add_edge: G_edge.t -> int -> t -> t option
val build: ?domain:Ast.domain -> Ast.node -> (Id.name * t) val build: ?domain:Ast.domain -> Ast.node -> (Id.name * t)
val of_conll: Conll.line -> t
val pos_comp: t -> t -> int val pos_comp: t -> t -> int
val build_neighbour: t -> t val build_neighbour: t -> t
......
...@@ -31,6 +31,9 @@ module Instance = struct ...@@ -31,6 +31,9 @@ module Instance = struct
let graph = G_graph.build gr_ast.Ast.nodes gr_ast.Ast.edges in let graph = G_graph.build gr_ast.Ast.nodes gr_ast.Ast.edges in
{ empty with graph = graph } { empty with graph = graph }
let of_conll lines =
{ empty with graph = G_graph.of_conll lines }
let rev_steps t = let rev_steps t =
{ t with big_step = match t.big_step with { t with big_step = match t.big_step with
| None -> None | None -> None
......
...@@ -17,6 +17,8 @@ module Instance : sig ...@@ -17,6 +17,8 @@ module Instance : sig
val build: Ast.gr -> t val build: Ast.gr -> t
val of_conll: Conll.line list -> t
(* rev_steps reverse the small step list: during rewriting, the last rule is in the head of the list and the reverse is needed for display *) (* rev_steps reverse the small step list: during rewriting, the last rule is in the head of the list and the reverse is needed for display *)
val rev_steps: t -> t val rev_steps: t -> t
......
...@@ -435,3 +435,39 @@ module Html = struct ...@@ -435,3 +435,39 @@ module Html = struct
fprintf out_ch "</html>\n"; fprintf out_ch "</html>\n";
end end
module Conll = struct
type line = {
num: int;
phon: string;
lemma: string;
pos1: string;
pos2: string;
morph: (string * string) list;
gov: int;
dep_lab: string;
}
let parse_morph morph =
List.map
(fun feat ->
match Str.split (Str.regexp "=") feat with
| [feat_name; feat_value] -> (feat_value, feat_value)
| [feat_name] -> (feat_name, "true")
| _ -> Log.fcritical "Cannot not parse CONLL feat '%s' (too many '=')" morph
) (Str.split (Str.regexp "|") morph)
let parse line =
match Str.split (Str.regexp "\t") line with
| [ num; phon; lemma; pos1; pos2; morph; gov; dep_lab; _; _ ] ->
{num = int_of_string num;
phon = phon;
lemma = lemma;
pos1 = pos1;
pos2 = pos2;
morph = parse_morph morph;
gov = int_of_string gov;
dep_lab = dep_lab;
}
| _ -> Log.fcritical "Cannot not parse CONLL line '%s'" line
end
...@@ -148,3 +148,18 @@ module Html: sig ...@@ -148,3 +148,18 @@ module Html: sig
val enter: out_channel -> ?title: string -> ?header: string -> string -> unit val enter: out_channel -> ?title: string -> ?header: string -> string -> unit
val leave: out_channel -> unit val leave: out_channel -> unit
end end
module Conll: sig
type line = {
num: int;
phon: string;
lemma: string;
pos1: string;
pos2: string;
morph: (string * string) list;
gov: int;
dep_lab: string;
}
val parse: string -> line
end
...@@ -64,6 +64,10 @@ let load_gr file = ...@@ -64,6 +64,10 @@ let load_gr file =
raise (File_dont_exists file) raise (File_dont_exists file)
) )
let load_conll file =
let lines = File.read file in
Instance.of_conll (List.map Conll.parse lines)
let rewrite ~gr ~grs ~seq = let rewrite ~gr ~grs ~seq =
try Grs.rewrite grs seq gr try Grs.rewrite grs seq gr
with with
......
...@@ -54,6 +54,8 @@ val empty_gr : Instance.t ...@@ -54,6 +54,8 @@ val empty_gr : Instance.t
*) *)
val load_gr: string -> Instance.t val load_gr: string -> Instance.t
val load_conll: string -> Instance.t
val save_index: dirname:string -> base_names: string list -> unit val save_index: dirname:string -> base_names: string list -> unit
val write_html: val write_html:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment