Commit 82f53a1b authored by bguillaum's avatar bguillaum

improve parse_structure_tree handling

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/semagramme/libcaml-grew/trunk@9093 7838e531-6607-4d57-9587-6c381814729c
parent d86f274c
......@@ -347,33 +347,12 @@ module Ast = struct
let empty_grs = { domain = None; modules = []; strategies= [] }
(* phrase structure tree *)
type pst =
| Leaf of (Loc.t * string) (* phon *)
| T of (Loc.t * string * pst list)
let rec word_list = function
| Leaf (_, p) -> [p]
| T (_,_,l) -> List.flatten (List.map word_list l)
end (* module Ast *)
(* ================================================================================ *)
module Constituent = struct
type t =
| Leaf of (string * string) (* cat, phon *)
| T of (string * t list)
let to_gr t =
let rec loop gorn nodes edges = function
| Leaf (cat,phon) ->
let fs = [{Ast.name="cat"; kind= Ast.Equality [cat]}, (Loc.empty); {Ast.name="phon"; kind= Ast.Equality [phon]}, (Loc.empty)] in
let node = ({Ast.node_id = gorn; position = None; fs}, Loc.empty) in
(gorn, node :: nodes, edges)
| T (cat, daugthers) ->
let (nodes',edges') = List_.foldi_left
(fun i (acc_nodes, acc_edges) daugther ->
let (id, new_acc_nodes, new_acc_edges) = loop (gorn^(string_of_int i)) acc_nodes acc_edges daugther in
let new_edge = ({Ast.edge_id= None; src = gorn; edge_label_cst= Ast.Pos_list ["_"]; tar= id}, Loc.empty) in
(new_acc_nodes, new_edge :: new_acc_edges)
) (nodes, edges) daugthers in
let fs = [{Ast.name="cat"; kind= Ast.Equality [cat]}, (Loc.empty)] in
let new_nodes = ({Ast.node_id = gorn; position = None; fs}, Loc.empty) :: nodes' in
(gorn, new_nodes, edges') in
let (_,nodes,edges) = loop "N" [] [] t in
{Ast.nodes; edges; meta=[]}
end (* module Constituent *)
......@@ -174,11 +174,6 @@ module Ast : sig
mod_dir: string; (* the directory where the module is defined (for lp file localisation) *)
}
type module_or_include =
| Modul of modul
| Includ of (string * Loc.t)
......@@ -210,13 +205,11 @@ module Ast : sig
}
val empty_grs: grs
end (* module Ast *)
module Constituent : sig
type t =
| Leaf of (string * string) (* cat, phon *)
| T of (string * t list)
val to_gr: t -> Ast.gr
end
(* phrase structure tree *)
type pst =
| Leaf of (Loc.t * string) (* phon *)
| T of (Loc.t * string * pst list)
val word_list: pst -> string list
end (* module Ast *)
......@@ -49,6 +49,10 @@ module G_edge = struct
let make ?loc ?domain string = Label.from_string ?loc ?domain string
let sub = make "__SUB__"
let succ = make "__SUCC__"
let prec = make "__PREC__"
let build ?domain (ast_edge, loc) =
match ast_edge.Ast.edge_label_cst with
| Ast.Pos_list [one] -> Label.from_string ~loc ?domain one
......
......@@ -37,6 +37,10 @@ module G_edge: sig
val make: ?loc:Loc.t -> ?domain:Domain.t -> string -> t
val sub: t
val succ: t
val prec: t
val build: ?domain:Domain.t -> Ast.edge -> t
val is_void: ?domain:Domain.t -> t -> bool
......
......@@ -222,6 +222,11 @@ module G_fs = struct
| s -> ("lemma", Feature_value.build_value ?loc ?domain "lemma" s) :: raw_list1 in
List.sort G_feature.compare raw_list2
(* ---------------------------------------------------------------------- *)
let pst_leaf ?loc ?domain phon = [("phon", Feature_value.build_value ?loc ?domain "phon" phon)]
let pst_node ?loc ?domain cat = [("cat", Feature_value.build_value ?loc ?domain "cat" cat)]
(* ---------------------------------------------------------------------- *)
exception Fail_unif
let unif fs1 fs2 =
......@@ -238,8 +243,8 @@ module G_fs = struct
(* ---------------------------------------------------------------------- *)
let get_main ?main_feat t =
let main_list = match main_feat with
| None -> ["phon";"label"]
| Some string -> Str.split (Str.regexp "\\( *; *\\)\\|#") string in
| None -> ["phon";"label"; "cat"]
| Some string -> (Str.split (Str.regexp "\\( *; *\\)\\|#") string) @ ["phon";"label"; "cat"] in
let rec loop = function
| [] -> (None, t)
| feat_name :: tail ->
......
......@@ -51,6 +51,9 @@ module G_fs: sig
val of_conll: ?loc:Loc.t -> ?domain:Domain.t -> Conll.line -> t
val pst_leaf: ?loc:Loc.t -> ?domain:Domain.t -> string -> t
val pst_node: ?loc:Loc.t -> ?domain:Domain.t -> string -> t
(** [unif t1 t2] returns [Some t] if [t] is the unification of two graph feature structures
[None] is returned if the two feature structures cannot be unified. *)
val unif: t -> t -> t option
......
......@@ -212,7 +212,7 @@ module G_graph = struct
meta: string list; (* meta-informations *)
map: G_node.t Gid_map.t; (* node description *)
fusion: (Gid.t * (Gid.t * string)) list; (* the list of fusion word considered in UD conll *)
highest_index: int; (* the next free interger index *)
highest_index: int; (* the next free integer index *)
}
let empty = {meta=[]; map=Gid_map.empty; fusion=[]; highest_index=0; }
......@@ -258,7 +258,7 @@ module G_graph = struct
| None -> Error.build "[G_node.get_annot_info] No nodes with annot info"
(* -------------------------------------------------------------------------------- *)
let map_add_edge map id_src label id_tar =
let map_add_edge_opt map id_src label id_tar =
let node_src =
(* Not found can be raised when adding an edge from pos to neg *)
try Gid_map.find id_src map with Not_found -> G_node.empty in
......@@ -266,9 +266,16 @@ module G_graph = struct
| None -> None
| Some new_node -> Some (Gid_map.add id_src new_node map)
(* -------------------------------------------------------------------------------- *)
let map_add_edge map id_src label id_tar =
let node_src = Gid_map.find id_src map in
match G_node.add_edge label id_tar node_src with
| Some new_node -> Gid_map.add id_src new_node map
| None -> Log.fbug "[Graph.map_add_edge] duplicate"; exit 2
(* -------------------------------------------------------------------------------- *)
let add_edge graph id_src label id_tar =
match map_add_edge graph.map id_src label id_tar with
match map_add_edge_opt graph.map id_src label id_tar with
| Some new_map -> Some {graph with map = new_map }
| None -> None
......@@ -305,7 +312,7 @@ module G_graph = struct
let i1 = List.assoc ast_edge.Ast.src table in
let i2 = List.assoc ast_edge.Ast.tar table in
let edge = G_edge.build ?domain (ast_edge, loc) in
(match map_add_edge acc i1 edge i2 with
(match map_add_edge_opt acc i1 edge i2 with
| Some g -> g
| None -> Error.build "[GRS] [Graph.build] try to build a graph with twice the same edge %s %s"
(G_edge.to_string ?domain edge)
......@@ -343,7 +350,7 @@ module G_graph = struct
(fun acc2 (gov, dep_lab) ->
let gov_id = Id.gbuild ~loc gov gtable in
let edge = G_edge.make ?domain ~loc dep_lab in
(match map_add_edge acc2 gov_id edge dep_id with
(match map_add_edge_opt acc2 gov_id edge dep_id with
| Some g -> g
| None -> Error.build "[GRS] [Graph.of_conll] try to build a graph with twice the same edge %s %s"
(G_edge.to_string ?domain edge)
......@@ -384,14 +391,43 @@ module G_graph = struct
of_conll ?domain { Conll.file=None; meta=[]; lines=conll_lines; multiwords=[] }
(* -------------------------------------------------------------------------------- *)
let of_const ?domain const =
let gr = Constituent.to_gr const in
build ?domain gr
let of_pst ?domain pst =
let cpt = ref 0 in
let get_pos () = incr cpt; !cpt - 1 in
(* -------------------------------------------------------------------------------- *)
let opt_att atts name =
try Some (List.assoc name atts)
with Not_found -> None
let leaf_list = ref [] in
let rec loop nodes = function
| Ast.Leaf (loc, phon) ->
let fresh_id = get_pos () in
let node = G_node.pst_leaf ~loc ?domain phon fresh_id in
leaf_list := fresh_id :: ! leaf_list;
(fresh_id, Gid_map.add fresh_id node nodes)
| Ast.T (loc, cat, daughters) ->
let fresh_id = get_pos () in
let new_node = G_node.pst_node ~loc ?domain cat fresh_id in
let with_mother = Gid_map.add fresh_id new_node nodes in
let new_nodes = List.fold_left
(fun map daughter ->
let (daughter_id, new_map) = loop map daughter in
map_add_edge new_map fresh_id G_edge.sub daughter_id
) with_mother daughters in
(fresh_id, new_nodes) in
let (_,map) = loop Gid_map.empty pst in
let rec prec_loop map = function
| [] | [_] -> map
| n1 :: n2 :: tail ->
let new_map = prec_loop map (n2 :: tail) in
let with_prec = map_add_edge new_map n1 G_edge.prec n2 in
let with_both = map_add_edge with_prec n2 G_edge.succ n1 in
with_both in
let map_with_prec = prec_loop map !leaf_list in
{meta=[]; map=map_with_prec; fusion = []; highest_index = !cpt}
(* -------------------------------------------------------------------------------- *)
let del_edge ?domain ?edge_ident loc graph id_src label id_tar =
......@@ -754,7 +790,7 @@ module G_graph = struct
bprintf buff "digraph G {\n";
(* bprintf buff " rankdir=LR;\n"; *)
bprintf buff " node [shape=Mrecord];\n";
bprintf buff " node [shape=none];\n";
(* nodes *)
Gid_map.iter
......@@ -775,7 +811,17 @@ module G_graph = struct
Massoc_gid.iter
(fun tar g_edge ->
let deco = List.mem (id,g_edge,tar) deco.G_deco.edges in
bprintf buff " N_%s -> N_%s%s\n" (Gid.to_string id) (Gid.to_string tar) (G_edge.to_dot ?domain ~deco g_edge)
match !Global.debug with
| true when g_edge = G_edge.succ ->
bprintf buff " N_%s -> N_%s [label=\"SUCC\", style=dotted, fontcolor=lightblue, color=lightblue]; {rank=same; N_%s; N_%s };\n"
(Gid.to_string id) (Gid.to_string tar) (Gid.to_string id) (Gid.to_string tar)
| false when g_edge = G_edge.succ ->
bprintf buff " N_%s -> N_%s [style=invis]; {rank=same; N_%s; N_%s };\n"
(Gid.to_string id) (Gid.to_string tar) (Gid.to_string id) (Gid.to_string tar)
| _ when g_edge = G_edge.prec -> ()
| _ when g_edge = G_edge.sub ->
bprintf buff " N_%s -> N_%s [dir=none];\n" (Gid.to_string id) (Gid.to_string tar)
| _ -> bprintf buff " N_%s -> N_%s%s;\n" (Gid.to_string id) (Gid.to_string tar) (G_edge.to_dot ?domain ~deco g_edge)
) (G_node.get_next node)
) graph.map;
......
......@@ -112,7 +112,7 @@ module G_graph: sig
It supposes that "SUC" is defined in current relations *)
val of_brown: ?domain:Domain.t -> ?sentid: string -> string -> t
val of_const: ?domain:Domain.t -> Constituent.t -> t
val of_pst: ?domain:Domain.t -> Ast.pst -> t
(* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ *)
(* Update functions *)
......
......@@ -232,14 +232,13 @@ and standard target = parse
| "re\"" { Buffer.clear buff; string_lex true global lexbuf }
| eof { EOF }
| _ as c { raise (Error (sprintf "unexpected character '%c'" c)) }
| _ as c { raise (Error (sprintf "At line %d: unexpected character '%c'" (lexbuf.Lexing.lex_start_p.Lexing.pos_lnum) c)) }
and const = parse
| [' ' '\t'] { const lexbuf }
| '\n' { incr Global.current_line; const lexbuf}
| '(' { printf ">>>LPAREN<<<\n%!"; LPAREN }
| ')' { printf ">>>RPAREN<<<\n%!"; RPAREN }
| [^'(' ')' ' ']+ as id { printf "ID=>>>%s<<<\n%!" id; ID id }
| '(' { LPAREN }
| ')' { RPAREN }
| [^'(' ')' ' ']+ as id { ID id }
......@@ -113,15 +113,15 @@ module Loader = struct
with Sys_error msg -> Error.parse ~loc:(Loc.file file) "[Grew_loader.pattern] %s" msg
(* ------------------------------------------------------------------------------------------*)
let constituent file =
let phrase_structure_tree file =
try
Global.init file;
let in_ch = open_in file in
let lexbuf = Lexing.from_channel in_ch in
let graph = parse_handle file (Grew_parser.constituent Grew_lexer.const) lexbuf in
let graph = parse_handle file (Grew_parser.phrase_structure_tree Grew_lexer.const) lexbuf in
close_in in_ch;
graph
with Sys_error msg -> Error.parse ~loc:(Loc.file file) "[Grew_loader.constituent] %s" msg
with Sys_error msg -> Error.parse ~loc:(Loc.file file) "[Grew_loader.phrase_structure_tree] %s" msg
end (* module Loader *)
......@@ -137,12 +137,12 @@ module Parser = struct
with Sys_error msg -> Error.parse "[Grew_parser.gr] %s" msg
(* ------------------------------------------------------------------------------------------*)
let constituent s =
let phrase_structure_tree s =
try
Global.init "Not a file";
let lexbuf = Lexing.from_string s in
let graph = parse_handle "Not a file" (Grew_parser.constituent Grew_lexer.const) lexbuf in
let graph = parse_handle "Not a file" (Grew_parser.phrase_structure_tree Grew_lexer.const) lexbuf in
graph
with Sys_error msg -> Error.parse "[Grew_parser.constituent] %s" msg
with Sys_error msg -> Error.parse "[Grew_parser.phrase_structure_tree] %s" msg
end
......@@ -21,11 +21,11 @@ module Loader: sig
val pattern: string -> Ast.pattern
val constituent: string -> Constituent.t
val phrase_structure_tree: string -> Ast.pst
end
module Parser : sig
val gr: string -> Ast.gr
val constituent: string -> Constituent.t
val phrase_structure_tree: string -> Ast.pst
end
\ No newline at end of file
......@@ -74,6 +74,11 @@ module G_node = struct
then { empty with conll_root=true; succ}
else { empty with fs = G_fs.of_conll ?loc ?domain line; position = float line.Conll.id; prec; succ }
let pst_leaf ?loc ?domain phon position =
{ empty with fs = G_fs.pst_leaf ?loc ?domain phon; position = float position }
let pst_node ?loc ?domain cat position =
{ empty with fs = G_fs.pst_node ?loc ?domain cat; position = float position }
let fresh ?domain ?prec ?succ position = { empty with position; prec; succ }
let remove (id_tar : Gid.t) label t = {t with next = Massoc_gid.remove id_tar label t.next}
......
......@@ -55,6 +55,9 @@ module G_node: sig
val add_edge: G_edge.t -> Gid.t -> t -> t option
val build: ?domain:Domain.t -> ?prec:Gid.t -> ?succ:Gid.t -> int -> Ast.node -> (Id.name * t)
val of_conll: ?loc:Loc.t -> ?prec:Gid.t -> ?succ:Gid.t -> ?domain:Domain.t -> Conll.line -> t
val pst_leaf: ?loc:Loc.t -> ?domain:Domain.t -> string -> int -> t
val pst_node: ?loc:Loc.t -> ?domain:Domain.t -> string -> int -> t
val fresh: ?domain:Domain.t -> ?prec:Gid.t -> ?succ:Gid.t -> float -> t
val get_position: t -> float
......
......@@ -101,8 +101,6 @@ let localize t = (t,get_loc ())
%token <string> AROBAS_ID /* @id */
%token <string> COLOR /* @#89abCD */
%token SENT /* SENT */
%token <string> ID /* the general notion of id */
/* %token <Grew_ast.Ast.complex_id> COMPLEX_ID*/
......@@ -123,7 +121,7 @@ let localize t = (t,get_loc ())
/* parsing of the string representation of the constituent representation of Sequoia */
/* EX: "( (SENT (NP (NC Amélioration) (PP (P de) (NP (DET la) (NC sécurité))))))" */
%start <Grew_ast.Constituent.t> constituent
%start <Grew_ast.Ast.pst> phrase_structure_tree
%left SEMIC
%left PLUS
......@@ -734,11 +732,11 @@ pattern:
/*=============================================================================================*/
/* Constituent tree (à la Sequoia) */
/*=============================================================================================*/
constituent:
| LPAREN t=const_tree RPAREN { t }
phrase_structure_tree:
| LPAREN t=pst RPAREN { t }
const_tree:
| LPAREN pos=ID ff=ID RPAREN { Grew_ast.Constituent.Leaf (pos,ff) }
| LPAREN cat=ID list=nonempty_list (const_tree) RPAREN { Grew_ast.Constituent.T (cat, list) }
pst:
| LPAREN pos=ID ff=ID RPAREN { Grew_ast.Ast.T (get_loc(), pos, [Grew_ast.Ast.Leaf (get_loc(), ff)]) }
| LPAREN cat=ID daugthers=nonempty_list (pst) RPAREN { Grew_ast.Ast.T (get_loc(), cat, daugthers) }
%%
......@@ -162,9 +162,7 @@ module Label_domain = struct
| Dash -> ["style=dashed"]
| Solid when deco -> ["style=dotted"]
| Solid -> []) in
match style.text with
| "_" -> sprintf "[arrowhead=none, %s]" (String.concat ", " dot_items)
| _ -> sprintf "[label=\"%s\", %s]" style.text (String.concat ", " dot_items)
sprintf "[label=\"%s\", %s]" style.text (String.concat ", " dot_items)
end
......
......@@ -119,14 +119,14 @@ type t = Grew_graph.G_graph.t
Grew_graph.G_graph.of_brown ?domain brown
) ()
let load_const ?domain file =
let load_pst ?domain file =
if not (Sys.file_exists file)
then raise (File_not_found file)
else
handle ~name:"load_const" ~file
handle ~name:"load_pst" ~file
(fun () ->
let const_ast = Grew_loader.Loader.constituent file in
Grew_graph.G_graph.of_const ?domain const_ast
let const_ast = Grew_loader.Loader.phrase_structure_tree file in
Grew_graph.G_graph.of_pst ?domain const_ast
) ()
let load ?domain file =
......@@ -136,13 +136,13 @@ type t = Grew_graph.G_graph.t
| Some ".gr" -> load_gr ?domain file
| Some ".conll" -> load_conll ?domain file
| Some ".br" | Some ".melt" -> load_brown ?domain file
| Some ".cst" -> load_const ?domain file
| Some ".cst" -> load_pst ?domain file
| _ ->
Log.fwarning "Unknown file format for input graph '%s', try to guess..." file;
let rec loop = function
| [] -> Log.fcritical "[Libgrew.load_graph] Cannot guess input file format of file '%s'. Use .gr or .conll file extension" file
| load_fct :: tail -> try load_fct ?domain file with _ -> loop tail in
loop [load_gr; load_conll; load_brown; load_const]
loop [load_gr; load_conll; load_brown; load_pst]
) ()
let of_gr ?domain ?(grewpy=false) gr_string =
......@@ -151,11 +151,19 @@ type t = Grew_graph.G_graph.t
let of_conll ?domain conll =
handle ~name:"Graph.of_conll" (fun () -> Grew_graph.G_graph.of_conll ?domain conll) ()
let of_const ?domain const =
handle ~name:"of_const"
let of_pst ?domain pst_string =
handle ~name:"of_pst"
(fun () ->
let const_ast = Grew_loader.Parser.constituent const in
(Grew_graph.G_graph.of_const ?domain const_ast)
let pst_ast = Grew_loader.Parser.phrase_structure_tree pst_string in
(Grew_graph.G_graph.of_pst ?domain pst_ast)
) ()
let sentence_of_pst ?domain pst_string =
handle ~name:"of_pst"
(fun () ->
let pst_ast = Grew_loader.Parser.phrase_structure_tree pst_string in
let word_list = Grew_ast.Ast.word_list pst_ast in
Sentence.fr_clean_spaces (String.concat " " word_list)
) ()
let of_brown ?domain ?sentid brown =
......
......@@ -92,6 +92,9 @@ module Graph : sig
val of_brown: ?domain:Domain.t -> ?sentid:string -> string -> t
val of_pst: ?domain:Domain.t -> string -> t
val sentence_of_pst: ?domain:Domain.t -> string -> string
val to_sentence: ?main_feat:string -> t -> string
val to_dot : ?domain:Domain.t -> ?main_feat:string -> ?deco:Deco.t -> t -> string
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment