From 5ffd4d054ea601a8371ac2f4c64c582597711a42 Mon Sep 17 00:00:00 2001 From: Ryan Herbert Date: Tue, 12 Jun 2018 16:24:02 +0100 Subject: [PATCH] fuse.py make ijson parser optional Restore old functionality and make new functionality optional. Due to the amount of time it takes to run the vidjilparser we may need to decide between speed and memory efficiency See #3234 --- tools/fuse.py | 101 +++++++++++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 38 deletions(-) diff --git a/tools/fuse.py b/tools/fuse.py index 562581ef0..794f6b7bf 100644 --- a/tools/fuse.py +++ b/tools/fuse.py @@ -333,25 +333,20 @@ class ListWindows(VidjilJson): with open(output, "w") as f: json.dump(self, f, indent=2, default=self.toJson) - def load(self, string, *args, **kwargs): - ''' + def load(self, file_path, *args, **kwargs): if not '.clntab' in file_path: self.load_vidjil(file_path, *args, **kwargs) else: self.load_clntab(file_path, *args, **kwargs) - ''' - self.load_vidjil(string, *args, **kwargs) - - def load_vidjil(self, string, pipeline, verbose=True): - '''init listWindows with data file - Detects and selects the parser according to the file extension.''' - - tmp = json.loads(string, object_hook=self.toPython) - self.d=tmp.d + + def loads(self, string, *args, **kwargs): + self.loads_vidjil(string, *args, **kwargs) + + def init_data(self, data): + self.d=data.d # Be robust against 'null' values for clones if not self.d["clones"]: self.d["clones"] = [] - #self.check_version(file_path) if "diversity" in self.d.keys(): self.d["diversity"] = Diversity(self.d["diversity"]) @@ -360,7 +355,30 @@ class ListWindows(VidjilJson): if 'distribution' not in self.d['reads'].d: self.d['reads'].d['distribution'] = {} - ''' + + self.id_lengths = defaultdict(int) + + print("%%") + for clone in self: + self.id_lengths[len(clone.d['id'])] += 1 + print ("%% lengths .vidjil -> ", self.id_lengths) + try: + print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"]) + except KeyError: + pass + + def load_vidjil(self, file_path, pipeline, verbose=True): + '''init listWindows with data file + Detects and selects the parser according to the file extension.''' + extension = file_path.split('.')[-1] + + if verbose: + print("<==", file_path, "\t", end=' ') + + with open(file_path, "r") as f: + self.init_data(json.load(f, object_hook=self.toPython)) + self.check_version(file_path) + if pipeline: # renaming, private pipeline f = '/'.join(file_path.split('/')[2:-1]) @@ -371,21 +389,13 @@ class ListWindows(VidjilJson): f = file_path if verbose: print() - + time = os.path.getmtime(file_path) self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")] - ''' - - self.id_lengths = defaultdict(int) - print("%%") - for clone in self: - self.id_lengths[len(clone.d['id'])] += 1 - print ("%% lengths .vidjil -> ", self.id_lengths) - try: - print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"]) - except KeyError: - pass + def loads_vidjil(self, string, pipeline, verbose=True): + '''init listWindows with a json string''' + self.init_data(json.loads(string, object_hook=self.toPython)) def getTop(self, top): result = [] @@ -687,6 +697,7 @@ def main(): group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings') group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)') + group_options.add_argument('--ijson', action='store_true', help='use the ijson vidjilparser') group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)') group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)') @@ -720,20 +731,31 @@ def main(): vparser = VidjilParser() vparser.addPrefix('clones.item', 'clones.item.top', le, args.top) for path_name in files: - json_clones = vparser.extract(path_name) - clones = json.loads(json_clones) - f += [c['id'] for c in clones["clones"]] + if args.ijson: + json_clones = vparser.extract(path_name) + clones = json.loads(json_clones) + f += [c['id'] for c in clones["clones"]] + else: + jlist = ListWindows() + jlist.load(path_name, args.pipeline) + f += jlist.getTop(args.top) + f = sorted(set(f)) - vparser.reset() - vparser.addPrefix('') - vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f) + if args.ijson: + vparser.reset() + vparser.addPrefix('') + vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f) + if args.multi: for path_name in files: - json_reads = vparser.extract(path_name) jlist = ListWindows() - jlist.load(json_reads, args.pipeline) - #jlist.build_stat() + if args.ijson: + json_reads = vparser.extract(path_name) + jlist.loads(json_reads, args.pipeline) + else: + jlist.load(path_name, args.pipeline) + jlist.build_stat() print("\t", jlist, end=' ') @@ -748,11 +770,14 @@ def main(): else: print("### Read and merge input files") for path_name in files: - json_reads = vparser.extract(path_name) jlist = ListWindows() - jlist.load(json_reads, args.pipeline) - #jlist.build_stat() - #jlist.filter(f) + if args.ijson: + json_reads = vparser.extract(path_name) + jlist.loads(json_reads, args.pipeline) + else: + jlist.load(path_name, args.pipeline) + jlist.build_stat() + jlist.filter(f) w1 = Window(1) w2 = Window(2) -- 2.22.0