Commit 5ffd4d05 authored by Ryan Herbert's avatar Ryan Herbert

fuse.py make ijson parser optional

Restore old functionality and make new functionality optional.
Due to the amount of time it takes to run the vidjilparser we may need
to decide between speed and memory efficiency

See #3234
parent 77d2a045
...@@ -333,25 +333,20 @@ class ListWindows(VidjilJson): ...@@ -333,25 +333,20 @@ class ListWindows(VidjilJson):
with open(output, "w") as f: with open(output, "w") as f:
json.dump(self, f, indent=2, default=self.toJson) json.dump(self, f, indent=2, default=self.toJson)
def load(self, string, *args, **kwargs): def load(self, file_path, *args, **kwargs):
'''
if not '.clntab' in file_path: if not '.clntab' in file_path:
self.load_vidjil(file_path, *args, **kwargs) self.load_vidjil(file_path, *args, **kwargs)
else: else:
self.load_clntab(file_path, *args, **kwargs) self.load_clntab(file_path, *args, **kwargs)
'''
self.load_vidjil(string, *args, **kwargs) def loads(self, string, *args, **kwargs):
self.loads_vidjil(string, *args, **kwargs)
def load_vidjil(self, string, pipeline, verbose=True):
'''init listWindows with data file def init_data(self, data):
Detects and selects the parser according to the file extension.''' self.d=data.d
tmp = json.loads(string, object_hook=self.toPython)
self.d=tmp.d
# Be robust against 'null' values for clones # Be robust against 'null' values for clones
if not self.d["clones"]: if not self.d["clones"]:
self.d["clones"] = [] self.d["clones"] = []
#self.check_version(file_path)
if "diversity" in self.d.keys(): if "diversity" in self.d.keys():
self.d["diversity"] = Diversity(self.d["diversity"]) self.d["diversity"] = Diversity(self.d["diversity"])
...@@ -360,7 +355,30 @@ class ListWindows(VidjilJson): ...@@ -360,7 +355,30 @@ class ListWindows(VidjilJson):
if 'distribution' not in self.d['reads'].d: if 'distribution' not in self.d['reads'].d:
self.d['reads'].d['distribution'] = {} self.d['reads'].d['distribution'] = {}
'''
self.id_lengths = defaultdict(int)
print("%%")
for clone in self:
self.id_lengths[len(clone.d['id'])] += 1
print ("%% lengths .vidjil -> ", self.id_lengths)
try:
print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
except KeyError:
pass
def load_vidjil(self, file_path, pipeline, verbose=True):
'''init listWindows with data file
Detects and selects the parser according to the file extension.'''
extension = file_path.split('.')[-1]
if verbose:
print("<==", file_path, "\t", end=' ')
with open(file_path, "r") as f:
self.init_data(json.load(f, object_hook=self.toPython))
self.check_version(file_path)
if pipeline: if pipeline:
# renaming, private pipeline # renaming, private pipeline
f = '/'.join(file_path.split('/')[2:-1]) f = '/'.join(file_path.split('/')[2:-1])
...@@ -371,21 +389,13 @@ class ListWindows(VidjilJson): ...@@ -371,21 +389,13 @@ class ListWindows(VidjilJson):
f = file_path f = file_path
if verbose: if verbose:
print() print()
time = os.path.getmtime(file_path) time = os.path.getmtime(file_path)
self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")] self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
'''
self.id_lengths = defaultdict(int)
print("%%") def loads_vidjil(self, string, pipeline, verbose=True):
for clone in self: '''init listWindows with a json string'''
self.id_lengths[len(clone.d['id'])] += 1 self.init_data(json.loads(string, object_hook=self.toPython))
print ("%% lengths .vidjil -> ", self.id_lengths)
try:
print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
except KeyError:
pass
def getTop(self, top): def getTop(self, top):
result = [] result = []
...@@ -687,6 +697,7 @@ def main(): ...@@ -687,6 +697,7 @@ def main():
group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings') group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)') group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
group_options.add_argument('--ijson', action='store_true', help='use the ijson vidjilparser')
group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)') group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)') group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')
...@@ -720,20 +731,31 @@ def main(): ...@@ -720,20 +731,31 @@ def main():
vparser = VidjilParser() vparser = VidjilParser()
vparser.addPrefix('clones.item', 'clones.item.top', le, args.top) vparser.addPrefix('clones.item', 'clones.item.top', le, args.top)
for path_name in files: for path_name in files:
json_clones = vparser.extract(path_name) if args.ijson:
clones = json.loads(json_clones) json_clones = vparser.extract(path_name)
f += [c['id'] for c in clones["clones"]] clones = json.loads(json_clones)
f += [c['id'] for c in clones["clones"]]
else:
jlist = ListWindows()
jlist.load(path_name, args.pipeline)
f += jlist.getTop(args.top)
f = sorted(set(f)) f = sorted(set(f))
vparser.reset() if args.ijson:
vparser.addPrefix('') vparser.reset()
vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f) vparser.addPrefix('')
vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f)
if args.multi: if args.multi:
for path_name in files: for path_name in files:
json_reads = vparser.extract(path_name)
jlist = ListWindows() jlist = ListWindows()
jlist.load(json_reads, args.pipeline) if args.ijson:
#jlist.build_stat() json_reads = vparser.extract(path_name)
jlist.loads(json_reads, args.pipeline)
else:
jlist.load(path_name, args.pipeline)
jlist.build_stat()
print("\t", jlist, end=' ') print("\t", jlist, end=' ')
...@@ -748,11 +770,14 @@ def main(): ...@@ -748,11 +770,14 @@ def main():
else: else:
print("### Read and merge input files") print("### Read and merge input files")
for path_name in files: for path_name in files:
json_reads = vparser.extract(path_name)
jlist = ListWindows() jlist = ListWindows()
jlist.load(json_reads, args.pipeline) if args.ijson:
#jlist.build_stat() json_reads = vparser.extract(path_name)
#jlist.filter(f) jlist.loads(json_reads, args.pipeline)
else:
jlist.load(path_name, args.pipeline)
jlist.build_stat()
jlist.filter(f)
w1 = Window(1) w1 = Window(1)
w2 = Window(2) w2 = Window(2)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment