Commit 996960ce authored by Ryan Herbert's avatar Ryan Herbert

fuse.py prototype (incomplete) refactor: use vidjilparser.py

The object here being some optimisations to increase the performance of
fuse.py by using VidjilParser to extract only the data that is relevant
to the context we are in, before loading the data into Python objects.

See #3234
parent cc0a3cee
...@@ -36,11 +36,13 @@ import os ...@@ -36,11 +36,13 @@ import os
import datetime import datetime
import subprocess import subprocess
import tempfile import tempfile
from operator import itemgetter from operator import itemgetter, le
from utils import * from utils import *
from defs import * from defs import *
from collections import defaultdict from collections import defaultdict
from vidjilparser import VidjilParser
FUSE_VERSION = "vidjil fuse" FUSE_VERSION = "vidjil fuse"
TOOL_SIMILARITY = "../algo/tools/similarity" TOOL_SIMILARITY = "../algo/tools/similarity"
...@@ -331,37 +333,34 @@ class ListWindows(VidjilJson): ...@@ -331,37 +333,34 @@ class ListWindows(VidjilJson):
with open(output, "w") as f: with open(output, "w") as f:
json.dump(self, f, indent=2, default=self.toJson) json.dump(self, f, indent=2, default=self.toJson)
def load(self, file_path, *args, **kwargs): def load(self, string, *args, **kwargs):
'''
if not '.clntab' in file_path: if not '.clntab' in file_path:
self.load_vidjil(file_path, *args, **kwargs) self.load_vidjil(file_path, *args, **kwargs)
else: else:
self.load_clntab(file_path, *args, **kwargs) self.load_clntab(file_path, *args, **kwargs)
'''
self.load_vidjil(string, *args, **kwargs)
def load_vidjil(self, file_path, pipeline, verbose=True): def load_vidjil(self, string, pipeline, verbose=True):
'''init listWindows with data file '''init listWindows with data file
Detects and selects the parser according to the file extension.''' Detects and selects the parser according to the file extension.'''
# name = file_path.split("/")[-1]
extension = file_path.split('.')[-1]
if verbose:
print("<==", file_path, "\t", end=' ')
with open(file_path, "r") as f: tmp = json.loads(string, object_hook=self.toPython)
tmp = json.load(f, object_hook=self.toPython) self.d=tmp.d
self.d=tmp.d # Be robust against 'null' values for clones
# Be robust against 'null' values for clones if not self.d["clones"]:
if not self.d["clones"]: self.d["clones"] = []
self.d["clones"] = [] #self.check_version(file_path)
self.check_version(file_path)
if "diversity" in self.d.keys(): if "diversity" in self.d.keys():
self.d["diversity"] = Diversity(self.d["diversity"]) self.d["diversity"] = Diversity(self.d["diversity"])
else: else:
self.d["diversity"] = Diversity() self.d["diversity"] = Diversity()
if 'distribution' not in self.d['reads'].d: if 'distribution' not in self.d['reads'].d:
self.d['reads'].d['distribution'] = {} self.d['reads'].d['distribution'] = {}
'''
if pipeline: if pipeline:
# renaming, private pipeline # renaming, private pipeline
f = '/'.join(file_path.split('/')[2:-1]) f = '/'.join(file_path.split('/')[2:-1])
...@@ -375,6 +374,7 @@ class ListWindows(VidjilJson): ...@@ -375,6 +374,7 @@ class ListWindows(VidjilJson):
time = os.path.getmtime(file_path) time = os.path.getmtime(file_path)
self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")] self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
'''
self.id_lengths = defaultdict(int) self.id_lengths = defaultdict(int)
...@@ -716,17 +716,27 @@ def main(): ...@@ -716,17 +716,27 @@ def main():
#filtre #filtre
f = [] f = []
vparser = VidjilParser()
vparser.addPrefix('clones', 'clones.item.top', le, args.top)
for path_name in files: for path_name in files:
jlist = ListWindows() json_clones = vparser.extract(path_name)
jlist.load(path_name, args.pipeline) print(json_clones)
f += jlist.getTop(args.top) clones = json.loads(json_clones)
f += [c['id'] for c in clones["clones"]]
f = sorted(set(f)) f = sorted(set(f))
vparser.reset()
vparser.addPrefix('reads')
vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f)
vparser.addPrefix('samples')
vparser.addPrefix('vidjil_json_version')
if args.multi: if args.multi:
for path_name in files: for path_name in files:
json_reads = vparser.extract(path_name)
jlist = ListWindows() jlist = ListWindows()
jlist.load(path_name, args.pipeline) jlist.load(json_reads, args.pipeline)
jlist.build_stat() #jlist.build_stat()
print("\t", jlist, end=' ') print("\t", jlist, end=' ')
...@@ -741,10 +751,11 @@ def main(): ...@@ -741,10 +751,11 @@ def main():
else: else:
print("### Read and merge input files") print("### Read and merge input files")
for path_name in files: for path_name in files:
json_reads = vparser.extract(path_name)
jlist = ListWindows() jlist = ListWindows()
jlist.load(path_name, args.pipeline) jlist.load(json_reads, args.pipeline)
jlist.build_stat() #jlist.build_stat()
jlist.filter(f) #jlist.filter(f)
w1 = Window(1) w1 = Window(1)
w2 = Window(2) w2 = Window(2)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment