Commit 2ff77480 authored by Ryan Herbert's avatar Ryan Herbert

Merge branch 'vidjil_parser_prototype' into 'dev'

Vidjil parser prototype

See merge request !201
parents 68fd3a75 38ae3783
Pipeline #36289 canceled with stages
in 1 minute and 8 seconds
#!/usr/bin/env python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins
......@@ -36,7 +36,7 @@ import os
import datetime
import subprocess
import tempfile
from operator import itemgetter
from operator import itemgetter, le
from utils import *
from defs import *
from collections import defaultdict
......@@ -336,32 +336,47 @@ class ListWindows(VidjilJson):
self.load_vidjil(file_path, *args, **kwargs)
else:
self.load_clntab(file_path, *args, **kwargs)
def loads(self, string, *args, **kwargs):
self.loads_vidjil(string, *args, **kwargs)
def init_data(self, data):
self.d=data.d
# Be robust against 'null' values for clones
if not self.d["clones"]:
self.d["clones"] = []
if "diversity" in self.d.keys():
self.d["diversity"] = Diversity(self.d["diversity"])
else:
self.d["diversity"] = Diversity()
if 'distribution' not in self.d['reads'].d:
self.d['reads'].d['distribution'] = {}
self.id_lengths = defaultdict(int)
print("%%")
for clone in self:
self.id_lengths[len(clone.d['id'])] += 1
print ("%% lengths .vidjil -> ", self.id_lengths)
try:
print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
except KeyError:
pass
def load_vidjil(self, file_path, pipeline, verbose=True):
'''init listWindows with data file
Detects and selects the parser according to the file extension.'''
# name = file_path.split("/")[-1]
extension = file_path.split('.')[-1]
if verbose:
print("<==", file_path, "\t", end=' ')
with open(file_path, "r") as f:
tmp = json.load(f, object_hook=self.toPython)
self.d=tmp.d
# Be robust against 'null' values for clones
if not self.d["clones"]:
self.d["clones"] = []
self.check_version(file_path)
if "diversity" in self.d.keys():
self.d["diversity"] = Diversity(self.d["diversity"])
else:
self.d["diversity"] = Diversity()
if 'distribution' not in self.d['reads'].d:
self.d['reads'].d['distribution'] = {}
self.init_data(json.load(f, object_hook=self.toPython))
self.check_version(file_path)
if pipeline:
# renaming, private pipeline
f = '/'.join(file_path.split('/')[2:-1])
......@@ -372,20 +387,13 @@ class ListWindows(VidjilJson):
f = file_path
if verbose:
print()
time = os.path.getmtime(file_path)
self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
self.id_lengths = defaultdict(int)
print("%%")
for clone in self:
self.id_lengths[len(clone.d['id'])] += 1
print ("%% lengths .vidjil -> ", self.id_lengths)
try:
print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
except KeyError:
pass
def loads_vidjil(self, string, pipeline, verbose=True):
'''init listWindows with a json string'''
self.init_data(json.loads(string, object_hook=self.toPython))
def getTop(self, top):
result = []
......@@ -687,6 +695,7 @@ def main():
group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
group_options.add_argument('--ijson', action='store_true', help='use the ijson vidjilparser')
group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')
......@@ -716,17 +725,39 @@ def main():
#filtre
f = []
if args.ijson:
from vidjilparser import VidjilParser
vparser = VidjilParser()
vparser.addPrefix('clones.item', 'clones.item.top', le, args.top)
for path_name in files:
jlist = ListWindows()
jlist.load(path_name, args.pipeline)
f += jlist.getTop(args.top)
if args.ijson:
json_clones = vparser.extract(path_name)
clones = json.loads(json_clones)
if clones["clones"] is not None:
f += [c['id'] for c in clones["clones"]]
else:
jlist = ListWindows()
jlist.load(path_name, args.pipeline)
f += jlist.getTop(args.top)
f = sorted(set(f))
if args.ijson:
vparser.reset()
vparser.addPrefix('')
vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f)
if args.multi:
for path_name in files:
jlist = ListWindows()
jlist.load(path_name, args.pipeline)
jlist.build_stat()
if args.ijson:
json_reads = vparser.extract(path_name)
jlist.loads(json_reads, args.pipeline)
else:
jlist.load(path_name, args.pipeline)
jlist.build_stat()
print("\t", jlist, end=' ')
......@@ -742,9 +773,13 @@ def main():
print("### Read and merge input files")
for path_name in files:
jlist = ListWindows()
jlist.load(path_name, args.pipeline)
jlist.build_stat()
jlist.filter(f)
if args.ijson:
json_reads = vparser.extract(path_name)
jlist.loads(json_reads, args.pipeline)
else:
jlist.load(path_name, args.pipeline)
jlist.build_stat()
jlist.filter(f)
w1 = Window(1)
w2 = Window(2)
......@@ -780,7 +815,7 @@ def main():
for i in range(len(jlist_fused.d["clones"])) :
fasta += ">>" + str(i) + "\n"
fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
fasta_file = tempfile.NamedTemporaryFile(delete=False)
fasta_file = tempfile.NamedTemporaryFile(mode="w", delete=False)
fasta_file.write(fasta)
try:
out = subprocess.check_output([TOOL_SIMILARITY, "-j", fasta_file.name])
......
......@@ -275,7 +275,7 @@ def populate_variables(var):
for v in var:
try:
key, var = v.split('=')
variables.append(('$' + key, var))
variables = [('$' + key, var)] + variables
except IOError:
raise ShouldException('Error in parsing variable definition: ' + v)
......@@ -681,7 +681,7 @@ class TestSuite():
# Directive -- Options
if l.startswith(DIRECTIVE_OPTIONS):
opts, unknown = options.parse_known_args(l[len(DIRECTIVE_OPTIONS):].split())
self.variables += populate_variables(opts.var)
self.variables = populate_variables(opts.var) + self.variables
if opts.mod:
self.modifiers += ''.join(opts.mod)
continue
......
......@@ -7,6 +7,9 @@ should-get-tests/fuse-doc.tap: should-get-tests/fuse-doc.should-get force
should: should-get-tests/fuse-doc.tap
python3 ../should.py should-get-tests/*.should-get
should-ijson:
python3 ../should.py --var FUSE_OPTIONS=--ijson should-get-tests/fuse-*.should-get
doctests:
@echo "*** Launching python tests..."
python -m doctest -v ../fuse.py
......
!LAUNCH: python ../../fuse.py ../../../algo/tests/data/results-two-clones-1-2.vidjil ../../../algo/tests/data/results-two-clones-1-2.vidjil ../../../algo/tests/data/results-two-clones-1-2.vidjil ; cat fused.vidjil
!LAUNCH: python ../../fuse.py $FUSE_OPTIONS ../../../algo/tests/data/results-two-clones-1-2.vidjil ../../../algo/tests/data/results-two-clones-1-2.vidjil ../../../algo/tests/data/results-two-clones-1-2.vidjil ; cat fused.vidjil
$ Fuse the three files
2: ListWindows: .1000, 1000, 1000.
......
!LAUNCH: python ../../fuse.py ../../../algo/tests/data/no_clones.vidjil ../../../algo/tests/data/results-two-clones-1-2.vidjil -o fused_no_clones.vidjil; cat fused_no_clones.vidjil
!LAUNCH: python ../../fuse.py $FUSE_OPTIONS ../../../algo/tests/data/no_clones.vidjil ../../../algo/tests/data/results-two-clones-1-2.vidjil -o fused_no_clones.vidjil; cat fused_no_clones.vidjil
$ Fuse the three files
2: ListWindows: .0, 1000.
......
......@@ -11,3 +11,5 @@ rZb
--var
VIDJIL_DIR=../../../
--var
FUSE_OPTIONS=
#!/usr/bin/python
import ijson.backends.yajl2_cffi as ijson
from six import string_types
from enum import Enum
class MatchingEvent(Enum):
end_map = "start_map"
end_array = "start_array"
class VidjilWriter(object):
def __init__(self, pretty=False):
self.pretty = pretty
self.buffer = []
self.buffering = False
self.conserveBuffer = False
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
pass
def write(self, prefix, event, value, previous):
res = self._write(prefix, event, value, previous)
if self.buffering:
self.buffer.append(res)
return ""
return res
def _write(self, prefix, event, value, previous):
if self.pretty:
end = '\n'
else:
end = ''
if event == 'start_map':
mstr = '{{'
elif event == 'end_map':
mstr = '}}'
elif event == 'start_array':
mstr = '['
elif event == 'end_array':
mstr = ']'
elif event == 'map_key':
mstr = '\"{}\":'
end = ''
elif event == 'string':
mstr = '\"{}\"'
else:
if event == 'boolean':
value = str(value).lower()
if value is None:
mstr = 'null'
else:
mstr = '{}'
padding = ''
if isinstance(value, string_types) :
value = value.replace("\n", "\\n")
value = value.replace("\r", "\\r")
if previous not in ['', 'map_key', 'start_map', 'start_array'] and event not in ['end_map', 'end_array']:
mstr = "," + mstr
if self.pretty and previous != 'map_key':
if len(prefix) > 0:
padding = ''.join(['\t' for i in range(len(prefix.split('.')))])
mstr = '{}' + mstr + end
return mstr.format(padding, value)
def purgeBuffer(self):
self.buffer = []
self.conserveBuffer = False
def writeBuffer(self):
try:
return ''.join(self.buffer)
finally:
self.purgeBuffer()
def startBuffering(self):
self.conserveBuffer = False
self.buffering = True
def endBuffering(self):
self.buffering = False
if self.conserveBuffer:
self.conserveBuffer = False
return self.writeBuffer()
else :
self.purgeBuffer()
return ""
class VidjilFileWriter(VidjilWriter):
def __init__(self, filepath=None, pretty=False):
super(VidjilWriter, self).__init__()
self._filepath = filepath
self.file = None
def __enter__(self):
self.file = open(self._filepath, 'wb')
return self
def __exit__(self, exc_type, exc_value, traceback):
self.file.close()
def write(self, prefix, event, value, previous):
res = super(VidjilWriter, self).write(prefix, event, value, previous)
self.file.write(res)
return res
def writeBuffer(self):
res = super(VidjilWriter, self).writeBuffer()
self.file.write(res)
return res
class Predicate(object):
def __init__(self, field, comparator, value):
self.comp = comparator
self.field = field
self.value = value
def compare(self, field, other):
if self.comp is None:
return field == self.field
try:
res = field == self.field and self.comp(other, self.value)
return res
except:
return False
class VidjilParser(object):
def __init__(self, writer=None):
if writer is not None:
self._writer = writer
else:
self._writer = VidjilWriter()
self._model_prefixes = []
self.prefixes = []
def initModel(self, model_path):
with open(model_path, 'rb') as model:
parser = ijson.parse(model)
for prefix, event, value in parser:
if (prefix, event) not in self._model_prefixes:
self._model_prefixes.append((prefix, event))
def validate(self, filepath):
with open(filepath, 'rb') as vfile:
parser = ijson.parse(vfile)
model = list(self._model_prefixes)
for prefix, event, value in parser:
pair = (prefix, event)
if pair in model:
model.remove(pair)
return len(model) == 0
def writer(self):
return self._writer
def addPrefix(self, prefix, conditional = None, comp = None, value = None):
if conditional is None:
conditional = prefix
self.prefixes.append((prefix, Predicate(conditional, comp, value)))
def reset(self):
self.prefixes = []
self._writer.purgeBuffer()
def extract(self, filepath):
vidjilfile = open(filepath, 'rb')
parser = ijson.parse(vidjilfile)
with self.writer() as writer:
return self._extract(parser, writer)
def isStartEvent(self, event):
return event in ['start_map', 'start_array']
def isEndEvent(self, event):
return event in ['end_map', 'end_array', 'number', 'string', 'boolean']
def isMatching(self, mbuffer, other):
if other[1] not in MatchingEvent.__members__:
return False
return (mbuffer[0] == other[0]) and (mbuffer[1] == MatchingEvent[other[1]].value)
def _extract(self, parser, writer):
previous = ''
res = ""
bufferStart = (None, None)
for prefix, event, value in parser:
subelem = lambda x, y: x.startswith(y)
if any(subelem(prefix, item[0])\
or (subelem(item[0], prefix) and (value is None or subelem(item[0], str(value))))\
for item in self.prefixes):
bufferOn = any(prefix == item[0] for item in self.prefixes) and self.isStartEvent(event)
if bufferOn:
bufferStart = (prefix, event)
saved_previous = previous
self._writer.startBuffering()
if not self._writer.conserveBuffer \
and any((item[1].compare(prefix, value)) for item in self.prefixes):
self._writer.conserveBuffer = True
res += writer.write(prefix, event, value, previous)
previous = event
if (self.writer().buffering and (self.isEndEvent(event) and self.isMatching(bufferStart, (prefix, event)) or self._writer.conserveBuffer)):
if not self._writer.conserveBuffer:
previous = saved_previous
res += self._writer.endBuffering()
return res
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment