Commit fdacca88 authored by Ryan Herbert's avatar Ryan Herbert

Merge branch 'feature-s/3041-stats' into 'dev'

sample_sets.stats()

See merge request !155
parents 9b658849 ca01199f
Pipeline #18461 failed with stages
in 1148 minutes and 22 seconds
......@@ -1099,6 +1099,22 @@ Database.prototype = {
return true;
},
updateStatsButton: function() {
var sample_set_ids = [];
$('[name^="sample_set_ids"]:checked').each(function() {
sample_set_ids.push("sample_set_ids=" + $(this).val());
});
var config_id = $('#choose_config').find(':selected').val();
var addr = DB_ADDRESS + '/sample_set/result_files?config_id=' + config_id + '&' + sample_set_ids.join('&');
$('#stats_button').attr('href', addr);
},
updateStatsSelection: function(cb) {
var $cb=$(cb);
$('[name^=\"sample_set_ids\"]').prop('checked', $cb.is(':checked'));
this.updateStatsButton();
},
// Log functions, to server
// 'quiet' is set to true to avoid infinite loops with timeouts
log : function (lvl, msg) {
......
......@@ -10,116 +10,6 @@ if request.env.http_origin:
response.headers['Access-Control-Max-Age'] = 86400
ACCESS_DENIED = "access denied"
STATS_READLINES = 1000 # approx. size in which the stats are searched
STATS_MAXBYTES = 500000 # approx. size in which the stats are searched
def stats():
start = time.time()
d = custom()
stats_regex = [
# found 771265 40-windows in 2620561 segments (85.4%) inside 3068713 sequences # before 1f501e13 (-> 2015.05)
'in (?P<seg>\d+) segments \((?P<seg_ratio>.*?)\) inside (?P<reads>\d+) sequences',
# found 10750 50-windows in 13139 reads (99.9% of 13153 reads)
'windows in (?P<seg>\d+) reads \((?P<seg_ratio>.*?) of (?P<reads>\d+) reads\)',
# segmentation causes
'log.* SEG_[+].*?-> (?P<SEG_plus>.*?).n',
'log.* SEG_[-].*?-> (?P<SEG_minus>.*?).n',
]
# stats by locus
for locus in defs.LOCUS:
locus_regex = locus.replace('+', '[+]')
locus_group = locus.replace('+', 'p')
stats_regex += [ 'log.* %(locus)s.*?->\s*?(?P<%(locus_g)s_reads>\d+)\s+(?P<%(locus_g)s_av_len>[0-9.]+)\s+(?P<%(locus_g)s_clones>\d+)\s+(?P<%(locus_g)s_av_reads>[0-9.]+)\s*.n'
% { 'locus': locus_regex, 'locus_g': locus_group } ]
json_paths = {
'result_file': {
'main_clone': '/clones[0]/name',
'main_clone_reads': '/clones[0]/reads[0]'
},
'fused_file': {
'reads distribution [>= 10%]': 'reads/distribution/0.1',
'reads distribution [>= 1% < 10%]': 'reads/distribution/0.01',
'reads distribution [>= .01% < 1%]': 'reads/distribution/0.001',
'reads distribution [>= .001% < .01%]': 'reads/distribution/0.0001',
'reads distribution [>= .0001% < .001%]': 'reads/distribution/0.00001',
'producer': 'samples/producer'
}
}
keys_patient = [ 'info' ]
keys_file = [ 'sampling_date', 'size_file' ]
keys = []
keys += keys_file
keys += keys_patient
regex = []
for sr in stats_regex:
r = re.compile(sr)
regex += [r]
keys += r.groupindex.keys()
keys += sorted(json_paths['result_file'].keys() + json_paths['fused_file'].keys())
for row in d['query']:
found = {}
results_f = row.results_file.data_file
row_result = vidjil_utils.search_first_regex_in_file(regex, defs.DIR_RESULTS + results_f, STATS_READLINES)
try:
row_result_json = vidjil_utils.extract_fields_from_json(json_paths['result_file'], None, defs.DIR_RESULTS + results_f, STATS_MAXBYTES)
except:
row_result_json = []
fused_file = db((db.fused_file.sample_set_id == row.sample_set.id) & (db.fused_file.config_id == row.results_file.config_id)).select(orderby = ~db.fused_file.id, limitby=(0,1))
if len(fused_file) > 0 and fused_file[0].sequence_file_list is not None:
sequence_file_list = fused_file[0].sequence_file_list.split('_')
try:
pos_in_list = sequence_file_list.index(str(row.sequence_file.id))
row_fused = vidjil_utils.extract_fields_from_json(json_paths['fused_file'], pos_in_list, defs.DIR_RESULTS + fused_file[0].fused_file, STATS_MAXBYTES)
except ValueError:
row_fused = []
else:
row_fused = {}
results_list = [row_result, row_result_json, row_fused]
for key in keys:
for map_result in results_list:
if key in map_result:
row[key] = map_result[key]
found[key] = True
if key not in found:
if key in keys_patient:
row[key] = row.patient[key]
found[key] = True
elif key in keys_file:
row[key] = row.sequence_file[key]
found[key] = True
else:
row[key] = ''
# Re-process some data
keys += ['IGH_av_clones']
for row in d['query']:
row['IGH_av_clones'] = ''
if 'IGH_av_reads' in row:
try:
row['IGH_av_clones'] = '%.4f' % (1.0 / float(row['IGH_av_reads']))
found['IGH_av_clones'] = True
except:
pass
# Keep only non-empty columns
d['stats'] = []
for key in keys:
if key in found:
d['stats'] += [key]
log.debug("patient/stats (%.3fs) %s" % (time.time()-start, request.vars["filter"]))
return d
## return form to create new patient
def add():
......
......@@ -212,6 +212,162 @@ def all():
step = step,
page = page)
def stats():
start = time.time()
if not auth.user :
res = {"redirect" : URL('default', 'user', args='login', scheme=True, host=True,
vars=dict(_next=URL('sample_set', 'all', vars={'type': defs.SET_TYPE_PATIENT}, scheme=True, host=True)))
}
return gluon.contrib.simplejson.dumps(res, separators=(',',':'))
isAdmin = auth.is_admin()
if request.vars['type']:
type = request.vars['type']
else :
type = defs.SET_TYPE_GENERIC
##filter
if "filter" not in request.vars :
request.vars["filter"] = ""
search, tags = parse_search(request.vars["filter"])
group_ids = get_involved_groups()
list = SampleSetList(type, tags=tags)
list.load_sample_information()
list.load_anon_permissions()
result = list.get_values()
factory = ModelFactory()
helper = factory.get_instance(type=type)
fields = helper.get_reduced_fields()
##sort result
reverse = False
if request.vars["reverse"] == "true" :
reverse = True
if "sort" in request.vars:
result = sorted(result, key = lambda row : row[request.vars["sort"]], reverse=reverse)
else:
result = sorted(result, key = lambda row : row.id, reverse=not reverse)
result = helper.filter(search, result)
log.debug("%s stat list (%.3fs) %s" % (request.vars["type"], time.time()-start, search))
return dict(query = result,
fields = fields,
helper = helper,
group_ids = group_ids,
isAdmin = isAdmin,
reverse = False)
def result_files():
from zipfile import ZipFile
from cStringIO import StringIO
import types
errors = []
config_id = request.vars['config_id']
sample_set_ids = []
if 'sample_set_ids' in request.vars:
sample_set_ids = request.vars['sample_set_ids']
#little hack since we can't pass array parameters with only one value
if isinstance(sample_set_ids, types.StringTypes):
sample_set_ids = [sample_set_ids]
if int(config_id) == -1:
config_query = (db.results_file.config_id > 0)
else:
config_query = (db.results_file.config_id == config_id)
left_join = [
db.patient.on(db.patient.sample_set_id == db.sample_set.id),
db.run.on(db.run.sample_set_id == db.sample_set.id),
db.generic.on(db.generic.sample_set_id == db.sample_set.id)
]
q = db(
(db.sample_set.id.belongs(sample_set_ids)) &
(db.sample_set_membership.sample_set_id == db.sample_set.id) &
(db.sequence_file.id == db.sample_set_membership.sequence_file_id) &
(db.results_file.sequence_file_id == db.sequence_file.id) &
(db.results_file.data_file != None) &
config_query
)
results = q.select(db.results_file.ALL, db.sequence_file.ALL, db.sample_set.ALL, db.patient.ALL, db.run.ALL, db.generic.ALL, left=left_join)
sample_types = ['patient', 'run', 'generic']
mf = ModelFactory()
helpers = {}
for t in sample_types:
helpers[t] = mf.get_instance(type=t)
tempfile = StringIO()
zipfile = ZipFile(tempfile, 'w')
metadata = []
for res in results:
log.debug("res: " + str(res))
metadata.append({'id': res.sample_set.id,
'name': helpers[res.sample_set.sample_type].get_name(res[res.sample_set.sample_type]),
'file': res.results_file.data_file,
'set_info': res[res.sample_set.sample_type].info,
'sample_info': res.sequence_file.info})
path = defs.DIR_RESULTS + res.results_file.data_file
zipfile.writestr(res.results_file.data_file, open(path, 'rb').read())
zipfile.writestr('metadata.json', json.dumps(metadata))
zipfile.close()
filename = "export_%s_%s.zip" % ('-'.join(sample_set_ids), str(datetime.date.today()))
response.headers['Content-Type'] = "application/zip"
response.headers['Content-Disposition'] = 'attachment; filename=%s' % filename# to force download as attachment
rtn = tempfile.getvalue()
return rtn
## Stats
def mystats():
start = time.time()
# d = all()
d = custom()
# Build .vidjil file list
f_samples = []
for row in d['query']:
found = {}
f_results = defs.DIR_RESULTS + row.results_file.data_file
f_fused = None
pos_in_fused = None
fused_file = ''
# TODO: fix the following request
# fused_file = db((db.fused_file.sample_set_id == row.sample_set.id) & (db.fused_file.config_id == row.results_file.config_id)).select(orderby = ~db.fused_file.id, limitby=(0,1))
if len(fused_file) > 0 and fused_file[0].sequence_file_list is not None:
sequence_file_list = fused_file[0].sequence_file_list.split('_')
try:
pos_in_fused = sequence_file_list.index(str(row.sequence_file.id))
f_fused = defs.DIR_RESULTS + fused_file[0].fused_file
except ValueError:
pass
metadata = { } # 'patient': row.patient, 'sequence_file': row.sequence_file }
f_samples += [(metadata, f_results, f_fused, pos_in_fused)]
# Send to vidjil_utils.stats
res = vidjil_utils.stats(f_samples)
d = {}
d['stats'] = res
d['f_samples'] = f_samples # TMP, for debug
# Return
log.debug("stats (%.3fs) %s" % (time.time()-start, request.vars["filter"]))
return gluon.contrib.simplejson.dumps(d, separators=(',',':'))
## return form to create new generic sample_set
def add():
......
......@@ -27,12 +27,19 @@ class SampleSet(object):
text = self.tag_decorator.decorate(data.info, 'tag', self.type, self.get_list_path())
return self.tag_decorator.sanitize(text)
def get_stats_tagged_info(self, data):
text = self.tag_decorator.decorate(data.info, 'tag', self.type, self.get_stats_path())
return self.tag_decorator.sanitize(text)
def get_configs(self, data):
return data.conf_list
def get_list_path(self):
return '/sample_set/all'
def get_stats_path(self):
return '/sample_set/stats'
def get_config_urls(self, data):
configs = []
for conf in data.conf_list:
......@@ -69,6 +76,13 @@ class SampleSet(object):
fields.append({'name': 'files', 'sort': 'file_count', 'call': self.get_files, 'width': 100, 'public': True})
return fields
def get_reduced_fields(self):
fields = []
fields.append({'name': 'name', 'sort': 'name', 'call': self.get_name, 'width': 200, 'public': True})
fields.append({'name': 'info', 'sort': 'info', 'call': self.get_stats_tagged_info, 'width': None, 'public': True})
fields.append({'name': 'files', 'sort': 'file_count', 'call': self.get_files, 'width': 100, 'public': True})
return fields
def get_sequence_count(self, data):
if not hasattr(data, 'sequence_count'):
data.sequence_count = db( (db.sequence_file.id == db.sample_set_membership.sequence_file_id)
......@@ -84,6 +98,13 @@ class SampleSet(object):
&(db.results_file.sequence_file_id == db.sequence_file.id)).count()
return data.data_count
def create_filter_string(self, data, keys):
for row in data:
row['string'] = []
for key in keys:
if key in row:
row['string'].append(str(row[key]))
@abstractmethod
def filter(self, filter_str, data):
pass
......
......@@ -7,8 +7,8 @@ class Generic(SampleSet):
return 'set'
def filter(self, filter_str, data):
for row in data:
row['string'] = [row['name'], row['confs'], row['groups'], str(row['info'])]
keys = ['name', 'confs', 'groups', 'info']
self.create_filter_string(data, keys)
return filter(lambda row : vidjil_utils.advanced_filter(row['string'], filter_str), data)
def get_info_dict(self, data):
......
......@@ -10,6 +10,11 @@ class Patient(SampleSet):
fields.insert(1, {'name': 'birth', 'sort': 'birth', 'call': self.get_birth, 'width': 100, 'public': True})
return fields
def get_reduced_fields(self):
fields = super(Patient, self).get_reduced_fields()
fields.insert(1, {'name': 'birth', 'sort': 'birth', 'call': self.get_birth, 'width': 100, 'public': True})
return fields
def get_name(self, data, anon=None):
return vidjil_utils.anon_names(data.id, data.first_name, data.last_name, anon)
......@@ -17,8 +22,8 @@ class Patient(SampleSet):
return "%s" % str(data.birth) if data.birth is not None else ''
def filter(self, filter_str, data):
for row in data:
row['string'] = [row['last_name'], row['first_name'], row['confs'], row['groups'], str(row['birth']), str(row['info'])]
keys = ['last_name', 'first_name', 'confs', 'groups', 'birth', 'info']
self.create_filter_string(data, keys)
return filter(lambda row : vidjil_utils.advanced_filter(row['string'], filter_str), data)
def get_info_dict(self, data):
......
......@@ -7,6 +7,11 @@ class Run(SampleSet):
fields.insert(1, {'name': 'run_date', 'sort': 'run_date', 'call': self.get_run_date, 'width': 100, 'public': True})
return fields
def get_reduced_fields(self):
fields = super(Run, self).get_reduced_fields()
fields.insert(1, {'name': 'run_date', 'sort': 'run_date', 'call': self.get_run_date, 'width': 100, 'public': True})
return fields
def get_name(self, data):
return data.name
......@@ -17,8 +22,8 @@ class Run(SampleSet):
return "%s" % str(data.run_date) if data.run_date is not None else ''
def filter(self, filter_str, data):
for row in data:
row['string'] = [row['name'], row['confs'], row['groups'], str(row['run_date']), str(row['info'])]
keys = ['name', 'confs', 'groups', 'run_date', 'info']
self.create_filter_string(data, keys)
return filter(lambda row : vidjil_utils.advanced_filter(row['string'], filter_str), data)
def get_info_dict(self, data):
......
......@@ -321,6 +321,122 @@ def extract_fields_from_json(json_fields, pos_in_list, filename, max_bytes = Non
return matched_keys
####
STATS_READLINES = 1000 # approx. size in which the stats are searched
STATS_MAXBYTES = 500000 # approx. size in which the stats are searched
def stats(samples):
stats_regex = [
# found 771265 40-windows in 2620561 segments (85.4%) inside 3068713 sequences # before 1f501e13 (-> 2015.05)
'in (?P<seg>\d+) segments \((?P<seg_ratio>.*?)\) inside (?P<reads>\d+) sequences',
# found 10750 50-windows in 13139 reads (99.9% of 13153 reads)
'windows in (?P<seg>\d+) reads \((?P<seg_ratio>.*?) of (?P<reads>\d+) reads\)',
# segmentation causes
'log.* SEG_[+].*?-> (?P<SEG_plus>.*?).n',
'log.* SEG_[-].*?-> (?P<SEG_minus>.*?).n',
]
# stats by locus
for locus in defs.LOCUS:
locus_regex = locus.replace('+', '[+]')
locus_group = locus.replace('+', 'p')
stats_regex += [ 'log.* %(locus)s.*?->\s*?(?P<%(locus_g)s_reads>\d+)\s+(?P<%(locus_g)s_av_len>[0-9.]+)\s+(?P<%(locus_g)s_clones>\d+)\s+(?P<%(locus_g)s_av_reads>[0-9.]+)\s*.n'
% { 'locus': locus_regex, 'locus_g': locus_group } ]
json_paths = {
'result_file': {
'main_clone': '/clones[0]/name',
'main_clone_reads': '/clones[0]/reads[0]'
},
'fused_file': {
'reads distribution [>= 10%]': 'reads/distribution/0.1',
'reads distribution [>= 1% < 10%]': 'reads/distribution/0.01',
'reads distribution [>= .01% < 1%]': 'reads/distribution/0.001',
'reads distribution [>= .001% < .01%]': 'reads/distribution/0.0001',
'reads distribution [>= .0001% < .001%]': 'reads/distribution/0.00001',
'producer': 'samples/producer'
}
}
keys_patient = [ 'info' ]
keys_file = [ 'sampling_date', 'size_file' ]
keys = []
keys += keys_file
keys += keys_patient
regex = []
for sr in stats_regex:
r = re.compile(sr)
regex += [r]
keys += r.groupindex.keys()
keys += sorted(json_paths['result_file'].keys() + json_paths['fused_file'].keys())
tab = []
found = {}
for (metadata, f_result, f_fused, pos_in_fused) in samples:
row = {}
row_result = search_first_regex_in_file(regex, f_result, STATS_READLINES)
row['result'] = row_result # TMP, for debug
try:
row_result_json = extract_fields_from_json(json_paths['result_file'], None, defs.DIR_RESULTS + results_f, STATS_MAXBYTES)
except:
row_result_json = []
if f_fused:
try:
row_fused = extract_fields_from_json(json_paths['fused_file'], pos_in_fused, f_fused, STATS_MAXBYTES)
except ValueError:
row_fused = []
else:
row_fused = {}
results_list = [row_result, row_result_json, row_fused]
for key in keys:
for map_result in results_list:
if key in map_result:
row[key] = map_result[key]
found[key] = True
if key not in found:
if key in keys_patient:
row[key] = "TODO" + key # metadata['patient'][key]
found[key] = True
elif key in keys_file:
row[key] = "TODO" + key # metadata['sequence_file'][key]
found[key] = True
else:
row[key] = ''
tab += [row]
# Re-process some data
keys += ['IGH_av_clones']
for row in tab:
row['IGH_av_clones'] = ''
if 'IGH_av_reads' in row:
try:
row['IGH_av_clones'] = '%.4f' % (1.0 / float(row['IGH_av_reads']))
found['IGH_av_clones'] = True
except:
pass
# Keep only non-empty columns
res = []
for key in keys:
if key in found:
res += [key]
return tab # res # TODO
####
SOURCES = "https://github.com/vidjil/vidjil/blob/master/server/web2py/applications/vidjil/%s#L%s"
......
{{import vidjil_utils}}
<div>
<h3>Stats</h3>
<!--
<div class="db_block">
<div class="db_block_left">
search
<input id="db_filter_input" type="text" value="{{=request.vars["filter"]}}"
onchange="db.call('patient/stats', {'config_id' : '{{=request.vars["config_id"]}}',
'filter' : this.value,
'custom_list' : db.getListInput('custom_result[]')} )" >
</div>
<div class="db_block_right">
{{if auth.can_process_file() :}}
config
<span>
<select id="choose_config" name="config" onchange="db.call('patient/stats', {'config_id' : this.value,
'filter' : '{{=request.vars["filter"]}}',
'custom_list' : db.getListInput('custom_result[]') })">
<option value="-1" {{if not config :}}selected{{pass}}> --- </option>
{{for row in db((auth.vidjil_accessible_query(PermissionEnum.read_config.value, db.config) | auth.vidjil_accessible_query(PermissionEnum.admin_config.value, db.config) ) ).select(orderby=~db.config.name) :}}
<option value="{{=row.id }}" {{if row.id==config_id:}}selected{{pass}} >
{{=row.name}}
</option>
{{pass}}
</select>
</span>
{{pass}}
</div>
</div>
</div>
-->
<div id="db_table_container">
<table class="db_table" id="table" border="1">
<thead>
<tr> <!-- <td class="column1"> </td> -->
<td class="column_200"> patient </td>
<td class="column_200"> file name </td>
<!-- Stats -->
{{ for key in stats: }}
<td>{{ =key }}</td>
{{ pass }}
<!-- ----- -->
<td class="column1"> size </td>
<td class="column_200"> config </td>
<td class="column_sep"></td>
<td class="column2">last processing</td>
</tr>
</thead>
<tbody>
{{for row in query :}}
<tr>
<!-- <td> <input type="checkbox" name="custom_result[]" value="{{=row.results_file.id}}" {{if row.checked :}} checked {{pass}}> </td> -->
<td> {{=vidjil_utils.anon_names(row.sequence_file.patient_id, row.patient.first_name, row.patient.last_name)}}</td>
<td {{if row.sequence_file.data_file == None :}} {{=XML("class='inactive' title='file is missing' ")}} {{pass}} id="sequence_file_{{=row.sequence_file.id}}">
{{=row.sequence_file.filename}}
</td>
{{ for key in stats: }}
<td>
{{ if key in row: }}
{{ =row[key] }}
{{ pass }}
</td>
{{ pass }}
<td {{if row.sequence_file.data_file == None :}} {{=XML("class='inactive' title='file is missing' ")}} {{pass}} >
{{=vidjil_utils.format_size(row.sequence_file.size_file)}} </td>
<td> {{=row.config.name}}
<td class="column_sep"></td>
{{if row.results_file.run_date :}}
<td class="button" onclick="db.call('results_file/info', { 'results_file_id' : '{{=row.results_file.id}}' } )"> {{=row.results_file.run_date }}</td>
{{else:}}<td></td>{{pass}}
</tr>
{{pass}}
</tbody>
</table>
<table class="db_table" id="db_fixed_header"></table>
</div>
<!--
<div class="db_block">
<div class="db_block_left">
</div>
<div class="db_block_right">
<span class="button2" onclick="myUrl.loadCustomUrl(db)" > see results </span>
</div>
</div>
-->
</div>
......@@ -66,6 +66,7 @@
{{else:}}
<!-- <span class="button2 inactive" onclick="db.call('sample_set/add')" title="you don't have permission to create new {{=helper.get_type_display()}}s"> add sample_set </span> -->
{{pass}}
<span class="button2 devel-mode" onclick="db.call('sample_set/stats', {'type': '{{=helper.get_type()}}', 'filter': '{{=request.vars['filter']}}'})">stats</span>
</div>
<div class="db_block_right">
......
......@@ -211,6 +211,9 @@
{{pass}}
{{=row.config.name}} </a>
{{pass}}
{{ if fused_count > 0: }}
<a id="stats_button" target="_blank" class="button2 devel-mode" href="/vidjil/sample_set/result_files?sample_set_ids={{=request.vars['id']}}&config_id=-1">export all results</a>
{{ pass }}
</div>
</div>
......
{{extend 'db_layout.html'}}
{{import vidjil_utils}}
<h3>Stats</h3>
<div class="db_block">
<div class="db_block_left">
search
<input id="db_filter_input" type="text" value="{{=request.vars["filter"]}}"