task.py 34.9 KB
Newer Older
1
# coding: utf8
Mathieu Giraud's avatar
Mathieu Giraud committed
2
3
from __future__ import print_function

4
import json
5
import os
6
import defs
7
import re
8
import os.path
9
10
11
import time
import sys
import datetime
12
13
import random
import xmlrpclib
14
import tools_utils
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30

def assert_scheduler_task_does_not_exist(args):
    ##check scheduled run
    row = db( ( db.scheduler_task.args == args)
         & ( db.scheduler_task.status != "FAILED"  )
         & ( db.scheduler_task.status != "EXPIRED"  )
         & ( db.scheduler_task.status != "TIMEOUT"  )
         & ( db.scheduler_task.status != "COMPLETED"  )
         ).select()

    if len(row) > 0 :
        res = {"message": "task already registered"}
        return res

    return None

31
32
33
34
35
36
37
38
39
40
41
def compute_contamination(sequence_file_id, results_file_id, config_id):
    result = []
    for i in range(0, len(sequence_file_id)) :
        result.append({})
        result[i]["total_clones"]=0
        result[i]["total_reads"]=0
        result[i]["sample"]={}
        
        #open sample
        with open(defs.DIR_RESULTS+db.results_file[results_file_id[i]].data_file, "r") as json_data:
            d = json.load(json_data)
42
43
44
45
46

            # Be robust against 'null' values for clones
            if not d["clones"]:
                d["clones"] = []

47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
            list1 = {}
            total_reads1=d["reads"]["segmented"][0]
            for clone in d["clones"]:
                list1[clone["id"]] = clone["reads"][0]
            json_data.close()
        
        #iterate trough others run's samples
        sample_set_run = db( (db.sample_set_membership.sequence_file_id == sequence_file_id[i])
                           & (db.sample_set_membership.sample_set_id == db.sample_set.id)
                           & (db.sample_set.sample_type == "run") ).select().first()
        
        if sample_set_run != None :
            sample_set_id = sample_set_run.sample_set.id
            query = db(  ( db.sample_set.id == sample_set_id )
                       & ( db.sample_set.id == db.sample_set_membership.sample_set_id )
                       & ( db.sequence_file.id == db.sample_set_membership.sequence_file_id)
                       & ( db.sequence_file.id != sequence_file_id[i])
                       & ( db.results_file.sequence_file_id == db.sequence_file.id )
65
                       & ( db.results_file.hidden == False )
66
67
68
69
70
71
72
73
74
75
76
                       & ( db.results_file.config_id == config_id[i]  )
                       ).select(db.sequence_file.ALL,db.results_file.ALL, db.sample_set.id, orderby=db.sequence_file.id|~db.results_file.run_date)

            query2 = []
            sfi = 0
            for row in query : 
                if row.sequence_file.id != sfi :
                    query2.append(row)
                    sfi = row.sequence_file.id
            
            for row in query2 :
Mathieu Giraud's avatar
Mathieu Giraud committed
77
                print(defs.DIR_RESULTS+row.results_file.data_file)
78
79
80
81
82
83
                result[i]["sample"][row.results_file.id] = {}
                result[i]["sample"][row.results_file.id]["clones"] = 0
                result[i]["sample"][row.results_file.id]["reads"] = 0
                with open(defs.DIR_RESULTS+row.results_file.data_file, "r") as json_data2:
                    try:
                        d = json.load(json_data2)
84
85
86
87
                        # Be robust against 'null' values for clones
                        if not d["clones"]:
                            d["clones"] = []
                            
88
89
90
91
92
93
94
95
96
                        total_reads2=d["reads"]["segmented"][0]
                        for clone in d["clones"]:
                            if clone["id"] in list1 :
                                if clone["reads"][0] > 10*list1[clone["id"]] :
                                    result[i]["total_clones"] += 1
                                    result[i]["total_reads"] += list1[clone["id"]]
                                    result[i]["sample"][row.results_file.id]["clones"] += 1
                                    result[i]["sample"][row.results_file.id]["reads"] += list1[clone["id"]]
                            
Mathieu Giraud's avatar
Mathieu Giraud committed
97
98
                    except ValueError as e:
                        print('invalid_json')
99
100
101
102
103
                    json_data2.close()


    return result
    
104
def compute_extra(id_file, id_config, min_threshold):
Ryan Herbert's avatar
Ryan Herbert committed
105
    result = {}
106
    d = None
Ryan Herbert's avatar
Ryan Herbert committed
107
    results_file = db((db.results_file.sequence_file_id == id_file) &
108
                      (db.results_file.hidden == False) &
Ryan Herbert's avatar
Ryan Herbert committed
109
                      (db.results_file.config_id == id_config)
110
111
112
                    ).select(orderby=~db.results_file.run_date).first()
    filename = defs.DIR_RESULTS+results_file.data_file
    with open(filename, "rb") as rf:
113
        try:
Ryan Herbert's avatar
Ryan Herbert committed
114
            d = json.load(rf)
115
116
            loci_min = {}

117
118
119
120
121
122
            if 'reads' in d and 'germline' in d['reads']:
                loci_totals = d['reads']['germline']
                for locus in loci_totals:
                    if locus not in result:
                        result[locus] = [0]
                    loci_min[locus] = loci_totals[locus][0] * (min_threshold/100.0)
123

124
125
126
127
128
            if 'clones' in d and d['clones'] is not None:
                for clone in d["clones"]:
                    germline = clone['germline']
                    if clone['reads'][0] >=  loci_min[germline]:
                        result[germline][0] += 1
flothoni's avatar
flothoni committed
129
130
            elif d["clones"] is None:
                # Be robust against 'null' values for clones
131
132
                d["clones"] = []
            
Ryan Herbert's avatar
Ryan Herbert committed
133
134
135
        except ValueError as e:
            print('invalid_json')
            return "FAIL"
136
137
138
    d['reads']['distribution'] = result
    with open(filename, 'wb') as extra:
        json.dump(d, extra)
Ryan Herbert's avatar
Ryan Herbert committed
139
    return "SUCCESS"
140

141

142
def schedule_run(id_sequence, id_config, grep_reads=None):
143
    from subprocess import Popen, PIPE, STDOUT, os
144

145
146
147
    #check results_file
    row = db( ( db.results_file.config_id == id_config ) & 
             ( db.results_file.sequence_file_id == id_sequence )  
148
149
             ).select()
    
Marc Duez's avatar
Marc Duez committed
150
151
152
    ts = time.time()
    data_id = db.results_file.insert(sequence_file_id = id_sequence,
                                     run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
153
                                     hidden = grep_reads is not None, 
Marc Duez's avatar
Marc Duez committed
154
                                     config_id = id_config )
155
        
156
    args = [id_sequence, id_config, data_id, grep_reads]
157

158
159
160
161
    err = assert_scheduler_task_does_not_exist(str(args))
    if err:
        log.error(err)
        return err
162

163
    program = db.config[id_config].program
164
    ##add task to scheduler
165
166
    task = scheduler.queue_task(program, args,
                                repeats = 1, timeout = defs.TASK_TIMEOUT)
167
    
168
    if db.sequence_file[id_sequence].pre_process_flag not in ["COMPLETED", "DONE"] and db.sequence_file[id_sequence].pre_process_flag :
169
170
        db.scheduler_task[task.id] = dict(status ="STOPPED")
    
171
    db.results_file[data_id] = dict(scheduler_task_id = task.id)
172

173
    filename= db.sequence_file[id_sequence].filename
174

175
    res = {"redirect": "reload",
176
           "processId": task.id,
177
           "message": "[%s] c%s: process requested - %s %s" % (data_id, id_config, grep_reads, filename)}
178

179
    log.info(res)
180
181
    return res

182
183
184
185
186
187
188
def schedule_fuse(sample_set_ids, config_ids):
    args = []
    for sample_set_id in sample_set_ids:
        for config_id in config_ids:
            row = db((db.sample_set_membership.sample_set_id == sample_set_id)
                   & (db.sample_set_membership.sequence_file_id == db.results_file.sequence_file_id)
                   & (db.results_file.config_id == config_id)
189
                   & (db.results_file.hidden == False)
190
191
192
193
194
                ).select(db.sample_set_membership.sample_set_id, db.sample_set_membership.sequence_file_id,
                        db.results_file.id, db.results_file.config_id).first()
            if row:
                args.append([row.sample_set_membership.sequence_file_id, row.results_file.config_id,
                row.results_file.id, row.sample_set_membership.sample_set_id, False])
195
196
197
    if len(args) > 0:
        task = scheduler.queue_task('refuse', [args],
                                    repeats = 1, timeout = defs.TASK_TIMEOUT)
198

199
200
201
202
def schedule_compute_extra(id_file, id_config, min_threshold):
    args = [id_file, id_config,  min_threshold]
    task = scheduler.queue_task('compute_extra', args, repeats=1, timeout=defs.TASK_TIMEOUT)

203
def run_vidjil(id_file, id_config, id_data, grep_reads,
204
               clean_before=False, clean_after=False):
205
    from subprocess import Popen, PIPE, STDOUT, os
206
207
    from datetime import timedelta as timed
    
208
209
210
211
212
    if db.sequence_file[id_file].pre_process_flag == "FAILED" :
        print("Pre-process has failed")
        raise ValueError('pre-process has failed')
        return "FAIL"
    
213
    ## re schedule if pre_process is still pending
214
    if db.sequence_file[id_file].pre_process_flag not in ["COMPLETED", "DONE"] and db.sequence_file[id_file].pre_process_flag:
215
        
Mathieu Giraud's avatar
Mathieu Giraud committed
216
        print("Pre-process is still pending, re-schedule")
217
    
218
        args = [id_file, id_config, id_data, grep_reads]
219
220
        task = scheduler.queue_task("vidjil", args,
                        repeats = 1, timeout = defs.TASK_TIMEOUT,
221
                               start_time=request.now + timed(seconds=1200))
222
223
        db.results_file[id_data] = dict(scheduler_task_id = task.id)
        db.commit()
Mathieu Giraud's avatar
Mathieu Giraud committed
224
        print(task.id)
225
226
227
228
        sys.stdout.flush()
        
        return "SUCCESS"
    
229
    ## les chemins d'acces a vidjil / aux fichiers de sequences
230
231
    upload_folder = defs.DIR_SEQUENCES
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
232
    
233
234
235
236
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    
237
238
239
    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
240
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
241
    seq_file = upload_folder+filename
242

243
    ## config de vidjil
244
    vidjil_cmd = db.config[id_config].command
245
246
247
248
249
250
251

    if 'next' in vidjil_cmd:
        vidjil_cmd = vidjil_cmd.replace('next', '')
        vidjil_cmd = vidjil_cmd.replace(' germline' , defs.DIR_GERMLINE_NEXT)
        cmd = defs.DIR_VIDJIL_NEXT + '/vidjil-algo '
    else:
        vidjil_cmd = vidjil_cmd.replace(' germline' , defs.DIR_GERMLINE)
252
        cmd = defs.DIR_VIDJIL + '/vidjil-algo '
253
254
255

    if grep_reads:
        # TODO: security, assert grep_reads XXXX
256
        vidjil_cmd += ' --grep-reads "%s" ' % grep_reads
257
    
258
    os.makedirs(out_folder)
259
260
    out_log = out_folder+'/'+output_filename+'.vidjil.log'
    vidjil_log_file = open(out_log, 'w')
261

262
    try:
263
        ## commande complete
264
        cmd += ' -o  ' + out_folder + " -b " + output_filename
265
266
267
        cmd += ' ' + vidjil_cmd + ' '+ seq_file

        ## execute la commande vidjil
Mathieu Giraud's avatar
Mathieu Giraud committed
268
269
270
        print("=== Launching Vidjil ===")
        print(cmd)    
        print("========================")
271
        sys.stdout.flush()
272

273
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=vidjil_log_file, stderr=STDOUT, close_fds=True)
274

275
        (stdoutdata, stderrdata) = p.communicate()
276

Mathieu Giraud's avatar
Mathieu Giraud committed
277
        print("Output log in " + out_log)
278
279
        sys.stdout.flush()
        db.commit()
280

281
282
283
284
285
        ## Get result file
        if grep_reads:
            out_results = out_folder + '/seq/clone.fa-1'
        else:
            out_results = out_folder + '/' + output_filename + '.vidjil'
286

Mathieu Giraud's avatar
Mathieu Giraud committed
287
        print("===>", out_results)
288
        results_filepath = os.path.abspath(out_results)
289
290

        stream = open(results_filepath, 'rb')
291
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
292
        print("!!! Vidjil failed, no result file")
293
294
        res = {"message": "[%s] c%s: Vidjil FAILED - %s" % (id_data, id_config, out_folder)}
        log.error(res)
295
        raise
296
    
297
298
299
300
301
302
303
304
305
306
307
308
309
    ## Parse some info in .log
    vidjil_log_file.close()

    segmented = re.compile("==> segmented (\d+) reads \((\d*\.\d+|\d+)%\)")
    windows = re.compile("==> found (\d+) .*-windows in .* segments .* inside (\d+) sequences")
    info = ''
    reads = None
    segs = None
    ratio = None
    wins = None
    for l in open(out_log):
        m = segmented.search(l)
        if m:
Mathieu Giraud's avatar
Mathieu Giraud committed
310
            print(l, end=' ')
311
312
313
314
315
316
            segs = int(m.group(1))
            ratio = m.group(2)
            info = "%d segmented (%s%%)" % (segs, ratio)
            continue
        m = windows.search(l)
        if m:
Mathieu Giraud's avatar
Mathieu Giraud committed
317
            print(l, end=' ')
318
319
320
321
322
323
            wins = int(m.group(1))
            reads = int(m.group(2))
            info = "%d reads, " % reads + info + ", %d windows" % wins
            break


324
    ## insertion dans la base de donnée
325
326
    ts = time.time()
    
327
    db.results_file[id_data] = dict(status = "ready",
328
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
329
330
                                 data_file = stream
                                )
331
    
332
333
    db.commit()
    
Mathieu Giraud's avatar
Mathieu Giraud committed
334
335
336
337
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
338
    
Mathieu Giraud's avatar
Mathieu Giraud committed
339
340
    ## l'output de Vidjil est stocké comme resultat pour l'ordonnanceur
    ## TODO parse result success/fail
341

342
343
    config_name = db.config[id_config].name

344
    res = {"message": "[%s] c%s: Vidjil finished - %s - %s" % (id_data, id_config, info, out_folder)}
345
346
    log.info(res)

347
348

    if not grep_reads:
349
350
        for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
	    sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
351
	    print(row.sample_set_id)
352
            compute_extra(id_file, id_config, 5)
353
            run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
354

Mathieu Giraud's avatar
Mathieu Giraud committed
355
    return "SUCCESS"
356

Alexander Shlemov's avatar
Alexander Shlemov committed
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
def run_igrec(id_file, id_config, id_data, clean_before=False, clean_after=False):
    from subprocess import Popen, PIPE, STDOUT, os
    import time
    import json

    upload_folder = defs.DIR_SEQUENCES
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data

    # FIXME Use shutil instead
    cmd = "rm -rf "+out_folder
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()

    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
    seq_file = upload_folder+filename

    ## config de vidjil
    arg_cmd = db.config[id_config].command

    os.makedirs(out_folder)
    out_log = out_folder + '/vidjil-igrec.log'
    log_file = open(out_log, 'w')

    out_results = out_folder + "/out/igrec.vidjil"
    ## commande complete
    try:
        igrec = defs.DIR_IGREC + '/igrec.py'
        if not os.path.isfile(igrec):
            print("!!! IgReC binary file not found")
        cmd = "%s -s %s -o %s/out %s" % (igrec, seq_file, out_folder, arg_cmd)

        ## execute la commande IgReC
        print("=== Launching IgReC ===")
        print(cmd)
        print("========================")
        sys.stdout.flush()

        p = Popen(cmd, shell=True, stdin=PIPE, stdout=log_file, stderr=STDOUT, close_fds=True)
        p.wait()

        print("Output log in " + out_log)
        sys.stdout.flush()

        ## Get result file
        print("===>", out_results)
        results_filepath = os.path.abspath(out_results)
        stream = open(results_filepath, 'rb')
        stream.close()
    except:
        print("!!! IgReC failed, no result file")
        res = {"message": "[%s] c%s: IgReC FAILED - %s" % (id_data, id_config, out_folder)}
        log.error(res)
        raise

    original_name = row[0].data_file
    with open(results_filepath, 'r') as json_file:
        my_json = json.load(json_file)
        fill_field(my_json, original_name, "original_names", "samples")
        fill_field(my_json, cmd, "commandline", "samples")

        # TODO fix this dirty hack to get around bad file descriptor error
    new_file = open(results_filepath, 'w')
    json.dump(my_json, new_file)
    new_file.close()

    ## insertion dans la base de donnée
    ts = time.time()

    stream = open(results_filepath, 'rb')
    db.results_file[id_data] = dict(status = "ready",
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
                                 data_file = stream
                                )

    db.commit()

    config_name = db.config[id_config].name
    res = {"message": "[%s] c%s: IgReC - %s" % (id_data, id_config, out_folder)}
    log.info(res)

    for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
        sample_set_id = row.sample_set_id
        print(row.sample_set_id)
        run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)


    return "SUCCESS"

448
def run_mixcr(id_file, id_config, id_data, clean_before=False, clean_after=False):
449
450
    from subprocess import Popen, PIPE, STDOUT, os
    import time
451
    import json
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470

    upload_folder = defs.DIR_SEQUENCES
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data

    cmd = "rm -rf "+out_folder
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()

    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
    seq_file = upload_folder+filename

    ## config de vidjil
    arg_cmd = db.config[id_config].command

    os.makedirs(out_folder)
    out_log = out_folder + output_filename+'.mixcr.log'
471
    report = out_folder + output_filename + '.mixcr.report'
472
473
474
475
    log_file = open(out_log, 'w')

    out_alignments = out_folder + output_filename + '.align.vdjca'
    out_clones =  out_folder + output_filename + '.clones.clns'
476
477
    out_results_file = output_filename + '.mixcr'
    out_results = out_folder + out_results_file
478

479
    align_report = report + '.aln'
480
    assembly_report = report + '.asmbl'
481

482
483
484
485
486
    arg_cmds = arg_cmd.split('|')
    args_1, args_2, args_3 = '', '', ''
    try:
        args_1, args_2, args_3 = arg_cmds
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
487
488
        print(arg_cmd)
        print("! Bad arguments, we expect args_align | args_assemble | args_exportClones")
489

490
    ## commande complete
491
    try:
492
493
494
495
496
497
498
499
500
        mixcr = defs.DIR_MIXCR + 'mixcr'
        cmd = mixcr + ' align --save-reads -t 1 -r ' + align_report + ' ' + args_1 + ' ' + seq_file  + ' ' + out_alignments
        cmd += ' && '
        cmd += mixcr + ' assemble -t 1 -r ' + assembly_report + ' ' + args_2 + ' ' + out_alignments + ' ' + out_clones
        cmd += ' && rm ' + out_alignments
        cmd += ' && '
        cmd += mixcr + ' exportClones --format vidjil -germline -id -name -reads -sequence -top -seg -s ' + args_3 + ' ' + out_clones + ' ' + out_results

        ## execute la commande MiXCR
Mathieu Giraud's avatar
Mathieu Giraud committed
501
502
503
        print("=== Launching MiXCR ===")
        print(cmd)
        print("========================")
504
        sys.stdout.flush()
505

506
507
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=log_file, stderr=STDOUT, close_fds=True)
        p.wait()
508

Mathieu Giraud's avatar
Mathieu Giraud committed
509
        print("Output log in " + out_log)
510
        sys.stdout.flush()
511

512
        ## Get result file
Mathieu Giraud's avatar
Mathieu Giraud committed
513
        print("===>", out_results)
514
        results_filepath = os.path.abspath(out_results)
HERBERT Ryan's avatar
HERBERT Ryan committed
515
        stream = open(results_filepath, 'rb')
516
        stream.close()
517
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
518
        print("!!! MiXCR failed, no result file")
HERBERT Ryan's avatar
HERBERT Ryan committed
519
520
        res = {"message": "[%s] c%s: MiXCR FAILED - %s" % (id_data, id_config, out_folder)}
        log.error(res)
521
        raise
522

HERBERT Ryan's avatar
HERBERT Ryan committed
523
524
525
    align_report = get_file_content(align_report)
    assembly_report = get_file_content(assembly_report)
    reports = align_report + assembly_report
526
    original_name = row[0].data_file
HERBERT Ryan's avatar
HERBERT Ryan committed
527
    totalReads = extract_total_reads(assembly_report)
528
529
    with open(results_filepath, 'r') as json_file:
        my_json = json.load(json_file)
HERBERT Ryan's avatar
HERBERT Ryan committed
530
531
        fill_field(my_json, reports, "log", "samples", True)
        fill_field(my_json, original_name, "original_names", "samples")
532
        fill_field(my_json, cmd, "commandline", "samples")
HERBERT Ryan's avatar
HERBERT Ryan committed
533
        fill_field(my_json, totalReads, "total", "reads")
534

535
536
537
538
        # TODO fix this dirty hack to get around bad file descriptor error
    new_file = open(results_filepath, 'w')
    json.dump(my_json, new_file)
    new_file.close()
539

540
541
    ## insertion dans la base de donnée
    ts = time.time()
HERBERT Ryan's avatar
HERBERT Ryan committed
542
    
543
    stream = open(results_filepath, 'rb')
544
545
    db.results_file[id_data] = dict(status = "ready",
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
HERBERT Ryan's avatar
HERBERT Ryan committed
546
                                 data_file = stream
547
                                )
HERBERT Ryan's avatar
HERBERT Ryan committed
548
    
549
550
551
552
553
554
    db.commit()

    config_name = db.config[id_config].name
    res = {"message": "[%s] c%s: MiXCR finished - %s" % (id_data, id_config, out_folder)}
    log.info(res)

HERBERT Ryan's avatar
HERBERT Ryan committed
555
556
    for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
        sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
557
        print(row.sample_set_id)
558
        run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
HERBERT Ryan's avatar
HERBERT Ryan committed
559
560


561
    return "SUCCESS"
562

563
def run_copy(id_file, id_config, id_data, clean_before=False, clean_after=False):
564
565
566
567
    from subprocess import Popen, PIPE, STDOUT, os
    
    ## les chemins d'acces a vidjil / aux fichiers de sequences
    upload_folder = defs.DIR_SEQUENCES
568
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
569
570
571
572
573
574
575
576
577
578
579
580
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
    
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    
    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
    
    os.makedirs(out_folder)
    vidjil_log_file = open(out_folder+'/'+output_filename+'.vidjil.log', 'w')
581

Mathieu Giraud's avatar
Mathieu Giraud committed
582
    print("Output log in "+out_folder+'/'+output_filename+'.vidjil.log')
583
584
585
586
587
    sys.stdout.flush()
    db.commit()
    
    ## récupération du fichier 
    results_filepath = os.path.abspath(defs.DIR_SEQUENCES+row[0].data_file)
588

589
590
591
    try:
        stream = open(results_filepath, 'rb')
    except IOError:
Mathieu Giraud's avatar
Mathieu Giraud committed
592
        print("!!! 'copy' failed, no file")
593
594
        res = {"message": "[%s] c%s: 'copy' FAILED - %s - %s" % (id_data, id_config, info, out_folder)}
        log.error(res)
595
596
597
598
599
600
601
602
        raise IOError
    
    ## insertion dans la base de donnée
    ts = time.time()
    
    db.results_file[id_data] = dict(status = "ready",
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
                                 #data_file = row[0].data_file
603
                                 data_file = db.results_file.data_file.store(stream, row[0].filename)
604
605
606
607
608
609
610
611
612
613
614
                                )
    db.commit()
    
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
    
    ## l'output de Vidjil est stocké comme resultat pour l'ordonnanceur
    ## TODO parse result success/fail

615
    res = {"message": "[%s] c%s: 'copy' finished - %s" % (id_data, id_config, filename)}
616
617
    log.info(res)

618
619
    for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
        sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
620
        print(row.sample_set_id)
621
        run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
622

623
624

    return "SUCCESS"
Mathieu Giraud's avatar
Mathieu Giraud committed
625
626


627
628
629
630
def run_refuse(args):
    for arg in args:
        run_fuse(arg[0], arg[1], arg[2], arg[3], arg[4])
    return "SUCCESS"
Mathieu Giraud's avatar
Mathieu Giraud committed
631

632
def run_fuse(id_file, id_config, id_data, sample_set_id, clean_before=True, clean_after=False):
633
634
    from subprocess import Popen, PIPE, STDOUT, os
    
635
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
636
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data + '-%s' % sample_set_id
637
    
Mathieu Giraud's avatar
Mathieu Giraud committed
638
639
640
641
642
    if clean_before:
        cmd = "rm -rf "+out_folder 
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
        os.makedirs(out_folder)    
643
644
    
    
Mathieu Giraud's avatar
Mathieu Giraud committed
645
    fuse_log_file = open(out_folder+'/'+output_filename+'.fuse.log', 'w')
Marc's avatar
Marc committed
646
    
647
    ## fuse.py 
648
    output_file = out_folder+'/'+output_filename+'.fused'
649
    files = ""
650
    sequence_file_list = ""
651
    query2 = db( ( db.results_file.sequence_file_id == db.sequence_file.id )
652
                   & ( db.sample_set_membership.sequence_file_id == db.sequence_file.id)
653
                   & ( db.sample_set_membership.sample_set_id == sample_set_id)
654
                   & ( db.results_file.config_id == id_config )
655
                   & ( db.results_file.hidden == False)
Vidjil Team's avatar
Vidjil Team committed
656
                   ).select( orderby=db.sequence_file.id|~db.results_file.run_date) 
Marc Duez's avatar
Marc Duez committed
657
658
659
660
661
662
663
    query = []
    sequence_file_id = 0
    for row in query2 : 
        if row.sequence_file.id != sequence_file_id :
            query.append(row)
            sequence_file_id = row.sequence_file.id
            
664
    for row in query :
Mathieu Giraud's avatar
Mathieu Giraud committed
665
666
        if row.results_file.data_file is not None :
            files += defs.DIR_RESULTS + row.results_file.data_file + " "
667
668
            sequence_file_list += str(row.results_file.sequence_file_id) + "_"
            
669
670
671
672
673
674
    if files == "":
        print("!!! Fuse failed: no files to fuse")
        res = {"message": "[%s] c%s: 'fuse' FAILED - %s no files to fuse" % (id_data, id_config, output_file)}
        log.error(res)
        return "FAILED"

675
    try:
676
677
        fuse_cmd = db.config[id_config].fuse_command
        cmd = "python "+defs.DIR_FUSE+"/fuse.py -o "+ output_file + " " + fuse_cmd + " " + files
Mathieu Giraud's avatar
Mathieu Giraud committed
678
679


Mathieu Giraud's avatar
Mathieu Giraud committed
680
681
682
        print("=== fuse.py ===")
        print(cmd)
        print("===============")
683
        sys.stdout.flush()
Mathieu Giraud's avatar
Mathieu Giraud committed
684

685
686
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=fuse_log_file, stderr=STDOUT, close_fds=True)
        (stdoutdata, stderrdata) = p.communicate()
Mathieu Giraud's avatar
Mathieu Giraud committed
687
        print("Output log in "+out_folder+'/'+output_filename+'.fuse.log')
Mathieu Giraud's avatar
Mathieu Giraud committed
688

689
        fuse_filepath = os.path.abspath(output_file)
690
691

        stream = open(fuse_filepath, 'rb')
692
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
693
        print("!!! Fuse failed, no .fused file")
694
695
        res = {"message": "[%s] c%s: 'fuse' FAILED - %s" % (id_data, id_config, output_file)}
        log.error(res)
696
        raise
697

698
    ts = time.time()
699
700
701
702

    fused_files = db( ( db.fused_file.config_id == id_config ) &
                     ( db.fused_file.sample_set_id == sample_set_id )
                 ).select()
703
    existing_fused_file = None
704
    if len(fused_files) > 0:
705
706
707
        fused_file = fused_files[0]
        id_fuse = fused_file.id
        existing_fused_file = fused_file.fused_file
708
709
710
711
    else:
        id_fuse = db.fused_file.insert(sample_set_id = sample_set_id,
                                       config_id = id_config)

712
    db.fused_file[id_fuse] = dict(fuse_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
713
714
                                 fused_file = stream,
                                 sequence_file_list = sequence_file_list)
715
    db.commit()
Mathieu Giraud's avatar
Mathieu Giraud committed
716
717
718
719
720

    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
721
722
723
724
725
        # remove previous fused_file if it exists
        if existing_fused_file is not None:
            clean_cmd = "rm -rf %s/%s" % (out_folder, existing_fused_file)
            p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
            p.wait()
726
    
727
    res = {"message": "[%s] c%s: 'fuse' finished - %s" % (id_data, id_config, db.fused_file[id_fuse].fused_file)}
728
729
    log.info(res)

730
731
732
    # Remove temporary fused file
    os.remove(output_file)

733
    return "SUCCESS"
734

735
def custom_fuse(file_list):
736
    from subprocess import Popen, PIPE, STDOUT, os
737
738
739

    if defs.PORT_FUSE_SERVER is None:
        raise IOError('This server cannot fuse custom data')
740
    random_id = random.randint(99999999,99999999999)
741
    out_folder = os.path.abspath(defs.DIR_OUT_VIDJIL_ID % random_id)
742
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % random_id
743
744
745
746
747
    
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    os.makedirs(out_folder)    
748

749
    res = {"message": "'custom fuse' (%d files): %s" % (len(file_list), ','.join(file_list))}
750
751
    log.info(res)
        
752
753
754
755
756
    ## fuse.py 
    output_file = out_folder+'/'+output_filename+'.fused'
    files = ""
    for id in file_list :
        if db.results_file[id].data_file is not None :
757
            files += os.path.abspath(defs.DIR_RESULTS + db.results_file[id].data_file) + " "
758
    
759
    try:
760
        cmd = "python "+ os.path.abspath(defs.DIR_FUSE) +"/fuse.py -o "+output_file+" -t 100 "+files
761
        proc_srvr = xmlrpclib.ServerProxy("http://%s:%d" % (defs.FUSE_SERVER, defs.PORT_FUSE_SERVER))
762
        fuse_filepath = proc_srvr.fuse(cmd, out_folder, output_filename)
763
764
765
    
        f = open(fuse_filepath, 'rb')
        data = gluon.contrib.simplejson.loads(f.read())
766
    except:
767
768
        res = {"message": "'custom fuse' -> IOError"}
        log.error(res)
769
        raise
770
771
772
773
774

    clean_cmd = "rm -rf " + out_folder 
    p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()

775
776
777
    res = {"message": "'custom fuse' -> finished"}
    log.info(res)

778
    return data
779

HERBERT Ryan's avatar
HERBERT Ryan committed
780
#TODO move this ?
781
782
783
784
785
786
def get_file_content(filename):
    content = ""
    with open(filename, 'rb') as my_file:
        content = my_file.read()
    return content

HERBERT Ryan's avatar
HERBERT Ryan committed
787
788
789
790
791
792
793
794
795
796
797
798
799
def fill_field(dest, value, field, parent="", append=False):
    if parent is not None and parent != "":
        dest = dest[parent]

    if field in dest:
        if append:
            dest[field][0] += value
        else:
            dest[field][0] = value
    else:
        dest[field] = []
        dest[field].append(value)

HERBERT Ryan's avatar
HERBERT Ryan committed
800
801
802
803
def extract_total_reads(report):
    reads_matcher = re.compile("Total Reads analysed: [0-9]+")
    reads_line = reads_matcher.search(report).group()
    return reads_line.split(' ')[-1]
804
805
806
807
808
809
810
811
812
813
814
815
816
817

def schedule_pre_process(sequence_file_id, pre_process_id):
    from subprocess import Popen, PIPE, STDOUT, os


    args = [pre_process_id, sequence_file_id]

    err = assert_scheduler_task_does_not_exist(str(args))
    if err:
        log.error(err)
        return err

    task = scheduler.queue_task("pre_process", args,
                                repeats = 1, timeout = defs.TASK_TIMEOUT)
818
819
    db.sequence_file[sequence_file_id] = dict(pre_process_scheduler_task_id = task.id)
    
820
    res = {"redirect": "reload",
821
           "message": "{%s} (%s): process requested" % (sequence_file_id, pre_process_id)}
822
823
824
825
826

    log.info(res)
    return res


827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
def get_preprocessed_filename(filename1, filename2):
    '''
    Get the same extension and then get the most common among them

    >>> get_preprocessed_filename('lsdkj_toto-tata_mlkmfsdlkf.fastq.gz', 'rete_toto-tata_eekjdf.fastq.gz')
    '_toto-tata_.fastq.gz'
    '''
    if not filename2:
        return filename1
    extension = tools_utils.get_common_suffpref([filename1, filename2], min(len(filename1), len(filename2)), -1)
    without_extension = [x[0:-len(extension)] for x in [filename1, filename2]]
    common = tools_utils.common_substring(without_extension)
    if len(common) == 0:
        common = '_'.join(without_extension)
    return common+extension


844
def run_pre_process(pre_process_id, sequence_file_id, clean_before=True, clean_after=False):
Mathieu Giraud's avatar
Mathieu Giraud committed
845
846
847
848
849
    '''
    Run a pre-process on sequence_file.data_file (and possibly sequence_file.data_file+2),
    put the output back in sequence_file.data_file.
    '''

850
851
    from subprocess import Popen, PIPE, STDOUT, os
    
852
    sequence_file = db.sequence_file[sequence_file_id]
853
854
855
    db.sequence_file[sequence_file_id] = dict(pre_process_flag = "RUN")
    db.commit()
    
856
    out_folder = defs.DIR_PRE_VIDJIL_ID % sequence_file_id
857
858
    output_filename = get_preprocessed_filename(get_original_filename(sequence_file.data_file),
                                                get_original_filename(sequence_file.data_file2))
859
860
861
862
863
864
865
    
    if clean_before:
        cmd = "rm -rf "+out_folder 
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
        os.makedirs(out_folder)    

866
    output_file = out_folder+'/'+output_filename
867
868
            
    pre_process = db.pre_process[pre_process_id]
869

870
    try:
871
872
873
874
875
        cmd = pre_process.command.replace( "&file1&", defs.DIR_SEQUENCES + sequence_file.data_file)
        if sequence_file.data_file2:
            cmd = cmd.replace( "&file2&", defs.DIR_SEQUENCES + sequence_file.data_file2)
        cmd = cmd.replace( "&result&", output_file)
        cmd = cmd.replace("&pear&", defs.DIR_PEAR)
876
877
878
879
880
        # Example of template to add some preprocess shortcut
        # cmd = cmd.replace("&preprocess_template&", defs.DIR_preprocess_template)
        # Where &preprocess_template& is the shortcut to change and
        # defs.DIR_preprocess_template the variable to set into the file defs.py. 
        # The value should be the path to access to the preprocess software.
881

Mathieu Giraud's avatar
Mathieu Giraud committed
882
883
884
        print("=== Pre-process %s ===" % pre_process_id)
        print(cmd)
        print("===============")
885
        sys.stdout.flush()
886

887
888
889
890
891
892
        out_log = out_folder+'/'+output_filename+'.pre.log'
        log_file = open(out_log, 'w')

        os.chdir(defs.DIR_FUSE)
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=log_file, stderr=log_file, close_fds=True)
        (stdoutdata, stderrdata) = p.communicate()
Mathieu Giraud's avatar
Mathieu Giraud committed
893
        print("Output log in " + out_log)
894

895
        filepath = os.path.abspath(output_file)
896
897

        stream = open(filepath, 'rb')
898
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
899
        print("!!! Pre-process failed, no result file")
900
        res = {"message": "{%s} p%s: 'pre_process' FAILED - %s" % (sequence_file_id, pre_process_id, output_file)}
901
        log.error(res)
902
903
        db.sequence_file[sequence_file_id] = dict(pre_process_flag = "FAILED")
        db.commit()
904
        raise
905

marc's avatar
marc committed
906
907
        

Mathieu Giraud's avatar
Mathieu Giraud committed
908
909
    # Now we update the sequence file with the result of the pre-process
    # We forget the initial data_file (and possibly data_file2)
910
    db.sequence_file[sequence_file_id] = dict(data_file = stream,
911
                                              data_file2 = None,
912
                                              pre_process_flag = "COMPLETED")
913
    db.commit()
914
915
916
917
918
919
920
921
922
923
924
    #resume STOPPED task for this sequence file
    stopped_task = db(
        (db.results_file.sequence_file_id == sequence_file_id)
        & (db.results_file.scheduler_task_id == db.scheduler_task.id)
        & (db.scheduler_task.status == "STOPPED")
    ).select()
    
    for row in stopped_task :
        db.scheduler_task[row.scheduler_task.id] = dict(status = "QUEUED")
        
    
925
926
    db.commit()

927
    # Dump log in scheduler_run.run_output
928
929
    log_file.close()
    for l in open(out_log):
Mathieu Giraud's avatar
Mathieu Giraud committed
930
        print(l, end=' ')
931
932
933

    # Remove data file from disk to save space (it is now saved elsewhere)
    os.remove(filepath)
934
    
935
936
937
938
939
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
    
940
    res = {"message": "{%s} p%s: 'pre_process' finished - %s" % (sequence_file_id, pre_process_id, output_file)}
941
942
943
944
    log.info(res)

    return "SUCCESS"
    
HERBERT Ryan's avatar
HERBERT Ryan committed
945
    
946
from gluon.scheduler import Scheduler
947
scheduler = Scheduler(db, dict(vidjil=run_vidjil,
948
                               compute_contamination=compute_contamination,
949
                               compute_extra=compute_extra,
950
                               mixcr=run_mixcr,
Alexander Shlemov's avatar
Alexander Shlemov committed
951
                               igrec=run_igrec,
952
                               none=run_copy,
Ryan Herbert's avatar
Ryan Herbert committed
953
                               pre_process=run_pre_process,
954
                               refuse=run_refuse),
HERBERT Ryan's avatar
HERBERT Ryan committed
955
                        heartbeat=defs.SCHEDULER_HEARTBEAT)