task.py 31.9 KB
Newer Older
1
# coding: utf8
Mathieu Giraud's avatar
Mathieu Giraud committed
2
3
from __future__ import print_function

4
import json
5
import os
6
import defs
7
import re
8
import os.path
9
10
11
import time
import sys
import datetime
12
13
import random
import xmlrpclib
14
import tools_utils
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30

def assert_scheduler_task_does_not_exist(args):
    ##check scheduled run
    row = db( ( db.scheduler_task.args == args)
         & ( db.scheduler_task.status != "FAILED"  )
         & ( db.scheduler_task.status != "EXPIRED"  )
         & ( db.scheduler_task.status != "TIMEOUT"  )
         & ( db.scheduler_task.status != "COMPLETED"  )
         ).select()

    if len(row) > 0 :
        res = {"message": "task already registered"}
        return res

    return None

31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def compute_contamination(sequence_file_id, results_file_id, config_id):
    result = []
    for i in range(0, len(sequence_file_id)) :
        result.append({})
        result[i]["total_clones"]=0
        result[i]["total_reads"]=0
        result[i]["sample"]={}
        
        #open sample
        with open(defs.DIR_RESULTS+db.results_file[results_file_id[i]].data_file, "r") as json_data:
            d = json.load(json_data)
            list1 = {}
            total_reads1=d["reads"]["segmented"][0]
            for clone in d["clones"]:
                list1[clone["id"]] = clone["reads"][0]
            json_data.close()
        
        #iterate trough others run's samples
        sample_set_run = db( (db.sample_set_membership.sequence_file_id == sequence_file_id[i])
                           & (db.sample_set_membership.sample_set_id == db.sample_set.id)
                           & (db.sample_set.sample_type == "run") ).select().first()
        
        if sample_set_run != None :
            sample_set_id = sample_set_run.sample_set.id
            query = db(  ( db.sample_set.id == sample_set_id )
                       & ( db.sample_set.id == db.sample_set_membership.sample_set_id )
                       & ( db.sequence_file.id == db.sample_set_membership.sequence_file_id)
                       & ( db.sequence_file.id != sequence_file_id[i])
                       & ( db.results_file.sequence_file_id == db.sequence_file.id )
                       & ( db.results_file.config_id == config_id[i]  )
                       ).select(db.sequence_file.ALL,db.results_file.ALL, db.sample_set.id, orderby=db.sequence_file.id|~db.results_file.run_date)

            query2 = []
            sfi = 0
            for row in query : 
                if row.sequence_file.id != sfi :
                    query2.append(row)
                    sfi = row.sequence_file.id
            
            for row in query2 :
Mathieu Giraud's avatar
Mathieu Giraud committed
71
                print(defs.DIR_RESULTS+row.results_file.data_file)
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
                result[i]["sample"][row.results_file.id] = {}
                result[i]["sample"][row.results_file.id]["clones"] = 0
                result[i]["sample"][row.results_file.id]["reads"] = 0
                with open(defs.DIR_RESULTS+row.results_file.data_file, "r") as json_data2:
                    try:
                        d = json.load(json_data2)
                        total_reads2=d["reads"]["segmented"][0]
                        for clone in d["clones"]:
                            if clone["id"] in list1 :
                                if clone["reads"][0] > 10*list1[clone["id"]] :
                                    result[i]["total_clones"] += 1
                                    result[i]["total_reads"] += list1[clone["id"]]
                                    result[i]["sample"][row.results_file.id]["clones"] += 1
                                    result[i]["sample"][row.results_file.id]["reads"] += list1[clone["id"]]
                            
Mathieu Giraud's avatar
Mathieu Giraud committed
87
88
                    except ValueError as e:
                        print('invalid_json')
89
90
91
92
93
                    json_data2.close()


    return result
    
94

95
def schedule_run(id_sequence, id_config, grep_reads=None):
96
    from subprocess import Popen, PIPE, STDOUT, os
97

98
99
100
    #check results_file
    row = db( ( db.results_file.config_id == id_config ) & 
             ( db.results_file.sequence_file_id == id_sequence )  
101
102
             ).select()
    
Marc Duez's avatar
Marc Duez committed
103
104
105
106
    ts = time.time()
    data_id = db.results_file.insert(sequence_file_id = id_sequence,
                                     run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
                                     config_id = id_config )
107
        
108
    args = [id_sequence, id_config, data_id, grep_reads]
109

110
111
112
113
    err = assert_scheduler_task_does_not_exist(str(args))
    if err:
        log.error(err)
        return err
114

115
    program = db.config[id_config].program
116
    ##add task to scheduler
117
118
    task = scheduler.queue_task(program, args,
                                repeats = 1, timeout = defs.TASK_TIMEOUT)
119
120
121
122
    
    if db.sequence_file[id_sequence].pre_process_flag == "WAIT" or db.sequence_file[id_sequence].pre_process_flag == "RUN" : 
        db.scheduler_task[task.id] = dict(status ="STOPPED")
    
123
    db.results_file[data_id] = dict(scheduler_task_id = task.id)
124

125
    filename= db.sequence_file[id_sequence].filename
126

127
    res = {"redirect": "reload",
128
           "message": "[%s] c%s: process requested - %s %s" % (data_id, id_config, grep_reads, filename)}
129

130
    log.info(res)
131
132
    return res

133
134
135
136
137
138
139
140
141
142
143
144
def schedule_fuse(sample_set_ids, config_ids):
    args = []
    for sample_set_id in sample_set_ids:
        for config_id in config_ids:
            row = db((db.sample_set_membership.sample_set_id == sample_set_id)
                   & (db.sample_set_membership.sequence_file_id == db.results_file.sequence_file_id)
                   & (db.results_file.config_id == config_id)
                ).select(db.sample_set_membership.sample_set_id, db.sample_set_membership.sequence_file_id,
                        db.results_file.id, db.results_file.config_id).first()
            if row:
                args.append([row.sample_set_membership.sequence_file_id, row.results_file.config_id,
                row.results_file.id, row.sample_set_membership.sample_set_id, False])
145
146
147
    if len(args) > 0:
        task = scheduler.queue_task('refuse', [args],
                                    repeats = 1, timeout = defs.TASK_TIMEOUT)
148

149
def run_vidjil(id_file, id_config, id_data, grep_reads,
150
               clean_before=False, clean_after=False):
151
    from subprocess import Popen, PIPE, STDOUT, os
152
153
154
    from datetime import timedelta as timed
    
    ## re schedule if pre_process is still pending
marc's avatar
marc committed
155
    if db.sequence_file[id_file].pre_process_flag == "WAIT" or db.sequence_file[id_file].pre_process_flag == "RUN" :
156
        
Mathieu Giraud's avatar
Mathieu Giraud committed
157
        print("Pre-process is still pending, re-schedule")
158
    
159
        args = [id_file, id_config, id_data, grep_reads]
160
161
        task = scheduler.queue_task("vidjil", args,
                        repeats = 1, timeout = defs.TASK_TIMEOUT,
162
                               start_time=request.now + timed(seconds=1200))
163
164
        db.results_file[id_data] = dict(scheduler_task_id = task.id)
        db.commit()
Mathieu Giraud's avatar
Mathieu Giraud committed
165
        print(task.id)
166
167
168
169
        sys.stdout.flush()
        
        return "SUCCESS"
    
marc's avatar
marc committed
170
    if db.sequence_file[id_file].pre_process_flag == "FAILED" :
Mathieu Giraud's avatar
Mathieu Giraud committed
171
        print("Pre-process has failed")
marc's avatar
marc committed
172
173
        raise ValueError('pre-process has failed')
        return "FAIL"
174
175
    
    ## les chemins d'acces a vidjil / aux fichiers de sequences
176
177
    upload_folder = defs.DIR_SEQUENCES
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
178
    
179
180
181
182
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    
183
184
185
    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
186
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
187
    seq_file = upload_folder+filename
188

189
    ## config de vidjil
190
    vidjil_cmd = db.config[id_config].command
191
192
193
194
195
196
197

    if 'next' in vidjil_cmd:
        vidjil_cmd = vidjil_cmd.replace('next', '')
        vidjil_cmd = vidjil_cmd.replace(' germline' , defs.DIR_GERMLINE_NEXT)
        cmd = defs.DIR_VIDJIL_NEXT + '/vidjil-algo '
    else:
        vidjil_cmd = vidjil_cmd.replace(' germline' , defs.DIR_GERMLINE)
198
        cmd = defs.DIR_VIDJIL + '/vidjil-algo '
199
200
201
202

    if grep_reads:
        # TODO: security, assert grep_reads XXXX
        vidjil_cmd += ' -FaW "%s" ' % grep_reads
203
    
204
    os.makedirs(out_folder)
205
206
    out_log = out_folder+'/'+output_filename+'.vidjil.log'
    vidjil_log_file = open(out_log, 'w')
207

208
    try:
209
        ## commande complete
210
        cmd += ' -o  ' + out_folder + " -b " + output_filename
211
212
213
        cmd += ' ' + vidjil_cmd + ' '+ seq_file

        ## execute la commande vidjil
Mathieu Giraud's avatar
Mathieu Giraud committed
214
215
216
        print("=== Launching Vidjil ===")
        print(cmd)    
        print("========================")
217
        sys.stdout.flush()
218

219
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=vidjil_log_file, stderr=STDOUT, close_fds=True)
220

221
        (stdoutdata, stderrdata) = p.communicate()
222

Mathieu Giraud's avatar
Mathieu Giraud committed
223
        print("Output log in " + out_log)
224
225
        sys.stdout.flush()
        db.commit()
226

227
228
229
230
231
        ## Get result file
        if grep_reads:
            out_results = out_folder + '/seq/clone.fa-1'
        else:
            out_results = out_folder + '/' + output_filename + '.vidjil'
232

Mathieu Giraud's avatar
Mathieu Giraud committed
233
        print("===>", out_results)
234
        results_filepath = os.path.abspath(out_results)
235
236

        stream = open(results_filepath, 'rb')
237
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
238
        print("!!! Vidjil failed, no result file")
239
240
        res = {"message": "[%s] c%s: Vidjil FAILED - %s" % (id_data, id_config, out_folder)}
        log.error(res)
241
        raise
242
    
243
244
245
246
247
248
249
250
251
252
253
254
255
    ## Parse some info in .log
    vidjil_log_file.close()

    segmented = re.compile("==> segmented (\d+) reads \((\d*\.\d+|\d+)%\)")
    windows = re.compile("==> found (\d+) .*-windows in .* segments .* inside (\d+) sequences")
    info = ''
    reads = None
    segs = None
    ratio = None
    wins = None
    for l in open(out_log):
        m = segmented.search(l)
        if m:
Mathieu Giraud's avatar
Mathieu Giraud committed
256
            print(l, end=' ')
257
258
259
260
261
262
            segs = int(m.group(1))
            ratio = m.group(2)
            info = "%d segmented (%s%%)" % (segs, ratio)
            continue
        m = windows.search(l)
        if m:
Mathieu Giraud's avatar
Mathieu Giraud committed
263
            print(l, end=' ')
264
265
266
267
268
269
            wins = int(m.group(1))
            reads = int(m.group(2))
            info = "%d reads, " % reads + info + ", %d windows" % wins
            break


270
    ## insertion dans la base de donnée
271
272
    ts = time.time()
    
273
    db.results_file[id_data] = dict(status = "ready",
274
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
275
276
                                 data_file = stream
                                )
277
    
278
279
    db.commit()
    
Mathieu Giraud's avatar
Mathieu Giraud committed
280
281
282
283
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
284
    
Mathieu Giraud's avatar
Mathieu Giraud committed
285
286
    ## l'output de Vidjil est stocké comme resultat pour l'ordonnanceur
    ## TODO parse result success/fail
287

288
289
    config_name = db.config[id_config].name

290
    res = {"message": "[%s] c%s: Vidjil finished - %s - %s" % (id_data, id_config, info, out_folder)}
291
292
    log.info(res)

293
294

    if not grep_reads:
295
296
        for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
	    sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
297
	    print(row.sample_set_id)
298
            run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
299

Mathieu Giraud's avatar
Mathieu Giraud committed
300
    return "SUCCESS"
301

Alexander Shlemov's avatar
Alexander Shlemov committed
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
def run_igrec(id_file, id_config, id_data, clean_before=False, clean_after=False):
    from subprocess import Popen, PIPE, STDOUT, os
    import time
    import json

    upload_folder = defs.DIR_SEQUENCES
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data

    # FIXME Use shutil instead
    cmd = "rm -rf "+out_folder
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()

    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
    seq_file = upload_folder+filename

    ## config de vidjil
    arg_cmd = db.config[id_config].command

    os.makedirs(out_folder)
    out_log = out_folder + '/vidjil-igrec.log'
    log_file = open(out_log, 'w')

    out_results = out_folder + "/out/igrec.vidjil"
    ## commande complete
    try:
        igrec = defs.DIR_IGREC + '/igrec.py'
        if not os.path.isfile(igrec):
            print("!!! IgReC binary file not found")
        cmd = "%s -s %s -o %s/out %s" % (igrec, seq_file, out_folder, arg_cmd)

        ## execute la commande IgReC
        print("=== Launching IgReC ===")
        print(cmd)
        print("========================")
        sys.stdout.flush()

        p = Popen(cmd, shell=True, stdin=PIPE, stdout=log_file, stderr=STDOUT, close_fds=True)
        p.wait()

        print("Output log in " + out_log)
        sys.stdout.flush()

        ## Get result file
        print("===>", out_results)
        results_filepath = os.path.abspath(out_results)
        stream = open(results_filepath, 'rb')
        stream.close()
    except:
        print("!!! IgReC failed, no result file")
        res = {"message": "[%s] c%s: IgReC FAILED - %s" % (id_data, id_config, out_folder)}
        log.error(res)
        raise

    original_name = row[0].data_file
    with open(results_filepath, 'r') as json_file:
        my_json = json.load(json_file)
        fill_field(my_json, original_name, "original_names", "samples")
        fill_field(my_json, cmd, "commandline", "samples")

        # TODO fix this dirty hack to get around bad file descriptor error
    new_file = open(results_filepath, 'w')
    json.dump(my_json, new_file)
    new_file.close()

    ## insertion dans la base de donnée
    ts = time.time()

    stream = open(results_filepath, 'rb')
    db.results_file[id_data] = dict(status = "ready",
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
                                 data_file = stream
                                )

    db.commit()

    config_name = db.config[id_config].name
    res = {"message": "[%s] c%s: IgReC - %s" % (id_data, id_config, out_folder)}
    log.info(res)

    for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
        sample_set_id = row.sample_set_id
        print(row.sample_set_id)
        run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)


    return "SUCCESS"

393
def run_mixcr(id_file, id_config, id_data, clean_before=False, clean_after=False):
394
395
    from subprocess import Popen, PIPE, STDOUT, os
    import time
396
    import json
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415

    upload_folder = defs.DIR_SEQUENCES
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data

    cmd = "rm -rf "+out_folder
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()

    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
    seq_file = upload_folder+filename

    ## config de vidjil
    arg_cmd = db.config[id_config].command

    os.makedirs(out_folder)
    out_log = out_folder + output_filename+'.mixcr.log'
416
    report = out_folder + output_filename + '.mixcr.report'
417
418
419
420
    log_file = open(out_log, 'w')

    out_alignments = out_folder + output_filename + '.align.vdjca'
    out_clones =  out_folder + output_filename + '.clones.clns'
421
422
    out_results_file = output_filename + '.mixcr'
    out_results = out_folder + out_results_file
423

424
    align_report = report + '.aln'
425
    assembly_report = report + '.asmbl'
426

427
428
429
430
431
    arg_cmds = arg_cmd.split('|')
    args_1, args_2, args_3 = '', '', ''
    try:
        args_1, args_2, args_3 = arg_cmds
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
432
433
        print(arg_cmd)
        print("! Bad arguments, we expect args_align | args_assemble | args_exportClones")
434

435
    ## commande complete
436
    try:
437
438
439
440
441
442
443
444
445
        mixcr = defs.DIR_MIXCR + 'mixcr'
        cmd = mixcr + ' align --save-reads -t 1 -r ' + align_report + ' ' + args_1 + ' ' + seq_file  + ' ' + out_alignments
        cmd += ' && '
        cmd += mixcr + ' assemble -t 1 -r ' + assembly_report + ' ' + args_2 + ' ' + out_alignments + ' ' + out_clones
        cmd += ' && rm ' + out_alignments
        cmd += ' && '
        cmd += mixcr + ' exportClones --format vidjil -germline -id -name -reads -sequence -top -seg -s ' + args_3 + ' ' + out_clones + ' ' + out_results

        ## execute la commande MiXCR
Mathieu Giraud's avatar
Mathieu Giraud committed
446
447
448
        print("=== Launching MiXCR ===")
        print(cmd)
        print("========================")
449
        sys.stdout.flush()
450

451
452
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=log_file, stderr=STDOUT, close_fds=True)
        p.wait()
453

Mathieu Giraud's avatar
Mathieu Giraud committed
454
        print("Output log in " + out_log)
455
        sys.stdout.flush()
456

457
        ## Get result file
Mathieu Giraud's avatar
Mathieu Giraud committed
458
        print("===>", out_results)
459
        results_filepath = os.path.abspath(out_results)
HERBERT Ryan's avatar
HERBERT Ryan committed
460
        stream = open(results_filepath, 'rb')
461
        stream.close()
462
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
463
        print("!!! MiXCR failed, no result file")
HERBERT Ryan's avatar
HERBERT Ryan committed
464
465
        res = {"message": "[%s] c%s: MiXCR FAILED - %s" % (id_data, id_config, out_folder)}
        log.error(res)
466
        raise
467

HERBERT Ryan's avatar
HERBERT Ryan committed
468
469
470
    align_report = get_file_content(align_report)
    assembly_report = get_file_content(assembly_report)
    reports = align_report + assembly_report
471
    original_name = row[0].data_file
HERBERT Ryan's avatar
HERBERT Ryan committed
472
    totalReads = extract_total_reads(assembly_report)
473
474
    with open(results_filepath, 'r') as json_file:
        my_json = json.load(json_file)
HERBERT Ryan's avatar
HERBERT Ryan committed
475
476
        fill_field(my_json, reports, "log", "samples", True)
        fill_field(my_json, original_name, "original_names", "samples")
477
        fill_field(my_json, cmd, "commandline", "samples")
HERBERT Ryan's avatar
HERBERT Ryan committed
478
        fill_field(my_json, totalReads, "total", "reads")
479

480
481
482
483
        # TODO fix this dirty hack to get around bad file descriptor error
    new_file = open(results_filepath, 'w')
    json.dump(my_json, new_file)
    new_file.close()
484

485
486
    ## insertion dans la base de donnée
    ts = time.time()
HERBERT Ryan's avatar
HERBERT Ryan committed
487
    
488
    stream = open(results_filepath, 'rb')
489
490
    db.results_file[id_data] = dict(status = "ready",
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
HERBERT Ryan's avatar
HERBERT Ryan committed
491
                                 data_file = stream
492
                                )
HERBERT Ryan's avatar
HERBERT Ryan committed
493
    
494
495
496
497
498
499
    db.commit()

    config_name = db.config[id_config].name
    res = {"message": "[%s] c%s: MiXCR finished - %s" % (id_data, id_config, out_folder)}
    log.info(res)

HERBERT Ryan's avatar
HERBERT Ryan committed
500
501
    for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
        sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
502
        print(row.sample_set_id)
503
        run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
HERBERT Ryan's avatar
HERBERT Ryan committed
504
505


506
    return "SUCCESS"
507

508
def run_copy(id_file, id_config, id_data, clean_before=False, clean_after=False):
509
510
511
512
    from subprocess import Popen, PIPE, STDOUT, os
    
    ## les chemins d'acces a vidjil / aux fichiers de sequences
    upload_folder = defs.DIR_SEQUENCES
513
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
514
515
516
517
518
519
520
521
522
523
524
525
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
    
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    
    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
    
    os.makedirs(out_folder)
    vidjil_log_file = open(out_folder+'/'+output_filename+'.vidjil.log', 'w')
526

Mathieu Giraud's avatar
Mathieu Giraud committed
527
    print("Output log in "+out_folder+'/'+output_filename+'.vidjil.log')
528
529
530
531
532
    sys.stdout.flush()
    db.commit()
    
    ## récupération du fichier 
    results_filepath = os.path.abspath(defs.DIR_SEQUENCES+row[0].data_file)
533

534
535
536
    try:
        stream = open(results_filepath, 'rb')
    except IOError:
Mathieu Giraud's avatar
Mathieu Giraud committed
537
        print("!!! 'copy' failed, no file")
538
539
        res = {"message": "[%s] c%s: 'copy' FAILED - %s - %s" % (id_data, id_config, info, out_folder)}
        log.error(res)
540
541
542
543
544
545
546
547
        raise IOError
    
    ## insertion dans la base de donnée
    ts = time.time()
    
    db.results_file[id_data] = dict(status = "ready",
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
                                 #data_file = row[0].data_file
548
                                 data_file = db.results_file.data_file.store(stream, row[0].filename)
549
550
551
552
553
554
555
556
557
558
559
                                )
    db.commit()
    
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
    
    ## l'output de Vidjil est stocké comme resultat pour l'ordonnanceur
    ## TODO parse result success/fail

560
    res = {"message": "[%s] c%s: 'copy' finished - %s" % (id_data, id_config, filename)}
561
562
    log.info(res)

563
564
    for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
        sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
565
        print(row.sample_set_id)
566
        run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
567

568
569

    return "SUCCESS"
Mathieu Giraud's avatar
Mathieu Giraud committed
570
571


572
573
574
575
def run_refuse(args):
    for arg in args:
        run_fuse(arg[0], arg[1], arg[2], arg[3], arg[4])
    return "SUCCESS"
Mathieu Giraud's avatar
Mathieu Giraud committed
576

577
def run_fuse(id_file, id_config, id_data, sample_set_id, clean_before=True, clean_after=False):
578
579
    from subprocess import Popen, PIPE, STDOUT, os
    
580
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
581
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data + '-%s' % sample_set_id
582
    
Mathieu Giraud's avatar
Mathieu Giraud committed
583
584
585
586
587
    if clean_before:
        cmd = "rm -rf "+out_folder 
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
        os.makedirs(out_folder)    
588
589
    
    
Mathieu Giraud's avatar
Mathieu Giraud committed
590
    fuse_log_file = open(out_folder+'/'+output_filename+'.fuse.log', 'w')
Marc's avatar
Marc committed
591
    
592
    ## fuse.py 
593
    output_file = out_folder+'/'+output_filename+'.fused'
594
    files = ""
595
    sequence_file_list = ""
596
    query2 = db( ( db.results_file.sequence_file_id == db.sequence_file.id )
597
                   & ( db.sample_set_membership.sequence_file_id == db.sequence_file.id)
598
                   & ( db.sample_set_membership.sample_set_id == sample_set_id)
599
                   & ( db.results_file.config_id == id_config )
Vidjil Team's avatar
Vidjil Team committed
600
                   ).select( orderby=db.sequence_file.id|~db.results_file.run_date) 
Marc Duez's avatar
Marc Duez committed
601
602
603
604
605
606
607
    query = []
    sequence_file_id = 0
    for row in query2 : 
        if row.sequence_file.id != sequence_file_id :
            query.append(row)
            sequence_file_id = row.sequence_file.id
            
608
    for row in query :
Mathieu Giraud's avatar
Mathieu Giraud committed
609
610
        if row.results_file.data_file is not None :
            files += defs.DIR_RESULTS + row.results_file.data_file + " "
611
612
            sequence_file_list += str(row.results_file.sequence_file_id) + "_"
            
613
    try:
614
615
        fuse_cmd = db.config[id_config].fuse_command
        cmd = "python "+defs.DIR_FUSE+"/fuse.py -o "+ output_file + " " + fuse_cmd + " " + files
Mathieu Giraud's avatar
Mathieu Giraud committed
616
617


Mathieu Giraud's avatar
Mathieu Giraud committed
618
619
620
        print("=== fuse.py ===")
        print(cmd)
        print("===============")
621
        sys.stdout.flush()
Mathieu Giraud's avatar
Mathieu Giraud committed
622

623
624
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=fuse_log_file, stderr=STDOUT, close_fds=True)
        (stdoutdata, stderrdata) = p.communicate()
Mathieu Giraud's avatar
Mathieu Giraud committed
625
        print("Output log in "+out_folder+'/'+output_filename+'.fuse.log')
Mathieu Giraud's avatar
Mathieu Giraud committed
626

627
        fuse_filepath = os.path.abspath(output_file)
628
629

        stream = open(fuse_filepath, 'rb')
630
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
631
        print("!!! Fuse failed, no .fused file")
632
633
        res = {"message": "[%s] c%s: 'fuse' FAILED - %s" % (id_data, id_config, output_file)}
        log.error(res)
634
        raise
635

636
    ts = time.time()
637
638
639
640

    fused_files = db( ( db.fused_file.config_id == id_config ) &
                     ( db.fused_file.sample_set_id == sample_set_id )
                 ).select()
641
    existing_fused_file = None
642
    if len(fused_files) > 0:
643
644
645
        fused_file = fused_files[0]
        id_fuse = fused_file.id
        existing_fused_file = fused_file.fused_file
646
647
648
649
    else:
        id_fuse = db.fused_file.insert(sample_set_id = sample_set_id,
                                       config_id = id_config)

650
    db.fused_file[id_fuse] = dict(fuse_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
651
652
                                 fused_file = stream,
                                 sequence_file_list = sequence_file_list)
653
    db.commit()
Mathieu Giraud's avatar
Mathieu Giraud committed
654
655
656
657
658

    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
659
660
661
662
663
        # remove previous fused_file if it exists
        if existing_fused_file is not None:
            clean_cmd = "rm -rf %s/%s" % (out_folder, existing_fused_file)
            p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
            p.wait()
664
    
665
    res = {"message": "[%s] c%s: 'fuse' finished - %s" % (id_data, id_config, db.fused_file[id_fuse].fused_file)}
666
667
    log.info(res)

668
669
670
    # Remove temporary fused file
    os.remove(output_file)

671
    return "SUCCESS"
672

673
def custom_fuse(file_list):
674
    from subprocess import Popen, PIPE, STDOUT, os
675
676
677

    if defs.PORT_FUSE_SERVER is None:
        raise IOError('This server cannot fuse custom data')
678
    random_id = random.randint(99999999,99999999999)
679
    out_folder = os.path.abspath(defs.DIR_OUT_VIDJIL_ID % random_id)
680
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % random_id
681
682
683
684
685
    
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    os.makedirs(out_folder)    
686

687
    res = {"message": "'custom fuse' (%d files): %s" % (len(file_list), ','.join(file_list))}
688
689
    log.info(res)
        
690
691
692
693
694
    ## fuse.py 
    output_file = out_folder+'/'+output_filename+'.fused'
    files = ""
    for id in file_list :
        if db.results_file[id].data_file is not None :
695
            files += os.path.abspath(defs.DIR_RESULTS + db.results_file[id].data_file) + " "
696
    
697
    try:
698
        cmd = "python "+ os.path.abspath(defs.DIR_FUSE) +"/fuse.py -o "+output_file+" -t 100 "+files
699
        proc_srvr = xmlrpclib.ServerProxy("http://%s:%d" % (defs.FUSE_SERVER, defs.PORT_FUSE_SERVER))
700
        fuse_filepath = proc_srvr.fuse(cmd, out_folder, output_filename)
701
702
703
    
        f = open(fuse_filepath, 'rb')
        data = gluon.contrib.simplejson.loads(f.read())
704
    except:
705
706
        res = {"message": "'custom fuse' -> IOError"}
        log.error(res)
707
        raise
708
709
710
711
712

    clean_cmd = "rm -rf " + out_folder 
    p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()

713
714
715
    res = {"message": "'custom fuse' -> finished"}
    log.info(res)

716
    return data
717

HERBERT Ryan's avatar
HERBERT Ryan committed
718
#TODO move this ?
719
720
721
722
723
724
def get_file_content(filename):
    content = ""
    with open(filename, 'rb') as my_file:
        content = my_file.read()
    return content

HERBERT Ryan's avatar
HERBERT Ryan committed
725
726
727
728
729
730
731
732
733
734
735
736
737
def fill_field(dest, value, field, parent="", append=False):
    if parent is not None and parent != "":
        dest = dest[parent]

    if field in dest:
        if append:
            dest[field][0] += value
        else:
            dest[field][0] = value
    else:
        dest[field] = []
        dest[field].append(value)

HERBERT Ryan's avatar
HERBERT Ryan committed
738
739
740
741
def extract_total_reads(report):
    reads_matcher = re.compile("Total Reads analysed: [0-9]+")
    reads_line = reads_matcher.search(report).group()
    return reads_line.split(' ')[-1]
742
743
744
745
746
747
748
749
750
751
752
753
754
755

def schedule_pre_process(sequence_file_id, pre_process_id):
    from subprocess import Popen, PIPE, STDOUT, os


    args = [pre_process_id, sequence_file_id]

    err = assert_scheduler_task_does_not_exist(str(args))
    if err:
        log.error(err)
        return err

    task = scheduler.queue_task("pre_process", args,
                                repeats = 1, timeout = defs.TASK_TIMEOUT)
756
757
    db.sequence_file[sequence_file_id] = dict(pre_process_scheduler_task_id = task.id)
    
758
    res = {"redirect": "reload",
759
           "message": "{%s} (%s): process requested" % (sequence_file_id, pre_process_id)}
760
761
762
763
764

    log.info(res)
    return res


765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
def get_preprocessed_filename(filename1, filename2):
    '''
    Get the same extension and then get the most common among them

    >>> get_preprocessed_filename('lsdkj_toto-tata_mlkmfsdlkf.fastq.gz', 'rete_toto-tata_eekjdf.fastq.gz')
    '_toto-tata_.fastq.gz'
    '''
    if not filename2:
        return filename1
    extension = tools_utils.get_common_suffpref([filename1, filename2], min(len(filename1), len(filename2)), -1)
    without_extension = [x[0:-len(extension)] for x in [filename1, filename2]]
    common = tools_utils.common_substring(without_extension)
    if len(common) == 0:
        common = '_'.join(without_extension)
    return common+extension


782
def run_pre_process(pre_process_id, sequence_file_id, clean_before=True, clean_after=False):
Mathieu Giraud's avatar
Mathieu Giraud committed
783
784
785
786
787
    '''
    Run a pre-process on sequence_file.data_file (and possibly sequence_file.data_file+2),
    put the output back in sequence_file.data_file.
    '''

788
789
    from subprocess import Popen, PIPE, STDOUT, os
    
790
    sequence_file = db.sequence_file[sequence_file_id]
791
792
793
    db.sequence_file[sequence_file_id] = dict(pre_process_flag = "RUN")
    db.commit()
    
794
    out_folder = defs.DIR_PRE_VIDJIL_ID % sequence_file_id
795
796
    output_filename = get_preprocessed_filename(get_original_filename(sequence_file.data_file),
                                                get_original_filename(sequence_file.data_file2))
797
798
799
800
801
802
803
    
    if clean_before:
        cmd = "rm -rf "+out_folder 
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
        os.makedirs(out_folder)    

804
    output_file = out_folder+'/'+output_filename
805
806
            
    pre_process = db.pre_process[pre_process_id]
807

808
    try:
809
810
811
812
813
814
        cmd = pre_process.command.replace( "&file1&", defs.DIR_SEQUENCES + sequence_file.data_file)
        if sequence_file.data_file2:
            cmd = cmd.replace( "&file2&", defs.DIR_SEQUENCES + sequence_file.data_file2)
        cmd = cmd.replace( "&result&", output_file)
        cmd = cmd.replace("&pear&", defs.DIR_PEAR)

Mathieu Giraud's avatar
Mathieu Giraud committed
815
816
817
        print("=== Pre-process %s ===" % pre_process_id)
        print(cmd)
        print("===============")
818
        sys.stdout.flush()
819

820
821
822
823
824
825
        out_log = out_folder+'/'+output_filename+'.pre.log'
        log_file = open(out_log, 'w')

        os.chdir(defs.DIR_FUSE)
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=log_file, stderr=log_file, close_fds=True)
        (stdoutdata, stderrdata) = p.communicate()
Mathieu Giraud's avatar
Mathieu Giraud committed
826
        print("Output log in " + out_log)
827

828
        filepath = os.path.abspath(output_file)
829
830

        stream = open(filepath, 'rb')
831
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
832
        print("!!! Pre-process failed, no result file")
833
        res = {"message": "{%s} p%s: 'pre_process' FAILED - %s" % (sequence_file_id, pre_process_id, output_file)}
834
        log.error(res)
835
836
        db.sequence_file[sequence_file_id] = dict(pre_process_flag = "FAILED")
        db.commit()
837
        raise
838

marc's avatar
marc committed
839
840
        

Mathieu Giraud's avatar
Mathieu Giraud committed
841
842
    # Now we update the sequence file with the result of the pre-process
    # We forget the initial data_file (and possibly data_file2)
843
    db.sequence_file[sequence_file_id] = dict(data_file = stream,
844
                                              data_file2 = None,
845
                                              pre_process_flag = "DONE")
846
    db.commit()
847
848
849
850
851
852
853
854
855
856
857
    #resume STOPPED task for this sequence file
    stopped_task = db(
        (db.results_file.sequence_file_id == sequence_file_id)
        & (db.results_file.scheduler_task_id == db.scheduler_task.id)
        & (db.scheduler_task.status == "STOPPED")
    ).select()
    
    for row in stopped_task :
        db.scheduler_task[row.scheduler_task.id] = dict(status = "QUEUED")
        
    
858
859
    db.commit()

860
    # Dump log in scheduler_run.run_output
861
862
    log_file.close()
    for l in open(out_log):
Mathieu Giraud's avatar
Mathieu Giraud committed
863
        print(l, end=' ')
864
865
866

    # Remove data file from disk to save space (it is now saved elsewhere)
    os.remove(filepath)
867
    
868
869
870
871
872
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
    
873
    res = {"message": "{%s} p%s: 'pre_process' finished - %s" % (sequence_file_id, pre_process_id, output_file)}
874
875
876
877
    log.info(res)

    return "SUCCESS"
    
HERBERT Ryan's avatar
HERBERT Ryan committed
878
    
879
from gluon.scheduler import Scheduler
880
scheduler = Scheduler(db, dict(vidjil=run_vidjil,
881
                               compute_contamination=compute_contamination,
882
                               mixcr=run_mixcr,
Alexander Shlemov's avatar
Alexander Shlemov committed
883
                               igrec=run_igrec,
884
                               none=run_copy,
Ryan Herbert's avatar
Ryan Herbert committed
885
                               pre_process=run_pre_process,
886
                               refuse=run_refuse),
HERBERT Ryan's avatar
HERBERT Ryan committed
887
                        heartbeat=defs.SCHEDULER_HEARTBEAT)