task.py 28.9 KB
Newer Older
1
# coding: utf8
Mathieu Giraud's avatar
Mathieu Giraud committed
2
3
from __future__ import print_function

4
import json
5
import os
6
import defs
7
import re
8
import os.path
9
10
11
import time
import sys
import datetime
12
13
import random
import xmlrpclib
14
import tools_utils
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30

def assert_scheduler_task_does_not_exist(args):
    ##check scheduled run
    row = db( ( db.scheduler_task.args == args)
         & ( db.scheduler_task.status != "FAILED"  )
         & ( db.scheduler_task.status != "EXPIRED"  )
         & ( db.scheduler_task.status != "TIMEOUT"  )
         & ( db.scheduler_task.status != "COMPLETED"  )
         ).select()

    if len(row) > 0 :
        res = {"message": "task already registered"}
        return res

    return None

31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def compute_contamination(sequence_file_id, results_file_id, config_id):
    result = []
    for i in range(0, len(sequence_file_id)) :
        result.append({})
        result[i]["total_clones"]=0
        result[i]["total_reads"]=0
        result[i]["sample"]={}
        
        #open sample
        with open(defs.DIR_RESULTS+db.results_file[results_file_id[i]].data_file, "r") as json_data:
            d = json.load(json_data)
            list1 = {}
            total_reads1=d["reads"]["segmented"][0]
            for clone in d["clones"]:
                list1[clone["id"]] = clone["reads"][0]
            json_data.close()
        
        #iterate trough others run's samples
        sample_set_run = db( (db.sample_set_membership.sequence_file_id == sequence_file_id[i])
                           & (db.sample_set_membership.sample_set_id == db.sample_set.id)
                           & (db.sample_set.sample_type == "run") ).select().first()
        
        if sample_set_run != None :
            sample_set_id = sample_set_run.sample_set.id
            query = db(  ( db.sample_set.id == sample_set_id )
                       & ( db.sample_set.id == db.sample_set_membership.sample_set_id )
                       & ( db.sequence_file.id == db.sample_set_membership.sequence_file_id)
                       & ( db.sequence_file.id != sequence_file_id[i])
                       & ( db.results_file.sequence_file_id == db.sequence_file.id )
                       & ( db.results_file.config_id == config_id[i]  )
                       ).select(db.sequence_file.ALL,db.results_file.ALL, db.sample_set.id, orderby=db.sequence_file.id|~db.results_file.run_date)

            query2 = []
            sfi = 0
            for row in query : 
                if row.sequence_file.id != sfi :
                    query2.append(row)
                    sfi = row.sequence_file.id
            
            for row in query2 :
Mathieu Giraud's avatar
Mathieu Giraud committed
71
                print(defs.DIR_RESULTS+row.results_file.data_file)
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
                result[i]["sample"][row.results_file.id] = {}
                result[i]["sample"][row.results_file.id]["clones"] = 0
                result[i]["sample"][row.results_file.id]["reads"] = 0
                with open(defs.DIR_RESULTS+row.results_file.data_file, "r") as json_data2:
                    try:
                        d = json.load(json_data2)
                        total_reads2=d["reads"]["segmented"][0]
                        for clone in d["clones"]:
                            if clone["id"] in list1 :
                                if clone["reads"][0] > 10*list1[clone["id"]] :
                                    result[i]["total_clones"] += 1
                                    result[i]["total_reads"] += list1[clone["id"]]
                                    result[i]["sample"][row.results_file.id]["clones"] += 1
                                    result[i]["sample"][row.results_file.id]["reads"] += list1[clone["id"]]
                            
Mathieu Giraud's avatar
Mathieu Giraud committed
87
88
                    except ValueError as e:
                        print('invalid_json')
89
90
91
92
93
                    json_data2.close()


    return result
    
94

95
def schedule_run(id_sequence, id_config, grep_reads=None):
96
    from subprocess import Popen, PIPE, STDOUT, os
97

98
99
100
    #check results_file
    row = db( ( db.results_file.config_id == id_config ) & 
             ( db.results_file.sequence_file_id == id_sequence )  
101
102
             ).select()
    
Marc Duez's avatar
Marc Duez committed
103
104
105
106
    ts = time.time()
    data_id = db.results_file.insert(sequence_file_id = id_sequence,
                                     run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
                                     config_id = id_config )
107
        
108
    args = [id_sequence, id_config, data_id, grep_reads]
109

110
111
112
113
    err = assert_scheduler_task_does_not_exist(str(args))
    if err:
        log.error(err)
        return err
114

115
    program = db.config[id_config].program
116
    ##add task to scheduler
117
118
    task = scheduler.queue_task(program, args,
                                repeats = 1, timeout = defs.TASK_TIMEOUT)
119
120
121
122
    
    if db.sequence_file[id_sequence].pre_process_flag == "WAIT" or db.sequence_file[id_sequence].pre_process_flag == "RUN" : 
        db.scheduler_task[task.id] = dict(status ="STOPPED")
    
123
    db.results_file[data_id] = dict(scheduler_task_id = task.id)
124

125
    filename= db.sequence_file[id_sequence].filename
126

127
    res = {"redirect": "reload",
128
           "message": "[%s] c%s: process requested - %s %s" % (data_id, id_config, grep_reads, filename)}
129

130
    log.info(res)
131
132
    return res

133
134
135
136
137
138
139
140
141
142
143
144
def schedule_fuse(sample_set_ids, config_ids):
    args = []
    for sample_set_id in sample_set_ids:
        for config_id in config_ids:
            row = db((db.sample_set_membership.sample_set_id == sample_set_id)
                   & (db.sample_set_membership.sequence_file_id == db.results_file.sequence_file_id)
                   & (db.results_file.config_id == config_id)
                ).select(db.sample_set_membership.sample_set_id, db.sample_set_membership.sequence_file_id,
                        db.results_file.id, db.results_file.config_id).first()
            if row:
                args.append([row.sample_set_membership.sequence_file_id, row.results_file.config_id,
                row.results_file.id, row.sample_set_membership.sample_set_id, False])
145
146
147
    if len(args) > 0:
        task = scheduler.queue_task('refuse', [args],
                                    repeats = 1, timeout = defs.TASK_TIMEOUT)
148

149
def run_vidjil(id_file, id_config, id_data, grep_reads,
150
               clean_before=False, clean_after=False):
151
    from subprocess import Popen, PIPE, STDOUT, os
152
153
154
    from datetime import timedelta as timed
    
    ## re schedule if pre_process is still pending
marc's avatar
marc committed
155
    if db.sequence_file[id_file].pre_process_flag == "WAIT" or db.sequence_file[id_file].pre_process_flag == "RUN" :
156
        
Mathieu Giraud's avatar
Mathieu Giraud committed
157
        print("Pre-process is still pending, re-schedule")
158
    
159
        args = [id_file, id_config, id_data, grep_reads]
160
161
        task = scheduler.queue_task("vidjil", args,
                        repeats = 1, timeout = defs.TASK_TIMEOUT,
162
                               start_time=request.now + timed(seconds=1200))
163
164
        db.results_file[id_data] = dict(scheduler_task_id = task.id)
        db.commit()
Mathieu Giraud's avatar
Mathieu Giraud committed
165
        print(task.id)
166
167
168
169
        sys.stdout.flush()
        
        return "SUCCESS"
    
marc's avatar
marc committed
170
    if db.sequence_file[id_file].pre_process_flag == "FAILED" :
Mathieu Giraud's avatar
Mathieu Giraud committed
171
        print("Pre-process has failed")
marc's avatar
marc committed
172
173
        raise ValueError('pre-process has failed')
        return "FAIL"
174
175
    
    ## les chemins d'acces a vidjil / aux fichiers de sequences
176
177
    upload_folder = defs.DIR_SEQUENCES
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
178
    
179
180
181
182
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    
183
184
185
    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
186
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
187
    seq_file = upload_folder+filename
188

189
    ## config de vidjil
190
    vidjil_cmd = db.config[id_config].command
191
192
193
194
195
196
197
198

    if 'next' in vidjil_cmd:
        vidjil_cmd = vidjil_cmd.replace('next', '')
        vidjil_cmd = vidjil_cmd.replace(' germline' , defs.DIR_GERMLINE_NEXT)
        cmd = defs.DIR_VIDJIL_NEXT + '/vidjil-algo '
    else:
        vidjil_cmd = vidjil_cmd.replace(' germline' , defs.DIR_GERMLINE)
        cmd = defs.DIR_VIDJIL + '/vidjil '
199
200
201
202

    if grep_reads:
        # TODO: security, assert grep_reads XXXX
        vidjil_cmd += ' -FaW "%s" ' % grep_reads
203
    
204
    os.makedirs(out_folder)
205
206
    out_log = out_folder+'/'+output_filename+'.vidjil.log'
    vidjil_log_file = open(out_log, 'w')
207

208
    try:
209
        ## commande complete
210
        cmd += ' -o  ' + out_folder + " -b " + output_filename
211
212
213
        cmd += ' ' + vidjil_cmd + ' '+ seq_file

        ## execute la commande vidjil
Mathieu Giraud's avatar
Mathieu Giraud committed
214
215
216
        print("=== Launching Vidjil ===")
        print(cmd)    
        print("========================")
217
        sys.stdout.flush()
218

219
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=vidjil_log_file, stderr=STDOUT, close_fds=True)
220

221
        (stdoutdata, stderrdata) = p.communicate()
222

Mathieu Giraud's avatar
Mathieu Giraud committed
223
        print("Output log in " + out_log)
224
225
        sys.stdout.flush()
        db.commit()
226

227
228
229
230
231
        ## Get result file
        if grep_reads:
            out_results = out_folder + '/seq/clone.fa-1'
        else:
            out_results = out_folder + '/' + output_filename + '.vidjil'
232

Mathieu Giraud's avatar
Mathieu Giraud committed
233
        print("===>", out_results)
234
        results_filepath = os.path.abspath(out_results)
235
236

        stream = open(results_filepath, 'rb')
237
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
238
        print("!!! Vidjil failed, no result file")
239
240
        res = {"message": "[%s] c%s: Vidjil FAILED - %s" % (id_data, id_config, out_folder)}
        log.error(res)
241
        raise
242
    
243
244
245
246
247
248
249
250
251
252
253
254
255
    ## Parse some info in .log
    vidjil_log_file.close()

    segmented = re.compile("==> segmented (\d+) reads \((\d*\.\d+|\d+)%\)")
    windows = re.compile("==> found (\d+) .*-windows in .* segments .* inside (\d+) sequences")
    info = ''
    reads = None
    segs = None
    ratio = None
    wins = None
    for l in open(out_log):
        m = segmented.search(l)
        if m:
Mathieu Giraud's avatar
Mathieu Giraud committed
256
            print(l, end=' ')
257
258
259
260
261
262
            segs = int(m.group(1))
            ratio = m.group(2)
            info = "%d segmented (%s%%)" % (segs, ratio)
            continue
        m = windows.search(l)
        if m:
Mathieu Giraud's avatar
Mathieu Giraud committed
263
            print(l, end=' ')
264
265
266
267
268
269
            wins = int(m.group(1))
            reads = int(m.group(2))
            info = "%d reads, " % reads + info + ", %d windows" % wins
            break


270
    ## insertion dans la base de donnée
271
272
    ts = time.time()
    
273
    db.results_file[id_data] = dict(status = "ready",
274
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
275
276
                                 data_file = stream
                                )
277
    
278
279
    db.commit()
    
Mathieu Giraud's avatar
Mathieu Giraud committed
280
281
282
283
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
284
    
Mathieu Giraud's avatar
Mathieu Giraud committed
285
286
    ## l'output de Vidjil est stocké comme resultat pour l'ordonnanceur
    ## TODO parse result success/fail
287

288
289
    config_name = db.config[id_config].name

290
    res = {"message": "[%s] c%s: Vidjil finished - %s - %s" % (id_data, id_config, info, out_folder)}
291
292
    log.info(res)

293
294

    if not grep_reads:
295
296
        for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
	    sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
297
	    print(row.sample_set_id)
298
            run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
299

Mathieu Giraud's avatar
Mathieu Giraud committed
300
    return "SUCCESS"
301

302
def run_mixcr(id_file, id_config, id_data, clean_before=False, clean_after=False):
303
304
    from subprocess import Popen, PIPE, STDOUT, os
    import time
305
    import json
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324

    upload_folder = defs.DIR_SEQUENCES
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data

    cmd = "rm -rf "+out_folder
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()

    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
    seq_file = upload_folder+filename

    ## config de vidjil
    arg_cmd = db.config[id_config].command

    os.makedirs(out_folder)
    out_log = out_folder + output_filename+'.mixcr.log'
325
    report = out_folder + output_filename + '.mixcr.report'
326
327
328
329
    log_file = open(out_log, 'w')

    out_alignments = out_folder + output_filename + '.align.vdjca'
    out_clones =  out_folder + output_filename + '.clones.clns'
330
331
    out_results_file = output_filename + '.mixcr'
    out_results = out_folder + out_results_file
332

333
    align_report = report + '.aln'
334
    assembly_report = report + '.asmbl'
335

336
337
338
339
340
    arg_cmds = arg_cmd.split('|')
    args_1, args_2, args_3 = '', '', ''
    try:
        args_1, args_2, args_3 = arg_cmds
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
341
342
        print(arg_cmd)
        print("! Bad arguments, we expect args_align | args_assemble | args_exportClones")
343

344
    ## commande complete
345
    try:
346
347
348
349
350
351
352
353
354
        mixcr = defs.DIR_MIXCR + 'mixcr'
        cmd = mixcr + ' align --save-reads -t 1 -r ' + align_report + ' ' + args_1 + ' ' + seq_file  + ' ' + out_alignments
        cmd += ' && '
        cmd += mixcr + ' assemble -t 1 -r ' + assembly_report + ' ' + args_2 + ' ' + out_alignments + ' ' + out_clones
        cmd += ' && rm ' + out_alignments
        cmd += ' && '
        cmd += mixcr + ' exportClones --format vidjil -germline -id -name -reads -sequence -top -seg -s ' + args_3 + ' ' + out_clones + ' ' + out_results

        ## execute la commande MiXCR
Mathieu Giraud's avatar
Mathieu Giraud committed
355
356
357
        print("=== Launching MiXCR ===")
        print(cmd)
        print("========================")
358
        sys.stdout.flush()
359

360
361
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=log_file, stderr=STDOUT, close_fds=True)
        p.wait()
362

Mathieu Giraud's avatar
Mathieu Giraud committed
363
        print("Output log in " + out_log)
364
        sys.stdout.flush()
365

366
        ## Get result file
Mathieu Giraud's avatar
Mathieu Giraud committed
367
        print("===>", out_results)
368
        results_filepath = os.path.abspath(out_results)
HERBERT Ryan's avatar
HERBERT Ryan committed
369
        stream = open(results_filepath, 'rb')
370
        stream.close()
371
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
372
        print("!!! MiXCR failed, no result file")
HERBERT Ryan's avatar
HERBERT Ryan committed
373
374
        res = {"message": "[%s] c%s: MiXCR FAILED - %s" % (id_data, id_config, out_folder)}
        log.error(res)
375
        raise
376

HERBERT Ryan's avatar
HERBERT Ryan committed
377
378
379
    align_report = get_file_content(align_report)
    assembly_report = get_file_content(assembly_report)
    reports = align_report + assembly_report
380
    original_name = row[0].data_file
HERBERT Ryan's avatar
HERBERT Ryan committed
381
    totalReads = extract_total_reads(assembly_report)
382
383
    with open(results_filepath, 'r') as json_file:
        my_json = json.load(json_file)
HERBERT Ryan's avatar
HERBERT Ryan committed
384
385
        fill_field(my_json, reports, "log", "samples", True)
        fill_field(my_json, original_name, "original_names", "samples")
386
        fill_field(my_json, cmd, "commandline", "samples")
HERBERT Ryan's avatar
HERBERT Ryan committed
387
        fill_field(my_json, totalReads, "total", "reads")
388

389
390
391
392
        # TODO fix this dirty hack to get around bad file descriptor error
    new_file = open(results_filepath, 'w')
    json.dump(my_json, new_file)
    new_file.close()
393

394
395
    ## insertion dans la base de donnée
    ts = time.time()
HERBERT Ryan's avatar
HERBERT Ryan committed
396
    
397
    stream = open(results_filepath, 'rb')
398
399
    db.results_file[id_data] = dict(status = "ready",
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
HERBERT Ryan's avatar
HERBERT Ryan committed
400
                                 data_file = stream
401
                                )
HERBERT Ryan's avatar
HERBERT Ryan committed
402
    
403
404
405
406
407
408
    db.commit()

    config_name = db.config[id_config].name
    res = {"message": "[%s] c%s: MiXCR finished - %s" % (id_data, id_config, out_folder)}
    log.info(res)

HERBERT Ryan's avatar
HERBERT Ryan committed
409
410
    for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
        sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
411
        print(row.sample_set_id)
412
        run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
HERBERT Ryan's avatar
HERBERT Ryan committed
413
414


415
    return "SUCCESS"
416

417
def run_copy(id_file, id_config, id_data, clean_before=False, clean_after=False):
418
419
420
421
    from subprocess import Popen, PIPE, STDOUT, os
    
    ## les chemins d'acces a vidjil / aux fichiers de sequences
    upload_folder = defs.DIR_SEQUENCES
422
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
423
424
425
426
427
428
429
430
431
432
433
434
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
    
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    
    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
    
    os.makedirs(out_folder)
    vidjil_log_file = open(out_folder+'/'+output_filename+'.vidjil.log', 'w')
435

Mathieu Giraud's avatar
Mathieu Giraud committed
436
    print("Output log in "+out_folder+'/'+output_filename+'.vidjil.log')
437
438
439
440
441
    sys.stdout.flush()
    db.commit()
    
    ## récupération du fichier 
    results_filepath = os.path.abspath(defs.DIR_SEQUENCES+row[0].data_file)
442

443
444
445
    try:
        stream = open(results_filepath, 'rb')
    except IOError:
Mathieu Giraud's avatar
Mathieu Giraud committed
446
        print("!!! 'copy' failed, no file")
447
448
        res = {"message": "[%s] c%s: 'copy' FAILED - %s - %s" % (id_data, id_config, info, out_folder)}
        log.error(res)
449
450
451
452
453
454
455
456
        raise IOError
    
    ## insertion dans la base de donnée
    ts = time.time()
    
    db.results_file[id_data] = dict(status = "ready",
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
                                 #data_file = row[0].data_file
457
                                 data_file = db.results_file.data_file.store(stream, row[0].filename)
458
459
460
461
462
463
464
465
466
467
468
                                )
    db.commit()
    
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
    
    ## l'output de Vidjil est stocké comme resultat pour l'ordonnanceur
    ## TODO parse result success/fail

469
    res = {"message": "[%s] c%s: 'copy' finished - %s" % (id_data, id_config, filename)}
470
471
    log.info(res)

472
473
    for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
        sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
474
        print(row.sample_set_id)
475
        run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
476

477
478

    return "SUCCESS"
Mathieu Giraud's avatar
Mathieu Giraud committed
479
480


481
482
483
484
def run_refuse(args):
    for arg in args:
        run_fuse(arg[0], arg[1], arg[2], arg[3], arg[4])
    return "SUCCESS"
Mathieu Giraud's avatar
Mathieu Giraud committed
485

486
def run_fuse(id_file, id_config, id_data, sample_set_id, clean_before=True, clean_after=False):
487
488
    from subprocess import Popen, PIPE, STDOUT, os
    
489
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
490
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data + '-%s' % sample_set_id
491
    
Mathieu Giraud's avatar
Mathieu Giraud committed
492
493
494
495
496
    if clean_before:
        cmd = "rm -rf "+out_folder 
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
        os.makedirs(out_folder)    
497
498
    
    
Mathieu Giraud's avatar
Mathieu Giraud committed
499
    fuse_log_file = open(out_folder+'/'+output_filename+'.fuse.log', 'w')
Marc's avatar
Marc committed
500
    
501
    ## fuse.py 
502
    output_file = out_folder+'/'+output_filename+'.fused'
503
    files = ""
504
    sequence_file_list = ""
505
    query2 = db( ( db.results_file.sequence_file_id == db.sequence_file.id )
506
                   & ( db.sample_set_membership.sequence_file_id == db.sequence_file.id)
507
                   & ( db.sample_set_membership.sample_set_id == sample_set_id)
508
                   & ( db.results_file.config_id == id_config )
Vidjil Team's avatar
Vidjil Team committed
509
                   ).select( orderby=db.sequence_file.id|~db.results_file.run_date) 
Marc Duez's avatar
Marc Duez committed
510
511
512
513
514
515
516
    query = []
    sequence_file_id = 0
    for row in query2 : 
        if row.sequence_file.id != sequence_file_id :
            query.append(row)
            sequence_file_id = row.sequence_file.id
            
517
    for row in query :
Mathieu Giraud's avatar
Mathieu Giraud committed
518
519
        if row.results_file.data_file is not None :
            files += defs.DIR_RESULTS + row.results_file.data_file + " "
520
521
            sequence_file_list += str(row.results_file.sequence_file_id) + "_"
            
522
    try:
523
524
        fuse_cmd = db.config[id_config].fuse_command
        cmd = "python "+defs.DIR_FUSE+"/fuse.py -o "+ output_file + " " + fuse_cmd + " " + files
Mathieu Giraud's avatar
Mathieu Giraud committed
525
526


Mathieu Giraud's avatar
Mathieu Giraud committed
527
528
529
        print("=== fuse.py ===")
        print(cmd)
        print("===============")
530
        sys.stdout.flush()
Mathieu Giraud's avatar
Mathieu Giraud committed
531

532
533
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=fuse_log_file, stderr=STDOUT, close_fds=True)
        (stdoutdata, stderrdata) = p.communicate()
Mathieu Giraud's avatar
Mathieu Giraud committed
534
        print("Output log in "+out_folder+'/'+output_filename+'.fuse.log')
Mathieu Giraud's avatar
Mathieu Giraud committed
535

536
        fuse_filepath = os.path.abspath(output_file)
537
538

        stream = open(fuse_filepath, 'rb')
539
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
540
        print("!!! Fuse failed, no .fused file")
541
542
        res = {"message": "[%s] c%s: 'fuse' FAILED - %s" % (id_data, id_config, output_file)}
        log.error(res)
543
        raise
544

545
    ts = time.time()
546
547
548
549

    fused_files = db( ( db.fused_file.config_id == id_config ) &
                     ( db.fused_file.sample_set_id == sample_set_id )
                 ).select()
550
    existing_fused_file = None
551
    if len(fused_files) > 0:
552
553
554
        fused_file = fused_files[0]
        id_fuse = fused_file.id
        existing_fused_file = fused_file.fused_file
555
556
557
558
    else:
        id_fuse = db.fused_file.insert(sample_set_id = sample_set_id,
                                       config_id = id_config)

559
    db.fused_file[id_fuse] = dict(fuse_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
560
561
                                 fused_file = stream,
                                 sequence_file_list = sequence_file_list)
562
    db.commit()
Mathieu Giraud's avatar
Mathieu Giraud committed
563
564
565
566
567

    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
568
569
570
571
572
        # remove previous fused_file if it exists
        if existing_fused_file is not None:
            clean_cmd = "rm -rf %s/%s" % (out_folder, existing_fused_file)
            p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
            p.wait()
573
    
574
    res = {"message": "[%s] c%s: 'fuse' finished - %s" % (id_data, id_config, db.fused_file[id_fuse].fused_file)}
575
576
    log.info(res)

577
578
579
    # Remove temporary fused file
    os.remove(output_file)

580
    return "SUCCESS"
581

582
def custom_fuse(file_list):
583
    from subprocess import Popen, PIPE, STDOUT, os
584
585
586

    if defs.PORT_FUSE_SERVER is None:
        raise IOError('This server cannot fuse custom data')
587
    random_id = random.randint(99999999,99999999999)
588
    out_folder = os.path.abspath(defs.DIR_OUT_VIDJIL_ID % random_id)
589
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % random_id
590
591
592
593
594
    
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    os.makedirs(out_folder)    
595

596
    res = {"message": "'custom fuse' (%d files): %s" % (len(file_list), ','.join(file_list))}
597
598
    log.info(res)
        
599
600
601
602
603
    ## fuse.py 
    output_file = out_folder+'/'+output_filename+'.fused'
    files = ""
    for id in file_list :
        if db.results_file[id].data_file is not None :
604
            files += os.path.abspath(defs.DIR_RESULTS + db.results_file[id].data_file) + " "
605
    
606
    try:
607
        cmd = "python "+ os.path.abspath(defs.DIR_FUSE) +"/fuse.py -o "+output_file+" -t 100 "+files
608
        proc_srvr = xmlrpclib.ServerProxy("http://%s:%d" % (defs.FUSE_SERVER, defs.PORT_FUSE_SERVER))
609
        fuse_filepath = proc_srvr.fuse(cmd, out_folder, output_filename)
610
611
612
    
        f = open(fuse_filepath, 'rb')
        data = gluon.contrib.simplejson.loads(f.read())
613
    except:
614
615
        res = {"message": "'custom fuse' -> IOError"}
        log.error(res)
616
        raise
617
618
619
620
621

    clean_cmd = "rm -rf " + out_folder 
    p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()

622
623
624
    res = {"message": "'custom fuse' -> finished"}
    log.info(res)

625
    return data
626

HERBERT Ryan's avatar
HERBERT Ryan committed
627
#TODO move this ?
628
629
630
631
632
633
def get_file_content(filename):
    content = ""
    with open(filename, 'rb') as my_file:
        content = my_file.read()
    return content

HERBERT Ryan's avatar
HERBERT Ryan committed
634
635
636
637
638
639
640
641
642
643
644
645
646
def fill_field(dest, value, field, parent="", append=False):
    if parent is not None and parent != "":
        dest = dest[parent]

    if field in dest:
        if append:
            dest[field][0] += value
        else:
            dest[field][0] = value
    else:
        dest[field] = []
        dest[field].append(value)

HERBERT Ryan's avatar
HERBERT Ryan committed
647
648
649
650
def extract_total_reads(report):
    reads_matcher = re.compile("Total Reads analysed: [0-9]+")
    reads_line = reads_matcher.search(report).group()
    return reads_line.split(' ')[-1]
651
652
653
654
655
656
657
658
659
660
661
662
663
664

def schedule_pre_process(sequence_file_id, pre_process_id):
    from subprocess import Popen, PIPE, STDOUT, os


    args = [pre_process_id, sequence_file_id]

    err = assert_scheduler_task_does_not_exist(str(args))
    if err:
        log.error(err)
        return err

    task = scheduler.queue_task("pre_process", args,
                                repeats = 1, timeout = defs.TASK_TIMEOUT)
665
666
    db.sequence_file[sequence_file_id] = dict(pre_process_scheduler_task_id = task.id)
    
667
    res = {"redirect": "reload",
668
           "message": "{%s} (%s): process requested" % (sequence_file_id, pre_process_id)}
669
670
671
672
673

    log.info(res)
    return res


674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
def get_preprocessed_filename(filename1, filename2):
    '''
    Get the same extension and then get the most common among them

    >>> get_preprocessed_filename('lsdkj_toto-tata_mlkmfsdlkf.fastq.gz', 'rete_toto-tata_eekjdf.fastq.gz')
    '_toto-tata_.fastq.gz'
    '''
    if not filename2:
        return filename1
    extension = tools_utils.get_common_suffpref([filename1, filename2], min(len(filename1), len(filename2)), -1)
    without_extension = [x[0:-len(extension)] for x in [filename1, filename2]]
    common = tools_utils.common_substring(without_extension)
    if len(common) == 0:
        common = '_'.join(without_extension)
    return common+extension


691
def run_pre_process(pre_process_id, sequence_file_id, clean_before=True, clean_after=False):
Mathieu Giraud's avatar
Mathieu Giraud committed
692
693
694
695
696
    '''
    Run a pre-process on sequence_file.data_file (and possibly sequence_file.data_file+2),
    put the output back in sequence_file.data_file.
    '''

697
698
    from subprocess import Popen, PIPE, STDOUT, os
    
699
    sequence_file = db.sequence_file[sequence_file_id]
700
701
702
    db.sequence_file[sequence_file_id] = dict(pre_process_flag = "RUN")
    db.commit()
    
703
    out_folder = defs.DIR_PRE_VIDJIL_ID % sequence_file_id
704
705
    output_filename = get_preprocessed_filename(get_original_filename(sequence_file.data_file),
                                                get_original_filename(sequence_file.data_file2))
706
707
708
709
710
711
712
    
    if clean_before:
        cmd = "rm -rf "+out_folder 
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
        os.makedirs(out_folder)    

713
    output_file = out_folder+'/'+output_filename
714
715
            
    pre_process = db.pre_process[pre_process_id]
716

717
    try:
718
719
720
721
722
723
        cmd = pre_process.command.replace( "&file1&", defs.DIR_SEQUENCES + sequence_file.data_file)
        if sequence_file.data_file2:
            cmd = cmd.replace( "&file2&", defs.DIR_SEQUENCES + sequence_file.data_file2)
        cmd = cmd.replace( "&result&", output_file)
        cmd = cmd.replace("&pear&", defs.DIR_PEAR)

Mathieu Giraud's avatar
Mathieu Giraud committed
724
725
726
        print("=== Pre-process %s ===" % pre_process_id)
        print(cmd)
        print("===============")
727
        sys.stdout.flush()
728

729
730
731
732
733
734
        out_log = out_folder+'/'+output_filename+'.pre.log'
        log_file = open(out_log, 'w')

        os.chdir(defs.DIR_FUSE)
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=log_file, stderr=log_file, close_fds=True)
        (stdoutdata, stderrdata) = p.communicate()
Mathieu Giraud's avatar
Mathieu Giraud committed
735
        print("Output log in " + out_log)
736

737
        filepath = os.path.abspath(output_file)
738
739

        stream = open(filepath, 'rb')
740
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
741
        print("!!! Pre-process failed, no result file")
742
        res = {"message": "{%s} p%s: 'pre_process' FAILED - %s" % (sequence_file_id, pre_process_id, output_file)}
743
        log.error(res)
744
745
        db.sequence_file[sequence_file_id] = dict(pre_process_flag = "FAILED")
        db.commit()
746
        raise
747

marc's avatar
marc committed
748
749
        

Mathieu Giraud's avatar
Mathieu Giraud committed
750
751
    # Now we update the sequence file with the result of the pre-process
    # We forget the initial data_file (and possibly data_file2)
752
    db.sequence_file[sequence_file_id] = dict(data_file = stream,
753
                                              data_file2 = None,
754
                                              pre_process_flag = "DONE")
755
    db.commit()
756
757
758
759
760
761
762
763
764
765
766
    #resume STOPPED task for this sequence file
    stopped_task = db(
        (db.results_file.sequence_file_id == sequence_file_id)
        & (db.results_file.scheduler_task_id == db.scheduler_task.id)
        & (db.scheduler_task.status == "STOPPED")
    ).select()
    
    for row in stopped_task :
        db.scheduler_task[row.scheduler_task.id] = dict(status = "QUEUED")
        
    
767
768
    db.commit()

769
    # Dump log in scheduler_run.run_output
770
771
    log_file.close()
    for l in open(out_log):
Mathieu Giraud's avatar
Mathieu Giraud committed
772
        print(l, end=' ')
773
774
775

    # Remove data file from disk to save space (it is now saved elsewhere)
    os.remove(filepath)
776
    
777
778
779
780
781
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
    
782
    res = {"message": "{%s} p%s: 'pre_process' finished - %s" % (sequence_file_id, pre_process_id, output_file)}
783
784
785
786
    log.info(res)

    return "SUCCESS"
    
HERBERT Ryan's avatar
HERBERT Ryan committed
787
    
788
from gluon.scheduler import Scheduler
789
scheduler = Scheduler(db, dict(vidjil=run_vidjil,
790
                               compute_contamination=compute_contamination,
791
                               mixcr=run_mixcr,
792
                               none=run_copy,
Ryan Herbert's avatar
Ryan Herbert committed
793
                               pre_process=run_pre_process,
794
                               refuse=run_refuse),
HERBERT Ryan's avatar
HERBERT Ryan committed
795
                        heartbeat=defs.SCHEDULER_HEARTBEAT)