task.py 28.9 KB
Newer Older
1
# coding: utf8
Mathieu Giraud's avatar
Mathieu Giraud committed
2 3
from __future__ import print_function

4
import json
5
import os
6
import defs
7
import re
8
import os.path
9 10 11
import time
import sys
import datetime
12 13
import random
import xmlrpclib
14
import tools_utils
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30

def assert_scheduler_task_does_not_exist(args):
    ##check scheduled run
    row = db( ( db.scheduler_task.args == args)
         & ( db.scheduler_task.status != "FAILED"  )
         & ( db.scheduler_task.status != "EXPIRED"  )
         & ( db.scheduler_task.status != "TIMEOUT"  )
         & ( db.scheduler_task.status != "COMPLETED"  )
         ).select()

    if len(row) > 0 :
        res = {"message": "task already registered"}
        return res

    return None

31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
def compute_contamination(sequence_file_id, results_file_id, config_id):
    result = []
    for i in range(0, len(sequence_file_id)) :
        result.append({})
        result[i]["total_clones"]=0
        result[i]["total_reads"]=0
        result[i]["sample"]={}
        
        #open sample
        with open(defs.DIR_RESULTS+db.results_file[results_file_id[i]].data_file, "r") as json_data:
            d = json.load(json_data)
            list1 = {}
            total_reads1=d["reads"]["segmented"][0]
            for clone in d["clones"]:
                list1[clone["id"]] = clone["reads"][0]
            json_data.close()
        
        #iterate trough others run's samples
        sample_set_run = db( (db.sample_set_membership.sequence_file_id == sequence_file_id[i])
                           & (db.sample_set_membership.sample_set_id == db.sample_set.id)
                           & (db.sample_set.sample_type == "run") ).select().first()
        
        if sample_set_run != None :
            sample_set_id = sample_set_run.sample_set.id
            query = db(  ( db.sample_set.id == sample_set_id )
                       & ( db.sample_set.id == db.sample_set_membership.sample_set_id )
                       & ( db.sequence_file.id == db.sample_set_membership.sequence_file_id)
                       & ( db.sequence_file.id != sequence_file_id[i])
                       & ( db.results_file.sequence_file_id == db.sequence_file.id )
                       & ( db.results_file.config_id == config_id[i]  )
                       ).select(db.sequence_file.ALL,db.results_file.ALL, db.sample_set.id, orderby=db.sequence_file.id|~db.results_file.run_date)

            query2 = []
            sfi = 0
            for row in query : 
                if row.sequence_file.id != sfi :
                    query2.append(row)
                    sfi = row.sequence_file.id
            
            for row in query2 :
Mathieu Giraud's avatar
Mathieu Giraud committed
71
                print(defs.DIR_RESULTS+row.results_file.data_file)
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
                result[i]["sample"][row.results_file.id] = {}
                result[i]["sample"][row.results_file.id]["clones"] = 0
                result[i]["sample"][row.results_file.id]["reads"] = 0
                with open(defs.DIR_RESULTS+row.results_file.data_file, "r") as json_data2:
                    try:
                        d = json.load(json_data2)
                        total_reads2=d["reads"]["segmented"][0]
                        for clone in d["clones"]:
                            if clone["id"] in list1 :
                                if clone["reads"][0] > 10*list1[clone["id"]] :
                                    result[i]["total_clones"] += 1
                                    result[i]["total_reads"] += list1[clone["id"]]
                                    result[i]["sample"][row.results_file.id]["clones"] += 1
                                    result[i]["sample"][row.results_file.id]["reads"] += list1[clone["id"]]
                            
Mathieu Giraud's avatar
Mathieu Giraud committed
87 88
                    except ValueError as e:
                        print('invalid_json')
89 90 91 92 93
                    json_data2.close()


    return result
    
94

95
def schedule_run(id_sequence, id_config, grep_reads=None):
96
    from subprocess import Popen, PIPE, STDOUT, os
97

98 99 100
    #check results_file
    row = db( ( db.results_file.config_id == id_config ) & 
             ( db.results_file.sequence_file_id == id_sequence )  
101 102
             ).select()
    
Marc Duez's avatar
Marc Duez committed
103 104 105 106
    ts = time.time()
    data_id = db.results_file.insert(sequence_file_id = id_sequence,
                                     run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
                                     config_id = id_config )
107
        
108
    args = [id_sequence, id_config, data_id, grep_reads]
109

110 111 112 113
    err = assert_scheduler_task_does_not_exist(str(args))
    if err:
        log.error(err)
        return err
114

115
    program = db.config[id_config].program
116
    ##add task to scheduler
117 118
    task = scheduler.queue_task(program, args,
                                repeats = 1, timeout = defs.TASK_TIMEOUT)
119 120 121 122
    
    if db.sequence_file[id_sequence].pre_process_flag == "WAIT" or db.sequence_file[id_sequence].pre_process_flag == "RUN" : 
        db.scheduler_task[task.id] = dict(status ="STOPPED")
    
123
    db.results_file[data_id] = dict(scheduler_task_id = task.id)
124

125
    filename= db.sequence_file[id_sequence].filename
126

127
    res = {"redirect": "reload",
128
           "message": "[%s] c%s: process requested - %s %s" % (data_id, id_config, grep_reads, filename)}
129

130
    log.info(res)
131 132
    return res

133 134 135 136 137 138 139 140 141 142 143 144
def schedule_fuse(sample_set_ids, config_ids):
    args = []
    for sample_set_id in sample_set_ids:
        for config_id in config_ids:
            row = db((db.sample_set_membership.sample_set_id == sample_set_id)
                   & (db.sample_set_membership.sequence_file_id == db.results_file.sequence_file_id)
                   & (db.results_file.config_id == config_id)
                ).select(db.sample_set_membership.sample_set_id, db.sample_set_membership.sequence_file_id,
                        db.results_file.id, db.results_file.config_id).first()
            if row:
                args.append([row.sample_set_membership.sequence_file_id, row.results_file.config_id,
                row.results_file.id, row.sample_set_membership.sample_set_id, False])
145 146 147
    if len(args) > 0:
        task = scheduler.queue_task('refuse', [args],
                                    repeats = 1, timeout = defs.TASK_TIMEOUT)
148

149
def run_vidjil(id_file, id_config, id_data, grep_reads,
150
               clean_before=False, clean_after=False):
151
    from subprocess import Popen, PIPE, STDOUT, os
152 153 154
    from datetime import timedelta as timed
    
    ## re schedule if pre_process is still pending
marc's avatar
marc committed
155
    if db.sequence_file[id_file].pre_process_flag == "WAIT" or db.sequence_file[id_file].pre_process_flag == "RUN" :
156
        
Mathieu Giraud's avatar
Mathieu Giraud committed
157
        print("Pre-process is still pending, re-schedule")
158
    
159
        args = [id_file, id_config, id_data, grep_reads]
160 161
        task = scheduler.queue_task("vidjil", args,
                        repeats = 1, timeout = defs.TASK_TIMEOUT,
162
                               start_time=request.now + timed(seconds=1200))
163 164
        db.results_file[id_data] = dict(scheduler_task_id = task.id)
        db.commit()
Mathieu Giraud's avatar
Mathieu Giraud committed
165
        print(task.id)
166 167 168 169
        sys.stdout.flush()
        
        return "SUCCESS"
    
marc's avatar
marc committed
170
    if db.sequence_file[id_file].pre_process_flag == "FAILED" :
Mathieu Giraud's avatar
Mathieu Giraud committed
171
        print("Pre-process has failed")
marc's avatar
marc committed
172 173
        raise ValueError('pre-process has failed')
        return "FAIL"
174 175
    
    ## les chemins d'acces a vidjil / aux fichiers de sequences
176 177
    upload_folder = defs.DIR_SEQUENCES
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
178
    
179 180 181 182
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    
183 184 185
    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
186
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
187
    seq_file = upload_folder+filename
188

189
    ## config de vidjil
190
    vidjil_cmd = db.config[id_config].command
191 192 193 194 195 196 197

    if 'next' in vidjil_cmd:
        vidjil_cmd = vidjil_cmd.replace('next', '')
        vidjil_cmd = vidjil_cmd.replace(' germline' , defs.DIR_GERMLINE_NEXT)
        cmd = defs.DIR_VIDJIL_NEXT + '/vidjil-algo '
    else:
        vidjil_cmd = vidjil_cmd.replace(' germline' , defs.DIR_GERMLINE)
198
        cmd = defs.DIR_VIDJIL + '/vidjil-algo '
199 200 201 202

    if grep_reads:
        # TODO: security, assert grep_reads XXXX
        vidjil_cmd += ' -FaW "%s" ' % grep_reads
203
    
204
    os.makedirs(out_folder)
205 206
    out_log = out_folder+'/'+output_filename+'.vidjil.log'
    vidjil_log_file = open(out_log, 'w')
207

208
    try:
209
        ## commande complete
210
        cmd += ' -o  ' + out_folder + " -b " + output_filename
211 212 213
        cmd += ' ' + vidjil_cmd + ' '+ seq_file

        ## execute la commande vidjil
Mathieu Giraud's avatar
Mathieu Giraud committed
214 215 216
        print("=== Launching Vidjil ===")
        print(cmd)    
        print("========================")
217
        sys.stdout.flush()
218

219
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=vidjil_log_file, stderr=STDOUT, close_fds=True)
220

221
        (stdoutdata, stderrdata) = p.communicate()
222

Mathieu Giraud's avatar
Mathieu Giraud committed
223
        print("Output log in " + out_log)
224 225
        sys.stdout.flush()
        db.commit()
226

227 228 229 230 231
        ## Get result file
        if grep_reads:
            out_results = out_folder + '/seq/clone.fa-1'
        else:
            out_results = out_folder + '/' + output_filename + '.vidjil'
232

Mathieu Giraud's avatar
Mathieu Giraud committed
233
        print("===>", out_results)
234
        results_filepath = os.path.abspath(out_results)
235 236

        stream = open(results_filepath, 'rb')
237
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
238
        print("!!! Vidjil failed, no result file")
239 240
        res = {"message": "[%s] c%s: Vidjil FAILED - %s" % (id_data, id_config, out_folder)}
        log.error(res)
241
        raise
242
    
243 244 245 246 247 248 249 250 251 252 253 254 255
    ## Parse some info in .log
    vidjil_log_file.close()

    segmented = re.compile("==> segmented (\d+) reads \((\d*\.\d+|\d+)%\)")
    windows = re.compile("==> found (\d+) .*-windows in .* segments .* inside (\d+) sequences")
    info = ''
    reads = None
    segs = None
    ratio = None
    wins = None
    for l in open(out_log):
        m = segmented.search(l)
        if m:
Mathieu Giraud's avatar
Mathieu Giraud committed
256
            print(l, end=' ')
257 258 259 260 261 262
            segs = int(m.group(1))
            ratio = m.group(2)
            info = "%d segmented (%s%%)" % (segs, ratio)
            continue
        m = windows.search(l)
        if m:
Mathieu Giraud's avatar
Mathieu Giraud committed
263
            print(l, end=' ')
264 265 266 267 268 269
            wins = int(m.group(1))
            reads = int(m.group(2))
            info = "%d reads, " % reads + info + ", %d windows" % wins
            break


270
    ## insertion dans la base de donnée
271 272
    ts = time.time()
    
273
    db.results_file[id_data] = dict(status = "ready",
274
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
275 276
                                 data_file = stream
                                )
277
    
278 279
    db.commit()
    
Mathieu Giraud's avatar
Mathieu Giraud committed
280 281 282 283
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
284
    
Mathieu Giraud's avatar
Mathieu Giraud committed
285 286
    ## l'output de Vidjil est stocké comme resultat pour l'ordonnanceur
    ## TODO parse result success/fail
287

288 289
    config_name = db.config[id_config].name

290
    res = {"message": "[%s] c%s: Vidjil finished - %s - %s" % (id_data, id_config, info, out_folder)}
291 292
    log.info(res)

293 294

    if not grep_reads:
295 296
        for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
	    sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
297
	    print(row.sample_set_id)
298
            run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
299

Mathieu Giraud's avatar
Mathieu Giraud committed
300
    return "SUCCESS"
301

302
def run_mixcr(id_file, id_config, id_data, clean_before=False, clean_after=False):
303 304
    from subprocess import Popen, PIPE, STDOUT, os
    import time
305
    import json
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324

    upload_folder = defs.DIR_SEQUENCES
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data

    cmd = "rm -rf "+out_folder
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()

    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
    seq_file = upload_folder+filename

    ## config de vidjil
    arg_cmd = db.config[id_config].command

    os.makedirs(out_folder)
    out_log = out_folder + output_filename+'.mixcr.log'
325
    report = out_folder + output_filename + '.mixcr.report'
326 327 328 329
    log_file = open(out_log, 'w')

    out_alignments = out_folder + output_filename + '.align.vdjca'
    out_clones =  out_folder + output_filename + '.clones.clns'
330 331
    out_results_file = output_filename + '.mixcr'
    out_results = out_folder + out_results_file
332

333
    align_report = report + '.aln'
334
    assembly_report = report + '.asmbl'
335

336 337 338 339 340
    arg_cmds = arg_cmd.split('|')
    args_1, args_2, args_3 = '', '', ''
    try:
        args_1, args_2, args_3 = arg_cmds
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
341 342
        print(arg_cmd)
        print("! Bad arguments, we expect args_align | args_assemble | args_exportClones")
343

344
    ## commande complete
345
    try:
346 347 348 349 350 351 352 353 354
        mixcr = defs.DIR_MIXCR + 'mixcr'
        cmd = mixcr + ' align --save-reads -t 1 -r ' + align_report + ' ' + args_1 + ' ' + seq_file  + ' ' + out_alignments
        cmd += ' && '
        cmd += mixcr + ' assemble -t 1 -r ' + assembly_report + ' ' + args_2 + ' ' + out_alignments + ' ' + out_clones
        cmd += ' && rm ' + out_alignments
        cmd += ' && '
        cmd += mixcr + ' exportClones --format vidjil -germline -id -name -reads -sequence -top -seg -s ' + args_3 + ' ' + out_clones + ' ' + out_results

        ## execute la commande MiXCR
Mathieu Giraud's avatar
Mathieu Giraud committed
355 356 357
        print("=== Launching MiXCR ===")
        print(cmd)
        print("========================")
358
        sys.stdout.flush()
359

360 361
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=log_file, stderr=STDOUT, close_fds=True)
        p.wait()
362

Mathieu Giraud's avatar
Mathieu Giraud committed
363
        print("Output log in " + out_log)
364
        sys.stdout.flush()
365

366
        ## Get result file
Mathieu Giraud's avatar
Mathieu Giraud committed
367
        print("===>", out_results)
368
        results_filepath = os.path.abspath(out_results)
HERBERT Ryan's avatar
HERBERT Ryan committed
369
        stream = open(results_filepath, 'rb')
370
        stream.close()
371
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
372
        print("!!! MiXCR failed, no result file")
HERBERT Ryan's avatar
HERBERT Ryan committed
373 374
        res = {"message": "[%s] c%s: MiXCR FAILED - %s" % (id_data, id_config, out_folder)}
        log.error(res)
375
        raise
376

HERBERT Ryan's avatar
HERBERT Ryan committed
377 378 379
    align_report = get_file_content(align_report)
    assembly_report = get_file_content(assembly_report)
    reports = align_report + assembly_report
380
    original_name = row[0].data_file
HERBERT Ryan's avatar
HERBERT Ryan committed
381
    totalReads = extract_total_reads(assembly_report)
382 383
    with open(results_filepath, 'r') as json_file:
        my_json = json.load(json_file)
HERBERT Ryan's avatar
HERBERT Ryan committed
384 385
        fill_field(my_json, reports, "log", "samples", True)
        fill_field(my_json, original_name, "original_names", "samples")
386
        fill_field(my_json, cmd, "commandline", "samples")
HERBERT Ryan's avatar
HERBERT Ryan committed
387
        fill_field(my_json, totalReads, "total", "reads")
388

389 390 391 392
        # TODO fix this dirty hack to get around bad file descriptor error
    new_file = open(results_filepath, 'w')
    json.dump(my_json, new_file)
    new_file.close()
393

394 395
    ## insertion dans la base de donnée
    ts = time.time()
HERBERT Ryan's avatar
HERBERT Ryan committed
396
    
397
    stream = open(results_filepath, 'rb')
398 399
    db.results_file[id_data] = dict(status = "ready",
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
HERBERT Ryan's avatar
HERBERT Ryan committed
400
                                 data_file = stream
401
                                )
HERBERT Ryan's avatar
HERBERT Ryan committed
402
    
403 404 405 406 407 408
    db.commit()

    config_name = db.config[id_config].name
    res = {"message": "[%s] c%s: MiXCR finished - %s" % (id_data, id_config, out_folder)}
    log.info(res)

HERBERT Ryan's avatar
HERBERT Ryan committed
409 410
    for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
        sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
411
        print(row.sample_set_id)
412
        run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
HERBERT Ryan's avatar
HERBERT Ryan committed
413 414


415
    return "SUCCESS"
416

417
def run_copy(id_file, id_config, id_data, clean_before=False, clean_after=False):
418 419 420 421
    from subprocess import Popen, PIPE, STDOUT, os
    
    ## les chemins d'acces a vidjil / aux fichiers de sequences
    upload_folder = defs.DIR_SEQUENCES
422
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data
423 424 425 426 427 428 429 430 431 432 433 434
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
    
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    
    ## filepath du fichier de séquence
    row = db(db.sequence_file.id==id_file).select()
    filename = row[0].data_file
    
    os.makedirs(out_folder)
    vidjil_log_file = open(out_folder+'/'+output_filename+'.vidjil.log', 'w')
435

Mathieu Giraud's avatar
Mathieu Giraud committed
436
    print("Output log in "+out_folder+'/'+output_filename+'.vidjil.log')
437 438 439 440 441
    sys.stdout.flush()
    db.commit()
    
    ## récupération du fichier 
    results_filepath = os.path.abspath(defs.DIR_SEQUENCES+row[0].data_file)
442

443 444 445
    try:
        stream = open(results_filepath, 'rb')
    except IOError:
Mathieu Giraud's avatar
Mathieu Giraud committed
446
        print("!!! 'copy' failed, no file")
447 448
        res = {"message": "[%s] c%s: 'copy' FAILED - %s - %s" % (id_data, id_config, info, out_folder)}
        log.error(res)
449 450 451 452 453 454 455 456
        raise IOError
    
    ## insertion dans la base de donnée
    ts = time.time()
    
    db.results_file[id_data] = dict(status = "ready",
                                 run_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
                                 #data_file = row[0].data_file
457
                                 data_file = db.results_file.data_file.store(stream, row[0].filename)
458 459 460 461 462 463 464 465 466 467 468
                                )
    db.commit()
    
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
    
    ## l'output de Vidjil est stocké comme resultat pour l'ordonnanceur
    ## TODO parse result success/fail

469
    res = {"message": "[%s] c%s: 'copy' finished - %s" % (id_data, id_config, filename)}
470 471
    log.info(res)

472 473
    for row in db(db.sample_set_membership.sequence_file_id==id_file).select() :
        sample_set_id = row.sample_set_id
Mathieu Giraud's avatar
Mathieu Giraud committed
474
        print(row.sample_set_id)
475
        run_fuse(id_file, id_config, id_data, sample_set_id, clean_before = False)
476

477 478

    return "SUCCESS"
Mathieu Giraud's avatar
Mathieu Giraud committed
479 480


481 482 483 484
def run_refuse(args):
    for arg in args:
        run_fuse(arg[0], arg[1], arg[2], arg[3], arg[4])
    return "SUCCESS"
Mathieu Giraud's avatar
Mathieu Giraud committed
485

486
def run_fuse(id_file, id_config, id_data, sample_set_id, clean_before=True, clean_after=False):
487 488
    from subprocess import Popen, PIPE, STDOUT, os
    
489
    out_folder = defs.DIR_OUT_VIDJIL_ID % id_data
490
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % id_data + '-%s' % sample_set_id
491
    
Mathieu Giraud's avatar
Mathieu Giraud committed
492 493 494 495 496
    if clean_before:
        cmd = "rm -rf "+out_folder 
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
        os.makedirs(out_folder)    
497 498
    
    
Mathieu Giraud's avatar
Mathieu Giraud committed
499
    fuse_log_file = open(out_folder+'/'+output_filename+'.fuse.log', 'w')
Marc's avatar
Marc committed
500
    
501
    ## fuse.py 
502
    output_file = out_folder+'/'+output_filename+'.fused'
503
    files = ""
504
    sequence_file_list = ""
505
    query2 = db( ( db.results_file.sequence_file_id == db.sequence_file.id )
506
                   & ( db.sample_set_membership.sequence_file_id == db.sequence_file.id)
507
                   & ( db.sample_set_membership.sample_set_id == sample_set_id)
508
                   & ( db.results_file.config_id == id_config )
Vidjil Team's avatar
Vidjil Team committed
509
                   ).select( orderby=db.sequence_file.id|~db.results_file.run_date) 
Marc Duez's avatar
Marc Duez committed
510 511 512 513 514 515 516
    query = []
    sequence_file_id = 0
    for row in query2 : 
        if row.sequence_file.id != sequence_file_id :
            query.append(row)
            sequence_file_id = row.sequence_file.id
            
517
    for row in query :
Mathieu Giraud's avatar
Mathieu Giraud committed
518 519
        if row.results_file.data_file is not None :
            files += defs.DIR_RESULTS + row.results_file.data_file + " "
520 521
            sequence_file_list += str(row.results_file.sequence_file_id) + "_"
            
522
    try:
523 524
        fuse_cmd = db.config[id_config].fuse_command
        cmd = "python "+defs.DIR_FUSE+"/fuse.py -o "+ output_file + " " + fuse_cmd + " " + files
Mathieu Giraud's avatar
Mathieu Giraud committed
525 526


Mathieu Giraud's avatar
Mathieu Giraud committed
527 528 529
        print("=== fuse.py ===")
        print(cmd)
        print("===============")
530
        sys.stdout.flush()
Mathieu Giraud's avatar
Mathieu Giraud committed
531

532 533
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=fuse_log_file, stderr=STDOUT, close_fds=True)
        (stdoutdata, stderrdata) = p.communicate()
Mathieu Giraud's avatar
Mathieu Giraud committed
534
        print("Output log in "+out_folder+'/'+output_filename+'.fuse.log')
Mathieu Giraud's avatar
Mathieu Giraud committed
535

536
        fuse_filepath = os.path.abspath(output_file)
537 538

        stream = open(fuse_filepath, 'rb')
539
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
540
        print("!!! Fuse failed, no .fused file")
541 542
        res = {"message": "[%s] c%s: 'fuse' FAILED - %s" % (id_data, id_config, output_file)}
        log.error(res)
543
        raise
544

545
    ts = time.time()
546 547 548 549

    fused_files = db( ( db.fused_file.config_id == id_config ) &
                     ( db.fused_file.sample_set_id == sample_set_id )
                 ).select()
550
    existing_fused_file = None
551
    if len(fused_files) > 0:
552 553 554
        fused_file = fused_files[0]
        id_fuse = fused_file.id
        existing_fused_file = fused_file.fused_file
555 556 557 558
    else:
        id_fuse = db.fused_file.insert(sample_set_id = sample_set_id,
                                       config_id = id_config)

559
    db.fused_file[id_fuse] = dict(fuse_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),
560 561
                                 fused_file = stream,
                                 sequence_file_list = sequence_file_list)
562
    db.commit()
Mathieu Giraud's avatar
Mathieu Giraud committed
563 564 565 566 567

    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
568 569 570 571 572
        # remove previous fused_file if it exists
        if existing_fused_file is not None:
            clean_cmd = "rm -rf %s/%s" % (out_folder, existing_fused_file)
            p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
            p.wait()
573
    
574
    res = {"message": "[%s] c%s: 'fuse' finished - %s" % (id_data, id_config, db.fused_file[id_fuse].fused_file)}
575 576
    log.info(res)

577 578 579
    # Remove temporary fused file
    os.remove(output_file)

580
    return "SUCCESS"
581

582
def custom_fuse(file_list):
583
    from subprocess import Popen, PIPE, STDOUT, os
584 585 586

    if defs.PORT_FUSE_SERVER is None:
        raise IOError('This server cannot fuse custom data')
587
    random_id = random.randint(99999999,99999999999)
588
    out_folder = os.path.abspath(defs.DIR_OUT_VIDJIL_ID % random_id)
589
    output_filename = defs.BASENAME_OUT_VIDJIL_ID % random_id
590 591 592 593 594
    
    cmd = "rm -rf "+out_folder 
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()
    os.makedirs(out_folder)    
595

596
    res = {"message": "'custom fuse' (%d files): %s" % (len(file_list), ','.join(file_list))}
597 598
    log.info(res)
        
599 600 601 602 603
    ## fuse.py 
    output_file = out_folder+'/'+output_filename+'.fused'
    files = ""
    for id in file_list :
        if db.results_file[id].data_file is not None :
604
            files += os.path.abspath(defs.DIR_RESULTS + db.results_file[id].data_file) + " "
605
    
606
    try:
607
        cmd = "python "+ os.path.abspath(defs.DIR_FUSE) +"/fuse.py -o "+output_file+" -t 100 "+files
608
        proc_srvr = xmlrpclib.ServerProxy("http://%s:%d" % (defs.FUSE_SERVER, defs.PORT_FUSE_SERVER))
609
        fuse_filepath = proc_srvr.fuse(cmd, out_folder, output_filename)
610 611 612
    
        f = open(fuse_filepath, 'rb')
        data = gluon.contrib.simplejson.loads(f.read())
613
    except:
614 615
        res = {"message": "'custom fuse' -> IOError"}
        log.error(res)
616
        raise
617 618 619 620 621

    clean_cmd = "rm -rf " + out_folder 
    p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    p.wait()

622 623 624
    res = {"message": "'custom fuse' -> finished"}
    log.info(res)

625
    return data
626

HERBERT Ryan's avatar
HERBERT Ryan committed
627
#TODO move this ?
628 629 630 631 632 633
def get_file_content(filename):
    content = ""
    with open(filename, 'rb') as my_file:
        content = my_file.read()
    return content

HERBERT Ryan's avatar
HERBERT Ryan committed
634 635 636 637 638 639 640 641 642 643 644 645 646
def fill_field(dest, value, field, parent="", append=False):
    if parent is not None and parent != "":
        dest = dest[parent]

    if field in dest:
        if append:
            dest[field][0] += value
        else:
            dest[field][0] = value
    else:
        dest[field] = []
        dest[field].append(value)

HERBERT Ryan's avatar
HERBERT Ryan committed
647 648 649 650
def extract_total_reads(report):
    reads_matcher = re.compile("Total Reads analysed: [0-9]+")
    reads_line = reads_matcher.search(report).group()
    return reads_line.split(' ')[-1]
651 652 653 654 655 656 657 658 659 660 661 662 663 664

def schedule_pre_process(sequence_file_id, pre_process_id):
    from subprocess import Popen, PIPE, STDOUT, os


    args = [pre_process_id, sequence_file_id]

    err = assert_scheduler_task_does_not_exist(str(args))
    if err:
        log.error(err)
        return err

    task = scheduler.queue_task("pre_process", args,
                                repeats = 1, timeout = defs.TASK_TIMEOUT)
665 666
    db.sequence_file[sequence_file_id] = dict(pre_process_scheduler_task_id = task.id)
    
667
    res = {"redirect": "reload",
668
           "message": "{%s} (%s): process requested" % (sequence_file_id, pre_process_id)}
669 670 671 672 673

    log.info(res)
    return res


674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690
def get_preprocessed_filename(filename1, filename2):
    '''
    Get the same extension and then get the most common among them

    >>> get_preprocessed_filename('lsdkj_toto-tata_mlkmfsdlkf.fastq.gz', 'rete_toto-tata_eekjdf.fastq.gz')
    '_toto-tata_.fastq.gz'
    '''
    if not filename2:
        return filename1
    extension = tools_utils.get_common_suffpref([filename1, filename2], min(len(filename1), len(filename2)), -1)
    without_extension = [x[0:-len(extension)] for x in [filename1, filename2]]
    common = tools_utils.common_substring(without_extension)
    if len(common) == 0:
        common = '_'.join(without_extension)
    return common+extension


691
def run_pre_process(pre_process_id, sequence_file_id, clean_before=True, clean_after=False):
Mathieu Giraud's avatar
Mathieu Giraud committed
692 693 694 695 696
    '''
    Run a pre-process on sequence_file.data_file (and possibly sequence_file.data_file+2),
    put the output back in sequence_file.data_file.
    '''

697 698
    from subprocess import Popen, PIPE, STDOUT, os
    
699
    sequence_file = db.sequence_file[sequence_file_id]
700 701 702
    db.sequence_file[sequence_file_id] = dict(pre_process_flag = "RUN")
    db.commit()
    
703
    out_folder = defs.DIR_PRE_VIDJIL_ID % sequence_file_id
704 705
    output_filename = get_preprocessed_filename(get_original_filename(sequence_file.data_file),
                                                get_original_filename(sequence_file.data_file2))
706 707 708 709 710 711 712
    
    if clean_before:
        cmd = "rm -rf "+out_folder 
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
        os.makedirs(out_folder)    

713
    output_file = out_folder+'/'+output_filename
714 715
            
    pre_process = db.pre_process[pre_process_id]
716

717
    try:
718 719 720 721 722 723
        cmd = pre_process.command.replace( "&file1&", defs.DIR_SEQUENCES + sequence_file.data_file)
        if sequence_file.data_file2:
            cmd = cmd.replace( "&file2&", defs.DIR_SEQUENCES + sequence_file.data_file2)
        cmd = cmd.replace( "&result&", output_file)
        cmd = cmd.replace("&pear&", defs.DIR_PEAR)

Mathieu Giraud's avatar
Mathieu Giraud committed
724 725 726
        print("=== Pre-process %s ===" % pre_process_id)
        print(cmd)
        print("===============")
727
        sys.stdout.flush()
728

729 730 731 732 733 734
        out_log = out_folder+'/'+output_filename+'.pre.log'
        log_file = open(out_log, 'w')

        os.chdir(defs.DIR_FUSE)
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=log_file, stderr=log_file, close_fds=True)
        (stdoutdata, stderrdata) = p.communicate()
Mathieu Giraud's avatar
Mathieu Giraud committed
735
        print("Output log in " + out_log)
736

737
        filepath = os.path.abspath(output_file)
738 739

        stream = open(filepath, 'rb')
740
    except:
Mathieu Giraud's avatar
Mathieu Giraud committed
741
        print("!!! Pre-process failed, no result file")
742
        res = {"message": "{%s} p%s: 'pre_process' FAILED - %s" % (sequence_file_id, pre_process_id, output_file)}
743
        log.error(res)
744 745
        db.sequence_file[sequence_file_id] = dict(pre_process_flag = "FAILED")
        db.commit()
746
        raise
747

marc's avatar
marc committed
748 749
        

Mathieu Giraud's avatar
Mathieu Giraud committed
750 751
    # Now we update the sequence file with the result of the pre-process
    # We forget the initial data_file (and possibly data_file2)
752
    db.sequence_file[sequence_file_id] = dict(data_file = stream,
753
                                              data_file2 = None,
754
                                              pre_process_flag = "DONE")
755
    db.commit()
756 757 758 759 760 761 762 763 764 765 766
    #resume STOPPED task for this sequence file
    stopped_task = db(
        (db.results_file.sequence_file_id == sequence_file_id)
        & (db.results_file.scheduler_task_id == db.scheduler_task.id)
        & (db.scheduler_task.status == "STOPPED")
    ).select()
    
    for row in stopped_task :
        db.scheduler_task[row.scheduler_task.id] = dict(status = "QUEUED")
        
    
767 768
    db.commit()

769
    # Dump log in scheduler_run.run_output
770 771
    log_file.close()
    for l in open(out_log):
Mathieu Giraud's avatar
Mathieu Giraud committed
772
        print(l, end=' ')
773 774 775

    # Remove data file from disk to save space (it is now saved elsewhere)
    os.remove(filepath)
776
    
777 778 779 780 781
    if clean_after:
        clean_cmd = "rm -rf " + out_folder 
        p = Popen(clean_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        p.wait()
    
782
    res = {"message": "{%s} p%s: 'pre_process' finished - %s" % (sequence_file_id, pre_process_id, output_file)}
783 784 785 786
    log.info(res)

    return "SUCCESS"
    
HERBERT Ryan's avatar
HERBERT Ryan committed
787
    
788
from gluon.scheduler import Scheduler
789
scheduler = Scheduler(db, dict(vidjil=run_vidjil,
790
                               compute_contamination=compute_contamination,
791
                               mixcr=run_mixcr,
792
                               none=run_copy,
Ryan Herbert's avatar
Ryan Herbert committed
793
                               pre_process=run_pre_process,
794
                               refuse=run_refuse),
HERBERT Ryan's avatar
HERBERT Ryan committed
795
                        heartbeat=defs.SCHEDULER_HEARTBEAT)