generate-recombinations.py 2.61 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
'''Generates artificial VJ recombinations'''

from __future__ import print_function
import json
import fasta
import random

def recombine_VJ(seq5, remove5, N, remove3, seq3):
    name = "%s %d/%s/%d %s" % (seq5.name, remove5, N, remove3, seq3.name)
    seq = seq5.seq[:len(seq5)-remove5] + '\n' + N + '\n' + seq3.seq[remove3:]

    return fasta.Fasta(name, seq)

14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32

def random_sequence(characters, length):
    return ''.join([random.choice(characters) for x in range(length)])

def recombine_VJ_with_removes(seq5, remove5, Nlength, remove3, seq3):
    '''Recombine V and J with a random N, ensuring that the bases in N are not the same that what was ultimately removed from V or J'''
    assert(Nlength >= 1)

    available = ['A', 'C', 'G', 'T']
    if remove5:
        available.remove(seq5.seq[len(seq5)-remove5].upper())
    if remove3:
        c = seq3.seq[remove3-1].upper()
        if c in available:
            available.remove(c)

    return recombine_VJ(seq5, remove5, random_sequence(available, Nlength), remove3, seq3)


33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
def select_genes(rep5, rep3, at_least=0):
    nb = 0
    for seq5 in rep5:
        yield (seq5, random.choice(rep3))
        nb += 1

    for seq3 in rep3:
        yield (random.choice(rep5), seq3)
        nb += 1

    while nb < at_least:
        yield (random.choice(rep5), random.choice(rep3))
        nb += 1


def generate_to_file(rep5, rep3, f, recomb_function):
    print("  ==>", f)

    with open(f, 'w') as ff:
        nb = 0
        for seq5, seq3 in select_genes(rep5, rep3):
            nb += 1
            seq = recomb_function(seq5, seq3)
            seq.header = seq.header.replace(' ', '_')
            ff.write(str(seq))

    print("  ==> %d recombinations" % nb)


germlines_json = open('germlines.data').read().replace('germline_data = ', '')
germlines = json.loads(germlines_json)


for code in germlines:
    g = germlines[code]
    print("--- %s - %-4s - %s"  % (g['shortcut'], code, g['description']))

    if '4' in g:
        continue

    # Read germlines

    rep5 = []
    for r5 in g['5']:
        rep5 += list(fasta.parse_as_Fasta(open(r5)))

    rep3 = []
    for r3 in g['3']:
        rep3 += list(fasta.parse_as_Fasta(open(r3)))

    print("      5: %3d sequences" % len(rep5),
          "      3: %3d sequences" % len(rep3))


    # Generate recombinations

    generate_to_file(rep5, rep3, '../data/gen/0-removes-%s.should-vdj.fa' % code,
                     (lambda seq5, seq3: recombine_VJ(seq5, 0, 'ATCG', 0, seq3)))

92 93 94
    generate_to_file(rep5, rep3, '../data/gen/5-removes-%s.should-vdj.fa' % code,
                     (lambda seq5, seq3: recombine_VJ_with_removes(seq5, 5, 4, 5, seq3)))

95
    print()