vidjil-to-fasta.py 9.07 KB
Newer Older
1 2 3 4 5
import fuse
import argparse
import base64

MAX_TOP=99999999999
6
REPLACEMENT_WHITESPACE='~'
7 8 9 10 11 12 13 14 15 16 17

def print_fasta_header(header, outfile, options):
    '''
    Print the fasta header in the outfile.
    '''
    header = '>'+header

    if options.no_header_whitespace:
        header = header.replace(' ', '_')
    outfile.write(header+"\n")

18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
def get_recombination_type(clone):
    '''
    >>> class tmp: d = {};
    >>> clone = tmp()
    >>> clone.d = {'seg': {'4': "D1", '4b': "D2"}}
    >>> get_recombination_type(clone)
    'VDDJ'

    >>> clone.d = {'seg': {'4': "D1"}}
    >>> get_recombination_type(clone)
    'VDJ'

    >>> clone.d = {'seg': {'5': "V", "3": "J"}}
    >>> get_recombination_type(clone)
    'VJ'

    >>> clone.d = {'seg': {'4a': "D1", '4b': "D2", '4c': "D3"}}
    >>> get_recombination_type(clone)
    'VDDDJ'
    '''
    number_of_Ds = ['4c', '4b', '4']
    i = 0
    D_seq = ""
    for d in number_of_Ds:
        if clone.d['seg'].has_key(d):
            D_seq = "D" * (len(number_of_Ds) - i)
            break
        i += 1
    return "V"+D_seq+"J"

def get_gene_positions(clone, end, gene_name):
    '''
    >>> class tmp: d = {};
    >>> clone = tmp()
    >>> clone.d = {'seg': {'5end': 10, '3start': 15}}
    >>> get_gene_positions(clone, 'stop', '5')
    10
    >>> get_gene_positions(clone, 'start', '3')
    15

    >>> clone.d = {'seg': {'5': {'stop': 10}, '3': {'start': 15}}}
    >>> get_gene_positions(clone, 'stop', '5')
    10
    >>> get_gene_positions(clone, 'start', '3')
    15

    >>> clone.d = {'seg': {'5': {'stop': 10}, '4': {'start': 11, 'stop': 12}, '3': {'start': 15}}}
    >>> get_gene_positions(clone, 'stop', '5')
    10
    >>> get_gene_positions(clone, 'start', '3')
    15
    >>> get_gene_positions(clone, 'stop', '4')
    12
    >>> get_gene_positions(clone, 'start', '4')
    11

    >>> clone.d = {'seg': {'5': {'stop': 10}, '4': {'start': 11, 'stop': 12}, '4b': {'start': 13, 'stop': 14}, '3': {'start': 15}}}
    >>> get_gene_positions(clone, 'stop', '5')
    10
    >>> get_gene_positions(clone, 'start', '3')
    15
    >>> get_gene_positions(clone, 'stop', '4')
    12
    >>> get_gene_positions(clone, 'start', '4')
    11
    >>> get_gene_positions(clone, 'stop', '4b')
    14
    >>> get_gene_positions(clone, 'start', '4b')
    13
    '''
    if not clone.d.has_key('seg'):
        return None
    seg = clone.d['seg']
    if seg.has_key(gene_name+end):
        return seg[gene_name+end]
    if end == 'stop' and seg.has_key(gene_name+'end'):
        return seg[gene_name+'end']
    elif seg.has_key(gene_name) \
         and isinstance(seg[gene_name], dict) \
         and seg[gene_name].has_key(end):
        return seg[gene_name][end]
    else:
        return None

def get_vdj_positions(recombination_type, clone):
    '''
104 105 106
    Return the start and end positions of all the genes in order.
    Or None if the clone doesn't have information on gene positions.

107 108
    >>> class tmp: d = {};
    >>> clone = tmp()
109
    >>> clone.d = {'seg': {'5end': 10, '3start': 15}, 'sequence': 'ATTAAAAAAAAAAAAAAAAA'}
110 111 112
    >>> get_vdj_positions('VJ', clone)
    [1, 11, 16, 20]

113
    >>> clone.d = {'seg': {'5': {'stop': 10}, '3': {'start': 15}}, 'sequence': 'ATTAAAAAAAAAAAAAAAAA'}
114 115 116
    >>> get_vdj_positions('VJ', clone)
    [1, 11, 16, 20]

117
    >>> clone.d = {'seg': {'5': {'stop': 10}, '4': {'start': 11, 'stop': 12}, '3': {'start': 15}}, 'sequence': 'ATTAAAAAAAAAAAAAAAAA'}
118 119 120
    >>> get_vdj_positions('VDJ', clone)
    [1, 11, 12, 13, 16, 20]

121
    >>> clone.d = {'seg': {'5': {'stop': 10}, '4': {'start': 11, 'stop': 12}, '4b': {'start': 13, 'stop': 14}, '3': {'start': 15}}, 'sequence': 'ATTAAAAAAAAAAAAAAAAA'}
122 123 124 125 126 127 128
    >>> get_vdj_positions('VDDJ', clone)
    [1, 11, 12, 13, 14, 15, 16, 20]
    >>> get_vdj_positions('VDJ', clone)
    [1, 11, 12, 13, 16, 20]
    >>> get_vdj_positions('VJ', clone)
    [1, 11, 16, 20]

129 130
    >>> clone.d = {'seg': {'foo': 'bar'}}
    >>> get_vdj_positions('VDJ', clone)
131 132 133 134 135
    '''
    positions = [1]
    if not clone.d.has_key('seg'):
        return None
    seg = clone.d['seg']
136 137 138 139
    gene_pos_stop_5 = get_gene_positions(clone, 'stop', '5')
    if gene_pos_stop_5 is None:
        return None
    positions.append(gene_pos_stop_5+1)
140 141 142 143 144 145 146 147 148 149 150 151 152
    recombination_type = recombination_type[1:-1]
    i = 0
    for d in recombination_type:
        if len(recombination_type) == 1:
            d_name = '4'
        else:
            d_name = '4'+chr(ord('a')+i)
        if not seg.has_key(d_name) and d_name == '4a':
            d_name = '4'
        positions.append(get_gene_positions(clone, 'start', d_name)+1)
        positions.append(get_gene_positions(clone, 'stop', d_name)+1)
        i+=1
    positions.append(get_gene_positions(clone, 'start', '3')+1)
153 154
    if (clone.d.has_key('sequence')):
        positions.append(len(clone.d['sequence']))
155 156 157 158 159 160
    else:
        # Arbitrary end
        positions.append(positions[-1] + 50)
    return positions


161
def write_fuse_to_fasta(data, outfile, used_names, current_filename, options, metadata=''):
162 163 164 165 166 167 168 169 170 171 172
    '''
    Write the top clones (if specified in options.top)
    in the fasta file opened in the outfile.
    used_names is a dictionary that lists the sequence names
    used so far as well as their number of occurrences (prevents
    having sequences with the same name)
    '''

    if options.top < MAX_TOP:
        data.filter(data.getTop(options.top))

173 174 175 176 177
    if options.no_header_whitespace:
        spacer = REPLACEMENT_WHITESPACE
    else:
        spacer = ' '

178 179
    for clone in data:
        if clone.d.has_key('sequence') and isinstance(clone.d['sequence'], basestring)\
180 181 182
        and len(clone.d['sequence']) > 0 and clone.d.has_key('seg'):
            recombination = get_recombination_type(clone)
            name = recombination+spacer
183 184 185 186
            # try:
            positions = get_vdj_positions(recombination, clone)
            # except:
            #     continue
187 188
            if positions is None:
                continue
189
            name += spacer.join(map(str, positions))+spacer
190
            if not clone.d.has_key('name'):
191
                name += "Anonymous"
192
            else:
193
                name += clone.d['name'].replace(' ', spacer)
194 195 196 197 198 199 200 201 202 203 204
            additional_header_info = []
            if name in used_names:
                used_names[name] += 1
                additional_header_info.append(str(used_names[name]))
            else:
                used_names[name] = 1

            if options.germline:
                additional_header_info.append('germline=%s'%clone.d['germline'])
            if len(options.sample_name) > 0:
                sample_name = eval(options.sample_name)
205 206 207
            else:
                sample_name = current_filename
            additional_header_info.append('sample_name=%s' % sample_name)
208

209 210 211
            if len(metadata) > 0:
                additional_header_info.append(metadata.replace(' ', spacer))

212
            if len(additional_header_info) > 0:
213
                additional_header_info = spacer+'#'+spacer+spacer.join(additional_header_info)
214 215 216 217 218 219 220
            else:
                additional_header_info = ''
            print_fasta_header('%s%s' % (name, additional_header_info),\
                               outfile, options)
            outfile.write(clone.d['sequence']+"\n")


221
def process_files(args):
222 223 224
    outfile = open(args.output, 'w')

    used_names = {}
225 226 227 228 229 230
    current_file = 0

    if len(args.metadata) != len(args.file):
        # Not of the same length: ignore metadata
        args.metadata = ['' for i in range(len(args.file))]

231
    for vidjil in args.file:
232 233 234 235 236 237
        try:
            data = fuse.ListWindows()
            data.load(vidjil, "")
        except Exception:
            print "** Warning ** file %s could not be loaded" % vidjil
        else:
238 239
            write_fuse_to_fasta(data, outfile, used_names, vidjil, args, args.metadata[current_file])
        current_file += 1
240 241

    outfile.close()
242 243 244 245 246 247 248 249

parser = argparse.ArgumentParser(description = 'Converts one or several .vidjil file to a .fasta file. The resulting .fasta file can notably be indexed',
                                     epilog = 'Example: python %(prog)s -o output.fasta in1.vidjil in2.vidjil in3.vidjil')

parser.add_argument('--top', '-t', type=int, default=MAX_TOP, help = 'Keep only the top most clones. By default keep all the clones for which we have enough information.')
parser.add_argument('--no-header-whitespace', '-w', action='store_true', help='Replace all whitespaces in the fasta header with '+REPLACEMENT_WHITESPACE)
parser.add_argument('--output', '-o', help='Name of the output FASTA file [REQUIRED]', required=True)
parser.add_argument('--sample-name', '-n', default='', help = 'Provide the sample name in the fasta header. Some Python code can be provided as soon as it returns a string')
250
parser.add_argument('--metadata', '-d', default = [], action='append', help = 'Provide metadata for each file. The option must be called each time for each file, in the same order as the files are given')
251 252 253
parser.add_argument('--germline', '-g', action='store_true', help = 'When set, provide the germline of the sequence in the additional header informations')
parser.add_argument('file', nargs='+', help='Input (.vidjil/.clntab) files')

254 255 256 257
if __name__ == '__main__':
    args = parser.parse_args()

    process_files(args)