vidjil-to-fasta.py 10.2 KB
Newer Older
1 2 3 4 5
import fuse
import argparse
import base64

MAX_TOP=99999999999
6
REPLACEMENT_WHITESPACE='~'
7 8 9 10 11 12 13 14 15 16 17

def print_fasta_header(header, outfile, options):
    '''
    Print the fasta header in the outfile.
    '''
    header = '>'+header

    if options.no_header_whitespace:
        header = header.replace(' ', '_')
    outfile.write(header+"\n")

18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
def get_recombination_type(clone):
    '''
    >>> class tmp: d = {};
    >>> clone = tmp()
    >>> clone.d = {'seg': {'4': "D1", '4b': "D2"}}
    >>> get_recombination_type(clone)
    'VDDJ'

    >>> clone.d = {'seg': {'4': "D1"}}
    >>> get_recombination_type(clone)
    'VDJ'

    >>> clone.d = {'seg': {'5': "V", "3": "J"}}
    >>> get_recombination_type(clone)
    'VJ'

    >>> clone.d = {'seg': {'4a': "D1", '4b': "D2", '4c': "D3"}}
    >>> get_recombination_type(clone)
    'VDDDJ'
    '''
    number_of_Ds = ['4c', '4b', '4']
    i = 0
    D_seq = ""
    for d in number_of_Ds:
        if clone.d['seg'].has_key(d):
            D_seq = "D" * (len(number_of_Ds) - i)
            break
        i += 1
    return "V"+D_seq+"J"

def get_gene_positions(clone, end, gene_name):
    '''
    >>> class tmp: d = {};
    >>> clone = tmp()
    >>> clone.d = {'seg': {'5end': 10, '3start': 15}}
    >>> get_gene_positions(clone, 'stop', '5')
    10
    >>> get_gene_positions(clone, 'start', '3')
    15

    >>> clone.d = {'seg': {'5': {'stop': 10}, '3': {'start': 15}}}
    >>> get_gene_positions(clone, 'stop', '5')
    10
    >>> get_gene_positions(clone, 'start', '3')
    15

    >>> clone.d = {'seg': {'5': {'stop': 10}, '4': {'start': 11, 'stop': 12}, '3': {'start': 15}}}
    >>> get_gene_positions(clone, 'stop', '5')
    10
    >>> get_gene_positions(clone, 'start', '3')
    15
    >>> get_gene_positions(clone, 'stop', '4')
    12
    >>> get_gene_positions(clone, 'start', '4')
    11

    >>> clone.d = {'seg': {'5': {'stop': 10}, '4': {'start': 11, 'stop': 12}, '4b': {'start': 13, 'stop': 14}, '3': {'start': 15}}}
    >>> get_gene_positions(clone, 'stop', '5')
    10
    >>> get_gene_positions(clone, 'start', '3')
    15
    >>> get_gene_positions(clone, 'stop', '4')
    12
    >>> get_gene_positions(clone, 'start', '4')
    11
    >>> get_gene_positions(clone, 'stop', '4b')
    14
    >>> get_gene_positions(clone, 'start', '4b')
    13
    '''
    if not clone.d.has_key('seg'):
        return None
    seg = clone.d['seg']
    if seg.has_key(gene_name+end):
        return seg[gene_name+end]
    if end == 'stop' and seg.has_key(gene_name+'end'):
        return seg[gene_name+'end']
    elif seg.has_key(gene_name) \
         and isinstance(seg[gene_name], dict) \
         and seg[gene_name].has_key(end):
        return seg[gene_name][end]
    else:
        return None

def get_vdj_positions(recombination_type, clone):
    '''
104 105 106
    Return the start and end positions of all the genes in order.
    Or None if the clone doesn't have information on gene positions.

107 108
    >>> class tmp: d = {};
    >>> clone = tmp()
109
    >>> clone.d = {'seg': {'5end': 10, '3start': 15}, 'sequence': 'ATTAAAAAAAAAAAAAAAAA'}
110 111 112
    >>> get_vdj_positions('VJ', clone)
    [1, 11, 16, 20]

113
    >>> clone.d = {'seg': {'5': {'stop': 10}, '3': {'start': 15}}, 'sequence': 'ATTAAAAAAAAAAAAAAAAA'}
114 115
    >>> get_vdj_positions('VJ', clone)
    [1, 11, 16, 20]
116 117 118 119
    >>> get_vdj_positions('VDJ', clone)
    [1, 11, 16, 20]
    >>> get_vdj_positions('VDDJ', clone)
    [1, 11, 16, 20]
120

121
    >>> clone.d = {'seg': {'5': {'stop': 10}, '4': {'start': 11, 'stop': 12}, '3': {'start': 15}}, 'sequence': 'ATTAAAAAAAAAAAAAAAAA'}
122 123 124
    >>> get_vdj_positions('VDJ', clone)
    [1, 11, 12, 13, 16, 20]

125
    >>> clone.d = {'seg': {'5': {'stop': 10}, '4': {'start': 11, 'stop': 12}, '4b': {'start': 13, 'stop': 14}, '3': {'start': 15}}, 'sequence': 'ATTAAAAAAAAAAAAAAAAA'}
126 127 128 129 130 131 132
    >>> get_vdj_positions('VDDJ', clone)
    [1, 11, 12, 13, 14, 15, 16, 20]
    >>> get_vdj_positions('VDJ', clone)
    [1, 11, 12, 13, 16, 20]
    >>> get_vdj_positions('VJ', clone)
    [1, 11, 16, 20]

133 134
    >>> clone.d = {'seg': {'foo': 'bar'}}
    >>> get_vdj_positions('VDJ', clone)
135 136 137 138 139 140 141 142 143 144

    >>> clone.d = {'seg': {'5': 'IGHV', '3': 'IGHJ', '5end': 20}}
    >>> get_vdj_positions('VJ', clone)

    >>> clone.d = {'seg': {'5': 'IGHV', '3': 'IGHJ', '3start': 20}}
    >>> get_vdj_positions('VJ', clone)

    >>> clone.d = {'seg': {'5': 'IGHV', '3': 'IGHJ'}}
    >>> get_vdj_positions('VJ', clone)

145 146 147 148 149
    '''
    positions = [1]
    if not clone.d.has_key('seg'):
        return None
    seg = clone.d['seg']
150
    gene_pos_stop_5 = get_gene_positions(clone, 'stop', '5')
151 152
    gene_pos_start_3 = get_gene_positions(clone, 'start', '3')
    if gene_pos_stop_5 is None or gene_pos_start_3 is None:
153 154
        return None
    positions.append(gene_pos_stop_5+1)
155 156 157 158 159 160 161 162 163
    recombination_type = recombination_type[1:-1]
    i = 0
    for d in recombination_type:
        if len(recombination_type) == 1:
            d_name = '4'
        else:
            d_name = '4'+chr(ord('a')+i)
        if not seg.has_key(d_name) and d_name == '4a':
            d_name = '4'
164 165 166 167 168 169
        gene_pos_start_4 = get_gene_positions(clone, 'start', d_name)
        gene_pos_stop_4 = get_gene_positions(clone, 'stop', d_name)
        if gene_pos_start_4 is not None and gene_pos_stop_4 is not None:
            positions.append(gene_pos_start_4+1)
            positions.append(gene_pos_stop_4+1)
            i+=1
170
    positions.append(gene_pos_start_3+1)
171 172
    if (clone.d.has_key('sequence')):
        positions.append(len(clone.d['sequence']))
173 174 175 176 177 178
    else:
        # Arbitrary end
        positions.append(positions[-1] + 50)
    return positions


179
def write_fuse_to_fasta(data, outfile, used_names, current_filename, options, metadata=''):
180 181 182 183 184 185 186
    '''
    Write the top clones (if specified in options.top)
    in the fasta file opened in the outfile.
    used_names is a dictionary that lists the sequence names
    used so far as well as their number of occurrences (prevents
    having sequences with the same name)
    '''
187
    clones_percentage = {}
188 189 190 191

    if options.top < MAX_TOP:
        data.filter(data.getTop(options.top))

192 193 194 195 196
    if options.no_header_whitespace:
        spacer = REPLACEMENT_WHITESPACE
    else:
        spacer = ' '

197 198
    for clone in data:
        if clone.d.has_key('sequence') and isinstance(clone.d['sequence'], basestring)\
199 200 201
        and len(clone.d['sequence']) > 0 and clone.d.has_key('seg'):
            recombination = get_recombination_type(clone)
            name = recombination+spacer
202
            positions = get_vdj_positions(recombination, clone)
203 204
            if positions is None:
                continue
205
            name += spacer.join(map(str, positions))+spacer
206
            if not clone.d.has_key('name'):
207
                name += "Anonymous"
208
            else:
209
                name += clone.d['name'].replace(' ', spacer)
210
            additional_header_info = []
211 212 213 214 215 216 217

            #Percentage
            #take the max reads number of the samples (in case of multiple
            #samples)
            max_sample = max(clone.d['reads'])
            #take the index corresponding to the max_sample
            index_max_sample = clone.d['reads'].index(max_sample)
218
            reads_total_nb = data.d['reads'].d['segmented'][index_max_sample]
219 220
            percentage = float(max_sample)/reads_total_nb

221 222 223 224 225 226 227 228 229 230
            if name in used_names:
                used_names[name] += 1
                additional_header_info.append(str(used_names[name]))
            else:
                used_names[name] = 1

            if options.germline:
                additional_header_info.append('germline=%s'%clone.d['germline'])
            if len(options.sample_name) > 0:
                sample_name = eval(options.sample_name)
231 232 233
            else:
                sample_name = current_filename
            additional_header_info.append('sample_name=%s' % sample_name)
234

235 236
            additional_header_info.append('percentage=%s'%percentage)

237 238 239
            if len(metadata) > 0:
                additional_header_info.append(metadata.replace(' ', spacer))

240
            if len(additional_header_info) > 0:
241
                additional_header_info = spacer+'#'+spacer+spacer.join(additional_header_info)
242 243 244 245 246 247 248
            else:
                additional_header_info = ''
            print_fasta_header('%s%s' % (name, additional_header_info),\
                               outfile, options)
            outfile.write(clone.d['sequence']+"\n")


249
def process_files(args):
250 251 252
    outfile = open(args.output, 'w')

    used_names = {}
253 254 255 256 257 258
    current_file = 0

    if len(args.metadata) != len(args.file):
        # Not of the same length: ignore metadata
        args.metadata = ['' for i in range(len(args.file))]

259
    for vidjil in args.file:
260 261 262 263 264 265
        try:
            data = fuse.ListWindows()
            data.load(vidjil, "")
        except Exception:
            print "** Warning ** file %s could not be loaded" % vidjil
        else:
266 267
            write_fuse_to_fasta(data, outfile, used_names, vidjil, args, args.metadata[current_file])
        current_file += 1
268 269

    outfile.close()
270 271 272 273 274 275 276 277

parser = argparse.ArgumentParser(description = 'Converts one or several .vidjil file to a .fasta file. The resulting .fasta file can notably be indexed',
                                     epilog = 'Example: python %(prog)s -o output.fasta in1.vidjil in2.vidjil in3.vidjil')

parser.add_argument('--top', '-t', type=int, default=MAX_TOP, help = 'Keep only the top most clones. By default keep all the clones for which we have enough information.')
parser.add_argument('--no-header-whitespace', '-w', action='store_true', help='Replace all whitespaces in the fasta header with '+REPLACEMENT_WHITESPACE)
parser.add_argument('--output', '-o', help='Name of the output FASTA file [REQUIRED]', required=True)
parser.add_argument('--sample-name', '-n', default='', help = 'Provide the sample name in the fasta header. Some Python code can be provided as soon as it returns a string')
278
parser.add_argument('--metadata', '-d', default = [], action='append', help = 'Provide metadata for each file. The option must be called each time for each file, in the same order as the files are given')
279 280 281
parser.add_argument('--germline', '-g', action='store_true', help = 'When set, provide the germline of the sequence in the additional header informations')
parser.add_argument('file', nargs='+', help='Input (.vidjil/.clntab) files')

282 283 284 285
if __name__ == '__main__':
    args = parser.parse_args()

    process_files(args)