fasta.py 1.97 KB
Newer Older
1

2
import sys
3

4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
COMPLEMENT_NUCLEOTIDE = {
    'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C',
    'Y': 'R', 'R': 'Y', # pyrimidine (CT) / purine (AG)
    'W': 'S', 'S': 'W', # weak (AT) / strong (GC)
    'K': 'M', 'M': 'K', #  keto (TG) / amino (AC)
    'B': 'V', 'V': 'B', 'D': 'H', 'H': 'D',
    'N': 'N'
}

def revcomp(seq):
    '''Returns the reverse complement of a sequence

    >>> revcomp('ACGNTT')
    'AANCGT'
    '''
    rc = ''
    for nucl in seq[::-1]:
21 22
        try:
            rc += COMPLEMENT_NUCLEOTIDE[nucl.upper()]
23
        except KeyError:
24 25
            sys.stderr.write("! Unknown nucleotide : '%s' " % nucl + seq)
            rc += 'N'
26 27
    return rc

28 29 30 31 32 33 34 35 36 37 38
def parse(fasta, endline=''):
    '''Iterates over sequences in a fasta files, yielding (header, sequence) pairs'''

    header = ''
    sequence = ''
    
    for l in fasta:
        l = l.strip()

        if not l:
            continue
39 40 41

        if l[0] == '#':
            continue
42 43 44 45 46 47 48 49 50 51 52 53
    
        if l[0] == '>':
            if header or sequence:
                yield (header, sequence)
            header = l[1:]
            sequence = ''

        else:
            sequence += l + endline
            
    if header or sequence:
        yield (header, sequence)
54

55 56
def extract_field_if_exists(s, separator, field_number):
    fields = s.split(separator)
57 58 59 60
    if len(fields) > field_number:
        return fields[field_number]
    return str

61 62 63 64 65 66 67 68 69 70 71
def parse_as_Fasta(fasta):
    for (header, sequence) in parse(fasta):
        yield Fasta(header, sequence)


class Fasta():

    def __init__(self, header, sequence):
        self.header = header
        self.seq = sequence

72 73 74
    def revcomp(self):
        self.seq = revcomp(self.seq)

75 76
    @property
    def name(self):
77
        return extract_field_if_exists(self.header, '|', 1)
78 79 80

    @property
    def species(self):
81
        return extract_field_if_exists(self.header, '|', 2)
82 83 84 85 86 87 88

    def __len__(self):
        return len(self.seq)
    
    def __str__(self):
        return '>%s\n%s\n' % (self.header, self.seq)