Attention une mise à jour du service Gitlab va être effectuée le mardi 30 novembre entre 17h30 et 18h00. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes. Cette mise à jour intermédiaire en version 14.0.12 nous permettra de rapidement pouvoir mettre à votre disposition une version plus récente.

Commit c15efbd6 authored by Marc Duez's avatar Marc Duez
Browse files
parents dd249f3b 40c08830
......@@ -31,6 +31,14 @@
#include "../lib/gzstream.h"
// http://stackoverflow.com/a/5840160/4475279
unsigned long long filesize(const char* filename)
{
std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary);
return in.tellg();
}
void Fasta::init(int extract_field, string extract_separator)
{
this -> extract_field = extract_field ;
......@@ -133,11 +141,16 @@ OnlineFasta::~OnlineFasta() {
}
void OnlineFasta::init() {
char_nb = 0;
line_nb = 0;
line = getInterestingLine();
current.seq = NULL;
}
unsigned long long OnlineFasta::getPos() {
return char_nb;
}
size_t OnlineFasta::getLineNb() {
return line_nb;
}
......@@ -226,6 +239,7 @@ string OnlineFasta::getInterestingLine(int state) {
string line;
while (line.length() == 0 && hasNext() && getline(*input, line)) {
line_nb++;
char_nb += line.length() + 1;
remove_trailing_whitespaces(line);
if (line.length() && line[0] == '#' && state != FASTX_FASTQ_SEP)
......@@ -274,8 +288,11 @@ ostream &operator<<(ostream &out, const Sequence &seq) {
return out;
}
int nb_sequences_in_fasta(string f)
int nb_sequences_in_fasta(string f, bool approx)
{
if (approx)
return approx_nb_sequences_in_fasta(f);
OnlineFasta *sequences = new OnlineFasta(f, 1, " ");
int nb_sequences = 0 ;
......@@ -288,3 +305,33 @@ int nb_sequences_in_fasta(string f)
cout << " ==> " << nb_sequences << " sequences" << endl;
return nb_sequences ;
}
#define SAMPLE_APPROX_NB_SEQUENCES 200
int approx_nb_sequences_in_fasta(string f)
{
OnlineFasta *sequences = new OnlineFasta(f, 1, " ");
int nb_sequences = 0 ;
while (nb_sequences < SAMPLE_APPROX_NB_SEQUENCES && sequences->hasNext())
{
sequences->next();
nb_sequences++ ;
}
cout << " ==> " ;
if (sequences->hasNext())
{
cout << "approx. " ;
float ratio = (float) filesize(f.c_str()) / (float) sequences->getPos();
nb_sequences = (int) (ratio * nb_sequences);
}
cout << nb_sequences << " sequences" << endl;
delete sequences ;
return nb_sequences ;
}
......@@ -27,6 +27,8 @@ typedef enum {
#include "tools.h"
unsigned long long filesize(const char* filename);
class Fasta
{
void init(int extract_field, string extract_separator);
......@@ -91,6 +93,7 @@ class OnlineFasta {
string line;
bool input_allocated;
size_t line_nb;
unsigned long long char_nb;
public:
......@@ -113,7 +116,12 @@ class OnlineFasta {
int extract_field=0, string extract_separator="|");
~OnlineFasta();
/**
* @return the position in the file
*/
unsigned long long getPos();
/**
* @return the current line number
*/
......@@ -166,6 +174,7 @@ ostream &operator<<(ostream &out, const Sequence &seq);
* Count the number of sequences in a Fasta file
* @return the number of sequences
*/
int nb_sequences_in_fasta(string f);
int nb_sequences_in_fasta(string f, bool approx = false);
int approx_nb_sequences_in_fasta(string f);
#endif
......@@ -79,6 +79,10 @@ bool Segmenter::isDSegmented() const {
return dSegmented;
}
bool KmerSegmenter::isDetected() const {
return detected;
}
// Chevauchement
string Segmenter::removeChevauchement()
......@@ -241,9 +245,18 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline)
CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
pair <KmerAffect, KmerAffect> max12 = ckaa.sortLeftRight(ckaa.max12(forbidden));
if (max12.first.isUnknown() || max12.second.isUnknown())
{
because = UNSEG_TOO_FEW_ZERO ;
return ;
}
strand = nb_strand[0] > nb_strand[1] ? -1 : 1 ;
computeSegmentation(strand, max12.first, max12.second);
if (!detected)
because = UNSEG_TOO_FEW_ZERO ;
// The pseudo-germline should never take precedence over the regular germlines
score = 1 ;
}
......@@ -313,7 +326,8 @@ KmerSegmenter::~KmerSegmenter() {
delete kaa;
}
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented, double threshold)
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented,
double threshold, int nb_reads_for_evalue)
{
int best_score_seg = 0 ; // Best score, segmented sequences
int best_score_unseg = 0 ; // Best score, unsegmented sequences
......@@ -383,9 +397,9 @@ KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermlin
if (threshold_nb_expected > NO_LIMIT_VALUE)
if (the_kseg->isSegmented()) {
// the_kseg->evalue also depends on the number of germlines from the *Multi*KmerSegmenter
the_kseg->evalue = getNbExpected();
the_kseg->evalue = getNbExpected(nb_reads_for_evalue);
pair <double, double> p = getNbExpectedLeftRight();
pair <double, double> p = getNbExpectedLeftRight(nb_reads_for_evalue);
the_kseg->evalue_left = p.first;
the_kseg->evalue_right = p.second;
......@@ -401,14 +415,14 @@ KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermlin
}
}
double KmerMultiSegmenter::getNbExpected() const {
pair <double, double> p = getNbExpectedLeftRight();
double KmerMultiSegmenter::getNbExpected(int multiplier) const {
pair <double, double> p = getNbExpectedLeftRight(multiplier);
return (p.first + p.second);
}
pair<double,double> KmerMultiSegmenter::getNbExpectedLeftRight() const {
pair<double,double> KmerMultiSegmenter::getNbExpectedLeftRight(int multiplier) const {
pair <double, double> p = the_kseg->getKmerAffectAnalyser()->getLeftRightProbabilityAtLeastOrAbove();
return pair<double, double>(p.first * multi_germline->germlines.size(), p.second * multi_germline->germlines.size());
return pair<double, double>(p.first * multiplier * multi_germline->germlines.size(), p.second * multiplier * multi_germline->germlines.size());
}
KmerMultiSegmenter::~KmerMultiSegmenter() {
......@@ -425,12 +439,10 @@ void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffec
max = kaa->getMaximum(before, after);
// We labeled it detected if there were both enough affect_5 and enough affect_3
bool detected = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD)
detected = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD)
&& (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD);
if (! max.max_found) {
// ----------------------------
// This code is only here to set the UNSEG cause. It could be streamlined.
if ((strand == 1 && max.nb_before_left == 0)
|| (strand == -1 && max.nb_after_right == 0))
because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_V ;
......@@ -440,7 +452,6 @@ void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffec
because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_J ;
} else
because = UNSEG_AMBIGUOUS;
// ----------------------------
} else {
Vend = max.first_pos_max;
Jstart = max.last_pos_max + 1;
......
......@@ -169,6 +169,7 @@ class KmerSegmenter : public Segmenter
string affects;
public:
bool isDetected() const;
int score;
KmerSegmenter();
......@@ -206,18 +207,18 @@ class KmerMultiSegmenter
* @param threshold: threshold of randomly expected segmentation
*/
KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented,
double threshold = THRESHOLD_NB_EXPECTED);
double threshold = THRESHOLD_NB_EXPECTED, int nb_reads_for_evalue = 1);
/**
* @return expected number of Segmenter that would have yield the maximum score by chance
*/
double getNbExpected() const;
double getNbExpected(int multiplier) const;
/**
* @return expected number of Segmenter that would have yield the maximum score by chance
* on the left part of the read and on the right part of the read respectively.
*/
pair<double,double> getNbExpectedLeftRight() const;
pair<double,double> getNbExpectedLeftRight(int multiplier) const;
~KmerMultiSegmenter();
......
......@@ -11,7 +11,7 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
size_t w,
map<string, string> &windows_labels,
int stop_after, int only_nth_read, bool keep_unsegmented_as_clone,
double nb_expected) {
double nb_expected, int nb_reads_for_evalue) {
init_stats();
WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
......@@ -38,7 +38,7 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
*out_affects << reads->getSequence();
}
KmerMultiSegmenter kmseg(reads->getSequence(), multigermline, out_affects, nb_expected);
KmerMultiSegmenter kmseg(reads->getSequence(), multigermline, out_affects, nb_expected, nb_reads_for_evalue);
KmerSegmenter *seg = kmseg.the_kseg ;
int read_length = seg->getSequence().sequence.length();
......
......@@ -40,6 +40,7 @@ class WindowExtractor {
* @param w: length of the window
* @param windows_labels: Windows that must be kept and registered as such.
* @param nb_expected: maximal e-value of the segmentation
* @param nb_reads_for_evalue: number of reads, used for e-value computation. Can be approximate or faked.
* @return a pointer to a WindowsStorage that will contain all the windows.
* It is a pointer so that the WindowsStorage is not duplicated.
* @post Statistics on segmentation will be provided through the getSegmentationStats() methods
......@@ -49,7 +50,7 @@ class WindowExtractor {
size_t w,
map<string, string> &windows_labels,
int stop_after=-1, int only_nth_reads=1, bool keep_unsegmented_as_clone=false,
double nb_expected = THRESHOLD_NB_EXPECTED);
double nb_expected = THRESHOLD_NB_EXPECTED, int nb_reads_for_evalue = 1);
/**
* @return the average length of sequences whose segmentation has been classified as seg
......
......@@ -26,6 +26,18 @@ void testOnlineFasta1() {
TAP_TEST(nb_seq == 5, TEST_O_FASTA_HAS_NEXT, "");
}
void testFastaNbSequences() {
TAP_TEST(nb_sequences_in_fasta("../../germline/IGHV.fa") == 344, TEST_FASTA_NB_SEQUENCES, "ccc");
int a1 = approx_nb_sequences_in_fasta("../../germline/IGHV.fa");
TAP_TEST(a1 >= 340 && a1 <= 348, TEST_FASTA_NB_SEQUENCES, "");
int a2 = nb_sequences_in_fasta("../../data/Stanford_S22.fasta", true);
TAP_TEST(a2 >= 13100 && a2 <= 13200, TEST_FASTA_NB_SEQUENCES, "");
}
void testFasta1() {
Fasta fa("../../data/test1.fa");
Fasta fq("../../data/test1.fq");
......@@ -284,6 +296,7 @@ void testNChooseK() {
void testTools() {
testOnlineFasta1();
testFastaNbSequences();
testFasta1();
testFastaAdd();
testFastaAddThrows();
......
......@@ -14,6 +14,7 @@ enum {
TEST_FASTA_ADD,
TEST_FASTA_INVALID_FILE,
TEST_FASTA_OUT,
TEST_FASTA_NB_SEQUENCES,
TEST_CREATE_SEQUENCE_LABEL_FULL,
TEST_CREATE_SEQUENCE_LABEL,
TEST_CREATE_SEQUENCE_SEQUENCE,
......@@ -167,6 +168,7 @@ inline void declare_tests() {
RECORD_TAP_TEST(TEST_FASTA_ADD, "Fasta add() method");
RECORD_TAP_TEST(TEST_FASTA_INVALID_FILE, "Fasta with invalid file");
RECORD_TAP_TEST(TEST_FASTA_OUT, "Test operator<< with Fasta");
RECORD_TAP_TEST(TEST_FASTA_NB_SEQUENCES, "Nb sequences in Fasta");
RECORD_TAP_TEST(TEST_CREATE_SEQUENCE_LABEL_FULL, "create_sequence: label_full field");
RECORD_TAP_TEST(TEST_CREATE_SEQUENCE_LABEL, "create_sequence: label field");
RECORD_TAP_TEST(TEST_CREATE_SEQUENCE_SEQUENCE, "create_sequence: sequence field");
......
#include <core/tools.h>
#include <iostream>
#include <cassert>
#include <core/fasta.h>
#include <core/kmerstore.h>
#include <core/kmeraffect.h>
#include <core/affectanalyser.h>
#include <core/segment.h>
#include "tests.h"
using namespace std;
const string seq[] = {"CCCCCCCCCCCCCCCCCCCC", "C lots of",
"GGGGGGGGGGGGGGGGGGGG", "G lots of",
"AAAAAAAAAAAAAAAAAAAA", "A lots of",
"TTTTTTTTTTTTTTTTTTTT", "T lots of",
"TTTTATTTTATTTTATTTTA", "U : TTTA lots of",
"AAAACAAAACAAAACAAAAC", "V : AAAC lots of"};
const int nb_seq = 6;
template<typename Index>
Index *createIndex(int k, bool revcomp) {
Index *index = new Index(k, revcomp);
for (int i=0; i < nb_seq; i++)
index->insert(seq[2*i], seq[2*i+1]);
return index;
}
void testFasta1() {
Fasta fa("data/test1.fa");
Fasta fq("data/test1.fq");
TAP_TEST(fa.size() == fq.size(), TEST_FASTA_SIZE, "");
for (int i=0; i < fa.size(); i++) {
TAP_TEST(fa.label(i) == fq.label(i), TEST_FASTA_LABEL, "");
TAP_TEST(fa.label_full(i) == fq.label_full(i), TEST_FASTA_LABEL_FULL, "");
TAP_TEST(fa.sequence(i) == fq.sequence(i), TEST_FASTA_SEQUENCE, "");
}
}
template<template <class> class T>
void testKmerStoreWithKmer(int k, bool revcomp, int test_id ) {
T<Kmer> *index = createIndex<T<Kmer> >(k, revcomp);
for (int i = 0; i < nb_seq-2; i++) {
string tmp = seq[2*i].substr(0, k);
if (revcomp) {
TAP_TEST((*index)[tmp].count == (seq[2*i].length()-k+1)*2, test_id, "");
} else {
TAP_TEST((*index)[tmp].count == (seq[2*i].length()-k+1), test_id, "");
}
}
delete index;
}
template<template <class> class T, class KAffect>
void testAffectAnalyser1() {
const int k = 4;
const bool revcomp = false;
T<KAffect> *index = createIndex<T<KAffect> >(k, revcomp);
KmerAffectAnalyser<KAffect> kaa(*index, "AAAACCCCCGGGGG");
for (int i = 2; i < nb_seq-1; i++) {
KAffect current_affect("", seq[2*i+1], 1);
TAP_TEST(kaa.count(current_affect) == 0, TEST_AA_COUNT, "");
TAP_TEST(kaa.first(current_affect) == (int)string::npos, TEST_AA_FIRST, "");
TAP_TEST(kaa.last(current_affect) == (int)string::npos, TEST_AA_LAST, "");
}
for (int i = 0; i < 2; i++) {
KAffect current_affect("", seq[2*i+1], 1);
TAP_TEST(kaa.count(current_affect) == 2, TEST_AA_COUNT, "");
TAP_TEST(kaa.getAffectation(kaa.first(current_affect)) == current_affect, TEST_AA_GET_AFFECT, "");
TAP_TEST(kaa.getAffectation(kaa.last(current_affect)) == current_affect, TEST_AA_GET_AFFECT, "");
}
TAP_TEST(kaa.count(KAffect("", seq[2*(nb_seq-1)+1], 1)) == 1, TEST_AA_COUNT, "");
TAP_TEST((kaa.first(KAffect("", seq[2*(nb_seq-1)+1], 1))
== kaa.last(KAffect("", seq[2*(nb_seq-1)+1], 1)))
== 1, TEST_AA_FIRST, "");
TAP_TEST(kaa.getAffectation(3).isUnknown(), TEST_AA_PREDICATES, "");
TAP_TEST(kaa.getAffectation(8).isUnknown(), TEST_AA_PREDICATES, "");
TAP_TEST(kaa.getAffectation(0).isAmbiguous(), TEST_AA_PREDICATES, "");
TAP_TEST(kaa.getDistinctAffectations().size() == 5, TEST_AA_GET_DISTINCT_AFFECT, "");
delete index;
}
template<template <class> class T, class KAffect>
void testAffectAnalyser2() {
const int k = 5;
const bool revcomp = true;
T<KAffect> *index = createIndex<T<KAffect> >(k, revcomp);
KmerAffectAnalyser<KAffect> kaa(*index, "TTTTTGGGGG");
TAP_TEST(kaa.getAffectation(1) == KAffect("", seq[2*(nb_seq-1)+1], -1), TEST_AA_GET_AFFECT, "");
TAP_TEST(kaa.count(kaa.getAffectation(1)) == 1, TEST_AA_GET_AFFECT, "");
TAP_TEST(kaa.getAffectation(0) == kaa.getAffectation(10 - k), TEST_AA_GET_AFFECT, "");
for (int i = 2; i < 10 - k; i++)
TAP_TEST(kaa.getAffectation(i).isUnknown(), TEST_AA_PREDICATES, "");
TAP_TEST(kaa.getDistinctAffectations().size() == 3, TEST_AA_GET_DISTINCT_AFFECT, "");
delete index;
}
void testSegmentationBug1(int delta_min, int delta_max) {
string buggy_sequences = "bugs/kmersegment.fa";
int k = 14;
bool rc = true;
Fasta seqV("data/Repertoire/TRGV.fa");
Fasta seqJ("data/Repertoire/TRGJ.fa");
IKmerStore<KmerAffect> *index = new ArrayKmerStore<KmerAffect>(k, rc);
index->insert(seqV, "V");
index->insert(seqJ, "J");
OnlineFasta input(buggy_sequences);
while (input.hasNext()) {
input.next();
KmerAffectAnalyser<KmerAffect> *kaa = new KmerAffectAnalyser<KmerAffect>(*index, input.getSequence().sequence);
set<KmerAffect> distinct_a = kaa->getDistinctAffectations();
int strand = 0;
for (set<KmerAffect>::iterator it = distinct_a.begin();
it != distinct_a.end() && strand != 2; it++) {
if (! it->isAmbiguous() && ! it->isUnknown()) {
if (strand == 0)
strand = affect_strand(it->affect);
else if ((strand == 1 && affect_strand(it->affect) == -1)
|| (strand == -1 && affect_strand(it->affect) == 1))
strand = 2;
}
}
int stats[STATS_SIZE];
Segmenter *segment = new KmerSegmenter(input.getSequence(), index,
delta_min, delta_max, stats);
if (strand == 2
|| (strand == 1
&& (kaa->last(AFFECT_V) == (int)string::npos
|| kaa->first(AFFECT_J) == (int) string::npos))
|| (strand == -1
&& (kaa->first(AFFECT_V) == (int)string::npos
|| kaa->last(AFFECT_J) == (int) string::npos)))
TAP_TEST(! segment->isSegmented(), TEST_BUG_SEGMENTATION, "");
delete segment;
delete kaa;
}
delete index;
}
int main(void) {
TAP_START(NB_TESTS);
declare_tests();
TAP_TEST(complement("AATCAGactgactagATCGAn") == "TTAGTCTGACTGATCTAGCTN", TEST_REVCOMP, "");
TAP_TEST(revcomp("AATCAGactgactagATCGAn") == "NTCGATCTAGTCAGTCTGATT", TEST_REVCOMP, "");
TAP_TEST(revcomp("") == "", TEST_REVCOMP, "");
TAP_TEST(revcomp("aaaaaa") == "TTTTTT", TEST_REVCOMP, "");
testFasta1();
testKmerStoreWithKmer<ArrayKmerStore>(5, false, TEST_ARRAY_KMERSTORE);
testKmerStoreWithKmer<ArrayKmerStore>(5, true, TEST_ARRAY_KMERSTORE_RC);
testKmerStoreWithKmer<MapKmerStore>(5, false, TEST_MAP_KMERSTORE);
testKmerStoreWithKmer<MapKmerStore>(5, true, TEST_MAP_KMERSTORE_RC);
testAffectAnalyser1<ArrayKmerStore,KmerAffect>();
testAffectAnalyser2<ArrayKmerStore,KmerAffect>();
testAffectAnalyser1<ArrayKmerStore,KmerStringAffect>();
testAffectAnalyser2<ArrayKmerStore,KmerStringAffect>();
// Bug with one sequence on segmentation
testSegmentationBug1(-10, 15);
TAP_END_TEST_EXIT
}
......@@ -811,6 +811,10 @@ int main (int argc, char **argv)
cout << "Germlines loaded" << endl ;
cout << *multigermline ;
cout << endl ;
// Number of reads for e-value computation
int nb_reads_for_evalue = (expected_value > NO_LIMIT_VALUE) ? nb_sequences_in_fasta(f_reads, true) : 1 ;
//////////////////////////////////
//$$ Read sequence files
......@@ -959,7 +963,8 @@ int main (int argc, char **argv)
we.setAffectsOutput(out_affects);
}
WindowsStorage *windowsStorage = we.extract(reads, multigermline, w, windows_labels, max_reads_processed, only_nth_read, keep_unsegmented_as_clone, expected_value);
WindowsStorage *windowsStorage = we.extract(reads, multigermline, w, windows_labels, max_reads_processed, only_nth_read, keep_unsegmented_as_clone,
expected_value, nb_reads_for_evalue);
windowsStorage->setIdToAll();
size_t nb_total_reads = we.getNbReads();
......@@ -1255,7 +1260,7 @@ int main (int argc, char **argv)
seg.toJsonList(&json_seg);
// Re-launch also a KmerMultiSegmenter, for control purposes (affectations, evalue)
KmerMultiSegmenter kmseg(seg.getSequence(), multigermline, 0, expected_value);
KmerMultiSegmenter kmseg(seg.getSequence(), multigermline, 0, expected_value, nb_reads_for_evalue);
KmerSegmenter *kseg = kmseg.the_kseg ;
kseg->toJsonList(&json_seg);
if (verbose)
......
......@@ -181,8 +181,10 @@ different clones. Setting =-w= to lower values is not recommended.
The =-e= option sets the maximal e-value accepted for segmenting a sequence.
If a segmentation has a higher e-value, it will not be segmented.
The e-value computation takes into account both the number of reads in the
input sequence and the number of locus searched for.
The default value is 'all', but the recommended value is to use something
like 1e-6 for datasets with a billion of reads.
like 1, 1e-2 or 1.
Further help on this point will come in next releases.
** Threshold on clone output
......
......@@ -24,7 +24,13 @@ def parse(fasta, endline=''):
if header or sequence:
yield (header, sequence)
def extract_field_if_exists(str, separator, field_number):
fields = str.split(separator)
if len(fields) > field_number:
return fields[field_number]
return str
def parse_as_Fasta(fasta):
for (header, sequence) in parse(fasta):
yield Fasta(header, sequence)
......@@ -38,11 +44,11 @@ class Fasta():
@property
def name(self):
return self.header.split('|')[1]
return extract_field_if_exists(self.header, '|', 1)
@property
def species(self):
return self.header.split('|')[2]
return extract_field_if_exists(self.header, '|', 2)
def __len__(self):
return len(self.seq)
......
......@@ -7,8 +7,8 @@ import random
random.seed(33328778554)
def recombine_VJ(seq5, remove5, N, remove3, seq3, code):
name = "%s %d/%s/%d %s %s " % (seq5.name, remove5, N, remove3, seq3.name, code)
def recombine_VJ(seq5, remove5, N, remove3, seq3):
name = "%s %d/%s/%d %s" % (seq5.name, remove5, N, remove3, seq3.name)
seq = seq5.seq[:len(seq5)-remove5] + '\n' + N + '\n' + seq3.seq[remove3:]
return fasta.Fasta(name, seq)
......@@ -17,7 +17,7 @@ def recombine_VJ(seq5, remove5, N, remove3, seq3, code):
def random_sequence(characters, length):
return ''.join([random.choice(characters) for x in range(length)])
def recombine_VJ_with_removes(seq5, remove5, Nlength, remove3, seq3, code):
def recombine_VJ_with_removes(seq5, remove5, Nlength, remove3, seq3):
'''Recombine V and J with a random N, ensuring that the bases in N are not the same that what was ultimately removed from V or J'''
assert(Nlength >= 1)
......@@ -29,36 +29,53 @@ def recombine_VJ_with_removes(seq5, remove5, Nlength, remove3, seq3, code):
if c in available:
available.remove(c)
return recombine_VJ(seq5, remove5, random_sequence(available, Nlength), remove3, seq3, code)
return recombine_VJ(seq5, remove5, random_sequence(available, Nlength), remove3, seq3)