Commit 8170f55f authored by Mathieu Giraud's avatar Mathieu Giraud

vidjil.cpp, core/fasta, core/windowExtractor: -x/-X options work everywhere,...

vidjil.cpp, core/fasta, core/windowExtractor: -x/-X options work everywhere, especially with '-c segment'

The handling of these options was moved from WindowExtractor to OnlineFasta.
This push was sponsored by E. Macron.
parents 2e4da038 67f7b536
......@@ -107,14 +107,19 @@ const string& Fasta::sequence(int index) const{ return reads[index].sequence; }
// OnlineFasta
OnlineFasta::OnlineFasta(int extract_field, string extract_separator):
input(NULL), extract_field(extract_field), extract_separator(extract_separator){}
OnlineFasta::OnlineFasta(int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence):
input(NULL),
extract_field(extract_field), extract_separator(extract_separator),
nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence){}
OnlineFasta::OnlineFasta(const string &input,
int extract_field, string extract_separator)
int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence)
:input(new igzstream(input.c_str())),
extract_field(extract_field),
extract_separator(extract_separator)
extract_separator(extract_separator),
nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence)
{
if (this->input->fail()) {
delete this->input;
......@@ -127,9 +132,11 @@ OnlineFasta::OnlineFasta(const string &input,
}
OnlineFasta::OnlineFasta(istream &input,
int extract_field, string extract_separator)
int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence)
:input(&input), extract_field(extract_field),
extract_separator(extract_separator)
extract_separator(extract_separator),
nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence)
{
input_allocated = false;
init();
......@@ -143,6 +150,8 @@ OnlineFasta::~OnlineFasta() {
}
void OnlineFasta::init() {
nb_sequences_parsed = 0;
nb_sequences_returned = 0;
char_nb = 0;
line_nb = 0;
line = getInterestingLine();
......@@ -161,10 +170,30 @@ Sequence OnlineFasta::getSequence() {
return current;
}
bool OnlineFasta::hasNextData() {
return ((!input->eof()) || line.length() > 0);
}
bool OnlineFasta::hasNext() {
return (! input->eof()) || line.length() > 0;
return hasNextData()
&& ((nb_sequences_max == NO_LIMIT_VALUE) || (nb_sequences_returned < nb_sequences_max));
}
void OnlineFasta::skipToNthSequence() {
// Possibly skip some reads, when only_nth_sequence > 1
while (hasNextData())
if (nb_sequences_parsed % only_nth_sequence)
{
nb_sequences_returned--;
next();
continue ;
}
else
return ;
}
void OnlineFasta::next() {
fasta_state state = FASTX_UNINIT;
......@@ -178,7 +207,7 @@ void OnlineFasta::next() {
current.seq = NULL;
}
if (hasNext()) {
if (hasNextData()) {
switch(line[0]) {
case '>': state=FASTX_FASTA; break;
case '@': state=FASTX_FASTQ_ID; break;
......@@ -187,14 +216,16 @@ void OnlineFasta::next() {
}
// Identifier line
nb_sequences_parsed++;
nb_sequences_returned++;
current.label_full = line.substr(1);
current.label = extract_from_label(current.label_full, extract_field, extract_separator);
line = getInterestingLine();
while (hasNext() && ((state != FASTX_FASTA || line[0] != '>')
while (hasNextData() && ((state != FASTX_FASTA || line[0] != '>')
&& (state != FASTX_FASTQ_QUAL || line[0] != '@'))) {
if (hasNext()) {
if (hasNextData()) {
switch(state) {
case FASTX_FASTA: case FASTX_FASTQ_ID:
// Sequence
......@@ -237,11 +268,13 @@ void OnlineFasta::next() {
} else
unexpectedEOF();
skipToNthSequence();
}
string OnlineFasta::getInterestingLine(int state) {
string line;
while (line.length() == 0 && hasNext() && getline(*input, line)) {
while (line.length() == 0 && hasNextData() && getline(*input, line)) {
line_nb++;
char_nb += line.length() + 1;
remove_trailing_whitespaces(line);
......
......@@ -97,12 +97,18 @@ class OnlineFasta {
size_t line_nb;
unsigned long long char_nb;
int nb_sequences_parsed;
int nb_sequences_returned;
int nb_sequences_max;
int only_nth_sequence;
public:
/**
* Default constructor
*/
OnlineFasta(int extract_field=0, string extract_separator="|");
OnlineFasta(int extract_field=0, string extract_separator="|",
int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
/**
* Open the file. No sequence is read at first.
......@@ -112,10 +118,12 @@ class OnlineFasta {
* well-formed
*/
OnlineFasta(const string &input,
int extract_field=0, string extract_separator="|");
int extract_field=0, string extract_separator="|",
int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
OnlineFasta(istream &input,
int extract_field=0, string extract_separator="|");
int extract_field=0, string extract_separator="|",
int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
~OnlineFasta();
......@@ -138,6 +146,11 @@ class OnlineFasta {
/**
* @return true iff we did not reach yet the end of the file.
*/
bool hasNextData();
/**
* @return true iff we did not reach yet both the end of the file and the maximal number of returned sequences
*/
bool hasNext();
/**
......@@ -154,6 +167,11 @@ class OnlineFasta {
*/
void init();
/**
* Skip to the next sequence that is a multiple of 'only_nth_sequence'
*/
void skipToNthSequence();
/**
* Reads line in the input stream until we have a line with at least one
* non-whitespace character.
......
......@@ -29,7 +29,6 @@
#define JSON_REMEMBER_BEST 4 /* The number of V/D/J predictions to keep */
#define NO_LIMIT_VALUE -1
#define BAD_EVALUE 1e10
#define THRESHOLD_NB_EXPECTED 1.0 /* Threshold of the accepted expected value for number of found k-mers */
......
#ifndef TOOLS_H
#define TOOLS_H
#define NO_LIMIT_VALUE -1 // Value for 'all' on command-line options
#define MAX_SEED_SIZE 50 // Spaced seed buffer
#define FIRST_POS 0 // Numbering of the base pairs
......
......@@ -18,23 +18,17 @@ WindowExtractor::WindowExtractor(MultiGermline *multigermline): out_segmented(NU
WindowsStorage *WindowExtractor::extract(OnlineFasta *reads,
size_t w,
map<string, string> &windows_labels, bool only_labeled_windows,
int stop_after, int only_nth_read, bool keep_unsegmented_as_clone,
bool keep_unsegmented_as_clone,
double nb_expected, int nb_reads_for_evalue) {
init_stats();
WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
windowsStorage->setMaximalNbReadsPerWindow(max_reads_per_window);
int nb_reads_all = 0;
unsigned long long int bp_total = 0;
while (reads->hasNext() && (int) nb_reads != stop_after) {
while (reads->hasNext()) {
reads->next();
nb_reads_all++;
if (nb_reads_all % only_nth_read)
continue ;
nb_reads++;
if (out_affects) {
......
......@@ -56,7 +56,7 @@ class WindowExtractor {
WindowsStorage *extract(OnlineFasta *reads,
size_t w,
map<string, string> &windows_labels, bool only_labeled_windows=false,
int stop_after=-1, int only_nth_reads=1, bool keep_unsegmented_as_clone=false,
bool keep_unsegmented_as_clone=false,
double nb_expected = THRESHOLD_NB_EXPECTED, int nb_reads_for_evalue = 1);
/**
......
!LAUNCH: ../../vidjil -c segment -x 2 -G ../../germline/IGH ../../data/Stanford_S22.fasta
$ Segments the good number of sequences in Stanford S22
2: >lcl
......@@ -26,6 +26,26 @@ void testOnlineFasta1() {
TAP_TEST(nb_seq == 5, TEST_O_FASTA_HAS_NEXT, "");
}
void testOnlineFastaMaxNth() {
OnlineFasta fa("../../data/test1.fa", 0, "|", 2, 2);
// First sequence is 'seq2', because only_nth_sequence = 2
TAP_TEST(fa.hasNext(), TEST_O_FASTA_HAS_NEXT, "");
fa.next();
Sequence s2 = fa.getSequence();
TAP_TEST(s2.label == "seq2", TEST_O_FASTA_GET_SEQUENCE, "Expected seq2, found " + s2.label);
// Second sequence is 'seq4', because only_nth_sequence = 2
TAP_TEST(fa.hasNext(), TEST_O_FASTA_HAS_NEXT, "");
fa.next();
Sequence s4 = fa.getSequence();
TAP_TEST(s4.label == "seq4", TEST_O_FASTA_GET_SEQUENCE, "Expected seq4, found " + s4.label);
// No more sequences, because nb_sequences_max = 2
TAP_TEST(!fa.hasNext(), TEST_O_FASTA_HAS_NEXT, "Expected (pseudo) end of file");
}
void testFastaNbSequences() {
TAP_TEST(nb_sequences_in_fasta("../../germline/IGHV.fa") == 348, TEST_FASTA_NB_SEQUENCES, "ccc");
......@@ -304,6 +324,7 @@ void testNChooseK() {
void testTools() {
testOnlineFasta1();
testOnlineFastaMaxNth();
testFastaNbSequences();
testFasta1();
testFastaAdd();
......
......@@ -337,8 +337,8 @@ int main (int argc, char **argv)
float ratio_reads_clone = DEFAULT_RATIO_READS_CLONE;
// int average_deletion = 4; // Average number of deletion in V or J
int max_reads_processed = -1;
int max_reads_processed_sample = -1;
int max_reads_processed = NO_LIMIT_VALUE;
int max_reads_processed_sample = NO_LIMIT_VALUE;
float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
......@@ -879,11 +879,19 @@ int main (int argc, char **argv)
//////////////////////////////////
//$$ Read sequence files
int only_nth_read = 1 ;
if (max_reads_processed_sample != NO_LIMIT_VALUE)
{
only_nth_read = nb_sequences_in_fasta(f_reads) / max_reads_processed_sample;
max_reads_processed = max_reads_processed_sample ;
cout << "Processing every " << only_nth_read << "th read" << endl ;
}
OnlineFasta *reads;
try {
reads = new OnlineFasta(f_reads, 1, read_header_separator);
reads = new OnlineFasta(f_reads, 1, read_header_separator, max_reads_processed, only_nth_read);
} catch (const invalid_argument e) {
cerr << ERROR_STRING << "Vidjil cannot open reads file " << f_reads << ": " << e.what() << endl;
exit(1);
......@@ -989,14 +997,6 @@ int main (int argc, char **argv)
//////////////////////////////////
//$$ Kmer Segmentation
int only_nth_read = 1 ;
if (max_reads_processed_sample > 0)
{
only_nth_read = nb_sequences_in_fasta(f_reads) / max_reads_processed_sample;
max_reads_processed = max_reads_processed_sample ;
cout << "Processing every " << only_nth_read << "th read" << endl ;
}
cout << endl;
cout << "Loop through reads, looking for windows" << endl ;
......@@ -1031,7 +1031,7 @@ int main (int argc, char **argv)
WindowsStorage *windowsStorage = we.extract(reads, w,
windows_labels, only_labeled_windows,
max_reads_processed, only_nth_read, keep_unsegmented_as_clone,
keep_unsegmented_as_clone,
expected_value, nb_reads_for_evalue);
windowsStorage->setIdToAll();
size_t nb_total_reads = we.getNbReads();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment