Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 0673ada5 authored by eu_susan's avatar eu_susan
Browse files

New Util.*pp: printing method added

New Definitions.hpp: PosType struct
parent da2deb96
No related branches found
No related tags found
No related merge requests found
......@@ -14,7 +14,7 @@ using namespace std;
static const unsigned int ST_ARM = 25; // stem arm length
static const unsigned int STL_MIN = 5; // stem loop minimum length
static const unsigned int STL_MAX = 20; // stem loop maximun length
static const float LIMIT = 0.0; // free energy threshold
static const float LIMIT = -20.6; //0.0; // free energy threshold
static const unsigned int W = 2; // word size
static const unsigned int LIMIT_int3ini = 30;
......@@ -39,6 +39,17 @@ static const int T = 3;
static const unsigned int X = 4; // alphabet size
// =============================================================================
// Metainfo for the positions of the pre-miRNA in the genome (auto set threshold)
struct PosType
{
string sequenceId;
int begin;
int end;
string strand;
};
// =============================================================================
// Features of the alignment path
......@@ -52,7 +63,7 @@ struct PairType
// =============================================================================
// Optimisation parameters
static const unsigned int DW = 6; // diagonal width at first line ( max diagonal during alignment = 2*DW-1 )
static const unsigned int DW = 5; // diagonal width at first line ( max diagonal during alignment = 2*DW-1 )
static const unsigned int tbl = 4; // t-blocks length //MODIF
// =============================================================================
......
// =============================================================================
/// @file Util.cpp
#include "Definitions.hpp"
#include "Options.hpp"
#include "Util.hpp"
#include "kseq.h"
#include <fstream>
#include<stack>
#include<fstream>
KSEQ_INIT(gzFile, gzread) // fasta parser (by Heng Li)
// =============================================================================
void Util::print_align( vector<PairType>& align_path, string& sequence, ofstream& ofile, int stArm_size, int loop_size, string& strand )
{
// === Bracket notation
stack<char> itemNotationSS;
// print notation for 5p arm
for( unsigned int i = 0; i < align_path.size(); ++i )
{
if( align_path[i].type == MA )
{
ofile << "(";
itemNotationSS.push(')');
}
else if( align_path[i].type == GL )
{
itemNotationSS.push('.');
}
else if( align_path[i].type == GU )
{
ofile << ".";
}
else //MI
{
ofile << ".";
itemNotationSS.push('.');
}
}
// print notation for loop
for( unsigned int i=0; i < loop_size; ++i )
{
ofile << ".";
}
// print notation for 3p loop
while( !itemNotationSS.empty() )
{
ofile << itemNotationSS.top();
itemNotationSS.pop();
}
ofile << endl;
// === Beautiful structure
// =====================================================
// Top strand
// --- 1) MI_Stem
for( int i = 0; i < align_path.size(); ++i )
{
if( align_path[i].type == MI || align_path[i].type == GL || align_path[i].type == GU )
{
if( strand == "forward" )
ofile << align_path[i].nuc1;
else
ofile << align_path[i].nuc2;
}
else
{
ofile << " ";
}
}
// --- 1) MI_Loop
string loop = sequence.substr( stArm_size, loop_size );
string loopTop, loopBot;
char hangNuc1, hangNuc2, hangNuc3;
hangNuc1 = hangNuc2 = hangNuc3 = ' ';
if( loop.size() % 2 == 0 )
{
loopTop = loop.substr( 0, loop.size()/2 );
loopBot = loop.substr( loop.size()/2, loop.size()/2 );
hangNuc1 = loopTop[ loopTop.size()-1 ];
loopTop.pop_back();
}
else
{
loopTop = loop.substr( 0, (loop.size()/2)+1 );
loopBot = loop.substr( (loop.size()/2)+1, loop.size()/2 );
hangNuc1 = loopTop[ loopTop.size()-2 ]; // antepenultimo
hangNuc2 = loopTop.back();
loopTop.pop_back();
loopTop.pop_back();
}
hangNuc3 = loopBot.front();
ofile << loopTop << endl;
// --- 2) MA_Stem
for( int i = 0; i < align_path.size(); ++i )
{
if( align_path[i].type == MA )
{
if( strand == "forward" )
ofile << align_path[i].nuc1;
else
ofile << align_path[i].nuc2;
}
else
{
ofile << " ";
}
}
// --- 2) MA_Loop
for( int i = 0; i < loopTop.size(); ++i )
ofile << " ";
ofile << hangNuc1 << endl;
// =====================================================
// Alignment
// --- 3) ALIGN_Stem
for( int i = 0; i < align_path.size(); ++i )
{
if( align_path[i].type == MA )
ofile << "|";
else
ofile << " ";
}
// --- 3) ALIGN_Loop
for( int i = 0; i < loopTop.size(); ++i )
ofile << " ";
ofile << hangNuc2 << endl;
// =====================================================
// Bottom strand
// --- 4) MA_Stem
for( int i = 0; i < align_path.size(); ++i )
{
if( align_path[i].type == MA )
{
if( strand == "forward" )
ofile << align_path[i].nuc2;
else
ofile << align_path[i].nuc1;
}
else
{
ofile << " ";
}
}
// --- 4) MA_Loop
for( int i = 0; i < loopTop.size(); ++i )
ofile << " ";
ofile << hangNuc3 << endl;
// --- 5) MI_Stem
for( int i = 0; i < align_path.size(); ++i )
{
if( align_path[i].type == MI || align_path[i].type == GL || align_path[i].type == GU )
{
if( strand == "forward" )
ofile << align_path[i].nuc2;
else
ofile << align_path[i].nuc1;
}
else
{
ofile << " ";
}
}
// --- 5) MI_Loop
for ( string::reverse_iterator rit = loopBot.rbegin(); rit != loopBot.rend()-1; ++rit )
ofile << *rit;
ofile << endl;
}
// =============================================================================
void Util::gen_random( vector<PosType>& pos, string& genome_file, string& genome_random )
{
gzFile file = gzopen( genome_file.c_str(), "r" );
kseq_t *kseq = kseq_init( file );
int l;
// --- Get subsequeces: premirnas regions
string preMirnas;
while( (l = kseq_read(kseq)) >= 0 )
{
string seqTmp = kseq->seq.s;
string sequenceId = kseq->name.s;
string sequence = clean_sequence( seqTmp );
for( int i = 0; i < pos.size(); ++i)
{
if( sequenceId == pos[i].sequenceId )
{
string subseq = sequence.substr( pos[i].begin, pos[i].end - pos[i].begin + 1 );
if( pos[i].strand == "-" )
rev_comp( subseq );
preMirnas += subseq;
}
}
}
// --- Compute nucleotide frequency
int a = count( preMirnas.begin(), preMirnas.end(), 'A' );
a = a + count( preMirnas.begin(), preMirnas.end(), 'a' );
int t = count( preMirnas.begin(), preMirnas.end(), 'T' );
t = t + count( preMirnas.begin(), preMirnas.end(), 't' );
int c = count( preMirnas.begin(), preMirnas.end(), 'C' );
c = c + count( preMirnas.begin(), preMirnas.end(), 'c' );
int g = count( preMirnas.begin(), preMirnas.end(), 'G' );
g = g + count( preMirnas.begin(), preMirnas.end(), 'g' );
int n = count( preMirnas.begin(), preMirnas.end(), 'N' );
n = n + count( preMirnas.begin(), preMirnas.end(), 'N' );
float freqA = a / (float)preMirnas.size();
float freqT = t / (float)preMirnas.size();
float freqC = c / (float)preMirnas.size();
//float freqG = g / (float)preMirnas.size();
// --- Generate random sequence
for( unsigned int i = 0; i < preMirnas.size()-n; ++i )
{
float r = ((float) rand() / (RAND_MAX));
if( r < freqA )
genome_random += "A";
else if( r < freqA+freqT )
genome_random += "T";
else if( r < freqA+freqT+freqC )
genome_random += "C";
else
genome_random += "G";
}
}
// =============================================================================
void Util::rev_comp( string& sequence )
{
replace( sequence.begin(), sequence.end(), 'A', 't' );
replace( sequence.begin(), sequence.end(), 'T', 'a' );
replace( sequence.begin(), sequence.end(), 'C', 'g' );
replace( sequence.begin(), sequence.end(), 'G', 'c' );
transform( sequence.begin(), sequence.end(), sequence.begin(), ::toupper );
reverse( sequence.begin(), sequence.end() );
}
// =============================================================================
void Util::parse_gff( string& filename, vector<PosType>& positions )
{
fstream file( filename.c_str() );
string line;
while( getline( file, line ) )
{
if( line[0] != '#' )
{
vector<string> tokens;
split( line, "\t", tokens );
if( tokens[2] == "miRNA_primary_transcript" )
{
PosType pt;
pt.sequenceId = tokens[0];
pt.begin = atoi( tokens[3].c_str() );
pt.end = atoi( tokens[4].c_str() );
pt.strand = tokens[6];
positions.push_back( pt );
}
}
}
}
// =============================================================================
void Util::split(const string& str, const string& delimiters , vector<string>& tokens)
{
string::size_type lastPos = str.find_first_not_of(delimiters, 0);
string::size_type pos = str.find_first_of(delimiters, lastPos);
while (string::npos != pos || string::npos != lastPos)
{
tokens.push_back(str.substr(lastPos, pos - lastPos));
lastPos = str.find_first_not_of(delimiters, pos);
pos = str.find_first_of(delimiters, lastPos);
}
}
// =============================================================================
......@@ -268,3 +590,101 @@ bool Util::is_in_restricted_seq( string r_seq, string sequenceID, string* premiR
// Don't use the option 'restricted sequence'.
return( true );
}
// =============================================================================
/// \fn void Glocal( Sequence *sq, Align& al, Options& opt )
/// \brief Look for miRNAs candidates in a sequence.
/// \param sq pointer to Sequence
/// \param al object for the alignemnt
/// \param opt object with the options set by the user
/*void Glocal( Sequence *pSq, Align& al, Options& opt )
{
for( unsigned int i = 0; i < pSq->seq.size() - (opt.st_arm + opt.stl_max + opt.st_arm); ++i )
{
string premirna = pSq->seq.substr( i, opt.st_arm + opt.st_arm + opt.stl_max );
size_t found = premirna.find( 'N', 0 );
if( found == string::npos )
{
string arm1 = pSq->seq.substr( i, opt.st_arm );
string arm2 = pSq->seq.substr( i+opt.st_arm+1, opt.st_arm + opt.stl_max );
reverse( arm2.begin(), arm2.end() );
// --- Alignment
al.forward( arm1, arm2 );
al.compute_maxj();
unsigned int maxj = al.get_maxj();
al.backward_glocal( arm1, arm2, arm1.size(), maxj );
// --- Compute the free energy
vector<PairType> path = al.get_path();
float energy = eSYM; // Symmetry penalty
energy += al.compute_stack( path ); // Stacking energy
energy += al.compute_dangle( path ); // Dangling nucleotide
energy += al.compute_internal( path ); // Internal loops (=mismatches)
if( energy > opt.limit ){ continue; }
energy += al.compute_bulge( path ); // Bulges (=gaps)
// --- Just mirna candidates under the threshold
if( energy < opt.limit )
{
unsigned int arm2_beg = al.get_minj(); // coordinate in arm scale
unsigned int arm2_end = al.get_maxj(); // idem
unsigned int arm2_end_prime = (i+(2*opt.st_arm)+opt.stl_max) - arm2_end; // coordinate in genome scale
unsigned int arm2_beg_prime = arm2_end_prime - (arm2_end - arm2_beg); // idem
unsigned int loop_size = arm2_beg_prime - i+opt.st_arm + 1;
unsigned int loop_beg = i+opt.st_arm + 1;
unsigned int loop_end = loop_beg + loop_size;
stringstream sst;
sst << pSq->id << ":" <<\
i+1 << ":" <<\
arm2_end_prime+1 << ":" <<\
loop_beg+1 << ":" <<\
loop_end+1 << ":" <<\
energy << ":" <<\
pSq->strand;
// header
string header = sst.str();
//sequence
if( pSq->strand == "reverse" )
{
// to be condescendent with the mirbase gff
reverse( pSq->seq.begin(), pSq->seq.end() );
}
string sequence = pSq->seq.substr( i, arm2_end_prime );
//pSq->ofile << ">" << header << endl;
//pSq->ofile << sequence << endl;
//ofile << ">" << header << endl;
//ofile << sequence << endl;
}
}
else
{
i += found;
}
}
}*/
......@@ -4,9 +4,13 @@
#ifndef UTIL_H_
#define UTIL_H_
#include "Definitions.hpp"
#include<zlib.h>
#include<string.h>
#include<string>
#include<algorithm>
#include<vector>
using namespace std;
......@@ -18,10 +22,25 @@ using namespace std;
class Util
{
public:
/// \fn
void print_align( vector<PairType>& align_path, string& sequence, ofstream& ofile, int stArm_size, int loop_size, string& strand );
/// \fn
void gen_random( vector<PosType>& pos, string& genome_file, string& genome_random );
/// \fn
void parse_gff( string& filename, vector<PosType>& positions );
/// \fn
void rev_comp( string& sequence );
/// \fn
void split(const string& str, const string& delimiters , vector<string>& tokens);
/// \fn Util::isATCG( char nucleotide )
/// \brief
/// \param
bool isATCG( char nucleotide );
/// \fn string Util::clean_sequence( string& sequence )
......@@ -29,21 +48,18 @@ class Util
/// Substitute masked sequences (lower case letters),
/// if they exists, by N.
/// \param sequence Nucleotide sequence.
string clean_sequence( string& sequence );
/// \fn int Util::encode_dimer( char first, char second )
/// \brief Encode a dimer (two nucleotides) in an integer.
/// \param first First nucleotide of the dimer.
/// \param second Second nucleotide of the dimer.
int encode_dimer( char first, char second );
/// \fn int Util::encode_closure(char a, char b);
/// \brief Encode the matching closure of a internal loop 1x1.
/// \param a First nucleotide of the match.
/// \param b Second nucleotide of the match.
int encode_closure(char a, char b);
/// \fn int Util::encode_closure( char a, char b, char c, char d );
......@@ -52,32 +68,27 @@ class Util
/// \param b Second nucleotide of the first match.
/// \param c First nucleotide of the second match.
/// \param d Second nucleotide of the second match.
int encode_closure( char a, char b, char c, char d );
/// \fn int Util::encode(char a);
/// \brief Encodes the nucleotide of the single mismatch of internal 1x1.
/// \param a Nucleotide of the single mismatch.
int encode(char a);
/// \fn int Util::encode(char a, char b);
/// \brief Encodes the nucleotides of a mismatch (int22).
/// \param a First nucleotide of a mismatch.
/// \param b Second nucleotide of a mismatch.
int encode(char a, char b);
/// \fn int Util::encode_dangle(char a);
/// \brief Encodes the dangling nucleotide of a dangle end.
/// \param a Dangling nucleotide.
int encode_dangle(char a);
/// \fn string Util::restricted_seq();
/// \brief Select only the sequences wanted in the genome -> we just read the file and stock sequences ID.
/// \param r_seq is the restricted file name.
pair<string*,unsigned int> restricted_seq( string r_seq );
/// \fn bool Util::is_in_restricted_seq( string r_seq, string sequenceID, string* premiRNA, unsigned int taille, string file_name)
......@@ -88,7 +99,6 @@ class Util
/// \param premiRNA is the vector with all wanted sequence ID.
/// \param taille is the length of the premiRNA vector (nb of sequence ID)
/// \param output_file is the output file name.
bool is_in_restricted_seq( string r_seq, string sequenceID, string* premiRNA, unsigned int taille, string output_file);
};
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment