Commit 32e67797 authored by NOEL Philippe's avatar NOEL Philippe

Post .gitignore changes

parent 92b99489
/target
**/*.rs.bk
Cargo.lock
.vscode/*
.idea/*
\ No newline at end of file
[package]
name = "pdbparser"
version = "1.0.0"
authors = ["NOEL Philippe <philippe.noel@inria.fr>"]
edition = "2018"
include = ["Cargo.toml", "src/**/*.rs", "tests/**/*.rs", "README.md"]
[[bin]]
name = "pdbparser"
path = "src/main.rs"
[lib]
name = "pdbparser"
path = "src/lib.rs"
[dependencies]
# PDBparser
**PDBparser is a library written in rust to read and select atoms in protein structure files in the [PDB format](http://www.wwpdb.org/documentation/file-format)**
## Usage
Add this to your `Cargo.toml`:
```toml
[dependencies]
pdbparser = { git = "ssh://git@gitlab.inria.fr/pnoel/pdbparser.git" }
```
and this to your crate root:
```rust
extern crate pdbparser;
```
Here's a simple example that read a pdb file in tests/tests_file
```rust
extern crate pdbparser;
use pdbpaser::*;
fn main() {
let my_prot = parse_pdb("tests/tests_file/5jpq.pdb", "5jpq");
println!("Prot : {} \nn chain: {}\nn res: {}\nn atom: {}",
my_prot.name, my_prot.get_number_chain(),
my_prot.get_number_residue(),
my_prot.get_number_atom());
println!("Reduce protein");
let chain_a = my_prot.select_atoms("chain a").unwrap();
println!("Prot : {} \nn chain: {}\nn res: {}\nn atom: {}",
chain_a.name, chain_a.get_number_chain(),
chain_a.get_number_residue(),
chain_a.get_number_atom());
}
```
## Todo
- [ ] : PDB Writer
- [ ] : Structure to keep informations on nucleic acid/lipid/water
- [ ] : More options to select atoms (Alpha carbon, atoms near to an other, ...)
- [ ] : Support of PDBx/mmCIF format
//! # pdbparser
//!
//! `pdbparser` is a library to manipulate protein structure. It can parse, and filter PDB files.
//! You can create a protein structure by parsing with parse_pdb function. Then you can add filters on your protein.
//!
mod pdb;
pub use self::pdb::read_pdb::parse_pdb as parse_pdb;
pub use self::pdb::protein::Protein;
pub use self::pdb::atom::Atom;
pub use self::pdb::residue::Residue;
pub use self::pdb::chain::Chain;
extern crate pdbparser;
use pdbparser::*;
fn main() {
let my_prot = parse_pdb("tests/tests_file/5jpq.pdb", "5jpq");
println!("Prot : {} \nn chain: {}\nn res: {}\nn atom: {}", my_prot.name, my_prot.get_number_chain(), my_prot.get_number_residue(), my_prot.get_number_atom());
println!("Reduce protein");
let chain_a = my_prot.select_atoms("chain a").unwrap();
println!("Prot : {} \nn chain: {}\nn res: {}\nn atom: {}", chain_a.name, chain_a.get_number_chain(), chain_a.get_number_residue(), chain_a.get_number_atom());
}
use std::ops::Deref;
/// An `Atom` is a sub-structure linked to a `Residue`.
/// It stores the following properties:
/// - atom name;
/// - atom number (atomid);
/// - Coordinates x, y and z
/// - if the atom is a constituant of the backbone of the protein
///
#[derive(Debug)]
pub struct Atom {
pub name: String,
pub number: u64,
pub coord: [f32; 3],
pub is_backbone: bool,
}
impl Atom {
/// Create a new structure Atom. An atom have a name, a number and x, y, z coordinates
/// If the atom name is "C", "CA", "N", "O", "OT1" or "OT2", it will be consider as backbone
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let hydrogen = pdbparser::Atom::new(String::from("HT1"), 1, [0.0, 0.0, 0.0]);
///
/// ````
pub fn new(name: String, number: u64, coord: [f32; 3]) -> Atom {
let n = name.deref();
let back = n == "C" || n == "CA" || n == "N" || n == "O" || n == "OT1" || n == "OT2";
Atom {
name,
number,
coord,
is_backbone: back,
}
}
/// Get the name of the atom
///
pub fn name(self) -> String{
self.name.clone()
}
/// Compute the distance between 2 Atoms
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let h1 = pdbparser::Atom::new(String::from("HT1"), 1, [1.0, 5.0, 2.0]);
/// let h2 = pdbparser::Atom::new(String::from("HT1"), 1, [11.0, 17.0, 5.0]);
///
/// assert_eq!(15.905973, h1.compute_distance(&h2));
///
/// ````
pub fn compute_distance(&self, a: &Atom) -> f32 {
(
(self.coord[0] - a.coord[0]).powi(2) +
(self.coord[1] - a.coord[1]).powi(2) +
(self.coord[2] - a.coord[2]).powi(2)
).sqrt()
}
}
use super::residue::Residue;
/// A `Chain` is a sub-structure linked to a `Protein`.
/// It contain one or more `Residue` and a name
///
#[derive(Debug)]
pub struct Chain {
pub name: char,
pub lst_res: Vec<Residue>,
}
impl Chain {
/// Create a new chain structure with an empty list of residue
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let my_chain = pdbparser::Chain::new('a');
///
/// ````
pub fn new(name: char) -> Chain {
Chain {
name,
lst_res: Vec::new(),
}
}
/// Add a new structure residue to the Chain
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let mut my_chain = pdbparser::Chain::new('a');
/// let lys = pdbparser::Residue::new(String::from("lysine"), 1);
///
/// my_chain.add_res(lys);
///
/// assert_eq!(1, my_chain.get_number_residue());
///
/// ````
pub fn add_res(&mut self, r: Residue) {
self.lst_res.push(r);
}
/// Get the number of residue in the Chain
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let my_chain = pdbparser::Chain::new('a');
///
/// assert_eq!(0, my_chain.get_number_residue());
///
/// ````
pub fn get_number_residue(&self) -> u64 {
self.lst_res.len() as u64
}
/// Return a mutable reference of a residue with its name. Return None if the
/// residue does not exist
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let mut my_chain = pdbparser::Chain::new('a');
/// let lys = pdbparser::Residue::new(String::from("lysine"), 1);
/// my_chain.add_res(lys);
///
/// assert_eq!(1, my_chain.lst_res[0].res_num);
/// {
/// let mut res_ref = my_chain.get_residue_ref(1).unwrap();
/// res_ref.res_num = 4;
/// }
/// assert_eq!(4, my_chain.lst_res[0].res_num);
///
/// ````
pub fn get_residue_ref(&mut self, n: u64) -> Option<&mut Residue> {
for res in &mut self.lst_res {
if res.res_num == n {
return Some(res)
}
}
None
}
/// Get the name of the Chain
pub fn get_name(&self) -> char {
self.name
}
}
\ No newline at end of file
pub mod atom;
pub mod protein;
pub mod residue;
pub mod read_pdb;
pub mod chain;
mod selection_atom;
\ No newline at end of file
use super::atom::Atom;
use super::chain::Chain;
use super::residue::Residue;
use super::selection_atom;
use std::char;
/// A `Protein` is a struct extract from PDB file. It store `Chain` structure(s)
///
/// The `Protein` is the main structure to manipulate
#[derive(Debug)]
pub struct Protein {
pub name: String,
pub lst_chain: Vec<Chain>,
last_chain_added: char,
}
impl<'a> Protein {
/// Create a new protein structure
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let my_prot = pdbparser::Protein::new(String::from("my_prot"));
///
/// ````
pub fn new(n : String) -> Protein {
Protein {
name: n,
lst_chain: Vec::new(),
last_chain_added: ' ',
}
}
/// Get the name of the protein
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let my_prot = pdbparser::Protein::new(String::from("my_prot"));
///
/// assert_eq!("my_prot", my_prot.name());
///
/// ````
pub fn name(&self) -> &str {
&self.name
}
/// Return True if the chain is in the protein
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let mut my_prot = pdbparser::Protein::new(String::from("my_prot"));
/// let my_chain_a = pdbparser::Chain::new('n');
/// my_prot.add_chain(my_chain_a);
///
/// assert!(my_prot.is_chain('n'));
///
/// ````
pub fn is_chain(&self, c: char) -> bool {
for ii in &self.lst_chain {
if ii.get_name() == c {
return true
}
}
false
}
/// Return a mutable reference of a chaine with its name. Return None if the
/// chain does not exist
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let mut my_prot = pdbparser::Protein::new(String::from("my_prot"));
/// my_prot.add_chain(pdbparser::Chain::new('n'));
/// assert_eq!('n', my_prot.lst_chain[0].get_name());
/// {
/// let mut reference = my_prot.get_chain_ref('n').unwrap();
/// reference.name = 'a';
/// }
/// assert_eq!('a', my_prot.lst_chain[0].get_name());
/// ````
pub fn get_chain_ref(&mut self, c: char) -> Option<&mut Chain> {
for chain in &mut self.lst_chain {
if chain.name == c {
return Some(chain)
}
}
None
}
/// Get the number of chain in the protein
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let my_prot = pdbparser::Protein::new(String::from("my_prot"));
///
/// assert_eq!(0, my_prot.get_number_chain());
/// ````
pub fn get_number_chain(&self) -> u32 {
self.lst_chain.len() as u32
}
/// Return the number of residue in the protein
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let mut my_prot = pdbparser::Protein::new(String::from("my_prot"));
/// let mut my_chain = pdbparser::Chain::new('n');
/// let lys = pdbparser::Residue::new(String::from("lysine"), 1);
/// let pro = pdbparser::Residue::new(String::from("proline"), 2);
///
/// my_chain.add_res(lys);
/// my_chain.add_res(pro);
/// my_prot.add_chain(my_chain);
///
/// assert_eq!(2, my_prot.get_number_residue());
///
/// ````
pub fn get_number_residue(&self) -> u64 {
let mut n: u64 = 0;
for chain in self.lst_chain.iter() {
for _ in chain.lst_res.iter() {
n+= 1;
}
}
n
}
/// Return the number of atom in the protein
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let my_prot = pdbparser::parse_pdb("tests/tests_file/f2.pdb", "f2");
/// assert_eq!(1085, my_prot.get_number_atom());
/// ````
pub fn get_number_atom(&self) -> u64 {
let mut n: u64 = 0;
for chain in self.lst_chain.iter() {
for res in chain.lst_res.iter() {
for _ in res.lst_atom.iter() {
n += 1;
}
}
}
n
}
/// Add a new chain structure in the protein
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let mut my_prot = pdbparser::Protein::new(String::from("my_prot"));
/// let my_chain_a = pdbparser::Chain::new('n');
///
/// my_prot.add_chain(my_chain_a);
///
/// assert_eq!(1, my_prot.get_number_chain());
///
/// ````
pub fn add_chain(&mut self, c: Chain) {
self.last_chain_added = c.get_name();
self.lst_chain.push(c);
}
/// Function that add information on the protein (used in the parsing)
/// /!\Change this to a macro!
///
pub fn update_protein(&mut self, chain: char, res_name: String, res_number: u64, atom_name: String, atom_number: u64, coord: [f32; 3]) {
// Get a chain reference. If the chain exist, return a mutable reference to it. If not,
// create a new chain an return the mutable reference
let chain = match self.get_chain_ref(chain) {
Some(c) => c,
None => {
self.add_chain(Chain::new(chain));
self.get_chain_ref(chain).unwrap()
},
};
// Get a residue reference. If the residue exist, return a mutable reference to it. If not,
// create a new residue and return it as mutable reference
let residue = match chain.get_residue_ref(res_number as u64) {
Some(r) => r,
None => {
chain.add_res(Residue::new(res_name, res_number));
chain.get_residue_ref(res_number).unwrap()
},
};
let atom = Atom::new(atom_name, atom_number, coord);
residue.add_atom(atom);
}
/// function that return a vector for atom index
/// Can be used in other program like rrmsd_map to select specific atoms
///
/// # Examples
/// ```
/// use pdbparser;
///
/// let my_prot = pdbparser::parse_pdb("tests/tests_file/f2_adn.pdb", "f2");
/// let atom_index = my_prot.get_atom_index();
///
/// assert_eq!(atom_index[0], 1);
/// assert_eq!(atom_index[1], 2);
/// ```
pub fn get_atom_index(&self) -> Vec<u64> {
let mut lst: Vec<u64> = Vec::new();
for chain in &self.lst_chain {
for res in &chain.lst_res {
for atom in &res.lst_atom {
lst.push(atom.number);
}
}
}
lst
}
/// Select atom from a pattern and return a new protein structure
///
/// The pattern could use keywords "Chain", "Resid" or "Backbone" (keyword are not case sensitive)
///
/// ## "Chain"
/// The Chain keyword is used to select chain. It must be follow by one or two chain names separate by the "to" keyword.
/// The chain name is case sensitive.
/// examples:
/// "Chain A" will select only the Chain A.
/// "Chain A to D" will select chains A, B, C and D.
///
/// ## "Resid"
/// The Resid keyword is used to select residues. It must be follow by one or two chain names separate by the "to" keyword.
/// In case where the protein has multiple chains, the Resid will return residue(s) for all chains.
/// examples:
/// "Resid 1" will select only the residue 1 of each chain
/// "Resid 12 to 50" will select residues 12, 13, .., 50 for all chains
///
/// ## "Backbone"
/// The Backbone keyword is used to select atoms in the backbone for each residues. It don't take parameters.
///
/// ## Special keyword "and"
/// You can use the keyword "and" to separate 2 or more differents selection.
/// examples:
/// "Chain A and Resid 40 to 150"
///
/// # Examples
///
/// ```
/// use pdbparser;
///
/// let my_prot = pdbparser::parse_pdb("tests/tests_file/f2.pdb", "f2");
///
/// assert_eq!(66, my_prot.get_number_residue());
/// assert_eq!(1085, my_prot.get_number_atom());
///
/// let prot_backbone = my_prot.select_atoms("resid 10 to 50 and backbone").unwrap();
///
/// assert_eq!(41, prot_backbone.get_number_residue());
/// assert_eq!(164, prot_backbone.get_number_atom());
/// ```
// TODO: The methode is idiot and need to be improve.
// ex: don't parse the chain if it's not selected
pub fn select_atoms(&self, pattern: &str) -> Option<Protein> {
let mut n_prot = Protein::new(self.name.clone());
let select = match selection_atom::parse_select(&pattern) {
Some(x) => x,
None => {
println!("Can't parse the protein with these attributes");
return None
},
};
for chain in &self.lst_chain {
let c_chain = chain.name;
for residue in &chain.lst_res {
let c_res = residue.res_num;
for atom in &residue.lst_atom {
if selection_atom::atom_match(&select, c_chain, c_res, atom.is_backbone) {
n_prot.update_protein(c_chain, residue.name.clone(), c_res, atom.name.clone(), atom.number, atom.coord);
}
}
}
}
Some(n_prot)
}
}
use std::fs::File;
use std::io::prelude::*;
use std::io::BufReader;
use std::process;
use super::protein::Protein;
/// Parse the string to return a f32. The `trim` is used to remove
/// /n and spaces.
///
/// # Errors
/// Will return 0.0 if the String cannot be convert and print the error
///
fn parse_float(s: &str) -> f32 {
match s.trim().parse::<f32>() {
Ok(n) => n,
Err(e) => {
println!("{}", e);
0.0
}
}
}
/// Parse the string to return a i64. The `trim` is used to remove
/// /n and spaces.
/// In large PDB, atom number can be > 99,999.
/// In VMD, the atom number is in hexadecimal after 99,999
///
/// # Errors
/// Will return 0 if the String cannot be convert and print the error
///
fn parse_int(s: &str) -> i64 {
match s.trim().parse::<i64>() {
Ok(n) => n,
Err(e) => {
match i64::from_str_radix(s.trim(), 16) {
Ok(n) => n,
Err(_) => {
println!("{}", e);
0
}
}
}
}
}
/// Parse the pdb file and return a protein structure
///
/// # Examples
/// ```
/// use pdbparser;
/// let my_prot = pdbparser::parse_pdb("tests/tests_file/f2.pdb", "f2");
/// assert_eq!(66, my_prot.get_number_residue());
/// ```
pub fn parse_pdb(pdb: &str, name: &str) -> Protein {
// Allocate here to avoid multiple allocation for every call
let lst_res = vec![
"ARG", "HIS", "HSE", "HSD", "LYS", "LYS", "ASP", "GLU", "SER", "THR", "ASN",
"GLN", "CYS", "SEC", "GLY", "PRO", "ALA", "VAL", "ILE", "LEU",
"MET", "PHE", "TYR", "TRP"
];
// Check if the file exist and/or can be read
let pdb = match File::open(pdb) {
Ok(f) => f,
Err(e) => {
println!("Could not open the file \"{}\"\nError: {}", pdb, e);
process::exit(1);
}
};
let reader = BufReader::new(pdb);
let mut protein = Protein::new(name.to_string());
for line in reader.lines() {
let l = line.unwrap();
if l.starts_with("HETAM") || l.starts_with("ATOM") {
// First get the resname.
// If the "residue" is a protein residue, continue to parse the line and add informations to the protein
// else continue to the next one line
let residue_name = &l[17..20].trim();
if is_protein_res(residue_name, &lst_res) {
let atom_name = &l[12..17].trim().to_string();
let chain = l[21..22].chars().next().unwrap();
let atom_number = parse_int(&l[6..11].to_string());
let residue_number = parse_int(&l[22..26].to_string());
let x = parse_float(&l[30..38].to_string());
let y = parse_float(&l[38..46].to_string());
let z = parse_float(&l[46..54].to_string());
// Add informations to the protein
protein.update_protein(chain, residue_name.to_string(), residue_number as u64, atom_name.clone(), atom_number as u64, [x, y, z]);
}
}
}
protein
}
/// Test if the selected line is a residue
///
fn is_protein_res(r: &str, lst: &[&str]) -> bool {
let r = r.to_uppercase();
for res in lst {
if r == *res {
return true
}
}
false
}
\ No newline at end of file
use super::atom::Atom;
/// A `Residue` is a sub-structure linked to a `Chain`.
/// It contain one or more `Atom`
/// It stores the following properties:
/// - res name;
/// - res number (resid);
/// - a list of atom(s)
///
#[derive(Debug)]
pub struct Residue {
pub name: String,
pub res_num: u64,
pub lst_atom: Vec<Atom>,
}
impl Residue {
/// Create a new Residue structure with an empty list of atom.
/// The Residue have a name and a number
///
/// # Examples
///
/// ````
/// use pdbparser;
///
/// let lys = pdbparser::Residue::new(String::from("lysine"), 1);
///
/// ````
pub fn new(name: String, res_num: u64) -> Residue {
Residue {
name,
res_num,
lst_atom: Vec::new(),
}
}
/// Get the name of the residue
///
pub fn name(&self) -> String {
self.name.clone()
}
/// Get the number of the residue
///
pub fn get_res_num(&self) -> u64 {
self.res_num
}
/// Get the number of Atom in the Residue
///
/// # Examples
///
/// ````
/// use pdbparser;
///