Commit 7f66bc2e authored by MARIJON Pierre's avatar MARIJON Pierre

Initial commit

parents
NCTC9078,ERS523585
NCTC11131,ERS715428
NCTC13095,ERS659595
NCTC10988,ERS798845
NCTC11126,ERS462976
NCTC11435,ERS950474
NCTC11800,ERS513146
NCTC12694,ERS1188834
NCTC12841,ERS879562
NCTC12993,ERS473437
NCTC13348,ERS450001
NCTC13543,ERS636085
NCTC5050,ERS513128
NCTC7152,ERS685290
# Assembly graph analysis of fragmented long-read bacterial genome assemblies
## Requirement
- Canu 1.6
- minimap 0.2-r123
- minimap2
- miniasm 0.2-r128
- assembly_report https://gitlab.inria.fr/pmarijon/assembly_report version b583f4c427df8ad33926a0ab3a18377796bfce13
- SRA_dow_assembly https://gitlab.inria.fr/pmarijon/SRA_dow_assembly version dfef00bcbc6618ea20a5721bcffac4a83dced573
- python >= 3:
- begin
## Dataset
| NCTC ID | ENA ID |
| --------- | ---------- |
| NCTC9078 | ERS523585 |
| NCTC11131 | ERS715428 |
| NCTC13095 | ERS659595 |
| NCTC10988 | ERS798845 |
| NCTC11126 | ERS462976 |
| NCTC11435 | ERS950474 |
| NCTC11800 | ERS513146 |
| NCTC12694 | ERS1188834 |
| NCTC12841 | ERS879562 |
| NCTC12993 | ERS473437 |
| NCTC13348 | ERS450001 |
| NCTC13543 | ERS636085 |
| NCTC5050 | ERS513128 |
| NCTC7152 | ERS685290 |
## Repetition step
1. Download dataset and extract read with dextractor and save it in directory `${data}`
2. Run assembly with SRA_dow_assembly `SRA_dow_assembly -i NCTC_id.lst -d ${data} -a . finish` canu and mini{map|asm} are save in local directory
3. Run assembly_report on each assembly `assembly_report -i ${NCTC_id}/canu -m ${NCTC_id}/minimap/minimap.paf -M ${NCTC_id}/miniasm/miniasm.gfa -o ${NCTC_id}/assembly_report`
4. Run contig assignation `./script/blaster.py ${NCTC_id}/canu/contigs.fasta` blast xml file are saved in `${NCTC_id}/canu` directory, a file contig `taxonomic_assign.lst` in `${NCTC_id}` directory (for minimap run a script modification are required)
5. Run contig classification (containment or not) `./script/contig_classification.sh` this script make a mapping of contig against contig in file `${NCTC_id}/${NCTC_id}.paf` and `${NCTC_id}/contig2contig.lst`
6. Run filtering of path find run `./script/filter_contain_nogenomic.py NCTC[0-9]*` this script create file `${NCTC_id}/assembly_report/path_{canu|minimap}_extremity_result_filtred.csv`
7. Count short and long path and threshold `./script/long_short.py NCTC[0-9]*` the result of script are present in standard output.
## Other script
- `script/gfaminiasm2fasta.py` convert miniasm gfa in fasta file
- `script/venn_of_common_path.py` generate venn diagrame of path find for each path
- `script/subset_gfa.py` generate a new gfa with a subset of longest read
#!/usr/bin/python2
# -*- coding: utf-8 -*-:
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import SeqIO
import tempfile
import os
import shutil
import sys
species = {
"NCTC11126" : "Escherichia coli", # done
"NCTC9078" : "Escherichia coli", # done
"NCTC7152" : "Escherichia coli", # done
"NCTC11131" : "Escherichia coli", # done
"NCTC13095" : "Klebsiella planticola", # done
"NCTC5050" : "Klebsiella pneumoniae", # done
"NCTC12993" : "Kluyvera cryocrescens", # done
"NCTC11800" : "Providencia stuartii", # done
"NCTC13543" : "Rhizobium radiobacter",
"NCTC12694" : "Salmonella enterica",
"NCTC13348" : "Salmonella enterica", # done
"NCTC10988" : "Staphylococcus aureus", # done
"NCTC12841" : "Streptococcus pyogenes", # done
"NCTC11435" : "Vibrio mimicus" # done
}
skipped_tigs = {
"NCTC11126" : [],
"NCTC9078" : [],
"NCTC7152" : [],
"NCTC11131" : [],
"NCTC13095" : [],
"NCTC5050" : [],
"NCTC12993" : [],
"NCTC11800" : [],
"NCTC13543" : [],
"NCTC12694" : [],
"NCTC13348" : [],
"NCTC10988" : [],
"NCTC12841" : [],
"NCTC11435" : [],
}
def analyze_blast (contigs_file, path, species, genomic_by_length = []):
print "# blast analysis"
category = dict()
records = list(SeqIO.parse(contigs_file, "fasta"))
for record in records:
print record.name, "length", len(record)
if record.name in genomic_by_length:
category[record.name] = { 'plasmidic' : 0, 'genomic' : 50, 'undefined' : 0 }
else:
output_filename = os.path.join(path,"{}.xml".format(record.name))
category[record.name] = { 'plasmidic' : 0, 'genomic' : 0, 'undefined' : 0 }
try:
result_handle = open(output_filename)
blast_records = list(NCBIXML.parse(result_handle))
assert (len(blast_records) == 1), "hum ... more blast records than queries"
for block_aligns in blast_records:
alignments = list(block_aligns.alignments)
if alignments:
for al in alignments:
# print "\t%s %d %d" % (al.title,len(record),al.length)
if "plasmid" in al.title:
category[record.name]['plasmidic'] += 1
elif species in al.title:
category[record.name]['genomic'] += 1
elif species.split()[0] in al.title:
category[record.name]['genomic'] += 1
elif "complete genome" in al.title and al.length > 3000000:
category[record.name]['genomic'] += 1
else:
category[record.name]['undefined'] += 1
print "\t{}".format(category[record.name])
# for hsp in al.hsps:
# print "\t\t%d %d" % (hsp.query_start, hsp.query_end)
# print "\t\t%d %d" % (hsp.sbjct_start, hsp.sbjct_end)
# print "\t\t%d" % (hsp.expect)
else:
print "No alignment found"
except None:
print "No Blast output for " + record.name
with open(os.path.join(path,"..","taxonomic_assign.lst"),"w") as output_fileid:
for record in records:
s = sum([ category[record.name][k] for k in category[record.name] ])
try:
c = [ k for k in category[record.name] if ((category[record.name][k]*1.0/s) >= 0.8) ]
if len(c) == 1:
output_fileid.write("{};{}\n".format(record.name,c[0]))
else:
output_fileid.write("{};{}\n".format(record.name,'undefined'))
except ZeroDivisionError as e:
output_fileid.write("{};{}\n".format(record.name,'none'))
output_fileid.close()
def launch_blast_and_save_output (contigs_file, path, skipped_tigs = []):
print "# blast computation"
records = list(SeqIO.parse(contigs_file, "fasta"))
genomic_by_length = []
for record in records:
print record.name, "length", len(record)
if record.name in skipped_tigs:
print "\tdo not blast this tig"
else:
if len(record) > 1000000:
print "\tsufficient length to be genomic"
genomic_by_length.append(record.name)
else:
output_filename = os.path.join(path,"{}.xml".format(record.name))
if os.path.exists(output_filename):
print "\tskipped"
else:
result_handle = NCBIWWW.qblast("blastn", "nt", record.seq, megablast=True)
with open(output_filename, "w") as out_handle:
out_handle.write(result_handle.read())
result_handle.close()
out_handle.close()
return genomic_by_length
if __name__ == "__main__":
assert(len(species) == 14)
contigs_file = sys.argv[1]
nctc_sample = os.path.split(contigs_file)[0].split("/")[0]
print "### NCTC sample number ", nctc_sample
sp = species[nctc_sample] #sys.argv[2]
# get the path of the contigs_file
contigs_file_path = os.path.dirname(contigs_file)
genomic_by_length = launch_blast_and_save_output (contigs_file, contigs_file_path, skipped_tigs[nctc_sample])
analyze_blast(contigs_file, contigs_file_path, sp, genomic_by_length)
for dataset in $(ls -d NCTC[0-9]*)
do
echo $dataset
minimap2 -x asm10 -X /home/pierre.marijon/NCTC_dextractor/$dataset/canu/canu.contigs.fasta /home/pierre.marijon/NCTC_dextractor/$dataset/canu/canu.contigs.fasta > $dataset.paf 2> /dev/null
python script/parse_paf.py $dataset.paf > /home/pierre.marijon/NCTC_dextractor/$dataset/contig2contig.lst
done
#!/usr/bin/env python3
import csv
import begin
def filter_extremity_file(id_NCTC, tools, remove):
out_file = id_NCTC+"/assembly_report/path_{}_extremity_result_filtred.csv"
in_file = id_NCTC+"/assembly_report/find_path/path_{}_extremity_result.csv"
with open(out_file.format(tools), "w") as fh_out, \
open(in_file.format(tools)) as fh_in:
reader = csv.reader(fh_in)
writer = csv.writer(fh_out)
for row in reader:
if not (row[0] in remove or row[2] in remove):
writer.writerow(row)
def reformat_tigname(tigname):
return tigname[3:].lstrip("0").strip()
def jobs(id_NCTC):
remove = set()
with open(id_NCTC+"/contig2contig.lst") as fh:
for line in fh:
remove.add(reformat_tigname(line))
with open(id_NCTC+"/taxonomic_assign.lst") as fh:
for line in fh:
tig_name, tig_type = line.strip().split(";")
if tig_type != "genomic":
remove.add(reformat_tigname(tig_name))
print(id_NCTC, remove)
filter_extremity_file(id_NCTC, "canu", remove)
filter_extremity_file(id_NCTC, "minimap", remove)
@begin.start
def main(*id_list):
for id_NCTC in id_list:
jobs(id_NCTC)
#!/usr/bin/env python3
import begin
@begin.start
def main(gfa, fasta):
with open(fasta, "w") as fh_out, open(gfa) as fh:
for line in fh:
if line.startswith("S"):
_, tig_id, tig_seq, _ = line.split("\t")
fh_out.write(">{}\n{}\n".format(tig_id, tig_seq))
#!/usr/bin/env python3
import begin
import csv
tigInfo_template = "{}/canu/canu.contigs.layout.tigInfo"
minimap_gfa_template = "{}/miniasm/miniasm.gfa"
extremity_template = "{}/assembly_report/path_{}_extremity_result_filtred.csv"
def get_short_long(filename, threshold):
long_path = 0
short_path = 0
tig2dist = dict()
with open(filename, "r") as fh:
for row in csv.DictReader(fh):
tig2dist[frozenset((row["tigA"], row["tigB"]))] = int(row["path_len"])
for v in tig2dist.values():
if v <= threshold:
short_path += 1
else:
long_path += 1
return short_path, long_path
def get_long_short_canu(dataset):
threshold = float("inf")
with open(tigInfo_template.format(dataset), "r") as fh:
for row in csv.DictReader(fh, delimiter='\t'):
if row["tigClass"] == "contig" and row["coverage"] != "1.00":
if int(row["numChildren"]) < threshold:
threshold = int(row["numChildren"])
if threshold < 10: threshold = 10
short_path, long_path = get_short_long(extremity_template.format(dataset,
"canu"),
threshold)
return (dataset, str(short_path), str(long_path), str(threshold))
def get_long_short_minimap(dataset):
threshold = 5
tig2len = dict()
name = "remove"
nb = 0
with open(minimap_gfa_template.format(dataset), "r") as fh:
for line in fh:
row = line.split('\t')
if row[0] == "S":
tig2len[name] = nb
name = row[1]
nb = 0
elif row[1]:
nb += 1
del tig2len["remove"]
threshold = float("inf")
for tig, nb in tig2len.items():
if nb > 1 and nb < threshold:
threshold = nb
if threshold < 10: threshold = 10
short_path, long_path = get_short_long(extremity_template.format(dataset,
"minimap"),
threshold)
return (str(short_path), str(long_path), str(threshold))
@begin.start
def main(*datasets):
print(",".join(["dataset", "canu", "short", "long", "threshold", "minimap", "short", "long", "threshold"]))
for d in datasets:
print(",".join(get_long_short_canu(d) + get_long_short_minimap(d)))
import sys
to_remove = set()
for line in open(sys.argv[1]):
line = line.strip().split()
lengthA = int(line[1])
lengthB = int(line[6])
matching_bases = int(line[9]) # https://lh3.github.io/minimap2/minimap2.html
query = line[0]
target = line[5]
ratioA = matching_bases*1.0/lengthA
ratioB = matching_bases*1.0/lengthB
if ratioA > 0.75:
sys.stderr.write(query + " " + target + " " + str(ratioA) + '\n')
to_remove.add(query)
if ratioB > 0.75:
sys.stderr.write(target + " " + query + " " + str(ratioB) + '\n')
to_remove.add(target)
for x in to_remove: print(x)
#!/usr/bin/env python3
import csv
import begin
@begin.start
@begin.convert(subset=float)
def main(gfa, sub_gfa, subset=0.3):
reads2len = dict()
with open(gfa) as fh:
reader = csv.reader(fh, delimiter='\t')
for line in reader:
if line[0] == "S":
reads2len[line[1]] = int(line[3].split(":")[2])
reads_sort_len = sorted(reads2len, key=lambda x: reads2len[x], reverse=True)
selected_read = set(reads_sort_len[:round(len(reads_sort_len)*subset)])
with open(gfa) as fh:
with open(sub_gfa, "w") as sub:
writer = csv.writer(sub, delimiter='\t')
reader = csv.reader(fh, delimiter='\t')
for line in reader:
if line[0] == "S":
if line[1] in selected_read:
writer.writerow(line)
elif line[0] == "L":
if line[1] in selected_read and line[3] in selected_read:
writer.writerow(line)
else:
writer.writerow(line)
#!/usr/bin/env python3
import begin
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
extremity_path = "{}/assembly_report/path_{}_extremity_result_filtred.csv"
def job(out_prefix, id_NCTC):
canu_path = set()
with open(extremity_path.format(id_NCTC, "canu")) as fh:
fh.readline()
for line in fh:
line = line.split(",")
canu_path.add(frozenset((line[0], line[2])))
print(id_NCTC, canu_path)
minimap_path = set()
with open(extremity_path.format(id_NCTC, "minimap")) as fh:
fh.readline()
for line in fh:
line = line.split(",")
minimap_path.add(frozenset((line[0], line[2])))
print(id_NCTC, minimap_path)
venn2([canu_path, minimap_path], set_labels = ('Canu path', 'Minimap path'))
plt.savefig(out_prefix + id_NCTC + ".png")
plt.clf()
@begin.start
def main(out_prefix, *id_list):
for id_NCTC in id_list:
job(out_prefix, id_NCTC)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment