Commit f1e0d1c5 authored by MARIJON Pierre's avatar MARIJON Pierre

Basic pipeline

parents
data*
.snakemake
# Created by https://www.gitignore.io/api/python
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
### Python Patch ###
.venv/
### Python.VirtualEnv Stack ###
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
pip-selfcheck.json
# End of https://www.gitignore.io/api/python
contigs_seq = config["contigs"]
contig_base_name = contigs_seq[:contigs_seq.rfind(".")]
script_path = os.path.join(config["package_path"], "script/")
rule build_AGG:
input:
search = "path_extremity_search.csv",
ovl_graph = raw_read_base_name + "_splited.gfa",
read2asm = "read2asm.paf",
asm_graph = config["graph"],
tig2tig = contig_base_name + "_filtred.gfa",
output:
"AAG.csv"
shell:
"python -m knot.path_search {input.search} {output} {input.ovl_graph} {input.read2asm} {input.asm_graph} {input.tig2tig}"
rule filter_contigs:
input:
contigs_seq
output:
contig_base_name + "_filtred.fasta"
shell:
"python -m knot.filter_tig {input} {output} -t 1000000"
rule map_contigs:
input:
contig_base_name + "_filtred.fasta"
output:
contig_base_name + "_filtred.paf"
shell:
" ".join([config["minimap"]["bin"],
config["minimap"]["map_option"]+config["read_type"],
"{input}", "{input}",
">", "{output}"
])
rule map_read_contigs:
input:
asm = contig_base_name + "_filtred.fasta",
read = reads_path
output:
"read2asm.paf"
shell:
" ".join([config["minimap"]["bin"],
config["minimap"]["map_option"]+config["read_type"],
"{input.read}", "{input.asm}",
">", "{output}"
])
rule find_extremity:
input:
read2asm = "read2asm.paf",
asm_graph = config["graph"],
tig2tig = contig_base_name + "_filtred.gfa",
read2read = raw_read_base_name + "_splited.gfa"
output:
"path_extremity_search.csv"
shell:
"python -m knot.extremity_search {input.asm_graph} {input.tig2tig} {input.read2asm} {input.read2read} {output}"
reads_path = config["input"]
raw_read_base_name = reads_path[:reads_path.rfind(".")]
rule mapping:
input:
reads = "{path}.fasta"
output:
mapping = "{path}.paf"
shell:
" ".join([config["minimap"]["bin"],
config["minimap"]["ovl_option"]+config["read_type"],
"{input}", "{input}",
">", "{output}"
])
rule spliting:
input:
mapping = "{path}.paf",
reads = "{path}.fasta"
output:
splited_read = "{path}_splited.fasta",
yacrd = "{path}.yacrd"
shell:
config["yacrd"]["bin"]+" -i {input.mapping} -s {input.reads} -o {output.yacrd}"
rule overlap2stringgraph:
input:
"{path}.paf"
output:
"{path}.gfa"
shell:
"paf2gfa -c -i {input} {output}"
#!/usr/bin/env python3
# std import
import io
import os
import csv
import argparse
import subprocess
# project import
# Class to call help and snakemake help
class MyArgumentParser(argparse.ArgumentParser):
def print_help(self):
super().print_help()
print()
subprocess.run(["snakemake", "--help"])
def main(args = None):
if args is None:
import sys
args = sys.argv[1:]
parser = MyArgumentParser(prog="KNOT")
# avaible
# b e g i j m o q u w x y z
# A B C E G H I J K L M N Q V W X Y Z
parser.add_argument("-C", "--contigs", required=True,
help="fasta file than contains contigs")
parser.add_argument("-g", "--graph", required=True,
help="contig graph")
parser.add_argument("-i", "--input", required=True,
help="read used for assembly")
parser.add_argument("--read-type", choices=["pb", "ont"], default="pb",
help="type of input read, default pb")
parser.add_argument("--clean", action="store_true")
args, unknow_arg = parser.parse_known_args(args)
args = vars(args)
package_path = os.path.dirname(__file__) + os.sep
snakemake_rule = os.path.join(package_path, "report.rules")
snakemake_config_path = os.path.join(package_path, "config.yaml")
call = ["snakemake", "report",
"--configfile", snakemake_config_path,
"--config",
"input="+args["input"],
"graph="+args["graph"],
"contigs="+args["contigs"],
"read_type="+args["read_type"],
"package_path="+package_path,
"--snakefile", snakemake_rule
]
if args["clean"]:
call.append("-S")
out = subprocess.run(call, stdout=subprocess.PIPE)
fakefile = io.StringIO(str(out.stdout.decode("utf-8")))
fakefile.seek(0)
reader = csv.reader(fakefile, delimiter="\t")
next(reader)
for row in reader:
if row[0] != args["graph"]: #don't remove input
rm_out = subprocess.run(["rm", "-rf", row[0]])
return
call += unknow_arg
out = subprocess.call(call)
if __name__ == "__main__":
import sys
main(sys.argv[1:])
out_path: "knot_report"
minimap:
bin: "minimap2"
ovl_option: "-x ava-"
map_option: "-x map-"
yacrd:
bin: "yacrd"
# std import
import csv
from collections import defaultdict
def get_valid_read(read2read):
result = set()
reader = csv.reader(read2read, delimiter="\t")
for row in reader:
if row[0] == "L":
result.add(row[1])
result.add(row[3])
return result
def get_tig2posread(read2tig, valid_read):
result = defaultdict(list)
reader = csv.reader(read2tig, delimiter="\t")
for row in reader:
if row[5] not in valid_read:
continue
if int(row[8]) - int(row[7]) > 0.7 * int(row[6]):
result[row[0]].append((int(row[2]), row[5], row[4]))
return result
def get_ext_ovl(asm_graph, tig2tig):
result = set()
signe2pos = {"+": "begin", "-": "end", "begin": "+", "end": "-"}
for row in asm_graph:
row = row.strip().split('\t')
if row[0] == "L":
result.add((row[1]+signe2pos[row[2]], row[3]+signe2pos[row[4]]))
for row in tig2tig:
row = row.strip().split('\t')
if row[0] == "L":
result.add((row[1]+signe2pos[row[2]], row[3]+signe2pos[row[4]]))
return result
#!/usr/bin/env python3
import sys
import argparse
from knot.extremity_search import *
def main(args):
if args is None:
args = sys.argv[1:]
parser = argparse.ArgumentParser(prog="extremity_search")
parser.add_argument("asm_graph", type=argparse.FileType('r'),
help="assembly graph")
parser.add_argument("tig2tig", type=argparse.FileType('r'),
help="tig2tig graph")
parser.add_argument("read2tig", type=argparse.FileType('r'),
help="read mapped against asm")
parser.add_argument("read2read", type=argparse.FileType('r'),
help="SG graph")
parser.add_argument("output", type=argparse.FileType('w'),
help="file where extremity are writed")
args = parser.parse_args(args)
args = vars(args)
valid_read = get_valid_read(args["read2read"]) # read name
tig2posread = get_tig2posread(args["read2tig"], valid_read) # tig -> (begin, end, name)
for k in tig2posread:
tig2posread[k].sort()
ext_not_search = list(sum(get_ext_ovl(args["asm_graph"], args["tig2tig"]), ())) # tig1 -> tig2 tig1_end tig2_begin
print("tig","read","strand_to_tig", sep=",", file=args["output"])
for tig in tig2posread.keys():
ext = tig+"_begin"
if ext in ext_not_search:
continue
print(ext, tig2posread[tig][0][1], tig2posread[tig][0][2],
sep=",", file=args["output"])
ext = tig+"_end"
if ext in ext_not_search:
continue
print(ext, tig2posread[tig][-1][1], tig2posread[tig][-1][2],
sep=",", file=args["output"])
if __name__ == '__main__':
main(sys.argv[1:])
#!/usr/bin/env python3
import sys
import argparse
from Bio import SeqIO
def main(args):
if args is None:
args = sys.argv[1:]
parser = argparse.ArgumentParser(prog="filter_contig")
parser.add_argument("-t", "--threshold", default=100.000, type=int,
help="Only sequence with size upper than threshold are write in output default 100.000")
parser.add_argument("input", type=argparse.FileType('r'), help="input fasta")
parser.add_argument("output", type=argparse.FileType('w'), help="output fasta")
args = parser.parse_args(args)
args = vars(args)
for record in SeqIO.parse(args["input"], "fasta"):
if len(record.seq) > args["threshold"]:
SeqIO.write(record, args["output"], "fasta")
if __name__ == '__main__':
main(sys.argv[1:])
#!/usr/bin/env python3
# std import
import csv
import sys
import logging
import argparse
import itertools
# project import
from knot import extremity_search
from knot.path_search import paths
from knot.path_search import gfa2sg
# Logging configuration
logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(levelname)s :: %(message)s")
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
def main(args = None):
# argument parsing
if args is None:
args = sys.argv[1:]
parser = argparse.ArgumentParser(prog="paf2gfa",
formatter_class=argparse.
ArgumentDefaultsHelpFormatter)
parser.add_argument("search", type=argparse.FileType('r'))
parser.add_argument("result", type=argparse.FileType('w'))
parser.add_argument("ovl_graph", type=argparse.FileType('r'))
parser.add_argument("read2asm", type=argparse.FileType('r'))
parser.add_argument("asm_graph", type=argparse.FileType('r'))
parser.add_argument("tig2tig", type=argparse.FileType('r'))
args = vars(parser.parse_args(args))
# gfa to sg
sg = gfa2sg.get_sg(args["ovl_graph"])
args["ovl_graph"].seek(0)
# get info about contig
valid_read = extremity_search.get_valid_read(args["ovl_graph"])
tig2reads = {tig: {v[1] for v in val} for tig, val in extremity_search.get_tig2posread(args["read2asm"], valid_read).items()}
# build list of search
tig_ext = list()
reader = csv.DictReader(args["search"])
for row in reader:
tig_ext.append(tuple(row.values()))
print("tig1", "read1", "tig2", "read2", "path_len", "nbread_contig",
sep=",", file=args["result"])
for ext1, ext2 in itertools.permutations(tig_ext, 2):
if ext1[0][:-6] == ext2[0][:-4] or ext1[0][:-4] == ext2[0][:-6]:
continue # don't search path between same contig ext
node1 = ext1[1] + "-" if ext1[2] == "+" else ext1[1] + "+"
node2 = ext2[1] + "-" if ext2[2] == "+" else ext2[1] + "+"
path = paths.get_path(sg, node1, node2)
if path:
nbread_contig = paths.format_node_contig_counter(
paths.path_through_contig(tig2reads, path),
tig2reads)
print(ext1[0], node1, ext2[0], node2, len(path), nbread_contig,
sep=",", file=args["result"])
for ext1, ext2 in extremity_search.get_ext_ovl(args["asm_graph"], args["tig2tig"]):
print(ext1,"no",ext2,"no",0,"", sep=",", file=args["result"])
pass
if __name__ == "__main__":
main()
# std import
import logging
# pip import
import gfapy
import networkx as nx
# project import
def get_sg(gfa_file):
# Convert gfa in SG
logging.info("begin parseGFA")
graph = read_gfa_from_handler(gfa_file)
logging.info("end parseGFA")
G = nx.DiGraph()
logging.info("populate graph begin")
populate_graph(graph, G)
logging.info("populate graph end")
print_info(G)
logging.info("transitive reduction begin")
G = transitive_reduction(G)
logging.info("transitive reduction end")
nb_trans = check_trans_red(G)
if nb_trans > 0:
logging.warning("Nb error in transitive reduction "+str(nb_trans))
print_info(G)
return G
def read_gfa_from_handler(handler):
gfa = gfapy.Gfa()
for line in handler:
gfa.append(line)
return gfa
def populate_graph(gfa, G):
is_contain = isContain(gfa)
for d in gfa.dovetails:
__add_dovetails(d, G, is_contain)
__add_dovetails(d.complement(), G, is_contain)
def __add_dovetails(d, G, is_contain):
G.add_node(d.from_name + d.from_orient, containment=is_contain(d.from_name))
G.add_node(d.to_name + d.to_orient, containment=is_contain(d.to_name))
if d.NM is None or d.om is None or d.om == 0:
qual_val = 0.0
else:
qual_val = (d.NM / d.om) * 100.0
G.add_edge(d.from_name + d.from_orient,
d.to_name + d.to_orient,
qual=qual_val,
len=d.alignment.length_on_query(),
overhang_len=d.to_segment.length - d.alignment.length_on_query(),
overhang_maplen=d.get("om"),
weight=1)
class isContain:
def __init__(self, gfa):
self.containment2pos = {l.to_name: i for i, l in enumerate(gfa.containments)}
def __call__(self, node):
return node in self.containment2pos
def index(self, node):
return self.containment2pos[node]
def print_info(G):
logging.debug("begin transitiv reduction")
logging.debug("nb edge : " + str(G.number_of_edges()))
logging.debug("nb node : " + str(G.number_of_nodes()))
logging.debug("nb strong components : " + str(nx.number_strongly_connected_components(G)))
logging.debug("nb weak components : " + str(nx.number_weakly_connected_components(G)))
def check_trans_red(G):
nb_error = 0
for x in G.nodes():
for y in G.successors(x):
for z in (set(G.successors(x)) & set(G.successors(y))):
nb_error += 1
return nb_error