Commit f296ee83 authored by BELCOUR Arnaud's avatar BELCOUR Arnaud

Add reactions extraction for each cluster.

Add creation of upset graph using intervene.
parent eb64d615
......@@ -14,25 +14,33 @@ usage:
option:
-h --help Show help.
-r --reactions=FILE pathname of the file containing reactions in each species of the comparison.
-o --output=FILE pathname of the output file.
-o --output=FOLDER path to the output folder.
"""
import docopt
import itertools
import pandas as pa
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import subprocess
sns.set_style("white")
sns.set('poster', rc={'figure.figsize':(50,37)}, font_scale=4)
from scipy.cluster.hierarchy import dendrogram
from collections import defaultdict
from scipy.cluster.hierarchy import dendrogram, fcluster
from scipy.spatial import distance
from sklearn.metrics.pairwise import pairwise_distances
from fastcluster import linkage
from scipy.spatial.distance import squareform, pdist
def dendrogram_creation(reaction_file, output_pathname):
def reaction_figure_creation(reaction_file, output_folder):
# Check if output_folder exists, if not create it.
if not os.path.isdir("{0}".format(output_folder)):
os.mkdir("{0}".format(output_folder))
# Read the reactions file with pandas.
reactions_dataframe = pa.read_csv(reaction_file, sep='\t')
# Keep column containing absence-presence of reactions.
......@@ -58,15 +66,78 @@ def dendrogram_creation(reaction_file, output_pathname):
linkage_matrix = linkage(distance_matrix_jaccard, method="average")
# Draw a dendrogram of the clustering.
dendrogram(linkage_matrix, labels=absence_presence_matrix.index, leaf_font_size=40)
reaction_dendrogram = dendrogram(linkage_matrix, labels=absence_presence_matrix.index, leaf_font_size=40)
plt.savefig(output_folder+'/reaction_dendrogram.png')
# Extract species in each clusteR.
k = len(set(reaction_dendrogram['color_list']))
results = fcluster(linkage_matrix, k, criterion='maxclust')
species = absence_presence_matrix.index.tolist()
cluster_species = dict(zip(species, results))
cluster_classes = defaultdict(list)
for key, value in cluster_species.items():
cluster_classes[value].append(key)
# Extract reactions in each cluster.
cluster_reactions = {}
for cluster in cluster_classes:
reactions_temp = []
for species in cluster_classes[cluster]:
species_reactions_dataframe = reactions_dataframe[reactions_dataframe[species] == True]
reactions_temp.extend(species_reactions_dataframe.index.tolist())
cluster_reactions[cluster] = set(reactions_temp)
all_reactions = [reactions for reactions in cluster_reactions.values()]
cluster_intersections = set.intersection(*all_reactions)
cluster_subintersection = {}
# Extract intersection between clusters.
for cluster_number in reversed(range(len(cluster_reactions))):
if cluster_number != 0 and cluster_number != 1:
for set_list in itertools.combinations(cluster_reactions, cluster_number):
tmp_reactions = [cluster_reactions[cluster] for cluster in set_list]
cltemp = set.intersection(*tmp_reactions)
intersection_temp = cltemp - cluster_intersections
cluster_subintersection[set_list] = intersection_temp
# Create reactions which intersect for each cluster.
cluster_subsubintersection = {}
for cluster in cluster_classes:
species_intersections = []
for set_list in cluster_subintersection:
if cluster in set_list:
species_intersections.append(cluster_subintersection[set_list])
cluster_subsubintersection[cluster] = set([j for i in species_intersections for j in i])
# Extract reactions unique for each cluster.
cluster_unique = {}
for cluster in cluster_classes:
cluster_unique[cluster] = cluster_reactions[cluster]-cluster_intersections-cluster_subsubintersection[cluster]
input_folder = output_folder + '/temp_data/'
path_to_intervene = 'intervene'
if not os.path.isdir("{0}".format(input_folder)):
os.mkdir("{0}".format(input_folder))
# Create data for creating upset graph using intervene.
for cluster in cluster_classes:
df = pa.DataFrame({'_'.join(cluster_classes[cluster]): list(cluster_reactions[cluster])})
df.to_csv(input_folder+'/'+'_'.join(cluster_classes[cluster])+'.tsv', sep='\t', index=None)
plt.savefig(output_pathname)
cmd = '{0} upset -i {1}/*.tsv --type list -o {2} --figtype svg'.format(path_to_intervene, input_folder, output_folder)
subprocess.call(cmd, shell=True)
def main():
args = docopt.docopt(__doc__)
reaction_pathname = args["--reactions"]
output_pathname = args["--output"]
dendrogram_creation(reaction_pathname, output_pathname)
reaction_figure_creation(reaction_pathname, output_pathname)
main()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment