Commit 00fcb322 authored by VIGNET Pierre's avatar VIGNET Pierre
Browse files

Add doc; remove test blobs on solution_sort module

parent 70f543b4
......@@ -45,7 +45,8 @@ def launch_researchs(args):
def launch_sort(args):
"""Parse a solution file and sort all frontier places in alphabetical order.
"""Parse a solution file or a directory of solution files,
and sort all frontier places in alphabetical order.
"""
# Module import
......@@ -230,7 +231,8 @@ def main():
parser_solutions_sort = subparsers.add_parser('sort_solutions',
help=launch_sort.__doc__)
parser_solutions_sort.add_argument('sol_file',
help="Solution file (output of 'compute_macs' command).")
help="Solution file or directory with solution files" + \
"(output of 'compute_macs' command).")
parser_solutions_sort.set_defaults(func=launch_sort)
......
......@@ -21,15 +21,32 @@
# Dyliss team
# IRISA Campus de Beaulieu
# 35042 RENNES Cedex, FRANCE
"""This module provides some functions to do some analyzes on the output
files of Cadbiom.
"""
from __future__ import unicode_literals
from __future__ import print_function
import os
import glob
from collections import Counter
from collections import defaultdict
import csv
## Handle output files #########################################################
def get_solutions(file_descriptor):
"""return a tuple of the original line + the stripped line containing the solution"""
"""Generator of solution lines and corresponding stripped lines.
.. note: Do not return events ! Just sets of frontier places.
:param: Opened file.
:type: <file>
:return: Line (without '\n') and stripped line (with '\' replaced by ' '
(except for final '\t')).
:rtype: <tuple <str>, <str>>
"""
for line in file_descriptor:
# Remove possible \t separator from first line (frontier solution)
......@@ -40,161 +57,69 @@ def get_solutions(file_descriptor):
if stripped_line == '':
continue
# Remove events or other lines
if stripped_line[0] not in ('%', '=', ' '):
# print(stripped_line)
# Sort in lower case, remove ' ' empty elements
yield line, stripped_line
## Sort functions ##############################################################
def sort_solutions(path):
"""
"""
assert os.path.isfile(path) or os.path.isdir(path)
if os.path.isdir(path):
path = path if path[-1] == '/' else path + '/'
[sort_solutions_in_file(file) for file in glob.glob(path + '*cam*')]
else:
sort_solutions_in_file(path)
def sort_solutions_in_file(filepath):
"""Sort all solutions in the given file in alphabetical order.
.. warning:: The file is modified in place.
def sort_solutions_in_file(file):
"""Sort all solutions in alphabetical order in place."""
:param: Filepath to be opened and in which solutions will be sorted.
:arg: <str>
"""
solutions = dict()
with open(file, 'r+') as fd:
with open(filepath, 'r+') as fd:
# Get old line as key and ordered line as value
for line, stripped_line in get_solutions(fd):
# Sort in lower case, remove ' ' empty elements
solutions[line] = \
" ".join(sorted([place for place in stripped_line.split(' ')
if place != ' '], key=lambda s: s.lower()))
# Rewind
# Rewind the whole file
fd.seek(0)
# Load all the content
file_text = fd.read()
# Replace old sols with the new
# Replace old sols with the new ones
for original_sol, sorted_sol in solutions.items():
file_text = file_text.replace(original_sol, sorted_sol)
# print(file_text)
# Rewind
# Rewind the whole file
fd.seek(0)
# Write all text in place
# Write all text in the current opened file
fd.write(file_text)
################################################################################
def get_cam_lines(file):
# Return cam lines
with open(file, 'r+') as fd:
return {stripped_line for _, stripped_line in get_solutions(fd)}
def make_matrix(path):
import glob
from collections import Counter
if path[-1] != '/':
path += '/'
genes = 'COL1A1_gene', 'MMP2_gene', 'MMP9_gene', 'TGFB1_gene', 'TIMP1_gene', 'decorin_gene'
i = 0
for file in glob.glob(path + '*cam.txt'):
i += 1
patterns = set()
for cam in get_cam_lines(file):
patterns.add(frozenset({gene for gene in genes if gene in cam}))
print('file:', file)
print(patterns, len(patterns))
# cnter_of_patterns = Counter(patterns)
# print(cnter_of_patterns, len(cnter_of_patterns))
raw_input('pause')
print("nb files", i)
if __name__ == "__main__":
make_matrix('./docker_results/')
exit()
import glob
from collections import Counter
total_cams = list()
# Get each cams for each file
for file in glob.glob('./docker_results/*cam.txt'):
temp_cams = get_cam_lines(file)
total_cams += list(temp_cams)
# Print the number of cams for the given file
print(file, len(temp_cams))
# Verification of duplicated cams (number > 1)
print([k for k, v in Counter(total_cams).items() if v != 1])
print("total cams:", len(total_cams))
print("len counter", len(Counter(total_cams)))
# Check new results vs old results for SRP9 cams
old_cams = get_cam_lines('/media/DATA/Projets/dyliss_tgf/cadbiom/data/pid_and_clock_no_perm_p21corrected_start_SRP9_complete.txt')
new_cams = get_cam_lines('./bite/Whole NCI-PID database translated into CADBIOM formalism(and)_SRP9_cam.txt')
print("Anciennes", len(old_cams)) # 317
print("Nouvelles", len(new_cams)) # 557
print("Intersection", len(old_cams & new_cams)) # 221
diff = old_cams - new_cams
print("Anciennes non retrouvées", len(diff)) # 96
print("Nouvelles en plus", len(new_cams - old_cams)) # 336
# print("Anciennes non retrouvées", diff)
# Get list of lists of frontier places in old cams not found this time
diff_pb = [problematic_cam.split(' ') for problematic_cam in diff] # Len: 96
def sort_solutions(path):
"""Entry point for sorting solutions.
# Common frontier places in problematic cams
common_frontier_places = set(diff_pb[0]).intersection(*diff_pb)
# print("Places communes aux solutions non retrouvées", common_frontier_places)
print("Nombre de places communes aux solutions non retrouvées", len(common_frontier_places)) # 14
Parse a solution file and sort all frontier places in alphabetical order.
# Résutlat:
set([u'SRF_nucl', u'ERK1_2', u'STAT5_cy', u'IL2Rgamma_JAK3_intToMb',
u'IL2Rbeta_JAK1_LCK_intToMb', u'JNK1_2', u'MEKK1', u'ELK1_nucl',
u'FOS_cy_gene', u'SRP9_gene', u'IL2_glycosylation_exCellRegion',
u'ceramide_start', u'JUN_gene', u'IL2Ralpha_intToMb_gene'])
:param: Filepath or directory path containing Cadbiom solutions.
:type: <str>
"""
# changement de nom dans chaque solution de l'ancien fichier: ceramide_start > ceramide
new_ancient_cams = {cam.replace('ceramide_start', 'ceramide') for cam in old_cams}
print("Anciennes retouchees", len(new_ancient_cams)) # 317 (normal)
print("Anciennes non retrouvées", len(new_ancient_cams - new_cams)) # 0
# => tout a été retrouvé
# Check valid input file/directory
assert os.path.isfile(path) or os.path.isdir(path)
if os.path.isdir(path):
# Recursive search of *cam* files
# (cam.txt, cam_complete.txt, cam_step.txt)
path = path if path[-1] == '/' else path + '/'
[sort_solutions_in_file(file) for file in glob.glob(path + '*cam*')]
else:
sort_solutions_in_file(path)
################################################################################
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment