Mentions légales du service
Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Source_Encoding
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
GitLab upgrade completed. Current version is 17.11.3.
Show more breadcrumbs
dnarXiv
Source_Encoding
Commits
adaa1986
Commit
adaa1986
authored
1 year ago
by
BOULLE Olivier
Browse files
Options
Downloads
Patches
Plain Diff
refactorings, comments, generalise zipping methods, improve merging small files
parent
be62c47f
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
post_processing.py
+3
-11
3 additions, 11 deletions
post_processing.py
pre_processing.py
+101
-46
101 additions, 46 deletions
pre_processing.py
with
104 additions
and
57 deletions
post_processing.py
+
3
−
11
View file @
adaa1986
...
@@ -12,12 +12,6 @@ do the opposite of pre_processing.py
...
@@ -12,12 +12,6 @@ do the opposite of pre_processing.py
take dna sequences and convert them into files
take dna sequences and convert them into files
"""
"""
def
compressed_name
(
file_path
):
return
file_path
+
"
.gz
"
def
uncompressed_name
(
file_path
):
return
file_path
.
replace
(
"
.gz
"
,
""
)
def
convert_to_binary
(
input_dir_path
,
compressed_dir_path
):
def
convert_to_binary
(
input_dir_path
,
compressed_dir_path
):
"""
"""
...
@@ -26,7 +20,7 @@ def convert_to_binary(input_dir_path, compressed_dir_path):
...
@@ -26,7 +20,7 @@ def convert_to_binary(input_dir_path, compressed_dir_path):
for
filename
in
os
.
listdir
(
input_dir_path
):
for
filename
in
os
.
listdir
(
input_dir_path
):
file_path
=
os
.
path
.
join
(
input_dir_path
,
filename
)
file_path
=
os
.
path
.
join
(
input_dir_path
,
filename
)
result_file_path
=
os
.
path
.
join
(
compressed_dir_path
,
compressed_name
(
filename
))
result_file_path
=
os
.
path
.
join
(
compressed_dir_path
,
pre_processing
.
get_
compressed_name
(
filename
))
# checking if it is a file
# checking if it is a file
if
os
.
path
.
isfile
(
file_path
):
if
os
.
path
.
isfile
(
file_path
):
...
@@ -44,14 +38,12 @@ def uncompress_files(compressed_dir_path, uncompressed_dir_path):
...
@@ -44,14 +38,12 @@ def uncompress_files(compressed_dir_path, uncompressed_dir_path):
"""
"""
for
filename
in
os
.
listdir
(
compressed_dir_path
):
for
filename
in
os
.
listdir
(
compressed_dir_path
):
file_path
=
os
.
path
.
join
(
compressed_dir_path
,
filename
)
file_path
=
os
.
path
.
join
(
compressed_dir_path
,
filename
)
result
_file_path
=
os
.
path
.
join
(
uncompressed_dir_path
,
uncompressed_name
(
filename
))
uncompressed
_file_path
=
os
.
path
.
join
(
uncompressed_dir_path
,
pre_processing
.
get_
uncompressed_name
(
filename
))
# checking if it is a file
# checking if it is a file
if
os
.
path
.
isfile
(
file_path
):
if
os
.
path
.
isfile
(
file_path
):
pre_processing
.
unzip_file
(
file_path
)
pre_processing
.
unzip_file
(
file_path
,
uncompressed_file_path
)
# move the unzipped file
os
.
replace
(
uncompressed_name
(
file_path
),
result_file_path
)
elif
os
.
path
.
isdir
(
file_path
):
elif
os
.
path
.
isdir
(
file_path
):
print
(
"
error post processing (uncompress_files) : directory found in compressed_dir_path
"
,
filename
)
print
(
"
error post processing (uncompress_files) : directory found in compressed_dir_path
"
,
filename
)
...
...
This diff is collapsed.
Click to expand it.
pre_processing.py
+
101
−
46
View file @
adaa1986
...
@@ -17,28 +17,72 @@ splitting files that are too large for 1 assembly
...
@@ -17,28 +17,72 @@ splitting files that are too large for 1 assembly
"""
"""
def
zip_file
(
file_path
,
output_path
):
def
zip_file
(
file_path
,
output_path
,
compression_type
=
"
gzip
"
):
"""
"""
split
the file
with gzip
and write it at the output path
compress
the file and write it at the output path
"""
"""
compression_command
=
"
gzip -c9
"
+
file_path
+
"
>
"
+
output_path
+
"
.gz
"
subprocess
.
run
(
'
/bin/bash -c
"
$COMMAND
"'
,
shell
=
True
,
env
=
{
'
COMMAND
'
:
compression_command
})
if
compression_type
==
"
gzip
"
:
compression_command
=
"
gzip -c9
"
+
file_path
+
"
>
"
+
output_path
subprocess
.
run
(
'
/bin/bash -c
"
$COMMAND
"'
,
shell
=
True
,
env
=
{
'
COMMAND
'
:
compression_command
})
return
if
compression_type
==
"
cmix
"
:
if
not
os
.
path
.
isfile
(
output_path
):
compression_command
=
"
/udd/oboulle/Documents/result_analysis/compression_analysis/cmix/cmix -c
"
+
file_path
+
"
"
+
output_path
subprocess
.
run
(
'
/bin/bash -c
"
$COMMAND
"'
,
shell
=
True
,
env
=
{
'
COMMAND
'
:
compression_command
})
else
:
print
(
"
already done
"
,
output_path
)
return
# type not supported
print
(
"
compression error, unknown format:
"
,
compression_type
)
exit
(
0
)
def
unzip_file
(
file_path
):
def
unzip_file
(
file_path
,
output_path
,
compression_type
=
"
gzip
"
):
"""
"""
un
zip
the file and write it just where it is
un
compress
the file and write it just where it is
"""
"""
decompression_command
=
"
gzip -d
"
+
file_path
subprocess
.
run
(
'
/bin/bash -c
"
$COMMAND
"'
,
shell
=
True
,
env
=
{
'
COMMAND
'
:
decompression_command
})
if
compression_type
==
"
gzip
"
:
decompression_command
=
"
gzip -d
"
+
file_path
subprocess
.
run
(
'
/bin/bash -c
"
$COMMAND
"'
,
shell
=
True
,
env
=
{
'
COMMAND
'
:
decompression_command
})
def
compressed_name
(
file_path
):
# move the unzipped file to the defined output path
return
file_path
+
"
.gz
"
os
.
replace
(
get_uncompressed_name
(
file_path
,
"
gzip
"
),
output_path
)
return
def
uncompressed_name
(
file_path
):
return
file_path
.
replace
(
"
.gz
"
,
""
)
if
compression_type
==
"
cmix
"
:
decompression_command
=
"
/udd/oboulle/Documents/result_analysis/compression_analysis/cmix/cmix -d
"
+
file_path
+
"
"
+
output_path
subprocess
.
run
(
'
/bin/bash -c
"
$COMMAND
"'
,
shell
=
True
,
env
=
{
'
COMMAND
'
:
decompression_command
})
return
# type not supported
print
(
"
decompression error, unknown format:
"
,
compression_type
)
exit
(
0
)
def
get_compressed_name
(
file_path
,
compression_type
=
"
gzip
"
):
if
compression_type
==
"
gzip
"
:
return
file_path
+
"
.gz
"
if
compression_type
==
"
cmix
"
:
return
file_path
+
"
.cx
"
# type not supported
print
(
"
get_compressed_name error, unknown format:
"
,
compression_type
)
exit
(
0
)
def
get_uncompressed_name
(
file_path
,
compression_type
=
"
gzip
"
):
if
compression_type
==
"
gzip
"
:
return
file_path
.
replace
(
"
.gz
"
,
""
)
if
compression_type
==
"
cmix
"
:
return
file_path
.
replace
(
"
.cx
"
,
""
)
# type not supported
print
(
"
get_uncompressed_name error, unknown format:
"
,
compression_type
)
exit
(
0
)
def
insert_path_in_files
(
input_dir_path
:
str
,
rearanged_files_dir_path
:
str
)
->
None
:
def
insert_path_in_files
(
input_dir_path
:
str
,
rearanged_files_dir_path
:
str
)
->
None
:
"""
"""
...
@@ -86,18 +130,18 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
...
@@ -86,18 +130,18 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
# compress the file
# compress the file
file_path
=
os
.
path
.
join
(
rearanged_files_dir_path
,
filename
)
file_path
=
os
.
path
.
join
(
rearanged_files_dir_path
,
filename
)
comp
ressed
_file_path
=
os
.
path
.
join
(
compressed_dir_path
,
filename
)
comp_file_path
=
get_compressed_name
(
os
.
path
.
join
(
compressed_dir_path
,
filename
)
)
# checking if it is a file
# checking if it is a file
if
os
.
path
.
isfile
(
file_path
):
if
os
.
path
.
isfile
(
file_path
):
zip_file
(
file_path
,
comp
ressed
_file_path
)
zip_file
(
file_path
,
comp_file_path
)
elif
os
.
path
.
isdir
(
file_path
):
elif
os
.
path
.
isdir
(
file_path
):
print
(
"
error pre processing (compress_all) : directory found in rearanged dir path
"
,
filename
)
print
(
"
error pre processing (compress_all) : directory found in rearanged dir path
"
,
filename
)
exit
(
0
)
exit
(
0
)
# get binary size of the compressed file
# get binary size of the compressed file
binary_len
=
len
(
file_to_dna
.
convert_file_to_bits
(
comp
ressed_name
(
compressed
_file_path
))
)
binary_len
=
len
(
file_to_dna
.
convert_file_to_bits
(
comp_file_path
))
if
binary_len
<=
max_binary_length
:
# if acceptable length, it's perfect
if
binary_len
<=
max_binary_length
:
# if acceptable length, it's perfect
files_compressed_size
[
filename
]
=
binary_len
# save the compressed size for this file
files_compressed_size
[
filename
]
=
binary_len
# save the compressed size for this file
...
@@ -105,7 +149,7 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
...
@@ -105,7 +149,7 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
else
:
else
:
# file too large, nedd to split it
# file too large, nedd to split it
os
.
remove
(
comp
ressed_name
(
compressed
_file_path
)
)
# delete the compressed file
os
.
remove
(
comp_file_path
)
# delete the compressed file
# read the original file as bytes
# read the original file as bytes
with
open
(
file_path
,
"
rb
"
)
as
input_file
:
with
open
(
file_path
,
"
rb
"
)
as
input_file
:
...
@@ -147,10 +191,10 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
...
@@ -147,10 +191,10 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
with
open
(
split_file_path
,
"
wb
"
)
as
f
:
# write the bytes content
with
open
(
split_file_path
,
"
wb
"
)
as
f
:
# write the bytes content
f
.
write
(
split_file_bytes_content
)
f
.
write
(
split_file_bytes_content
)
compressed_subfile_path
=
get_compressed_name
(
get_uncompressed_name
(
comp_file_path
)
+
split_file_footer
)
# compress the split_file to the compressed directory
# compress the split_file to the compressed directory
zip_file
(
split_file_path
,
compressed_file_path
+
split_file_footer
)
zip_file
(
split_file_path
,
compressed_
sub
file_path
)
compressed_subfile_path
=
compressed_name
(
compressed_file_path
+
split_file_footer
)
# check the size of the subfile
# check the size of the subfile
binary_len
=
len
(
file_to_dna
.
convert_file_to_bits
(
compressed_subfile_path
))
binary_len
=
len
(
file_to_dna
.
convert_file_to_bits
(
compressed_subfile_path
))
...
@@ -166,28 +210,35 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
...
@@ -166,28 +210,35 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
# if the sub file is recreated, the new size will overwrite this one
# if the sub file is recreated, the new size will overwrite this one
def
merge_short_files
(
files_compressed_size
:
dict
):
def
merge_short_files
(
files_compressed_size
:
dict
):
"""
each molecule can store a maximum fixed number of bits
since it
'
s more efficient to have a few long molecule than a lot of short molecule,
small files can be merged in one same molecule
"""
#sorted_files_sizes = sorted(files_compressed_size.items(), key=lambda item: item[1], reverse=True) # sort dict by sizes from highest to lowest
#sorted_files_sizes = sorted(files_compressed_size.items(), key=lambda item: item[1], reverse=True) # sort dict by sizes from highest to lowest
files_compressed_size_bis
=
{}
#
files_compressed_size_bis = {}
merged_files_paths
=
[]
#
merged_files_paths = []
new_merge
=
False
# true if at least one new merge has been made
new_merge
=
False
#
set to
true if at least one new merge has been made
for
i
,
filename
in
enumerate
(
list
(
files_compressed_size
.
keys
())[:
-
1
]):
for
i
,
filename
in
enumerate
(
list
(
files_compressed_size
.
keys
())[:
-
1
]):
unmerged
=
True
# set to false if a merge is made
file_compressed_size
=
files_compressed_size
[
filename
]
# get size of the compressed file
file_compressed_size
=
files_compressed_size
[
filename
]
if
file_compressed_size
>=
max_binary_length
:
# impossible to merge because too large
if
file_compressed_size
is
None
or
file_compressed_size
>=
max_binary_length
:
# impossible to merge because too large
, or has already be used in a merge (set to None)
continue
continue
# skip this file
for
filename_2
in
list
(
files_compressed_size
.
keys
())[
i
+
1
:]:
for
filename_2
in
list
(
files_compressed_size
.
keys
())[
i
+
1
:]:
file_compressed_size_2
=
files_compressed_size
[
filename_2
]
file_compressed_size_2
=
files_compressed_size
[
filename_2
]
if
file_compressed_size
+
file_compressed_size_2
<=
max_binary_length
:
if
file_compressed_size_2
is
None
or
file_compressed_size
+
file_compressed_size_2
<=
max_binary_length
:
# the sum of the 2 compressed files is lower than what can be stored,
# so the original files will be merged and recompressed,
# the compression of a merging is supposed to be smaller than the sum of compressions of each file
#
mer
ge the
2
file
s
# ge
t
the
binary content of each
file
with
open
(
os
.
path
.
join
(
rearanged_files_dir_path
,
filename
),
"
rb
"
)
as
input_file
:
with
open
(
os
.
path
.
join
(
rearanged_files_dir_path
,
filename
),
"
rb
"
)
as
input_file
:
bytes_content
=
b
""
.
join
(
input_file
.
readlines
())
bytes_content
=
b
""
.
join
(
input_file
.
readlines
())
with
open
(
os
.
path
.
join
(
rearanged_files_dir_path
,
filename_2
),
"
rb
"
)
as
input_file
:
with
open
(
os
.
path
.
join
(
rearanged_files_dir_path
,
filename_2
),
"
rb
"
)
as
input_file
:
...
@@ -196,15 +247,17 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
...
@@ -196,15 +247,17 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
# remove the "merged_" from the name of already merged files for visibility
# remove the "merged_" from the name of already merged files for visibility
merged_file_name
=
"
merged_
"
+
filename
.
replace
(
"
merged_
"
,
""
)
+
"
+
"
+
filename_2
.
replace
(
"
merged_
"
,
""
)
merged_file_name
=
"
merged_
"
+
filename
.
replace
(
"
merged_
"
,
""
)
+
"
+
"
+
filename_2
.
replace
(
"
merged_
"
,
""
)
merged_file_path
=
os
.
path
.
join
(
rearanged_files_dir_path
,
merged_file_name
)
merged_file_path
=
os
.
path
.
join
(
rearanged_files_dir_path
,
merged_file_name
)
with
open
(
merged_file_path
,
"
wb
"
)
as
f
:
# write the bytes content
compressed_merged_file_path
=
get_compressed_name
(
os
.
path
.
join
(
compressed_dir_path
,
merged_file_name
))
print
(
"
new merge :
"
,
filename
,
"
and
"
,
filename_2
)
with
open
(
merged_file_path
,
"
wb
"
)
as
f
:
# write the sum of bytes content
f
.
write
(
bytes_content
+
bytes_content_2
)
f
.
write
(
bytes_content
+
bytes_content_2
)
compressed_merged_file_path
=
os
.
path
.
join
(
compressed_dir_path
,
merged_file_name
)
# compress the merged file created
# add the merged file path to the bis dict with it's compressed size
zip_file
(
merged_file_path
,
compressed_merged_file_path
)
zip_file
(
merged_file_path
,
compressed_merged_file_path
)
merged_binary_len
=
len
(
file_to_dna
.
convert_file_to_bits
(
compressed_name
(
compressed_merged_file_path
)))
# test its size just in case, but it should fit in a molecule
merged_binary_len
=
len
(
file_to_dna
.
convert_file_to_bits
(
compressed_merged_file_path
))
if
merged_binary_len
>=
max_binary_length
:
if
merged_binary_len
>=
max_binary_length
:
print
(
"
error merging result too large
"
,
compressed_merged_file_path
)
print
(
"
error merging result too large
"
,
compressed_merged_file_path
)
...
@@ -213,23 +266,25 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
...
@@ -213,23 +266,25 @@ def compress_and_split(rearanged_files_dir_path: str, compressed_dir_path: str)
# add the merged file to the dict because it can still be used for other merging if it's short enough
# add the merged file to the dict because it can still be used for other merging if it's short enough
files_compressed_size
[
merged_file_name
]
=
merged_binary_len
files_compressed_size
[
merged_file_name
]
=
merged_binary_len
# remove the 2 compressed files of the 2 files
# remove the 2 old compressed files of the 2 files
os
.
remove
(
os
.
path
.
join
(
compressed_dir_path
,
compressed_name
(
filename
)))
os
.
remove
(
os
.
path
.
join
(
compressed_dir_path
,
get_compressed_name
(
filename
)))
os
.
remove
(
os
.
path
.
join
(
compressed_dir_path
,
compressed_name
(
filename_2
)))
os
.
remove
(
os
.
path
.
join
(
compressed_dir_path
,
get_compressed_name
(
filename_2
)))
# set the compressed size of the 2 files to None to avoid them to be reused for merging
files_compressed_size
[
filename
]
=
None
files_compressed_size
[
filename_2
]
=
None
# set the compressed size of the 2 files to a too high number to avoid them to be reused for merging
new_merge
=
True
# keep in memory that at least one new merge has been made in this loop
files_compressed_size
[
filename
]
=
2
*
max_binary_length
files_compressed_size
[
filename_2
]
=
2
*
max_binary_length
new_merge
=
True
break
# leave the second loop, but others merges can still be done in the continuation of the first loop
break
# continue to try to create other merging if at least one merge has been made
# continue to try to create other merging if at least one merge has been made
# otherwise, the loop can end since it no longer find possible merges
if
new_merge
:
if
new_merge
:
print
(
"
continue merging...
"
)
print
(
"
continue merging...
"
)
print
(
files_compressed_size
)
merge_short_files
(
files_compressed_size
)
merge_short_files
(
files_compressed_size
)
# otherwise, the loop can end since it no longer find possible merges
print
(
files_compressed_size
)
print
(
files_compressed_size
)
merge_short_files
(
files_compressed_size
)
merge_short_files
(
files_compressed_size
)
...
@@ -245,7 +300,7 @@ def convert_to_sequence(compressed_dir_path, payload_fragments_dir_path):
...
@@ -245,7 +300,7 @@ def convert_to_sequence(compressed_dir_path, payload_fragments_dir_path):
# checking if it is a file
# checking if it is a file
if
os
.
path
.
isfile
(
file_path
):
if
os
.
path
.
isfile
(
file_path
):
output_file_path
=
os
.
path
.
join
(
payload_fragments_dir_path
,
uncompressed_name
(
filename
))
output_file_path
=
os
.
path
.
join
(
payload_fragments_dir_path
,
get_
uncompressed_name
(
filename
))
dna_sequence
=
file_to_dna
.
encode_file
(
file_path
,
output_file_path
)
# convert binaries into a dna sequence and save result in the output file
dna_sequence
=
file_to_dna
.
encode_file
(
file_path
,
output_file_path
)
# convert binaries into a dna sequence and save result in the output file
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment