Skip to content

Commit

Permalink
moving sequence_neighborhood_main to extract_neighborhood file
Browse files Browse the repository at this point in the history
  • Loading branch information
skafaie committed Sep 15, 2024
1 parent e0753a0 commit c99bd0a
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 87 deletions.
89 changes: 88 additions & 1 deletion sarand/extract_neighborhood.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,20 @@
import shutil
import multiprocessing
from csv import DictReader
from pathlib import Path

from sarand.util.logger import LOG
from sarand.utils import (
reverse_sign,
find_node_name,
exist_in_path,
compare_two_sequences,
read_path_info_from_align_file
read_path_info_from_align_file,
delete_lines_started_with
)

from sarand.config import SEQ_DIR_NAME, SEQ_NAME_PREFIX

OUT_DIR = "output"
TEMP_DIR = "temp"

Expand Down Expand Up @@ -1891,6 +1895,89 @@ def neighborhood_sequence_extraction(
)
return seq_file, paths_info_file

def sequence_neighborhood_main(
params,
gfa_file,
amr_seq_align_info,
debug: bool
):
"""
The core function to extract the neighborhood of AMRs
Parameters:
params: the list pf parameters imported from params.py
bandage_path: the path to bandage executable file
gfa_file: the file containing the assembly graph
amr_seq_align_info: the alignment info (AMR alignment in the graph)
Return:
the list of files containing extracted sequence and the details of nodes representing them
"""

# seq_files = []
# path_info_files = []
LOG.info(f"Extracting neighborhood sequences with length = {params.neighbourhood_length}")

# remove paths from GFA file
# AM: We don't want to modify the users input data so this now goes to a new file
"""
AM: Since we don't want to modify the users input data, this is now being written
to a new file in the output directory. This could be in a temporary directory
instead.
"""
path_gfa_new = Path(params.output_dir) / f'{gfa_file.stem}_no_paths.gfa'
delete_lines_started_with("P", gfa_file, path_gfa_new)
gfa_file = path_gfa_new

sequence_dir = os.path.join(
params.output_dir,
SEQ_DIR_NAME,
f'{SEQ_DIR_NAME}_{params.neighbourhood_length}',
)
os.makedirs(sequence_dir, exist_ok=True)

# If running single threaded do not add any overhead using multiprocessing pool
if params.num_cores == 1:
lists = list()
for x in amr_seq_align_info:
lists.append(neighborhood_sequence_extraction(gfa_file,
params.neighbourhood_length,
sequence_dir,
params.min_target_identity,
SEQ_NAME_PREFIX,
1000, # should this really be an option? path_node_threshold
params.max_kmer_size,
params.extraction_timeout,
params.assembler, x))
else:
p_extraction = partial(
neighborhood_sequence_extraction,
gfa_file,
params.neighbourhood_length,
sequence_dir,
params.min_target_identity,
SEQ_NAME_PREFIX,
1000, # should this really be an option? path_node_threshold
params.max_kmer_size,
params.extraction_timeout,
params.assembler
)
with Pool(params.num_cores) as p:
lists = list(p.imap(p_extraction, amr_seq_align_info))
seq_files, path_info_files = zip(*lists)

# AM: To clean up the file we created earlier
os.remove(path_gfa_new)

if debug:
try_dump_to_disk(
{
'seq_files': seq_files,
'path_info_files': path_info_files
},
Path(sequence_dir) / 'debug_sequence_neighborhood_main.json'
)

return seq_files, path_info_files


if __name__ == "__main__":
pass
Expand Down
88 changes: 2 additions & 86 deletions sarand/full_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
ANNOTATION_DIR
from sarand.external.graph_aligner import GraphAligner, GraphAlignerParams
from sarand.external.bandage import Bandage, BandageParams
from sarand.extract_neighborhood import neighborhood_sequence_extraction
#from sarand.extract_neighborhood import neighborhood_sequence_extraction
from sarand.extract_neighborhood import sequence_neighborhood_main
from sarand.model.fasta_seq import FastaSeq
from sarand.util.file import try_dump_to_disk
from sarand.util.logger import LOG
Expand All @@ -44,7 +45,6 @@
amr_name_from_comment,
extract_name_from_file_name,
restricted_amr_name_from_modified_name,
delete_lines_started_with,
read_path_info_from_align_file_with_multiple_amrs,
annotate_sequence,
extract_amr_sequences,
Expand Down Expand Up @@ -1163,90 +1163,6 @@ def seq_annotation_main(params, seq_files, path_info_files, amr_files, debug: bo
return all_seq_info_lists, annotation_files


def sequence_neighborhood_main(
params,
gfa_file,
amr_seq_align_info,
debug: bool
):
"""
The core function to extract the neighborhood of AMRs
Parameters:
params: the list pf parameters imported from params.py
bandage_path: the path to bandage executable file
gfa_file: the file containing the assembly graph
amr_seq_align_info: the alignment info (AMR alignment in the graph)
Return:
the list of files containing extracted sequence and the details of nodes representing them
"""

# seq_files = []
# path_info_files = []
LOG.info(f"Extracting neighborhood sequences with length = {params.neighbourhood_length}")

# remove paths from GFA file
# AM: We don't want to modify the users input data so this now goes to a new file
"""
AM: Since we don't want to modify the users input data, this is now being written
to a new file in the output directory. This could be in a temporary directory
instead.
"""
path_gfa_new = Path(params.output_dir) / f'{gfa_file.stem}_no_paths.gfa'
delete_lines_started_with("P", gfa_file, path_gfa_new)
gfa_file = path_gfa_new

sequence_dir = os.path.join(
params.output_dir,
SEQ_DIR_NAME,
f'{SEQ_DIR_NAME}_{params.neighbourhood_length}',
)
os.makedirs(sequence_dir, exist_ok=True)

# If running single threaded do not add any overhead using multiprocessing pool
if params.num_cores == 1:
lists = list()
for x in amr_seq_align_info:
lists.append(neighborhood_sequence_extraction(gfa_file,
params.neighbourhood_length,
sequence_dir,
params.min_target_identity,
SEQ_NAME_PREFIX,
1000, # should this really be an option? path_node_threshold
params.max_kmer_size,
params.extraction_timeout,
params.assembler, x))
else:
p_extraction = partial(
neighborhood_sequence_extraction,
gfa_file,
params.neighbourhood_length,
sequence_dir,
params.min_target_identity,
SEQ_NAME_PREFIX,
1000, # should this really be an option? path_node_threshold
params.max_kmer_size,
params.extraction_timeout,
params.assembler
)
with Pool(params.num_cores) as p:
lists = list(p.imap(p_extraction, amr_seq_align_info))
seq_files, path_info_files = zip(*lists)

# AM: To clean up the file we created earlier
os.remove(path_gfa_new)

if debug:
try_dump_to_disk(
{
'seq_files': seq_files,
'path_info_files': path_info_files
},
Path(sequence_dir) / 'debug_sequence_neighborhood_main.json'
)

return seq_files, path_info_files


def full_pipeline_main(params):
"""
Main runner function for the sarand workflow
Expand Down

0 comments on commit c99bd0a

Please sign in to comment.