moving sequence_neighborhood_main to extract_neighborhood file

beiko-lab · Sep 15, 2024 · c99bd0a · c99bd0a
1 parent e0753a0
commit c99bd0a
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 87 deletions.
diff --git a/sarand/extract_neighborhood.py b/sarand/extract_neighborhood.py
@@ -26,16 +26,20 @@
 import shutil
 import multiprocessing
 from csv import DictReader
+from pathlib import Path
 
 from sarand.util.logger import LOG
 from sarand.utils import (
     reverse_sign,
     find_node_name,
     exist_in_path,
     compare_two_sequences,
-    read_path_info_from_align_file
+    read_path_info_from_align_file,
+    delete_lines_started_with
 )
 
+from sarand.config import SEQ_DIR_NAME, SEQ_NAME_PREFIX
+
 OUT_DIR = "output"
 TEMP_DIR = "temp"
 
@@ -1891,6 +1895,89 @@ def neighborhood_sequence_extraction(
         )
     return seq_file, paths_info_file
 
+def sequence_neighborhood_main(
+        params,
+        gfa_file,
+        amr_seq_align_info,
+        debug: bool
+):
+    """
+    The core function to extract the neighborhood of AMRs
+    Parameters:
+        params: the list pf parameters imported from params.py
+		bandage_path: the path to bandage executable file
+        gfa_file: the file containing the assembly graph
+        amr_seq_align_info: the alignment info (AMR alignment in the graph)
+    Return:
+        the list of files containing extracted sequence and the details of nodes representing them
+    """
+
+    # seq_files = []
+    # path_info_files = []
+    LOG.info(f"Extracting neighborhood sequences with length = {params.neighbourhood_length}")
+
+    # remove paths from GFA file
+    # AM: We don't want to modify the users input data so this now goes to a new file
+    """
+    AM: Since we don't want to modify the users input data, this is now being written
+    to a new file in the output directory. This could be in a temporary directory
+    instead.
+    """
+    path_gfa_new = Path(params.output_dir) / f'{gfa_file.stem}_no_paths.gfa'
+    delete_lines_started_with("P", gfa_file, path_gfa_new)
+    gfa_file = path_gfa_new
+
+    sequence_dir = os.path.join(
+        params.output_dir,
+        SEQ_DIR_NAME,
+        f'{SEQ_DIR_NAME}_{params.neighbourhood_length}',
+    )
+    os.makedirs(sequence_dir, exist_ok=True)
+
+    # If running single threaded do not add any overhead using multiprocessing pool
+    if params.num_cores == 1:
+        lists = list()
+        for x in amr_seq_align_info:
+            lists.append(neighborhood_sequence_extraction(gfa_file,
+                                                          params.neighbourhood_length,
+                                                          sequence_dir,
+                                                          params.min_target_identity,
+                                                          SEQ_NAME_PREFIX,
+                                                          1000,  # should this really be an option? path_node_threshold
+                                                          params.max_kmer_size,
+                                                          params.extraction_timeout,
+                                                          params.assembler, x))
+    else:
+        p_extraction = partial(
+            neighborhood_sequence_extraction,
+            gfa_file,
+            params.neighbourhood_length,
+            sequence_dir,
+            params.min_target_identity,
+            SEQ_NAME_PREFIX,
+            1000,  # should this really be an option? path_node_threshold
+            params.max_kmer_size,
+            params.extraction_timeout,
+            params.assembler
+        )
+        with Pool(params.num_cores) as p:
+            lists = list(p.imap(p_extraction, amr_seq_align_info))
+    seq_files, path_info_files = zip(*lists)
+
+    # AM: To clean up the file we created earlier
+    os.remove(path_gfa_new)
+
+    if debug:
+        try_dump_to_disk(
+            {
+                'seq_files': seq_files,
+                'path_info_files': path_info_files
+            },
+            Path(sequence_dir) / 'debug_sequence_neighborhood_main.json'
+        )
+
+    return seq_files, path_info_files
+
 
 if __name__ == "__main__":
     pass

diff --git a/sarand/full_pipeline.py b/sarand/full_pipeline.py
@@ -31,7 +31,8 @@
     ANNOTATION_DIR
 from sarand.external.graph_aligner import GraphAligner, GraphAlignerParams
 from sarand.external.bandage import Bandage, BandageParams
-from sarand.extract_neighborhood import neighborhood_sequence_extraction
+#from sarand.extract_neighborhood import neighborhood_sequence_extraction
+from sarand.extract_neighborhood import sequence_neighborhood_main
 from sarand.model.fasta_seq import FastaSeq
 from sarand.util.file import try_dump_to_disk
 from sarand.util.logger import LOG
@@ -44,7 +45,6 @@
     amr_name_from_comment,
     extract_name_from_file_name,
     restricted_amr_name_from_modified_name,
-    delete_lines_started_with,
     read_path_info_from_align_file_with_multiple_amrs,
     annotate_sequence,
     extract_amr_sequences,
@@ -1163,90 +1163,6 @@ def seq_annotation_main(params, seq_files, path_info_files, amr_files, debug: bo
     return all_seq_info_lists, annotation_files
 
 
-def sequence_neighborhood_main(
-        params,
-        gfa_file,
-        amr_seq_align_info,
-        debug: bool
-):
-    """
-    The core function to extract the neighborhood of AMRs
-    Parameters:
-        params: the list pf parameters imported from params.py
-		bandage_path: the path to bandage executable file
-        gfa_file: the file containing the assembly graph
-        amr_seq_align_info: the alignment info (AMR alignment in the graph)
-    Return:
-        the list of files containing extracted sequence and the details of nodes representing them
-    """
-
-    # seq_files = []
-    # path_info_files = []
-    LOG.info(f"Extracting neighborhood sequences with length = {params.neighbourhood_length}")
-
-    # remove paths from GFA file
-    # AM: We don't want to modify the users input data so this now goes to a new file
-    """
-    AM: Since we don't want to modify the users input data, this is now being written
-    to a new file in the output directory. This could be in a temporary directory
-    instead.
-    """
-    path_gfa_new = Path(params.output_dir) / f'{gfa_file.stem}_no_paths.gfa'
-    delete_lines_started_with("P", gfa_file, path_gfa_new)
-    gfa_file = path_gfa_new
-
-    sequence_dir = os.path.join(
-        params.output_dir,
-        SEQ_DIR_NAME,
-        f'{SEQ_DIR_NAME}_{params.neighbourhood_length}',
-    )
-    os.makedirs(sequence_dir, exist_ok=True)
-
-    # If running single threaded do not add any overhead using multiprocessing pool
-    if params.num_cores == 1:
-        lists = list()
-        for x in amr_seq_align_info:
-            lists.append(neighborhood_sequence_extraction(gfa_file,
-                                                          params.neighbourhood_length,
-                                                          sequence_dir,
-                                                          params.min_target_identity,
-                                                          SEQ_NAME_PREFIX,
-                                                          1000,  # should this really be an option? path_node_threshold
-                                                          params.max_kmer_size,
-                                                          params.extraction_timeout,
-                                                          params.assembler, x))
-    else:
-        p_extraction = partial(
-            neighborhood_sequence_extraction,
-            gfa_file,
-            params.neighbourhood_length,
-            sequence_dir,
-            params.min_target_identity,
-            SEQ_NAME_PREFIX,
-            1000,  # should this really be an option? path_node_threshold
-            params.max_kmer_size,
-            params.extraction_timeout,
-            params.assembler
-        )
-        with Pool(params.num_cores) as p:
-            lists = list(p.imap(p_extraction, amr_seq_align_info))
-    seq_files, path_info_files = zip(*lists)
-
-    # AM: To clean up the file we created earlier
-    os.remove(path_gfa_new)
-
-    if debug:
-        try_dump_to_disk(
-            {
-                'seq_files': seq_files,
-                'path_info_files': path_info_files
-            },
-            Path(sequence_dir) / 'debug_sequence_neighborhood_main.json'
-        )
-
-    return seq_files, path_info_files
-
-
 def full_pipeline_main(params):
     """
     Main runner function for the sarand workflow