Merge pull request beiko-lab#4 from beiko-lab/master

Fin's changes
maguire-lab · Oct 8, 2022 · 13e54de · 13e54de
2 parents 71d7e40 + f3eea62
commit 13e54de
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 57 deletions.
diff --git a/README.md b/README.md
@@ -33,6 +33,8 @@ You can test your install has worked by...
 
 This will execute sarand on a test dataset and check all the expected outputs are created correctly.
 
+
+
 ## Usage
 
 All of sarand's parameters can be set using the command line flags.
@@ -84,21 +86,22 @@ optional arguments:
 ### Output
 All results will be available in specified output directory (default is `sarand_results_` followed by a timestamp).
 
-	Here is the list of important directories and files that can be seen there and a short description of their content.
-	- `AMR_info`: this directory contains the list of identified AMR sequences.
-	  - AMR_info/sequences/:The sequence of identified AMRs, from graph, is stored here, with a name similar to their original name (file name is generated by calling `sarand/utils.py::restricted_amr_name_from_modified_name(amr_name_from_title(amr_original_name)))`
-	  - AMR_info/alignments/: The alignment details for all AMR sequences are stored here.
-	- `sequences_info/sequences_info_{neighbourhood_length}/`: This directory stores the information of extracted neighborhood sequences from the assembly graph.
-	  - sequences_info/sequences_info_{params.neighbourhood_length}/sequences/: the extracted sequences in the neighborhood of each AMR are stored in a file like `ng_sequences_{AMR_NAME}_{params.neighbourhood_length}_{DATE}.txt`.
-	  For each extracted sequence, the first line denotes the corresponding path, where the nodes representing the AMR sequence are placed in '[]'.
-	  The next line denotes the extracted sequence where the AMR sequence is in lower case letters and the neighborhood is in upper case letters.
-	  - sequences_info/sequences_info_{params.neighbourhood_length}/paths_info/: The information of nodes representing the AMR neighborhood including their name, the part of the sequence represented by each node (start position and end position) as well as their coverage is stored in a file like `ng_sequences_{AMR_NAME}_{params.neighbourhood_length}_{DATE}.csv`
-	- `annotations/annotations_{params.neighbourhood_length}`: The annotation details are stored in this directory.
-	  - annotations/annotations_{params.neighbourhood_length}/annotation_{AMR_NAME}_{params.neighbourhood_length}: this directory contains all annotation details for a given AMR.
-	    - gene_comparison_<AMR_NAME>.png: An image visualizing annotations
-	    - annotation_detail_{AMR_NAME}.csv: the list of annotations of all extracted sequences for an AMR gene
-	    - trimmed_annotation_info_{AMR_NAME}.csv: the list of unique annotations of all extracted sequences for an AMR gene
-	    - coverage_annotation_{COVERAGE_DIFFERENCE}_{AMR_NAME}.csv:
-	    the list of the annotations in which the gene coverage difference from the AMR gene coverage is less than GENE_COVERAGE_DIFFERENCE value.
-	    - prokka_dir_extracted{NUM}_{DATE}: it contains the output of prokka for annotation of a sequence extracted from the neighborhood of the target AMR gene in the assembly graph.
-	    - rgi_dir: contains RGI annotation details for all extracted neighborhood sequences of the target AMR gene.
+Here is the list of important directories and files that can be seen there and a short description of their content.
+
+* `AMR_info`: this directory contains the list of identified AMR sequences.
+    * AMR_info/sequences/:The sequence of identified AMRs, from graph, is stored here, with a name similar to their original name (file name is generated by calling `sarand/utils.py::restricted_amr_name_from_modified_name(amr_name_from_title(amr_original_name)))`
+    * AMR_info/alignments/: The alignment details for all AMR sequences are stored here.
+
+* `sequences_info/sequences_info_{neighbourhood_length}/`: This directory stores the information of extracted neighborhood sequences from the assembly graph.
+    * sequences_info/sequences_info_{params.neighbourhood_length}/sequences/: the extracted sequences in the neighborhood of each AMR are stored in a file like `ng_sequences_{AMR_NAME}_{params.neighbourhood_length}_{DATE}.txt`.
+For each extracted sequence, the first line denotes the corresponding path, where the nodes representing the AMR sequence are placed in '[]'. The next line denotes the extracted sequence where the AMR sequence is in lower case letters and the neighborhood is in upper case letters.
+    * sequences_info/sequences_info_{params.neighbourhood_length}/paths_info/: The information of nodes representing the AMR neighborhood including their name, the part of the sequence represented by each node (start position and end position) as well as their coverage is stored in a file like `ng_sequences_{AMR_NAME}_{params.neighbourhood_length}_{DATE}.csv`
+
+* `annotations/annotations_{params.neighbourhood_length}`: The annotation details are stored in this directory.
+    * `annotations/annotations_{params.neighbourhood_length}/annotation_{AMR_NAME}_{params.neighbourhood_length}`: this directory contains all annotation details for a given AMR.
+    * `gene_comparison_<AMR_NAME>.png`: An image visualizing annotations
+    * `annotation_detail_{AMR_NAME}.csv`: the list of annotations of all extracted sequences for an AMR gene
+    * `trimmed_annotation_info_{AMR_NAME}.csv`: the list of unique annotations of all extracted sequences for an AMR gene
+    * `coverage_annotation_{COVERAGE_DIFFERENCE}_{AMR_NAME}.csv`: the list of the annotations in which the gene coverage difference from the AMR gene coverage is less than GENE_COVERAGE_DIFFERENCE value.
+    * `prokka_dir_extracted{NUM}_{DATE}`: it contains the output of prokka for annotation of a sequence extracted from the neighborhood of the target AMR gene in the assembly graph.
+    * `rgi_dir`: contains RGI annotation details for all extracted neighborhood sequences of the target AMR gene.
diff --git a/sarand/annotation_visualization.py b/sarand/annotation_visualization.py
@@ -14,9 +14,9 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import math
+import sys
 from PIL import Image
 import argparse
-import csv
 from csv import DictReader
 import logging
 import shutil
@@ -41,7 +41,7 @@ def show_images(image_list, main_title, output, cols=1, title_list=None):
         title_list = ["Image (%d)" % i for i in range(1, n_images + 1)]
     fig = plt.figure()
     for n, (image, title) in enumerate(zip(image_list, title_list)):
-        a = fig.add_subplot(math.ceil(n_images / float(cols)), cols, n + 1)
+        _ = fig.add_subplot(math.ceil(n_images / float(cols)), cols, n + 1)
         plt.imshow(image)
         ax = plt.gca()
         ax.axes.xaxis.set_ticks([])

diff --git a/sarand/extract_neighborhood.py b/sarand/extract_neighborhood.py
@@ -18,26 +18,18 @@
 import os
 import errno
 import gfapy
-import re
-import argparse
-import difflib
 import datetime
 import csv
-import collections
 import subprocess
-from Bio import SeqIO
 from gfapy.sequence import rc
 import shutil
 import logging
-import enum
 import multiprocessing
 from csv import DictReader
 
 from sarand.utils import (
     reverse_sign,
-    find_node_orient,
     find_node_name,
-    find_node_name_orient,
     exist_in_path,
     compare_two_sequences,
     read_path_info_from_align_file,
@@ -1534,7 +1526,7 @@ def check_if_similar_ng_extractions_exist(
             and new_end_pos == end_pos
         ):
             similar_path["down_stream"] = i
-            found_dwon_stream = True
+            found_down_stream = True
 
     if found_up_stream and found_down_stream:
         return similar_path
@@ -1568,7 +1560,7 @@ def check_if_similar_ng_extractions_exist(
                 new_last_node == second_last_node or new_second_last_node == last_node
             ):
                 similar_path["down_stream"] = i
-                found_dwon_stream = True
+                found_down_stream = True
             if found_up_stream and found_down_stream:
                 return similar_path
 
@@ -1600,7 +1592,7 @@ def order_path_nodes(path_nodes, amr_file, out_dir, threshold=90):
         # run blast query for alignement
         blast_file_name = os.path.join(out_dir, "blast.csv")
         blast_file = open(blast_file_name, "w")
-        blast_command = subprocess.run(
+        subprocess.run(
             [
                 "blastn",
                 "-query",

diff --git a/sarand/full_pipeline.py b/sarand/full_pipeline.py
@@ -17,40 +17,29 @@
 
 import sys
 import os
-import errno
 import copy
 import datetime
 import csv
 import collections
 import subprocess
 import shutil
-import matplotlib.pyplot as plt
 import logging
 from functools import partial
 from multiprocessing.pool import Pool
-import seaborn as sns
-import pandas as pd
 
 from sarand.extract_neighborhood import neighborhood_sequence_extraction
 from sarand.annotation_visualization import visualize_annotation
 from sarand.utils import (
     retrieve_AMR,
-    extract_files,
     create_fasta_file,
     annotate_sequence,
     split_up_down_info,
     seqs_annotation_are_identical,
     similar_seq_annotation_already_exist,
     amr_name_from_comment,
-    amr_name_from_title,
-    retreive_original_amr_name,
     extract_name_from_file_name,
     restricted_amr_name_from_modified_name,
-    extract_info_from_overlap_file,
-    read_path_info_from_align_file,
     read_path_info_from_align_file_with_multiple_amrs,
-    extract_path_info_for_amrs,
-    compare_two_sequences,
     delete_lines_started_with,
 )
 
@@ -1160,6 +1149,7 @@ def full_pipeline_main(params):
         pdb.set_trace()
         sys.exit(1)
 
+    # not used anywhere? @Somayeh
     send_amr_align_info = False
     if unique_amr_path_list and len(unique_amr_path_list) == len(unique_amr_files):
         send_amr_align_info = True
@@ -1182,11 +1172,10 @@ def full_pipeline_main(params):
 	amr_seq_align_info
     )
 
-    coverage_annotation_list = []
     all_seq_info_lists, annotation_file_list = seq_annotation_main(
         params, seq_files, path_info_files, unique_amr_files
     )
-
+    # never used? @Somayeh
     coverage_annotation_list = seq_annotation_trim_main(
         params, unique_amr_files, all_seq_info_lists, annotation_file_list, True
     )

diff --git a/sarand/sarand_main.py b/sarand/sarand_main.py
@@ -8,7 +8,6 @@
 import pkg_resources
 from pathlib import Path
 
-from sarand import full_pipeline, utils
 from sarand.__init__ import __version__
 from sarand.full_pipeline import full_pipeline_main
 from sarand.utils import (
@@ -105,12 +104,12 @@ def main():
         "--neighbourhood_length",
         default=1000,
         type=validate_range(int, 0, 100000),
-        help="Size of gene neighbourhood to extract from the " "assembly graph",
+        help="Size of gene neighbourhood to extract from the assembly graph",
     )
     parser.add_argument(
         "-o",
         "--output_dir",
-        help="Output folder for current " "run of sarand",
+        help="Output folder for current run of sarand",
         default=Path(f"sarand_results_{run_time}"),
     )
     parser.add_argument(
@@ -137,9 +136,9 @@ def main():
 
     args = parser.parse_args()
     # check dependencies work
-    cwd = os.getcwd()
-    PROKKA_COMMAND_PREFIX = 'docker run -v '+cwd+':/data staphb/prokka:latest '
     dependencies = ["Bandage --version", "prokka --version", "blastn -version"]
+    #cwd = os.getcwd()
+    #PROKKA_COMMAND_PREFIX = 'docker run -v '+cwd+':/data staphb/prokka:latest '
     #dependencies = ["/media/Data/tools/Bandage_Ubuntu_dynamic_v0_8_1/Bandage --version",PROKKA_COMMAND_PREFIX+ "prokka --version", "blastn -version"]
     if not args.no_rgi:
         dependencies.append("rgi main --version")

diff --git a/sarand/utils.py b/sarand/utils.py
@@ -251,7 +251,7 @@ def run_RGI(
         "--exclude_nudge",
     ]
     if include_loose:
-        carg_list.append("--include_loose")
+        arg_list.append("--include_loose")
     rgi_command = subprocess.run(arg_list, stdout=subprocess.PIPE, check=True)
     logging.info(rgi_command.stdout.decode("utf-8"))
     seq_info_list = []
@@ -317,7 +317,7 @@ def annotate_sequence(
         + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
     )
     prefix_name = "mygenome_" + seq_description
-    arg_list = [            
+    arg_list = [
         "prokka",
         "--metagenome",
         "--outdir",
@@ -328,10 +328,10 @@ def annotate_sequence(
         "--notrna",
         seq_file_name,
     ]
-    cwd = os.getcwd()
-    PROKKA_COMMAND_PREFIX = 'docker run -v '+cwd+':/data staphb/prokka:latest '
-    pre_list = PROKKA_COMMAND_PREFIX.strip().split(" ")
-    #arg_list = pre_list + arg_list    
+    #cwd = os.getcwd()
+    #PROKKA_COMMAND_PREFIX = 'docker run -v '+cwd+':/data staphb/prokka:latest '
+    #pre_list = PROKKA_COMMAND_PREFIX.strip().split(" ")
+    #arg_list = pre_list + arg_list
     prokka_command = subprocess.run(arg_list, stdout=subprocess.PIPE, check=True)
     logging.info(prokka_command.stdout.decode("utf-8"))
     # move prokka directory to the right address
@@ -484,7 +484,7 @@ def compare_two_sequences(
     # run blast query for alignement
     blast_file_name = os.path.join(output_dir, "blast" + blast_ext + ".csv")
     blast_file = open(blast_file_name, "w")
-    blast_command = subprocess.run(
+    subprocess.run(
         [
             "blastn",
             "-query",
@@ -709,6 +709,7 @@ def read_path_info_from_align_file_with_multiple_amrs(align_file, threshold=99):
 
 def extract_path_info_for_amrs(all_align_files, unique_amr_files, amr_count, threshold):
     """ """
+    unique_amr_path_list = []
     if len(all_align_files) == amr_count:
         amr_align_files = extract_unique_align_files(all_align_files, unique_amr_files)
         for align_file in amr_align_files:
@@ -802,7 +803,7 @@ def check_dependencies(programs):
     missing = False
     for program in programs:
         try:
-            program_name = program.split()[0]            
+            program_name = program.split()[0]
             output = subprocess.run(
                 program,
                 shell=True,