Skip to content

Commit

Permalink
Run bigscape v2 (#251)
Browse files Browse the repository at this point in the history
* add BiG-SCAPE 2 to dependencies

* add example config for bigscape 2

* implement running BiG-SCAPE

* fix bigscape2 dependency

* copy db file properly

* remove cluster arg

* run ruff formatter

* fix ruff check issues

* ensure str for mypy static type checking

* Move configuration to correct file

* use os.path.join instead of string concat

* fix merge mistake

* remove extra bigscape 2 files

* add missing library

* add validator for bigscape version

* add test for bigscape version

* fix typo

* add simple run testing

* add test to check for nonextent input path

* add info to docstring

* add exception on invalid version number

* move log to after validation

* add version info to log

* use specific exception

* rework return codes and exceptions

* add wrong version test

* add invalid path test for v2

* specify exception

* fix tests not correctly running

* change imports to reflect style in other tests

* specify exception type

* add minimal test data

* add real data tests

* remove class

* force string for mypy

* Apply suggestions from code review

Co-authored-by: Cunliang Geng <[email protected]>

* add exceptions to docstring

* add docstring to tests

* use tmp path instead of data path

* add missing typing

* add explanation of cluster mode

* parameterize tests

* remove two gbks

* better documentation

* skip tests with dataset

* do not check output code within run

* move log

* add test with incorrect parameters for runtime exception

* remove temporary nplinker.toml

* add stderr to error log

* add import needed for skipping test on CI

* Apply suggestions from code review

Co-authored-by: Cunliang Geng <[email protected]>

* expand docstring

* Apply suggestions from code review

Co-authored-by: Cunliang Geng <[email protected]>

* fix ruff complaints

---------

Co-authored-by: Cunliang Geng <[email protected]>
  • Loading branch information
adraismawur and CunliangGeng authored Jul 17, 2024
1 parent 1e23e57 commit f767bfa
Show file tree
Hide file tree
Showing 10 changed files with 3,565 additions and 27 deletions.
15 changes: 15 additions & 0 deletions bin/install-nplinker-deps
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ pip install -q -U pip setuptools
echo "🔥 Start installing BigScape ..."
[[ -d BiG-SCAPE ]] || git clone https://github.com/medema-group/BiG-SCAPE.git
cd BiG-SCAPE
git reset --hard
git config --add advice.detachedHead false # disable advice
git config pull.ff only
git checkout master
Expand All @@ -136,6 +137,20 @@ echo "🔥 Start installing BigScape ..."
chmod 775 Annotated_MIBiG_reference
ln -sf $LIB_PATH/BiG-SCAPE/bigscape.py $PY_PATH/bin
cd ..
# blob size limit to remove large files left in history
[[ -d BiG-SCAPE-v2 ]] || git clone -b dev --filter=blob:limit=10m https://github.com/medema-group/BiG-SCAPE.git BiG-SCAPE-v2
cd BiG-SCAPE-v2
git config --ad advice.detatchedHead false
git checkout 99a4c2e4923bb50e175b2e619c2cee0a14918789 # Commits on Jun 14, 2024
pip install click
pip install sqlalchemy
pip install pyhmmer
pip install tqdm
chmod 754 bigscape.py
ln -sf $LIB_PATH/BiG-SCAPE-v2/bigscape.py $PY_PATH/bin/bigscape-v2.py
cd ..


echo -e "✅ BigScape installed successfully\n"

#--- Install FastTree (not support Windows, required by BigScape)
Expand Down
31 changes: 22 additions & 9 deletions src/nplinker/arranger.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,21 +304,34 @@ def _run_bigscape(self) -> None:
default BiG-SCAPE directory.
"""
self.bigscape_running_output_dir.mkdir(exist_ok=True, parents=True)

version = self.config.bigscape.version

run_bigscape(
self.antismash_dir,
self.bigscape_running_output_dir,
self.config.bigscape.parameters,
version,
)
for f in glob(
str(
self.bigscape_running_output_dir
/ "network_files"
/ "*"
/ "mix"
/ "mix_clustering_c*.tsv"

if version == 1:
for f in glob(
str(
self.bigscape_running_output_dir
/ "network_files"
/ "*"
/ "mix"
/ "mix_clustering_c*.tsv"
)
):
shutil.copy(f, self.bigscape_dir)
elif version == 2:
shutil.copy(
self.bigscape_running_output_dir / "data_sqlite.db",
self.bigscape_dir,
)
):
shutil.copy(f, self.bigscape_dir)
else:
raise ValueError(f"Invalid BiG-SCAPE version: {version}")

def arrange_strain_mappings(self) -> None:
"""Arrange the strain mappings file.
Expand Down
1 change: 1 addition & 0 deletions src/nplinker/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def load_config(config_file: str | PathLike) -> Dynaconf:
# BigScape
Validator("bigscape.parameters", required=True, is_type_of=str),
Validator("bigscape.cutoff", required=True, is_type_of=str),
Validator("bigscape.version", required=True, is_type_of=int),
# Scoring
## `scoring.methods` must be a list of strings and must contain at least one of the
## supported scoring methods.
Expand Down
10 changes: 9 additions & 1 deletion src/nplinker/data/nplinker.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ podp_id = ""
# The default value is "INFO".
level = "INFO"
# The log file to append log messages.
# The value is optional.
# The value is optional.
# If not set or use empty string, log messages will not be written to a file.
# The file will be created if it does not exist. Log messages will be appended to the file if it exists.
file = "path/to/logfile"
Expand All @@ -43,6 +43,9 @@ version = "3.1"

[bigscape]
# The parameters to use for running BiG-SCAPE.
# Version of BiG-SCAPE to run. Make sure to change the parameters property below as well
# when changing versions.
version = 1
# Required bigscape parameters are `--mix`, `--include_singletons` and `--cutoffs`. NPLinker needs
# them to run the analysis properly.
# Parameters that must NOT exist: `--inputdir`, `--outputdir`, `--pfam_dir`. NPLinker will
Expand All @@ -51,6 +54,11 @@ version = "3.1"
# `mibig.version` to the version of mibig in bigscape.
# The default value is "--mibig --clans-off --mix --include_singletons --cutoffs 0.30".
parameters = "--mibig --clans-off --mix --include_singletons --cutoffs 0.30"
# for version 2, use the following parameters string:
# parameters = "--mibig_version 3.1 --include_singletons --gcf_cutoffs 0.30"
# Note that BiG-SCAPE v2 has subcommands. NPLinker requires the "cluster" subcommand and its parameters to be used.
NPLinker will automatically set the following parameters: `--pfam_path`, `--inputdir` and `--outputdir`. So, do not set them here.
# BiG-SCPAPE v2 also runs a --mix analysis by default, and does not need this to be included.
# Which bigscape cutoff to use for NPLinker analysis.
# There might be multiple cutoffs in bigscape output.
# Note that this value must be a string.
Expand Down
107 changes: 90 additions & 17 deletions src/nplinker/genomics/bigscape/runbigscape.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import subprocess
import sys
from os import PathLike
from typing import Literal


logger = logging.getLogger(__name__)
Expand All @@ -15,33 +16,105 @@ def run_bigscape(
antismash_path: str | PathLike,
output_path: str | PathLike,
extra_params: str,
):
bigscape_py_path = "bigscape.py"
logger.info(
f'run_bigscape: input="{antismash_path}", output="{output_path}", extra_params={extra_params}"'
)
version: Literal[1, 2] = 1,
) -> bool:
"""Runs BiG-SCAPE to cluster BGCs.
The behavior of this function is slightly different depending on the version of
BiG-SCAPE that is set to run using the configuration file.
Mostly this means a different set of parameters is used between the two versions.
The AntiSMASH output directory should be a directory that contains GBK files.
The directory can contain subdirectories, in which case BiG-SCAPE will search
recursively for GBK files. E.g.:
```
example_folder
├── organism_1
│  ├── organism_1.region001.gbk
│  ├── organism_1.region002.gbk
│  ├── organism_1.region003.gbk
│  ├── organism_1.final.gbk <- skipped!
│  └── ...
├── organism_2
│  ├── ...
└── ...
```
By default, only GBK Files with "cluster" or "region" in the filename are
accepted. GBK Files with "final" in the filename are excluded.
Args:
antismash_path: Path to the antismash output directory.
output_path: Path to the output directory where BiG-SCAPE will write its results.
extra_params: Additional parameters to pass to BiG-SCAPE.
version: The version of BiG-SCAPE to run. Must be 1 or 2.
Returns:
True if BiG-SCAPE ran successfully, False otherwise.
Raises:
ValueError: If an unexpected BiG-SCAPE version number is specified.
FileNotFoundError: If the antismash_path does not exist or if the BiG-SCAPE python
script could not be found.
RuntimeError: If BiG-SCAPE fails to run.
Examples:
>>> from nplinker.genomics.bigscape import run_bigscape
>>> run_bigscape(antismash_path="./antismash", output_path="./output",
... extra_params="--help", version=1)
"""
# switch to correct version of BiG-SCAPE
if version == 1:
bigscape_py_path = "bigscape.py"
elif version == 2:
bigscape_py_path = "bigscape-v2.py"
else:
raise ValueError("Invalid BiG-SCAPE version number. Expected: 1 or 2.")

try:
subprocess.run([bigscape_py_path, "-h"], capture_output=True, check=True)
except Exception as e:
raise Exception(f"Failed to find/run bigscape.py (path={bigscape_py_path}, err={e})") from e
raise FileNotFoundError(
f"Failed to find/run BiG-SCAPE executable program (path={bigscape_py_path}, err={e})"
) from e

if not os.path.exists(antismash_path):
raise Exception(f'antismash_path "{antismash_path}" does not exist!')
raise FileNotFoundError(f'antismash_path "{antismash_path}" does not exist!')

logger.info(f"Running BiG-SCAPE version {version}")
logger.info(
f'run_bigscape: input="{antismash_path}", output="{output_path}", extra_params={extra_params}"'
)

# configure the IO-related parameters, including pfam_dir
args = [bigscape_py_path, "-i", antismash_path, "-o", output_path, "--pfam_dir", PFAM_PATH]
# assemble arguments. first argument is the python file
args = [bigscape_py_path]

# version 2 points to specific Pfam file, version 1 points to directory
# version 2 also requires the cluster subcommand
if version == 1:
args.extend(["--pfam_dir", PFAM_PATH])
elif version == 2:
args.extend(["cluster", "--pfam_path", os.path.join(PFAM_PATH, "Pfam-A.hmm")])

# add input and output paths. these are unchanged
args.extend(["-i", str(antismash_path), "-o", str(output_path)])

# append the user supplied params, if any
if len(extra_params) > 0:
args.extend(extra_params.split(" "))

logger.info(f"BiG-SCAPE command: {args}")
result = subprocess.run(args, stdout=sys.stdout, stderr=sys.stderr, check=True)
logger.info(f"BiG-SCAPE completed with return code {result.returncode}")
# use subprocess.CompletedProcess.check_returncode() to test if the BiG-SCAPE
# process exited successfully. This throws an exception for non-zero returncodes
# which will indicate to the PODPDownloader module that something went wrong.
result.check_returncode()

return True
result = subprocess.run(args, stdout=sys.stdout, stderr=sys.stderr)

# return true on any non-error return code
if result.returncode == 0:
logger.info(f"BiG-SCAPE completed with return code {result.returncode}")
return True

# otherwise log details and raise a runtime error
logger.error(f"BiG-SCAPE failed with return code {result.returncode}")
logger.error(f"output: {str(result.stdout)}")
logger.error(f"stderr: {str(result.stderr)}")

raise RuntimeError(f"Failed to run BiG-SCAPE with error code {result.returncode}")
2 changes: 2 additions & 0 deletions src/nplinker/nplinker_default.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ to_use = true
version = "3.1"

[bigscape]
version = 1
parameters = "--mibig --clans-off --mix --include_singletons --cutoffs 0.30"

cutoff = "0.30"

[scoring]
Expand Down
Loading

0 comments on commit f767bfa

Please sign in to comment.