From 3ef0c01dd883798a2f0709aa320f2c547740fab6 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:50:01 +0200 Subject: [PATCH 01/14] update readmes --- README.md | 17 +++++++++++------ tests/data_1/README.md | 12 ++++++------ tests/data_5/README.md | 4 ++-- tests/data_6/README.md | 23 +++++++---------------- 4 files changed, 26 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 412dd01..b6785d1 100755 --- a/README.md +++ b/README.md @@ -21,18 +21,19 @@ pip install git+https://github.com/cbg-ethz/VILOCA@master ### Example To test your installation run VILOCA `tests/data_1`: ``` -viloca run -b test_aln.cram -f test_ref.fasta -z scheme.insert.bed --mode use_quality_scores +viloca run -b test_aln.cram -f test_ref.fasta --mode use_quality_scores ``` - +Another example can be found in `tests/data_6`: If the sequencing amplicon strategy is known, we recommend using the amplicon-mode of the program, which takes as input the `.insert.bed` - file: -`viloca run -b test_aln.cram -f test_ref.fasta -z scheme.insert.bed --mode use_quality_scores` +`viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode use_quality_scores -z scheme.insert.bed` + +If there is no information on the sequencing amplicon strategy available, run: +`viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode use_quality_scores` If the sequencing quality scores are not trustable, the sequencing error parameters can also be learned: -`viloca run -b test_aln.cram -f test_ref.fasta -z scheme.insert.bed --mode learn_error_params`. +`viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode learn_error_params`. -If there is no information on the sequencing amplicon strategy available, run: -`viloca run -b test_aln.cram -f test_ref.fasta --mode use_quality_scores` ### Parameters There are several parameters available: @@ -70,3 +71,7 @@ This is the same setup as used in the CI at [`.github/workflows/test.yaml`](.git ```bash poetry run python3 -m cProfile -m shorah shotgun ... ``` + +### Applications + +You can find several applications of VILOCA at https://github.com/cbg-ethz/viloca_applications. diff --git a/tests/data_1/README.md b/tests/data_1/README.md index 9f48efd..fc83c0b 100644 --- a/tests/data_1/README.md +++ b/tests/data_1/README.md @@ -1,10 +1,10 @@ -### Sample files to test `shorah shotgun` +### Sample files to test `VILOCA` -Use files in this directory to test shorah in shotgun mode. -The reads data comes from the +Use files in this directory to test VILOCA. +The reads data comes from the [test-data](https://github.com/cbg-ethz/V-pipe/tree/master/testdata/2VM-sim/20170904/raw_data) of [V-pipe](https://cbg-ethz.github.io/V-pipe/) -and has been processed with the pipeline using the `bwa` +and has been processed with the pipeline using the `bwa` [option](https://github.com/cbg-ethz/V-pipe/wiki/options#aligner): ```ini @@ -16,8 +16,8 @@ The sorted bam file has been further compressed with samtools for space saving: [user@host shotgun_test]$ samtools view -T test_ref.fasta -C -O cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 -o test_aln.cram V-pipe/work/samples/2VM-sim/20170904/alignments/REF_aln.bam -You can then run `shorah shotgun` as follows +You can then run `viloca` as follows - [user@host shotgun_test]$ shorah shotgun -b test_aln.cram -f test_ref.fasta + [user@host shotgun_test]$ viloca run -b test_aln.cram -f test_ref.fasta The output files will be `snv/SNVs_0.010000_final.vcf` and `snv/SNVs_0.010000_final.csv`. diff --git a/tests/data_5/README.md b/tests/data_5/README.md index 80d3d51..21f4bb0 100644 --- a/tests/data_5/README.md +++ b/tests/data_5/README.md @@ -1,9 +1,9 @@ ### Test to check fil.cpp implementation accounting for long deletions -Test files `SNV.txt` and `SNVs_0.010000.txt` are obtained by running `shorah shutgun`, e.g: +Test files `SNV.txt` and `SNVs_0.010000.txt` are obtained by running `viloca run`, e.g: ``` -shorah shotgun -a 0.1 -w 42 -x 100000 -p 0.9 -c 0 -r REF:42-272 -R 42 -b test_aln.cram -f ref.fasta +viloca run -a 0.1 -w 42 -x 100000 -p 0.9 -c 0 -r REF:42-272 -R 42 -b test_aln.cram -f ref.fasta ``` The test script `test_long_deletions.py` uses `pysam` and `NumPy`, which can be installed using pip or conda. diff --git a/tests/data_6/README.md b/tests/data_6/README.md index 70baae0..3479aae 100644 --- a/tests/data_6/README.md +++ b/tests/data_6/README.md @@ -1,27 +1,18 @@ -### Sample files to test `shorah shotgun` +### Sample files to test `VILOCA` Use files in this directory to test shorah in shotgun mode. The reads data have been generated with V-pipe's benchmarking framework (simulated with parameters: ```seq_tech~illumina__seq_mode~shotgun__seq_mode_param~nan__read_length~90__genome_size~90__coverage~100__haplos~5@5@10@5@10@geom@0.75```) The reads are from one single amplicon of length 90, meaning the reference is of length 90 and each read is of length 90bps. -To run ShoRAH's original Gibbs sampler use the following command: -``` -poetry run shorah shotgun -f reference.fasta -b reads.shotgun.bam -w 90 --sampler shorah -``` -or -``` -poetry run shorah shotgun -f reference.fasta -b reads.shotgun.bam -z scheme.insert.bed --sampler shorah -``` - To use the new inference method using the sequencing quality scores use: ``` -poetry run shorah shotgun -f reference.fasta -b reads.shotgun.bam -w 90 --sampler use_quality_scores --alpha 0.0001 --n_max_haplotypes 100 --n_mfa_starts 1 --conv_thres 0.0001 +viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode use_quality_scores ``` -To use the new inference method learning the sequencing error parameter: +To use the model that is learning the sequencing error parameter: ``` -poetry run shorah shotgun -f reference.fasta -b reads.shotgun.bam -w 90 --sampler -learn_error_params --alpha 0.0001 --n_max_haplotypes 100 --n_mfa_starts 1 --conv_thres 0.0001 +viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode -learn_error_params ``` - -In the new inference method reads are filtered (and weighted respectively) such that only a set of unique reads are processed. This mode can be switch off by setting the parameter `--non-unique_modus`, e.g.: +To run VILOCA with the insert file run: +``` +viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode use_quality_scores -z scheme.insert.bed ``` -poetry run shorah shotgun -f reference.fasta -b reads.shotgun.bam -w 90 --sampler -learn_error_params --alpha 0.0001 --n_max_haplotypes 100 --n_mfa_starts 1 --conv_thres 0.0001 --non-unique_modus From 348e40e1075346a7605fabfb3e8ee90836604f15 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:50:28 +0200 Subject: [PATCH 02/14] update version number --- environment.yml | 2 +- pyproject.toml | 6 +++--- viloca/cli.py | 15 +++++---------- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/environment.yml b/environment.yml index 4c562d4..8e5f6b6 100644 --- a/environment.yml +++ b/environment.yml @@ -6,4 +6,4 @@ dependencies: - htslib >=1.9 - boost-cpp >=1.56 - pip: - - https://github.com/spaceben/shorah/releases/download/canary-eeec049/ShoRAH-0.1.0.tar.gz \ No newline at end of file + - https://github.com/cbg-ethz/VILOCA/archive/refs/tags/viloca-v1.0.0.tar.gz diff --git a/pyproject.toml b/pyproject.toml index 42da0bd..ebc301c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,9 @@ [tool.poetry] name = "VILOCA" -version = "0.1.0" -description = "SHOrt Reads Assembly into Haplotypes" +version = "1.0.0" +description = "VIral LOcal haplotype reconstruction and mutation CAlling for short and long read data" license = "GPL-3.0-only" -authors = ["Benjamin Langer , Lara Fuhrmann "] +authors = ["Ivan Topolsky", "Benjamin Langer , Lara Fuhrmann "] build = "build.py" packages = [ { include = "viloca" } diff --git a/viloca/cli.py b/viloca/cli.py index 96273b3..aee2cd2 100644 --- a/viloca/cli.py +++ b/viloca/cli.py @@ -2,11 +2,6 @@ # Copyright 2007-2018 # Niko Beerenwinkel, -# Nicholas Eriksson, -# Moritz Gerstung, -# Lukas Geyrhofer, -# Kerensa McElroy, -# Osvaldo Zagordi, # ETH Zurich # This file is part of ShoRAH. @@ -21,7 +16,7 @@ # GNU General Public License for more details. # You should have received a copy of the GNU General Public License -# along with ShoRAH. If not, see . +# along with VILOCA. If not, see . """ Module that contains the command line app. Why does this file exist, and why not put this in __main__? @@ -115,8 +110,8 @@ def main(): type=str, dest="f", help="reference genome in fasta format") parent_parser.add_argument("-a", "--alpha", metavar='FLOAT', required=False, - type=float, dest="a", default=0.1, - help="alpha in dpm sampling (controls the probability of creating new classes)") + type=float, dest="a", default=0.0001, + help="alpha") parent_parser.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str, dest="r", default='', @@ -194,8 +189,8 @@ def main(): help="Number of starts for inference type mean_field_approximation.") parser_shotgun.add_argument('--mode', choices=['shorah','learn_error_params','use_quality_scores'], - default='shorah', dest="inference_type", - help="Mode in which to run VILOCA: shorah, learn_error_params, use_quality_scores") + default='use_quality_scores', dest="inference_type", + help="Mode in which to run VILOCA: shorah, learn_error_params, use_quality_scores, ShoRAH refers to the method from https://github.com/cbg-ethz/shorah.") parser_shotgun.add_argument('--non-unique_modus', action='store_false', dest="unique_modus", help="For inference: Make read set unique with read weights. Cannot be used with --mode shorah.") From 08dd11af2aed3eee88983ddb69065c60ae4bd8ab Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:50:44 +0200 Subject: [PATCH 03/14] remove duplicate --- viloca/b2w.py | 182 -------------------------------------------------- 1 file changed, 182 deletions(-) diff --git a/viloca/b2w.py b/viloca/b2w.py index d3f4e40..686a2b8 100644 --- a/viloca/b2w.py +++ b/viloca/b2w.py @@ -296,188 +296,6 @@ def _run_one_window(samfile, window_start, reference_name, window_length,control return convert_to_printed_fmt(arr), arr_read_qualities_summary, arr_read_summary, pos_filter -def parallel_run_one_window( - reference_filename, - minimum_reads, - tiling, - region_end, - idx, - window_start, - window_length, - control_window_length, - alignment_file, - reference_name, - win_min_ext, - permitted_reads_per_location, - counter, - exact_conformance_fix_0_1_basing_in_reads, - indel_map, - max_ins_at_pos, - extended_window_mode, - exclude_non_var_pos_threshold -): - """ - build one window. - """ - reffile = pysam.FastaFile(reference_filename) - - samfile = pysam.AlignmentFile( - alignment_file, - "r", # auto-detect bam/cram (rc) - reference_filename=reference_filename, - threads=1 - ) - - reads = open(f"reads_{idx}.fas", "w") - - logging.info(f"Working on window (1-based) @ {window_start+1}") - -# (arr, arr_read_qualities_summary, arr_read_summary, -# counter, control_window_length, pos_filter) = _run_one_window( - (arr, arr_read_qualities_summary, arr_read_summary, - pos_filter) = _run_one_window( - samfile, - window_start - 1, # make 0 based - reference_name, - window_length, - control_window_length, - math.floor(win_min_ext * window_length), - dict(permitted_reads_per_location), # copys dict ("pass by value") - counter, - exact_conformance_fix_0_1_basing_in_reads, - indel_map, - max_ins_at_pos, - extended_window_mode, - exclude_non_var_pos_threshold - ) - - logging.debug(f"Window length: {control_window_length}") - - window_end = window_start + window_length - 1 - file_name = f'w-{reference_name}-{window_start}-{window_end}' - - # TODO solution for backward conformance - if len(tiling) > 1: - end_extended_by_a_window = region_end + (tiling[1][0]-tiling[0][0])*3 - else: - end_extended_by_a_window = region_end + window_length*3 - - for read in arr_read_summary: - if idx == len(tiling) - 1 and read[1] > end_extended_by_a_window: - continue - # TODO reads.fas not FASTA conform, +-0/1 mixed - # TODO global end does not really make sense, only for conformance - # read name, global start, global end, read start, read end, read - reads.write( - f'{read[0]}\t{tiling[0][0]-1}\t{end_extended_by_a_window}\t{read[1]}\t{read[2]}\t{read[3]}\n' - ) - - reads.close() - - if (idx != len(tiling) - 1 # except last - and len(arr) > 0) or len(tiling) == 1: # suppress output if window empty - - _write_to_file(arr, file_name + '.reads.fas') - if arr_read_qualities_summary is not None: - with open(file_name + '.qualities.npy', 'wb') as f: - np.save(f, np.asarray(arr_read_qualities_summary, dtype=np.int64), allow_pickle=True) - - ref = reffile.fetch(reference=reference_name, start=window_start-1, end=window_end) - - if extended_window_mode: - for file_name_comp, char in [("extended-ref", "X"), ("ref", "-")]: - res_ref = _build_one_full_read( - list(ref), list(ref), None, None, - window_start-1, window_end-1, - indel_map, max_ins_at_pos, extended_window_mode, char - )[0] - - k = max(0, control_window_length - len(res_ref)) - res_ref += k * "N" - assert_condition = control_window_length == len(res_ref) - - if exclude_non_var_pos_threshold > 0 and file_name_comp == "ref": - _write_to_file([ - f'>{reference_name} {window_start}\n' + res_ref - ], file_name + '.envp-full-ref.fas') - - envp_ref = np.array(list(res_ref)) - envp_ref[~pos_filter] = "=" - _write_to_file([ - f'>{reference_name} {window_start}\n' + "".join(envp_ref) - ], file_name + '.envp-ref.fas') - - reduced_ref = np.array(list(res_ref))[pos_filter] - res_ref = "".join(reduced_ref) - assert_condition = (control_window_length == - len(reduced_ref) + len(pos_filter) - pos_filter.sum()) - - _write_to_file([ - f'>{reference_name} {window_start}\n' + res_ref - ], f'{file_name}.{file_name_comp}.fas') - - assert assert_condition, ( - f""" - Reference ({file_name_comp}) does not have same length as the window. - Location: {file_name} - Ref: {len(res_ref)} - Win: {control_window_length} - """ - ) - - else: - k = max(0, control_window_length - len(ref)) - ref += k * "N" - - if exclude_non_var_pos_threshold > 0: - full_file_name = file_name + '.envp-full-ref.fas' - else: - full_file_name = file_name + '.ref.fas' - - _write_to_file([ - f'>{reference_name} {window_start}\n' + ref - ], full_file_name) - - assert control_window_length == len(ref), ( - f""" - Reference does not have same length as the window. - Location: {file_name} - Ref: {len(ref)} - Win: {control_window_length} - """ - ) - - if exclude_non_var_pos_threshold > 0: - envp_ref = np.array(list(ref)) - envp_ref[~pos_filter] = "=" - _write_to_file([ - f'>{reference_name} {window_start}\n' + "".join(envp_ref) - ], file_name + '.envp-ref.fas') - reduced_ref = np.array(list(ref))[pos_filter] - _write_to_file([ - f'>{reference_name} {window_start}\n' + "".join(reduced_ref) - ], file_name + '.ref.fas') - - assert (control_window_length == len(envp_ref) and - control_window_length == len(reduced_ref) + len(pos_filter) - pos_filter.sum()), ( - f""" - Reference does not have same length as the window. - Location: {file_name} - Envp Ref: {len(envp_ref)} - Ref: {len(reduced_ref)} - Win: {control_window_length} - """ - ) - - if len(arr) > minimum_reads: - line = ( - f'{file_name}.reads.fas\t{reference_name}\t{window_start}\t' - f'{window_end}\t{len(arr)}' - ) - _write_to_file([line], f"coverage_{idx}.txt") - - - def update_tiling(tiling, extended_window_mode, max_ins_at_pos): """ input tiling: From 98438cb4c80a5ce8ec3f7d3f7ac553af43fcd59d Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:53:49 +0200 Subject: [PATCH 04/14] remove unused packages --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ebc301c..baa46fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,9 +18,7 @@ biopython = "^1.79" numpy = "^1.21.4" pysam = "^0.18.0" pybind11 = "^2.9.0" -PyYAML = "^6.0" scipy = "^1.7.3" -bio = "^1.3.3" pandas = "^1.3.5" [tool.poetry.dev-dependencies] From 005909beed964c2bbc665deff9f0a57775cbd69f Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Thu, 6 Jun 2024 18:23:34 +0200 Subject: [PATCH 05/14] change default viloca mode --- tests/data_1/shotgun_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data_1/shotgun_test.sh b/tests/data_1/shotgun_test.sh index 77925d3..b57e539 100755 --- a/tests/data_1/shotgun_test.sh +++ b/tests/data_1/shotgun_test.sh @@ -1,4 +1,4 @@ #!/bin/bash -viloca run -a 0.1 -w 201 -x 100000 -p 0.9 -c 0 \ +viloca run -a 0.1 -w 201 --mode shorah -x 100000 -p 0.9 -c 0 \ -r HXB2:2469-3713 -R 42 -f test_ref.fasta -b test_aln.cram --out_format csv "$@" From b2b76aa20ce73b66c8ca230ed5105a2b6345cec0 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Fri, 7 Jun 2024 10:41:32 +0200 Subject: [PATCH 06/14] b2w and inference: rerun windows that failed + check if window was processed already before adding process to process list --- viloca/b2w.py | 52 ++++++++++++++++++++++++----------------------- viloca/shotgun.py | 11 ++++++---- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/viloca/b2w.py b/viloca/b2w.py index 686a2b8..36bb114 100644 --- a/viloca/b2w.py +++ b/viloca/b2w.py @@ -578,30 +578,31 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy, all_processes = [] for idx, (window_start, window_length, control_window_length) in enumerate(tiling): counter = counter_list[idx] - p = Process( - target=parallel_run_one_window, - args=( - reference_filename, - minimum_reads, - tiling, - region_end, - idx, - window_start, # 1-based - window_length, - control_window_length, - alignment_file, - reference_name, - win_min_ext, - permitted_reads_per_location, - counter, # 0-based - exact_conformance_fix_0_1_basing_in_reads, - indel_map, - max_ins_at_pos, # 0-based - extended_window_mode, - exclude_non_var_pos_threshold, + if not os.path.isfile(f"coverage_{idx}.txt"): + p = Process( + target=parallel_run_one_window, + args=( + reference_filename, + minimum_reads, + tiling, + region_end, + idx, + window_start, # 1-based + window_length, + control_window_length, + alignment_file, + reference_name, + win_min_ext, + permitted_reads_per_location, + counter, # 0-based + exact_conformance_fix_0_1_basing_in_reads, + indel_map, + max_ins_at_pos, # 0-based + extended_window_mode, + exclude_non_var_pos_threshold, + ) ) - ) - all_processes.append(p) + all_processes.append(p) for p in all_processes: p.start() @@ -609,8 +610,9 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy, for p in all_processes: p.join() if p.exitcode != 0: - logging.debug("[b2w] A process was killed. Terminating the program.") - exit(1) + p.start() + logging.debug("[b2w] A process was killed. Terminating the program. Process was restarted.") + #exit(1) logging.debug("[b2w] All processes completed successfully.") diff --git a/viloca/shotgun.py b/viloca/shotgun.py index 95618f2..cac74b1 100644 --- a/viloca/shotgun.py +++ b/viloca/shotgun.py @@ -307,8 +307,10 @@ def win_to_run(alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, un for f1 in file1: winFile, chr1, beg, end, cov = f1.rstrip().split('\t') - j = min(300_000, int(cov) * 15) - rn_list.append((winFile, j, alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, unique_modus, inference_convergence_threshold)) + output_name = output_dir + winFile.split("/")[-1][:-4] + "-" + "cor.fas" + if not os.path.isfile(output_name): + j = min(300_000, int(cov) * 15) + rn_list.append((winFile, j, alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, unique_modus, inference_convergence_threshold)) del end del(beg, chr1) @@ -515,8 +517,9 @@ def main(args): for p in all_processes: p.join() if p.exitcode != 0: - logging.debug("[shotgun] A process was killed. Terminating the program.") - exit(1) + p.start() + logging.debug("[shotgun] A process was killed. Restart process.") + #exit(1) logging.debug("[shotgun] All processes completed successfully.") From 37b31a05bf8f527f9b3f2188c7f13b79036c6165 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Fri, 7 Jun 2024 10:47:44 +0200 Subject: [PATCH 07/14] correct variable --- viloca/shotgun.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viloca/shotgun.py b/viloca/shotgun.py index cac74b1..d4f4a24 100644 --- a/viloca/shotgun.py +++ b/viloca/shotgun.py @@ -307,7 +307,7 @@ def win_to_run(alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, un for f1 in file1: winFile, chr1, beg, end, cov = f1.rstrip().split('\t') - output_name = output_dir + winFile.split("/")[-1][:-4] + "-" + "cor.fas" + output_name = winFile.split("/")[-1][:-4] + "-" + "cor.fas" if not os.path.isfile(output_name): j = min(300_000, int(cov) * 15) rn_list.append((winFile, j, alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, unique_modus, inference_convergence_threshold)) From 433347770ef844334a255d96182a8fef3e23226d Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Mon, 10 Jun 2024 13:32:27 +0200 Subject: [PATCH 08/14] reference to manuscript --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index b6785d1..357c90a 100755 --- a/README.md +++ b/README.md @@ -7,6 +7,8 @@ are written in different programming languages and provide error correction, haplotype reconstruction and estimation of the frequency of the different genetic variants present in a mixed sample. +The corresponding manuscript can be found here: https://www.biorxiv.org/content/10.1101/2024.06.06.597712v1 + --- ### Installation From ecda5abd9e44185066fff6ed87c89f6b003bf87a Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Mon, 10 Jun 2024 16:59:38 +0200 Subject: [PATCH 09/14] add viloca conda package --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 357c90a..9bce1e2 100755 --- a/README.md +++ b/README.md @@ -15,6 +15,12 @@ The corresponding manuscript can be found here: https://www.biorxiv.org/content/ For installation miniconda is recommended: https://docs.conda.io/en/latest/miniconda.html. We recommend to install VILOCA in a clean conda environment: ``` +conda create --name env_viloca --channel conda-forge --channel bioconda viloca +conda activate env_viloca +``` + +If you want to install the `master` branch use: +``` conda create --name env_viloca --channel conda-forge --channel bioconda libshorah conda activate env_viloca pip install git+https://github.com/cbg-ethz/VILOCA@master From a644e9b85f18c957df9dba1c7f6d2e4fe30540a9 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Tue, 11 Jun 2024 09:51:49 +0200 Subject: [PATCH 10/14] restarting failed process doesn't work, exit again --- viloca/b2w.py | 5 ++--- viloca/shotgun.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/viloca/b2w.py b/viloca/b2w.py index 36bb114..961a34e 100644 --- a/viloca/b2w.py +++ b/viloca/b2w.py @@ -610,9 +610,8 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy, for p in all_processes: p.join() if p.exitcode != 0: - p.start() - logging.debug("[b2w] A process was killed. Terminating the program. Process was restarted.") - #exit(1) + logging.debug("[b2w] A process was killed. Terminating the program.") + exit(1) logging.debug("[b2w] All processes completed successfully.") diff --git a/viloca/shotgun.py b/viloca/shotgun.py index d4f4a24..4a40cb0 100644 --- a/viloca/shotgun.py +++ b/viloca/shotgun.py @@ -517,9 +517,8 @@ def main(args): for p in all_processes: p.join() if p.exitcode != 0: - p.start() - logging.debug("[shotgun] A process was killed. Restart process.") - #exit(1) + logging.debug("[shotgun] A process was killed. Terminating program.") + exit(1) logging.debug("[shotgun] All processes completed successfully.") From 279cc634a5d7856b9b280aef5b29a7b647d97407 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Wed, 12 Jun 2024 16:27:47 +0200 Subject: [PATCH 11/14] fix: check if files already exist --- viloca/shotgun.py | 93 ++++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 45 deletions(-) diff --git a/viloca/shotgun.py b/viloca/shotgun.py index 4a40cb0..45c34e4 100644 --- a/viloca/shotgun.py +++ b/viloca/shotgun.py @@ -307,7 +307,7 @@ def win_to_run(alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, un for f1 in file1: winFile, chr1, beg, end, cov = f1.rstrip().split('\t') - output_name = winFile.split("/")[-1][:-4] + "-" + "cor.fas" + output_name = winFile.split(".fas")[0] + "-" + "cor.fas" if not os.path.isfile(output_name): j = min(300_000, int(cov) * 15) rn_list.append((winFile, j, alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, unique_modus, inference_convergence_threshold)) @@ -432,53 +432,56 @@ def main(args): # run b2w logging.info('starting b2w') - try: - if ignore_indels == True: - raise NotImplementedError('This argument was deprecated.') - b2w_logging((in_bam, in_fasta, win_length, incr, win_min_ext, - max_coverage, cov_thrd, region, ignore_indels)) + if not os.path.isfile(f"coverage.txt"): + try: + if ignore_indels == True: + raise NotImplementedError('This argument was deprecated.') + b2w_logging((in_bam, in_fasta, win_length, incr, win_min_ext, + max_coverage, cov_thrd, region, ignore_indels)) + + if path_insert_file == None and region == "": # special case if no region defined + samfile = pysam.AlignmentFile( + in_bam, + "r", # auto-detect bam/cram (rc) + reference_filename=in_fasta, + threads=1 + ) + if samfile.nreferences != 1: + raise NotImplementedError("There are multiple references in this alignment file.") + strategy = tiling.EquispacedTilingStrategy( + f"{samfile.references[0]}:1-{samfile.lengths[0]}", + win_length, + incr, + False, + True + ) + elif path_insert_file == None: + strategy = tiling.EquispacedTilingStrategy(region, win_length, incr, True) + else: + strategy = tiling.PrimerTilingStrategy(path_insert_file) + if region != "": + logging.warn(f"region is set to {region} but is not used with this tiling strategy") + + logging.info(f"Using tiling strategy: {type(strategy).__name__}") - if path_insert_file == None and region == "": # special case if no region defined - samfile = pysam.AlignmentFile( + b2w.build_windows( in_bam, - "r", # auto-detect bam/cram (rc) - reference_filename=in_fasta, - threads=1 + strategy, + win_min_ext, + max_coverage, + cov_thrd, + in_fasta, + extended_window_mode=extended_window_mode, + exclude_non_var_pos_threshold=exclude_non_var_pos_threshold, + maxthreads=maxthreads ) - if samfile.nreferences != 1: - raise NotImplementedError("There are multiple references in this alignment file.") - strategy = tiling.EquispacedTilingStrategy( - f"{samfile.references[0]}:1-{samfile.lengths[0]}", - win_length, - incr, - False, - True - ) - elif path_insert_file == None: - strategy = tiling.EquispacedTilingStrategy(region, win_length, incr, True) - else: - strategy = tiling.PrimerTilingStrategy(path_insert_file) - if region != "": - logging.warn(f"region is set to {region} but is not used with this tiling strategy") - - logging.info(f"Using tiling strategy: {type(strategy).__name__}") - - b2w.build_windows( - in_bam, - strategy, - win_min_ext, - max_coverage, - cov_thrd, - in_fasta, - extended_window_mode=extended_window_mode, - exclude_non_var_pos_threshold=exclude_non_var_pos_threshold, - maxthreads=maxthreads - ) - logging.info('finished b2w') - - except Exception as e: - logging.debug(e) - sys.exit('b2w run not successful') + logging.info('finished b2w') + + except Exception as e: + logging.debug(e) + sys.exit('b2w run not successful') + else: + logging.info('coverage.txt file already exists, hence skip b2w and use what is in directory.') aligned_reads = parse_aligned_reads('reads.fas') if len(aligned_reads) == 0: From 7ced8b8330259f057e624f364f9db8db79c36603 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Sun, 10 Nov 2024 16:51:41 +0100 Subject: [PATCH 12/14] add minimum number of iterations --- .../use_quality_scores/analyze_results.py | 2 +- viloca/local_haplotype_inference/use_quality_scores/cavi.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py b/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py index ed0a6b2..4093ff2 100644 --- a/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py +++ b/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py @@ -31,7 +31,7 @@ def haplotypes_to_fasta(state_curr_dict, output_dir): ave_reads = state_curr_dict["weight" + str(k)] if ave_reads==0: # this haplotype will not be reported as there are no reads - # supporting it. + # supporting it. continue head = ( diff --git a/viloca/local_haplotype_inference/use_quality_scores/cavi.py b/viloca/local_haplotype_inference/use_quality_scores/cavi.py index 3ff3c58..bfce3d9 100644 --- a/viloca/local_haplotype_inference/use_quality_scores/cavi.py +++ b/viloca/local_haplotype_inference/use_quality_scores/cavi.py @@ -114,8 +114,9 @@ def run_cavi( iter = 0 converged = False elbo = 0 + min_number_iterations = 10 state_curr_dict = state_init_dict - while converged is False: + while (converged is False) or (iter < min_number_iterations): if iter <= 1: digamma_alpha_sum = digamma(state_curr_dict["alpha"].sum(axis=0)) @@ -155,7 +156,7 @@ def run_cavi( break elif (history_elbo[-2] > elbo) and np.abs(elbo - history_elbo[-2]) > 1e-08: exit_message = "Error: ELBO is decreasing." - break + #break elif np.abs(elbo - history_elbo[-2]) < convergence_threshold: converged = True exit_message = "ELBO converged." From 822f341890f390ad7e856a56988b70e7c60120ca Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Sun, 10 Nov 2024 16:52:13 +0100 Subject: [PATCH 13/14] pass unique_modus argument --- .../use_quality_scores/run_dpm_mfa.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py index b62a84f..eb0c692 100644 --- a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py +++ b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py @@ -33,7 +33,7 @@ def main( K, alpha0, alphabet="ACGT-", - unique_modus=False, + unique_modus=True, convergence_threshold=1e-03, ): @@ -46,7 +46,7 @@ def main( reference_binary, ref_id = preparation.load_reference_seq(fref_in, alphabet) reads_list, qualities = preparation.load_fasta_and_qualities( - freads_in, fname_qualities, alphabet, False + freads_in, fname_qualities, alphabet, unique_modus ) reads_seq_binary, reads_weights = preparation.reads_list_to_array(reads_list) reads_log_error_proba = preparation.compute_reads_log_error_proba( @@ -147,9 +147,9 @@ def main( sys.argv[1], sys.argv[2], sys.argv[3], - int(sys.argv[4]), + sys.argv[4], int(sys.argv[5]), - float(sys.argv[6]), - sys.argv[7], + int(sys.argv[6]), + float(sys.argv[7]), ) # freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet = 'ACGT-' From 0064241caaa0e9cb52eacc72da749471d915a93e Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Tue, 26 Nov 2024 09:16:59 +0100 Subject: [PATCH 14/14] Revert "Merge pull request #38 from cbg-ethz/enable-unique-modus" This reverts commit ffd4fd13575b9412fda3d689e5b3b588dc1587ef, reversing changes made to 279cc634a5d7856b9b280aef5b29a7b647d97407. --- .../use_quality_scores/analyze_results.py | 2 +- .../use_quality_scores/cavi.py | 5 ++--- .../use_quality_scores/run_dpm_mfa.py | 10 +++++----- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py b/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py index 4093ff2..ed0a6b2 100644 --- a/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py +++ b/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py @@ -31,7 +31,7 @@ def haplotypes_to_fasta(state_curr_dict, output_dir): ave_reads = state_curr_dict["weight" + str(k)] if ave_reads==0: # this haplotype will not be reported as there are no reads - # supporting it. + # supporting it. continue head = ( diff --git a/viloca/local_haplotype_inference/use_quality_scores/cavi.py b/viloca/local_haplotype_inference/use_quality_scores/cavi.py index bfce3d9..3ff3c58 100644 --- a/viloca/local_haplotype_inference/use_quality_scores/cavi.py +++ b/viloca/local_haplotype_inference/use_quality_scores/cavi.py @@ -114,9 +114,8 @@ def run_cavi( iter = 0 converged = False elbo = 0 - min_number_iterations = 10 state_curr_dict = state_init_dict - while (converged is False) or (iter < min_number_iterations): + while converged is False: if iter <= 1: digamma_alpha_sum = digamma(state_curr_dict["alpha"].sum(axis=0)) @@ -156,7 +155,7 @@ def run_cavi( break elif (history_elbo[-2] > elbo) and np.abs(elbo - history_elbo[-2]) > 1e-08: exit_message = "Error: ELBO is decreasing." - #break + break elif np.abs(elbo - history_elbo[-2]) < convergence_threshold: converged = True exit_message = "ELBO converged." diff --git a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py index eb0c692..b62a84f 100644 --- a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py +++ b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py @@ -33,7 +33,7 @@ def main( K, alpha0, alphabet="ACGT-", - unique_modus=True, + unique_modus=False, convergence_threshold=1e-03, ): @@ -46,7 +46,7 @@ def main( reference_binary, ref_id = preparation.load_reference_seq(fref_in, alphabet) reads_list, qualities = preparation.load_fasta_and_qualities( - freads_in, fname_qualities, alphabet, unique_modus + freads_in, fname_qualities, alphabet, False ) reads_seq_binary, reads_weights = preparation.reads_list_to_array(reads_list) reads_log_error_proba = preparation.compute_reads_log_error_proba( @@ -147,9 +147,9 @@ def main( sys.argv[1], sys.argv[2], sys.argv[3], - sys.argv[4], + int(sys.argv[4]), int(sys.argv[5]), - int(sys.argv[6]), - float(sys.argv[7]), + float(sys.argv[6]), + sys.argv[7], ) # freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet = 'ACGT-'