From 3ef0c01dd883798a2f0709aa320f2c547740fab6 Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Thu, 6 Jun 2024 17:50:01 +0200
Subject: [PATCH 01/14] update readmes

---
 README.md              | 17 +++++++++++------
 tests/data_1/README.md | 12 ++++++------
 tests/data_5/README.md |  4 ++--
 tests/data_6/README.md | 23 +++++++----------------
 4 files changed, 26 insertions(+), 30 deletions(-)
diff --git a/README.md b/README.md
index 412dd01..b6785d1 100755
--- a/README.md
+++ b/README.md
@@ -21,18 +21,19 @@ pip install git+https://github.com/cbg-ethz/VILOCA@master
 ### Example
 To test your installation run VILOCA `tests/data_1`:
 ```
-viloca run -b test_aln.cram -f test_ref.fasta -z scheme.insert.bed --mode use_quality_scores
+viloca run -b test_aln.cram -f test_ref.fasta --mode use_quality_scores
 ```
 
-
+Another example can be found in  `tests/data_6`:
 If the sequencing amplicon strategy is known, we recommend using the amplicon-mode of the program, which takes as input the `<smth>.insert.bed` - file:
-`viloca run -b test_aln.cram -f test_ref.fasta -z scheme.insert.bed --mode use_quality_scores`
+`viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode use_quality_scores -z scheme.insert.bed`
+
+If there is no information on the sequencing amplicon strategy available, run:
+`viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode use_quality_scores`
 
 If the sequencing quality scores are not trustable, the sequencing error parameters can also be learned:
-`viloca run -b test_aln.cram -f test_ref.fasta -z scheme.insert.bed --mode learn_error_params`.
+`viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode learn_error_params`.
 
-If there is no information on the sequencing amplicon strategy available, run:
-`viloca run -b test_aln.cram -f test_ref.fasta --mode use_quality_scores`
 
 ### Parameters
 There are several parameters available:  
@@ -70,3 +71,7 @@ This is the same setup as used in the CI at [`.github/workflows/test.yaml`](.git
 ```bash
 poetry run python3 -m cProfile -m shorah shotgun ...
 ```
+
+### Applications
+
+You can find several applications of VILOCA at https://github.com/cbg-ethz/viloca_applications.
diff --git a/tests/data_1/README.md b/tests/data_1/README.md
index 9f48efd..fc83c0b 100644
--- a/tests/data_1/README.md
+++ b/tests/data_1/README.md
@@ -1,10 +1,10 @@
-### Sample files to test `shorah shotgun`
+### Sample files to test `VILOCA`
 
-Use files in this directory to test shorah in shotgun mode.
-The reads data comes from the 
+Use files in this directory to test VILOCA.
+The reads data comes from the
 [test-data](https://github.com/cbg-ethz/V-pipe/tree/master/testdata/2VM-sim/20170904/raw_data)
 of [V-pipe](https://cbg-ethz.github.io/V-pipe/)
-and has been processed with the pipeline using the `bwa` 
+and has been processed with the pipeline using the `bwa`
 [option](https://github.com/cbg-ethz/V-pipe/wiki/options#aligner):
 
 ```ini
@@ -16,8 +16,8 @@ The sorted bam file has been further compressed with samtools for space saving:
 
     [user@host shotgun_test]$ samtools view -T test_ref.fasta -C -O cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 -o test_aln.cram V-pipe/work/samples/2VM-sim/20170904/alignments/REF_aln.bam
 
-You can then run `shorah shotgun` as follows
+You can then run `viloca` as follows
 
-    [user@host shotgun_test]$ shorah shotgun -b test_aln.cram -f test_ref.fasta
+    [user@host shotgun_test]$ viloca run -b test_aln.cram -f test_ref.fasta
 
 The output files will be `snv/SNVs_0.010000_final.vcf` and `snv/SNVs_0.010000_final.csv`.
diff --git a/tests/data_5/README.md b/tests/data_5/README.md
index 80d3d51..21f4bb0 100644
--- a/tests/data_5/README.md
+++ b/tests/data_5/README.md
@@ -1,9 +1,9 @@
 ### Test to check fil.cpp implementation accounting for long deletions
 
-Test files `SNV.txt` and `SNVs_0.010000.txt` are obtained by running `shorah shutgun`, e.g:
+Test files `SNV.txt` and `SNVs_0.010000.txt` are obtained by running `viloca run`, e.g:
 
 ```
-shorah shotgun -a 0.1 -w 42 -x 100000 -p 0.9 -c 0 -r REF:42-272 -R 42 -b test_aln.cram -f ref.fasta
+viloca run -a 0.1 -w 42 -x 100000 -p 0.9 -c 0 -r REF:42-272 -R 42 -b test_aln.cram -f ref.fasta
 ```
 
 The test script `test_long_deletions.py` uses `pysam` and `NumPy`, which can be installed using pip or conda.  
diff --git a/tests/data_6/README.md b/tests/data_6/README.md
index 70baae0..3479aae 100644
--- a/tests/data_6/README.md
+++ b/tests/data_6/README.md
@@ -1,27 +1,18 @@
-### Sample files to test `shorah shotgun`
+### Sample files to test `VILOCA`
 
 Use files in this directory to test shorah in shotgun mode. The reads data have been generated with V-pipe's benchmarking framework (simulated with parameters: ```seq_tech~illumina__seq_mode~shotgun__seq_mode_param~nan__read_length~90__genome_size~90__coverage~100__haplos~5@5@10@5@10@geom@0.75```)
 
 The reads are from one single amplicon of length 90, meaning the reference is of length 90 and each read is of length 90bps.
 
-To run ShoRAH's original Gibbs sampler use the following command:
-```
-poetry run shorah shotgun -f reference.fasta -b reads.shotgun.bam -w 90 --sampler shorah
-```
-or
-```
-poetry run shorah shotgun -f reference.fasta -b reads.shotgun.bam -z scheme.insert.bed --sampler shorah
-```
-
 To use the new inference method using the sequencing quality scores use:
 ```
-poetry run shorah shotgun -f reference.fasta -b reads.shotgun.bam -w 90 --sampler use_quality_scores --alpha 0.0001 --n_max_haplotypes 100 --n_mfa_starts 1 --conv_thres 0.0001
+viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode use_quality_scores
 ```
-To use the new inference method learning the sequencing error parameter:
+To use the model that is learning the sequencing error parameter:
 ```
-poetry run shorah shotgun -f reference.fasta -b reads.shotgun.bam -w 90 --sampler -learn_error_params --alpha 0.0001 --n_max_haplotypes 100 --n_mfa_starts 1 --conv_thres 0.0001
+viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode -learn_error_params
 ```
-
-In the new inference method reads are filtered (and weighted respectively) such that only a set of unique reads are processed. This mode can be switch off by setting the parameter `--non-unique_modus`,  e.g.:
+To run VILOCA with the insert file run:
+```
+viloca run -f reference.fasta -b reads.shotgun.bam -w 90 --mode use_quality_scores -z scheme.insert.bed
 ```
-poetry run shorah shotgun -f reference.fasta -b reads.shotgun.bam -w 90 --sampler -learn_error_params --alpha 0.0001 --n_max_haplotypes 100 --n_mfa_starts 1 --conv_thres 0.0001 --non-unique_modus

From 348e40e1075346a7605fabfb3e8ee90836604f15 Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Thu, 6 Jun 2024 17:50:28 +0200
Subject: [PATCH 02/14] update version number

---
 environment.yml |  2 +-
 pyproject.toml  |  6 +++---
 viloca/cli.py   | 15 +++++----------
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/environment.yml b/environment.yml
index 4c562d4..8e5f6b6 100644
--- a/environment.yml
+++ b/environment.yml
@@ -6,4 +6,4 @@ dependencies:
   - htslib >=1.9
   - boost-cpp >=1.56
   - pip:
-    - https://github.com/spaceben/shorah/releases/download/canary-eeec049/ShoRAH-0.1.0.tar.gz 
\ No newline at end of file
+    - https://github.com/cbg-ethz/VILOCA/archive/refs/tags/viloca-v1.0.0.tar.gz
diff --git a/pyproject.toml b/pyproject.toml
index 42da0bd..ebc301c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,9 @@
 [tool.poetry]
 name = "VILOCA"
-version = "0.1.0"
-description = "SHOrt Reads Assembly into Haplotypes"
+version = "1.0.0"
+description = "VIral LOcal haplotype reconstruction and mutation CAlling for short and long read data"
 license = "GPL-3.0-only"
-authors = ["Benjamin Langer <blanger@student.ethz.ch>, Lara Fuhrmann <lara.fuhrmann@bsse.ethz.ch>"]
+authors = ["Ivan Topolsky", "Benjamin Langer <blanger@student.ethz.ch>, Lara Fuhrmann <lara.fuhrmann@bsse.ethz.ch>"]
 build = "build.py"
 packages = [
     { include = "viloca" }
diff --git a/viloca/cli.py b/viloca/cli.py
index 96273b3..aee2cd2 100644
--- a/viloca/cli.py
+++ b/viloca/cli.py
@@ -2,11 +2,6 @@
 
 # Copyright 2007-2018
 # Niko Beerenwinkel,
-# Nicholas Eriksson,
-# Moritz Gerstung,
-# Lukas Geyrhofer,
-# Kerensa McElroy,
-# Osvaldo Zagordi,
 # ETH Zurich
 
 # This file is part of ShoRAH.
@@ -21,7 +16,7 @@
 # GNU General Public License for more details.
 
 # You should have received a copy of the GNU General Public License
-# along with ShoRAH.  If not, see <http://www.gnu.org/licenses/>.
+# along with VILOCA.  If not, see <http://www.gnu.org/licenses/>.
 """
 Module that contains the command line app.
 Why does this file exist, and why not put this in __main__?
@@ -115,8 +110,8 @@ def main():
                           type=str, dest="f", help="reference genome in fasta format")
 
     parent_parser.add_argument("-a", "--alpha", metavar='FLOAT', required=False,
-                               type=float, dest="a", default=0.1,
-                               help="alpha in dpm sampling (controls the probability of creating new classes)")
+                               type=float, dest="a", default=0.0001,
+                               help="alpha")
 
     parent_parser.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str,
                                dest="r", default='',
@@ -194,8 +189,8 @@ def main():
                                 help="Number of starts for inference type mean_field_approximation.")
 
     parser_shotgun.add_argument('--mode', choices=['shorah','learn_error_params','use_quality_scores'],
-                                default='shorah', dest="inference_type",
-                                help="Mode in which to run VILOCA: shorah,  learn_error_params, use_quality_scores")
+                                default='use_quality_scores', dest="inference_type",
+                                help="Mode in which to run VILOCA: shorah,  learn_error_params, use_quality_scores, ShoRAH refers to the method from https://github.com/cbg-ethz/shorah.")
 
     parser_shotgun.add_argument('--non-unique_modus', action='store_false', dest="unique_modus",
                                 help="For inference: Make read set unique with read weights. Cannot be used with --mode shorah.")

From 08dd11af2aed3eee88983ddb69065c60ae4bd8ab Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Thu, 6 Jun 2024 17:50:44 +0200
Subject: [PATCH 03/14] remove duplicate

---
 viloca/b2w.py | 182 --------------------------------------------------
 1 file changed, 182 deletions(-)

diff --git a/viloca/b2w.py b/viloca/b2w.py
index d3f4e40..686a2b8 100644
--- a/viloca/b2w.py
+++ b/viloca/b2w.py
@@ -296,188 +296,6 @@ def _run_one_window(samfile, window_start, reference_name, window_length,control
     return convert_to_printed_fmt(arr), arr_read_qualities_summary, arr_read_summary, pos_filter
 
 
-def parallel_run_one_window(
-    reference_filename,
-    minimum_reads,
-    tiling,
-    region_end,
-    idx,
-    window_start,
-    window_length,
-    control_window_length,
-    alignment_file,
-    reference_name,
-    win_min_ext,
-    permitted_reads_per_location,
-    counter,
-    exact_conformance_fix_0_1_basing_in_reads,
-    indel_map,
-    max_ins_at_pos,
-    extended_window_mode,
-    exclude_non_var_pos_threshold
-):
-    """
-    build one window.
-    """
-    reffile = pysam.FastaFile(reference_filename)
-
-    samfile = pysam.AlignmentFile(
-        alignment_file,
-        "r", # auto-detect bam/cram (rc)
-        reference_filename=reference_filename,
-        threads=1
-    )
-
-    reads = open(f"reads_{idx}.fas", "w")
-
-    logging.info(f"Working on window (1-based) @ {window_start+1}")
-
-#    (arr, arr_read_qualities_summary, arr_read_summary,
-#    counter, control_window_length, pos_filter) = _run_one_window(
-    (arr, arr_read_qualities_summary, arr_read_summary,
-     pos_filter) = _run_one_window(
-        samfile,
-        window_start - 1, # make 0 based
-        reference_name,
-        window_length,
-        control_window_length,
-        math.floor(win_min_ext * window_length),
-        dict(permitted_reads_per_location), # copys dict ("pass by value")
-        counter,
-        exact_conformance_fix_0_1_basing_in_reads,
-        indel_map,
-        max_ins_at_pos,
-        extended_window_mode,
-        exclude_non_var_pos_threshold
-    )
-
-    logging.debug(f"Window length: {control_window_length}")
-
-    window_end = window_start + window_length - 1
-    file_name = f'w-{reference_name}-{window_start}-{window_end}'
-
-    # TODO solution for backward conformance
-    if len(tiling) > 1:
-        end_extended_by_a_window = region_end + (tiling[1][0]-tiling[0][0])*3
-    else:
-        end_extended_by_a_window = region_end + window_length*3
-
-    for read in arr_read_summary:
-        if idx == len(tiling) - 1 and read[1] > end_extended_by_a_window:
-            continue
-        # TODO reads.fas not FASTA conform, +-0/1 mixed
-        # TODO global end does not really make sense, only for conformance
-        # read name, global start, global end, read start, read end, read
-        reads.write(
-            f'{read[0]}\t{tiling[0][0]-1}\t{end_extended_by_a_window}\t{read[1]}\t{read[2]}\t{read[3]}\n'
-        )
-
-    reads.close()
-
-    if (idx != len(tiling) - 1 # except last
-        and len(arr) > 0) or len(tiling) == 1: # suppress output if window empty
-
-        _write_to_file(arr, file_name + '.reads.fas')
-        if arr_read_qualities_summary is not None:
-            with open(file_name + '.qualities.npy', 'wb') as f:
-                np.save(f, np.asarray(arr_read_qualities_summary, dtype=np.int64), allow_pickle=True)
-
-        ref = reffile.fetch(reference=reference_name, start=window_start-1, end=window_end)
-
-        if extended_window_mode:
-            for file_name_comp, char in [("extended-ref", "X"), ("ref", "-")]:
-                res_ref = _build_one_full_read(
-                    list(ref), list(ref), None, None,
-                    window_start-1, window_end-1,
-                    indel_map, max_ins_at_pos, extended_window_mode, char
-                )[0]
-
-                k = max(0, control_window_length - len(res_ref))
-                res_ref += k * "N"
-                assert_condition = control_window_length == len(res_ref)
-
-                if exclude_non_var_pos_threshold > 0 and file_name_comp == "ref":
-                    _write_to_file([
-                        f'>{reference_name} {window_start}\n' + res_ref
-                    ], file_name + '.envp-full-ref.fas')
-
-                    envp_ref = np.array(list(res_ref))
-                    envp_ref[~pos_filter] = "="
-                    _write_to_file([
-                        f'>{reference_name} {window_start}\n' + "".join(envp_ref)
-                    ], file_name + '.envp-ref.fas')
-
-                    reduced_ref = np.array(list(res_ref))[pos_filter]
-                    res_ref = "".join(reduced_ref)
-                    assert_condition = (control_window_length ==
-                                        len(reduced_ref) + len(pos_filter) - pos_filter.sum())
-
-                _write_to_file([
-                    f'>{reference_name} {window_start}\n' + res_ref
-                ], f'{file_name}.{file_name_comp}.fas')
-
-                assert assert_condition, (
-                    f"""
-                        Reference ({file_name_comp}) does not have same length as the window.
-                        Location: {file_name}
-                        Ref: {len(res_ref)}
-                        Win: {control_window_length}
-                    """
-                )
-
-        else:
-            k = max(0, control_window_length - len(ref))
-            ref += k * "N"
-
-            if exclude_non_var_pos_threshold > 0:
-                full_file_name = file_name + '.envp-full-ref.fas'
-            else:
-                full_file_name = file_name + '.ref.fas'
-
-            _write_to_file([
-                f'>{reference_name} {window_start}\n' + ref
-            ], full_file_name)
-
-            assert control_window_length == len(ref), (
-                f"""
-                    Reference does not have same length as the window.
-                    Location: {file_name}
-                    Ref: {len(ref)}
-                    Win: {control_window_length}
-                """
-            )
-
-            if exclude_non_var_pos_threshold > 0:
-                envp_ref = np.array(list(ref))
-                envp_ref[~pos_filter] = "="
-                _write_to_file([
-                    f'>{reference_name} {window_start}\n' + "".join(envp_ref)
-                ], file_name + '.envp-ref.fas')
-                reduced_ref = np.array(list(ref))[pos_filter]
-                _write_to_file([
-                    f'>{reference_name} {window_start}\n' + "".join(reduced_ref)
-                ], file_name + '.ref.fas')
-
-                assert (control_window_length == len(envp_ref) and
-                        control_window_length == len(reduced_ref) + len(pos_filter) - pos_filter.sum()), (
-                    f"""
-                        Reference does not have same length as the window.
-                        Location: {file_name}
-                        Envp Ref: {len(envp_ref)}
-                        Ref: {len(reduced_ref)}
-                        Win: {control_window_length}
-                    """
-                )
-
-        if len(arr) > minimum_reads:
-            line = (
-                f'{file_name}.reads.fas\t{reference_name}\t{window_start}\t'
-                f'{window_end}\t{len(arr)}'
-            )
-            _write_to_file([line], f"coverage_{idx}.txt")
-
-
-
 def update_tiling(tiling, extended_window_mode, max_ins_at_pos):
     """
     input tiling:

From 98438cb4c80a5ce8ec3f7d3f7ac553af43fcd59d Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Thu, 6 Jun 2024 17:53:49 +0200
Subject: [PATCH 04/14] remove unused packages

---
 pyproject.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ebc301c..baa46fc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,9 +18,7 @@ biopython = "^1.79"
 numpy = "^1.21.4"
 pysam = "^0.18.0"
 pybind11 = "^2.9.0"
-PyYAML = "^6.0"
 scipy = "^1.7.3"
-bio = "^1.3.3"
 pandas = "^1.3.5"
 
 [tool.poetry.dev-dependencies]

From 005909beed964c2bbc665deff9f0a57775cbd69f Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Thu, 6 Jun 2024 18:23:34 +0200
Subject: [PATCH 05/14] change default viloca mode

---
 tests/data_1/shotgun_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/data_1/shotgun_test.sh b/tests/data_1/shotgun_test.sh
index 77925d3..b57e539 100755
--- a/tests/data_1/shotgun_test.sh
+++ b/tests/data_1/shotgun_test.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 
-viloca run -a 0.1 -w 201 -x 100000 -p 0.9 -c 0 \
+viloca run -a 0.1 -w 201 --mode shorah -x 100000 -p 0.9 -c 0 \
 -r HXB2:2469-3713 -R 42 -f test_ref.fasta -b test_aln.cram --out_format csv "$@"

From b2b76aa20ce73b66c8ca230ed5105a2b6345cec0 Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Fri, 7 Jun 2024 10:41:32 +0200
Subject: [PATCH 06/14] b2w and inference: rerun windows that failed + check if
 window was processed already before adding process to process list

---
 viloca/b2w.py     | 52 ++++++++++++++++++++++++-----------------------
 viloca/shotgun.py | 11 ++++++----
 2 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/viloca/b2w.py b/viloca/b2w.py
index 686a2b8..36bb114 100644
--- a/viloca/b2w.py
+++ b/viloca/b2w.py
@@ -578,30 +578,31 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
     all_processes = []
     for idx, (window_start, window_length, control_window_length) in enumerate(tiling):
         counter = counter_list[idx]
-        p = Process(
-            target=parallel_run_one_window,
-            args=(
-                reference_filename,
-                minimum_reads,
-                tiling,
-                region_end,
-                idx,
-                window_start, # 1-based
-                window_length,
-                control_window_length,
-                alignment_file,
-                reference_name,
-                win_min_ext,
-                permitted_reads_per_location,
-                counter, # 0-based
-                exact_conformance_fix_0_1_basing_in_reads,
-                indel_map,
-                max_ins_at_pos, # 0-based
-                extended_window_mode,
-                exclude_non_var_pos_threshold,
+        if not os.path.isfile(f"coverage_{idx}.txt"):
+            p = Process(
+                target=parallel_run_one_window,
+                args=(
+                    reference_filename,
+                    minimum_reads,
+                    tiling,
+                    region_end,
+                    idx,
+                    window_start, # 1-based
+                    window_length,
+                    control_window_length,
+                    alignment_file,
+                    reference_name,
+                    win_min_ext,
+                    permitted_reads_per_location,
+                    counter, # 0-based
+                    exact_conformance_fix_0_1_basing_in_reads,
+                    indel_map,
+                    max_ins_at_pos, # 0-based
+                    extended_window_mode,
+                    exclude_non_var_pos_threshold,
+                    )
                 )
-            )
-        all_processes.append(p)
+            all_processes.append(p)
 
     for p in all_processes:
       p.start()
@@ -609,8 +610,9 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
     for p in all_processes:
         p.join()
         if p.exitcode != 0:
-            logging.debug("[b2w] A process was killed. Terminating the program.")
-            exit(1)
+            p.start()
+            logging.debug("[b2w] A process was killed. Terminating the program. Process was restarted.")
+            #exit(1)
 
     logging.debug("[b2w] All processes completed successfully.")
 
diff --git a/viloca/shotgun.py b/viloca/shotgun.py
index 95618f2..cac74b1 100644
--- a/viloca/shotgun.py
+++ b/viloca/shotgun.py
@@ -307,8 +307,10 @@ def win_to_run(alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, un
 
     for f1 in file1:
         winFile, chr1, beg, end, cov = f1.rstrip().split('\t')
-        j = min(300_000, int(cov) * 15)
-        rn_list.append((winFile, j, alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, unique_modus, inference_convergence_threshold))
+        output_name = output_dir + winFile.split("/")[-1][:-4] + "-" + "cor.fas"
+        if not os.path.isfile(output_name):
+            j = min(300_000, int(cov) * 15)
+            rn_list.append((winFile, j, alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, unique_modus, inference_convergence_threshold))
 
     del end
     del(beg, chr1)
@@ -515,8 +517,9 @@ def main(args):
     for p in all_processes:
         p.join()
         if p.exitcode != 0:
-            logging.debug("[shotgun] A process was killed. Terminating the program.")
-            exit(1)
+            p.start()
+            logging.debug("[shotgun] A process was killed. Restart process.")
+            #exit(1)
 
     logging.debug("[shotgun] All processes completed successfully.")
 

From 37b31a05bf8f527f9b3f2188c7f13b79036c6165 Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Fri, 7 Jun 2024 10:47:44 +0200
Subject: [PATCH 07/14] correct variable

---
 viloca/shotgun.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/viloca/shotgun.py b/viloca/shotgun.py
index cac74b1..d4f4a24 100644
--- a/viloca/shotgun.py
+++ b/viloca/shotgun.py
@@ -307,7 +307,7 @@ def win_to_run(alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, un
 
     for f1 in file1:
         winFile, chr1, beg, end, cov = f1.rstrip().split('\t')
-        output_name = output_dir + winFile.split("/")[-1][:-4] + "-" + "cor.fas"
+        output_name = winFile.split("/")[-1][:-4] + "-" + "cor.fas"
         if not os.path.isfile(output_name):
             j = min(300_000, int(cov) * 15)
             rn_list.append((winFile, j, alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, unique_modus, inference_convergence_threshold))

From 433347770ef844334a255d96182a8fef3e23226d Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Mon, 10 Jun 2024 13:32:27 +0200
Subject: [PATCH 08/14] reference to manuscript

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index b6785d1..357c90a 100755
--- a/README.md
+++ b/README.md
@@ -7,6 +7,8 @@ are written in different programming languages and provide error correction,
 haplotype reconstruction and estimation of the frequency of the different
 genetic variants present in a mixed sample.
 
+The corresponding manuscript can be found here: https://www.biorxiv.org/content/10.1101/2024.06.06.597712v1
+
 ---
 
 ### Installation

From ecda5abd9e44185066fff6ed87c89f6b003bf87a Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Mon, 10 Jun 2024 16:59:38 +0200
Subject: [PATCH 09/14] add viloca conda package

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 357c90a..9bce1e2 100755
--- a/README.md
+++ b/README.md
@@ -15,6 +15,12 @@ The corresponding manuscript can be found here: https://www.biorxiv.org/content/
 For installation miniconda is recommended: https://docs.conda.io/en/latest/miniconda.html.
 We recommend to install VILOCA in a clean conda environment:
 ```
+conda create --name env_viloca --channel conda-forge --channel bioconda viloca
+conda activate env_viloca
+```
+
+If you want to install the `master` branch use:
+```
 conda create --name env_viloca --channel conda-forge --channel bioconda libshorah
 conda activate env_viloca
 pip install git+https://github.com/cbg-ethz/VILOCA@master

From a644e9b85f18c957df9dba1c7f6d2e4fe30540a9 Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Tue, 11 Jun 2024 09:51:49 +0200
Subject: [PATCH 10/14] restarting failed process doesn't work, exit again

---
 viloca/b2w.py     | 5 ++---
 viloca/shotgun.py | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/viloca/b2w.py b/viloca/b2w.py
index 36bb114..961a34e 100644
--- a/viloca/b2w.py
+++ b/viloca/b2w.py
@@ -610,9 +610,8 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
     for p in all_processes:
         p.join()
         if p.exitcode != 0:
-            p.start()
-            logging.debug("[b2w] A process was killed. Terminating the program. Process was restarted.")
-            #exit(1)
+            logging.debug("[b2w] A process was killed. Terminating the program.")
+            exit(1)
 
     logging.debug("[b2w] All processes completed successfully.")
 
diff --git a/viloca/shotgun.py b/viloca/shotgun.py
index d4f4a24..4a40cb0 100644
--- a/viloca/shotgun.py
+++ b/viloca/shotgun.py
@@ -517,9 +517,8 @@ def main(args):
     for p in all_processes:
         p.join()
         if p.exitcode != 0:
-            p.start()
-            logging.debug("[shotgun] A process was killed. Restart process.")
-            #exit(1)
+            logging.debug("[shotgun] A process was killed. Terminating program.")
+            exit(1)
 
     logging.debug("[shotgun] All processes completed successfully.")
 

From 279cc634a5d7856b9b280aef5b29a7b647d97407 Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Wed, 12 Jun 2024 16:27:47 +0200
Subject: [PATCH 11/14] fix: check if files already exist

---
 viloca/shotgun.py | 93 ++++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 45 deletions(-)

diff --git a/viloca/shotgun.py b/viloca/shotgun.py
index 4a40cb0..45c34e4 100644
--- a/viloca/shotgun.py
+++ b/viloca/shotgun.py
@@ -307,7 +307,7 @@ def win_to_run(alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, un
 
     for f1 in file1:
         winFile, chr1, beg, end, cov = f1.rstrip().split('\t')
-        output_name = winFile.split("/")[-1][:-4] + "-" + "cor.fas"
+        output_name = winFile.split(".fas")[0] + "-" + "cor.fas"
         if not os.path.isfile(output_name):
             j = min(300_000, int(cov) * 15)
             rn_list.append((winFile, j, alpha_w, seed, inference_type, n_max_haplotypes, n_mfa_starts, unique_modus, inference_convergence_threshold))
@@ -432,53 +432,56 @@ def main(args):
     # run b2w
 
     logging.info('starting b2w')
-    try:
-        if ignore_indels == True:
-            raise NotImplementedError('This argument was deprecated.')
-        b2w_logging((in_bam, in_fasta, win_length, incr, win_min_ext,
-            max_coverage, cov_thrd, region, ignore_indels))
+    if not os.path.isfile(f"coverage.txt"):
+        try:
+            if ignore_indels == True:
+                raise NotImplementedError('This argument was deprecated.')
+            b2w_logging((in_bam, in_fasta, win_length, incr, win_min_ext,
+                max_coverage, cov_thrd, region, ignore_indels))
+
+            if path_insert_file == None and region == "": # special case if no region defined
+                samfile = pysam.AlignmentFile(
+                    in_bam,
+                    "r", # auto-detect bam/cram (rc)
+                    reference_filename=in_fasta,
+                    threads=1
+                )
+                if samfile.nreferences != 1:
+                    raise NotImplementedError("There are multiple references in this alignment file.")
+                strategy = tiling.EquispacedTilingStrategy(
+                    f"{samfile.references[0]}:1-{samfile.lengths[0]}",
+                    win_length,
+                    incr,
+                    False,
+                    True
+                )
+            elif path_insert_file == None:
+                strategy = tiling.EquispacedTilingStrategy(region, win_length, incr, True)
+            else:
+                strategy = tiling.PrimerTilingStrategy(path_insert_file)
+                if region != "":
+                    logging.warn(f"region is set to {region} but is not used with this tiling strategy")
+
+            logging.info(f"Using tiling strategy: {type(strategy).__name__}")
 
-        if path_insert_file == None and region == "": # special case if no region defined
-            samfile = pysam.AlignmentFile(
+            b2w.build_windows(
                 in_bam,
-                "r", # auto-detect bam/cram (rc)
-                reference_filename=in_fasta,
-                threads=1
+                strategy,
+                win_min_ext,
+                max_coverage,
+                cov_thrd,
+                in_fasta,
+                extended_window_mode=extended_window_mode,
+                exclude_non_var_pos_threshold=exclude_non_var_pos_threshold,
+                maxthreads=maxthreads
             )
-            if samfile.nreferences != 1:
-                raise NotImplementedError("There are multiple references in this alignment file.")
-            strategy = tiling.EquispacedTilingStrategy(
-                f"{samfile.references[0]}:1-{samfile.lengths[0]}",
-                win_length,
-                incr,
-                False,
-                True
-            )
-        elif path_insert_file == None:
-            strategy = tiling.EquispacedTilingStrategy(region, win_length, incr, True)
-        else:
-            strategy = tiling.PrimerTilingStrategy(path_insert_file)
-            if region != "":
-                logging.warn(f"region is set to {region} but is not used with this tiling strategy")
-
-        logging.info(f"Using tiling strategy: {type(strategy).__name__}")
-
-        b2w.build_windows(
-            in_bam,
-            strategy,
-            win_min_ext,
-            max_coverage,
-            cov_thrd,
-            in_fasta,
-            extended_window_mode=extended_window_mode,
-            exclude_non_var_pos_threshold=exclude_non_var_pos_threshold,
-            maxthreads=maxthreads
-        )
-        logging.info('finished b2w')
-
-    except Exception as e:
-        logging.debug(e)
-        sys.exit('b2w run not successful')
+            logging.info('finished b2w')
+
+        except Exception as e:
+            logging.debug(e)
+            sys.exit('b2w run not successful')
+    else:
+        logging.info('coverage.txt file already exists, hence skip b2w and use what is in directory.')
 
     aligned_reads = parse_aligned_reads('reads.fas')
     if len(aligned_reads) == 0:

From 7ced8b8330259f057e624f364f9db8db79c36603 Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Sun, 10 Nov 2024 16:51:41 +0100
Subject: [PATCH 12/14] add minimum number of iterations

---
 .../use_quality_scores/analyze_results.py                    | 2 +-
 viloca/local_haplotype_inference/use_quality_scores/cavi.py  | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py b/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py
index ed0a6b2..4093ff2 100644
--- a/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py
+++ b/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py
@@ -31,7 +31,7 @@ def haplotypes_to_fasta(state_curr_dict, output_dir):
         ave_reads = state_curr_dict["weight" + str(k)]
         if ave_reads==0:
             # this haplotype will not be reported as there are no reads
-            # supporting it. 
+            # supporting it.
             continue
 
         head = (
diff --git a/viloca/local_haplotype_inference/use_quality_scores/cavi.py b/viloca/local_haplotype_inference/use_quality_scores/cavi.py
index 3ff3c58..bfce3d9 100644
--- a/viloca/local_haplotype_inference/use_quality_scores/cavi.py
+++ b/viloca/local_haplotype_inference/use_quality_scores/cavi.py
@@ -114,8 +114,9 @@ def run_cavi(
     iter = 0
     converged = False
     elbo = 0
+    min_number_iterations = 10
     state_curr_dict = state_init_dict
-    while converged is False:
+    while (converged is False) or (iter < min_number_iterations):
 
         if iter <= 1:
             digamma_alpha_sum = digamma(state_curr_dict["alpha"].sum(axis=0))
@@ -155,7 +156,7 @@ def run_cavi(
                 break
             elif (history_elbo[-2] > elbo) and np.abs(elbo - history_elbo[-2]) > 1e-08:
                 exit_message = "Error: ELBO is decreasing."
-                break
+                #break
             elif np.abs(elbo - history_elbo[-2]) < convergence_threshold:
                 converged = True
                 exit_message = "ELBO converged."

From 822f341890f390ad7e856a56988b70e7c60120ca Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Sun, 10 Nov 2024 16:52:13 +0100
Subject: [PATCH 13/14] pass unique_modus argument

---
 .../use_quality_scores/run_dpm_mfa.py                  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py
index b62a84f..eb0c692 100644
--- a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py
+++ b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py
@@ -33,7 +33,7 @@ def main(
     K,
     alpha0,
     alphabet="ACGT-",
-    unique_modus=False,
+    unique_modus=True,
     convergence_threshold=1e-03,
 ):
 
@@ -46,7 +46,7 @@ def main(
     reference_binary, ref_id = preparation.load_reference_seq(fref_in, alphabet)
 
     reads_list, qualities = preparation.load_fasta_and_qualities(
-        freads_in, fname_qualities, alphabet, False
+        freads_in, fname_qualities, alphabet, unique_modus
     )
     reads_seq_binary, reads_weights = preparation.reads_list_to_array(reads_list)
     reads_log_error_proba = preparation.compute_reads_log_error_proba(
@@ -147,9 +147,9 @@ def main(
         sys.argv[1],
         sys.argv[2],
         sys.argv[3],
-        int(sys.argv[4]),
+        sys.argv[4],
         int(sys.argv[5]),
-        float(sys.argv[6]),
-        sys.argv[7],
+        int(sys.argv[6]),
+        float(sys.argv[7]),
     )
 # freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet = 'ACGT-'

From 0064241caaa0e9cb52eacc72da749471d915a93e Mon Sep 17 00:00:00 2001
From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com>
Date: Tue, 26 Nov 2024 09:16:59 +0100
Subject: [PATCH 14/14] Revert "Merge pull request #38 from
 cbg-ethz/enable-unique-modus"

This reverts commit ffd4fd13575b9412fda3d689e5b3b588dc1587ef, reversing
changes made to 279cc634a5d7856b9b280aef5b29a7b647d97407.
---
 .../use_quality_scores/analyze_results.py              |  2 +-
 .../use_quality_scores/cavi.py                         |  5 ++---
 .../use_quality_scores/run_dpm_mfa.py                  | 10 +++++-----
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py b/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py
index 4093ff2..ed0a6b2 100644
--- a/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py
+++ b/viloca/local_haplotype_inference/use_quality_scores/analyze_results.py
@@ -31,7 +31,7 @@ def haplotypes_to_fasta(state_curr_dict, output_dir):
         ave_reads = state_curr_dict["weight" + str(k)]
         if ave_reads==0:
             # this haplotype will not be reported as there are no reads
-            # supporting it.
+            # supporting it. 
             continue
 
         head = (
diff --git a/viloca/local_haplotype_inference/use_quality_scores/cavi.py b/viloca/local_haplotype_inference/use_quality_scores/cavi.py
index bfce3d9..3ff3c58 100644
--- a/viloca/local_haplotype_inference/use_quality_scores/cavi.py
+++ b/viloca/local_haplotype_inference/use_quality_scores/cavi.py
@@ -114,9 +114,8 @@ def run_cavi(
     iter = 0
     converged = False
     elbo = 0
-    min_number_iterations = 10
     state_curr_dict = state_init_dict
-    while (converged is False) or (iter < min_number_iterations):
+    while converged is False:
 
         if iter <= 1:
             digamma_alpha_sum = digamma(state_curr_dict["alpha"].sum(axis=0))
@@ -156,7 +155,7 @@ def run_cavi(
                 break
             elif (history_elbo[-2] > elbo) and np.abs(elbo - history_elbo[-2]) > 1e-08:
                 exit_message = "Error: ELBO is decreasing."
-                #break
+                break
             elif np.abs(elbo - history_elbo[-2]) < convergence_threshold:
                 converged = True
                 exit_message = "ELBO converged."
diff --git a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py
index eb0c692..b62a84f 100644
--- a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py
+++ b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py
@@ -33,7 +33,7 @@ def main(
     K,
     alpha0,
     alphabet="ACGT-",
-    unique_modus=True,
+    unique_modus=False,
     convergence_threshold=1e-03,
 ):
 
@@ -46,7 +46,7 @@ def main(
     reference_binary, ref_id = preparation.load_reference_seq(fref_in, alphabet)
 
     reads_list, qualities = preparation.load_fasta_and_qualities(
-        freads_in, fname_qualities, alphabet, unique_modus
+        freads_in, fname_qualities, alphabet, False
     )
     reads_seq_binary, reads_weights = preparation.reads_list_to_array(reads_list)
     reads_log_error_proba = preparation.compute_reads_log_error_proba(
@@ -147,9 +147,9 @@ def main(
         sys.argv[1],
         sys.argv[2],
         sys.argv[3],
-        sys.argv[4],
+        int(sys.argv[4]),
         int(sys.argv[5]),
-        int(sys.argv[6]),
-        float(sys.argv[7]),
+        float(sys.argv[6]),
+        sys.argv[7],
     )
 # freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet = 'ACGT-'