From 0020fa262a5cbbee7045f2abefc3948c85d1b46d Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Thu, 2 May 2024 15:40:34 +0200 Subject: [PATCH] Fix b2w (#35) * return to before, make ref2binary okay for Ns * remove duplicate function update_tiling() * add new variable original_minimum_overlap for exteneded_window_mode since last_aligned_pos is in orignal reference genome space --- tests/test_tiling.py | 2 +- viloca/b2w.py | 28 ++----------------- .../learn_error_params/preparation.py | 3 +- .../use_quality_scores/preparation.py | 3 +- viloca/tiling.py | 5 ++-- 5 files changed, 10 insertions(+), 31 deletions(-) diff --git a/tests/test_tiling.py b/tests/test_tiling.py index af40ced..89e38fc 100644 --- a/tests/test_tiling.py +++ b/tests/test_tiling.py @@ -26,7 +26,7 @@ def test_equispaced_use_full_reference_as_region(): assert actual[0][0] == 1 assert actual[0][1] == 201 - assert actual[-1][0] == 2748 #2949 #2748 (old number where last windows where excluded) + assert actual[-1][0] == 2949 #2748 (old number where last windows where excluded) #assert actual[-1][0] + 201 < 3000 diff --git a/viloca/b2w.py b/viloca/b2w.py index d3953b2..d3f4e40 100644 --- a/viloca/b2w.py +++ b/viloca/b2w.py @@ -157,6 +157,7 @@ def _run_one_window(samfile, window_start, reference_name, window_length,control # TODO: window_length original_window_length = window_length window_length = control_window_length + original_minimum_overlap = minimum_overlap if extended_window_mode: # this is now done intilaly for all windows #for pos, val in max_ins_at_pos.items(): @@ -208,7 +209,8 @@ def _run_one_window(samfile, window_start, reference_name, window_length,control if (first_aligned_pos + minimum_overlap < window_start + 1 + window_length - and last_aligned_pos >= window_start + minimum_overlap - 2 # TODO justify 2 + and last_aligned_pos >= window_start + original_minimum_overlap - 2 # TODO justify 2 + #last_aligned_pos: is in the orginal reference genome space (not in the extended_window_mode-space) and len(full_read) >= minimum_overlap): num_inserts_right_of_read = 0 @@ -684,30 +686,6 @@ def parallel_run_one_window( -def update_tiling(tiling, extended_window_mode, max_ins_at_pos): - """ - input tiling: - - return: tiling = [ - (window_start, original_window_length, control_window_length, counter) - for each window - ] - """ - update_tiling = [] - - for idx, (window_start, window_length) in enumerate(tiling): - original_window_length = window_length - if extended_window_mode: - for pos, val in max_ins_at_pos.items(): - if window_start <= pos < window_start + original_window_length: - window_length += val - update_tiling.append((window_start,original_window_length, window_length)) - else: - update_tiling.append((window_start,original_window_length, window_length)) - - return update_tiling - - def build_windows(alignment_file: str, tiling_strategy: TilingStrategy, win_min_ext: float, maximum_reads: int, minimum_reads: int, reference_filename: str, diff --git a/viloca/local_haplotype_inference/learn_error_params/preparation.py b/viloca/local_haplotype_inference/learn_error_params/preparation.py index 4b82217..fe80925 100644 --- a/viloca/local_haplotype_inference/learn_error_params/preparation.py +++ b/viloca/local_haplotype_inference/learn_error_params/preparation.py @@ -103,5 +103,6 @@ def reference2binary(reference_seq, alphabet): length_seq = len(reference_seq) reference_table = np.zeros((length_seq, len(alphabet))) for base_position, base in enumerate(str(reference_seq)): - reference_table[base_position][alphabet.index(base.upper())] = 1 + if alphabet.find(base) >= 0: + reference_table[base_position][alphabet.index(base.upper())] = 1 return reference_table diff --git a/viloca/local_haplotype_inference/use_quality_scores/preparation.py b/viloca/local_haplotype_inference/use_quality_scores/preparation.py index add95ff..beab500 100644 --- a/viloca/local_haplotype_inference/use_quality_scores/preparation.py +++ b/viloca/local_haplotype_inference/use_quality_scores/preparation.py @@ -158,5 +158,6 @@ def reference2binary(reference_seq, alphabet): length_seq = len(reference_seq) reference_table = np.zeros((length_seq, len(alphabet))) for base_position, base in enumerate(str(reference_seq)): - reference_table[base_position][alphabet.index(base.upper())] = 1 + if alphabet.find(base) >= 0: + reference_table[base_position][alphabet.index(base.upper())] = 1 return reference_table diff --git a/viloca/tiling.py b/viloca/tiling.py index 30fd56f..2665f2b 100644 --- a/viloca/tiling.py +++ b/viloca/tiling.py @@ -103,9 +103,8 @@ def get_window_tilings(self) -> List[Tuple[int, int]]: self.end - 1, self.incr )) - while window_positions[-1] + self.window_length >= self.end: - del window_positions[-1] # FIXME uncommented to create one single window - #window_positions.append(self.end - 1 - self.window_length) + #while window_positions[-1] + self.window_length >= self.end: + # del window_positions[-1] # FIXME uncommented to create one single window else: window_positions = list(range(