diff --git a/PostProcess/extraction.sh b/PostProcess/extraction.sh index 641ee33..b663668 100755 --- a/PostProcess/extraction.sh +++ b/PostProcess/extraction.sh @@ -1,8 +1,7 @@ #!/bin/bash - ##NOTE AWK & GREP REPORT NO STDOUT IF NO MATCHES ARE FOUND (AWK DO NOT PRODUCE ANY OUTPUT) -set -e # trace all errors +# set -e # trace all errors #PARAM $1 is ref targets file #PARAM $2 is var targets file @@ -12,40 +11,21 @@ dir=$(dirname $1) #common targets extraction LC_ALL=C sort -u -T "$dir" $1 >$1.sort.txt LC_ALL=C sort -u -T "$dir" $2 >$2.sort.txt -cp $2.sort.txt $3.common_targets.txt -# LC_ALL=C comm -12 $1.sort.txt $2.sort.txt >$3.common_targets.txt +# cp $2.sort.txt $3.common_targets.txt +LC_ALL=C comm -12 $1.sort.txt $2.sort.txt >$3.common_targets.txt #Semi common targets extraction -touch $3.ref.chr_pos.txt -# LC_ALL=C awk '{print $4"\t"$5"\t"$6}' $1.sort.txt >$3.ref.chr_pos.txt -# LC_ALL=C grep -F -f $3.ref.chr_pos.txt $2.sort.txt >$3.semi_common_targets.txt #Seleziono i targets di var che hanno la stessa chr pos in ref -cp $2.sort.txt $3.semi_common_targets.txt +LC_ALL=C awk '{print $4"\t"$5"\t"$6}' $1.sort.txt >$3.ref.chr_pos.txt +LC_ALL=C awk 'NR==FNR{a[$0]; next} {if ($4"\t"$5"\t"$6 in a) print $0}' $3.ref.chr_pos.txt $2.sort.txt >$3.semi_common_targets.txt #Aggiungo i target del ref: ora semicommon contiene: target con iupac e targets senza iupac corrispondenti; # target senza iupac del var e corrispondenti target senza iupac del ref -# LC_ALL=C grep -F -f $3.ref.chr_pos.txt $1.sort.txt >>$3.semi_common_targets.txt -# LC_ALL=C sort -u -T "$dir" $3.semi_common_targets.txt >$3.semi_common_targets.sort.txt -cp $3.semi_common_targets.txt $3.semi_common_targets.sort.txt +LC_ALL=C awk 'NR==FNR{a[$0]; next} ($4"\t"$5"\t"$6) in a' $3.ref.chr_pos.txt $1.sort.txt >>$3.semi_common_targets.txt +LC_ALL=C sort -u -T "$dir" $3.semi_common_targets.txt >$3.semi_common_targets.sort.txt #unique variant targets extraction -# LC_ALL=C comm -13 $3.semi_common_targets.sort.txt $2.sort.txt >$3.unique_targets.txt -cp $2.sort.txt $3.unique_targets.txt +LC_ALL=C comm -13 $3.semi_common_targets.sort.txt $2.sort.txt >$3.unique_targets.txt mv $3.semi_common_targets.sort.txt $3.semi_common_targets.txt #Remove tmp files, NOTE maybe keep first two and change name to $1 and $2 ? rm $1.sort.txt $2.sort.txt $3.ref.chr_pos.txt - -# # OLD semi common targets extraction -# LC_ALL=C awk '{print $4"\t"$5}' $1 > ref.chr_pos.txt -# LC_ALL=C sort -T ./ -u ref.chr_pos.txt > ref.chr_pos.sort.txt -# LC_ALL=C grep -F -f ref.chr_pos.sort.txt $2 > $3.semi_common_targets.txt -# LC_ALL=C awk '{print $4"\t"$5}' $3.semi_common_targets.txt > semi_common.chr_pos.txt -# LC_ALL=C sort -T ./ -u semi_common.chr_pos.txt > semi_common.chr_pos.sort.txt -# LC_ALL=C grep -F -f semi_common.chr_pos.sort.txt $1 >> $3.semi_common_targets.txt -# LC_ALL=C sort -T ./ -u $3.semi_common_targets.txt > semi_common_targets.sort.txt - -# # OLD unique variant targets extraction -# #LC_ALL=C sort $2 > $2.sort.txt -# LC_ALL=C comm -13 semi_common_targets.sort.txt $2.sort.txt > $3.unique_targets.txt - -# rm ref.chr_pos.txt ref.chr_pos.sort.txt semi_common_targets.sort.txt $2.sort.txt semi_common.chr_pos.txt semi_common.chr_pos.sort.txt $1.sort.txt diff --git a/PostProcess/merge_alt_chr.sh b/PostProcess/merge_alt_chr.sh index cf854ac..86b7dd9 100755 --- a/PostProcess/merge_alt_chr.sh +++ b/PostProcess/merge_alt_chr.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -e # trace all failures +set -e # trace all failures dir=$(dirname $1) fileIn=$1 @@ -8,18 +8,17 @@ fileOut=$2 STARTTIME=$(date +%s) -chroms=($(cut -f 5 $fileIn | tail -n +2 | LC_ALL=C grep -v "_" | LC_ALL=C sort -T "$dir" | uniq)) +chroms=($(awk 'NR>1 && !/_/ {print $5}' $fileIn | LC_ALL=C sort -T "$dir" | uniq)) head -1 $fileIn >$fileOut for chrom in ${chroms[@]}; do echo $chrom - #awk -v "key=$chrom" '$5 == key {print($0)}' $fileIn > $fileIn.$chrom.ref - grep -F -w "$chrom" $fileIn >$fileIn.$chrom.ref + awk "/${chrom}\t/" test.targets.txt >$fileIn.$chrom.ref cut -f 3 $fileIn.$chrom.ref | LC_ALL=C sort -T "$dir" | uniq >$fileIn.$chrom.ref.targets - LC_ALL=C grep -F "${chrom}_" $fileIn >$fileIn.$chrom.alt # | awk '$5 ~ "_" {print($0)}' - LC_ALL=C grep -v -F -f $fileIn.$chrom.ref.targets $fileIn.$chrom.alt >$fileIn.$chrom.merged + awk -v chrom="$chrom" '$0 ~ chrom"_" {print($0)}' $fileIn >$fileIn.$chrom.alt + awk 'NR==FNR{a[$0];next} !($0 in a)' $fileIn.$chrom.ref.targets $fileIn.$chrom.alt >$fileIn.$chrom.merged rm $fileIn.$chrom.ref.targets rm $fileIn.$chrom.alt cat $fileIn.$chrom.ref $fileIn.$chrom.merged >>$fileOut diff --git a/PostProcess/merge_close_targets_cfd.sh b/PostProcess/merge_close_targets_cfd.sh index b970f07..e329593 100755 --- a/PostProcess/merge_close_targets_cfd.sh +++ b/PostProcess/merge_close_targets_cfd.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -e # capture any failure +# set -e # capture any failure fileIn=$1 fileOut=$2 @@ -28,8 +28,9 @@ echo "Sorting done in $(($ENDTIME - $STARTTIME)) seconds" # rm $fileIn.sorted.tmp echo "Merging contiguous targets" # python remove_contiguous_samples_cfd.py $fileIn.sorted $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd -python remove_contiguous_samples_cfd.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot $sorting_criteria_scoring $sorting_criteria || { - echo "CRISPRme ERROR: contigous SNP removal failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} +python remove_contiguous_samples_cfd.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot $sorting_criteria_scoring $sorting_criteria +# python remove_contiguous_samples_cfd_new.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot $sorting_criteria_scoring $sorting_criteria || { +# echo "CRISPRme ERROR: contigous SNP removal failed (script: ${0} line $((LINENO-1)))" >&2 +# exit 1 +# } # rm $fileIn.sorted diff --git a/PostProcess/post_analisi_indel.sh b/PostProcess/post_analisi_indel.sh index ebb2e52..16aaf9a 100755 --- a/PostProcess/post_analisi_indel.sh +++ b/PostProcess/post_analisi_indel.sh @@ -1,6 +1,13 @@ #!/bin/bash -set -e +# set -e + +output_folder=$1 +ref_folder=$2 +ref_name=$(basename $2) +vcf_folder=$3 #!/bin/bash + +set -e output_folder=$1 ref_folder=$2 @@ -28,22 +35,49 @@ echo "Processing INDELs results for $key, starting post-analysis" true_chr=$key fake_chr="fake$true_chr" -# LC_ALL=C grep -F -w $fake_chr "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +header=$(head -1 $output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt) +sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" + +./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || { + echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 + exit 1 +} +rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +rm "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" + +vcf_name=$(basename $3) +guide_file=$4 +guide_name=$(basename $4) +mm=$5 +bDNA=$6 +bRNA=$7 +annotation_file=$8 +annotation_name=$(basename $8) +pam_file=$9 +pam_name=$(basename $9) +# sampleID=${10} +dict_folder=${10} + +final_res=${11} +final_res_alt=${12} + +key=${13} + +echo "Processing INDELs results for $key, starting post-analysis" +true_chr=$key +fake_chr="fake$true_chr" + # create file to prevent void grep failing touch "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" # create file to prevent void grep failing touch "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -LC_ALL=C grep -F -w $fake_chr "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" header=$(head -1 $output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt) sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" ./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || { - echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 + echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 + exit 1 } -# rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -# rm "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -# tail -n +2 "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestMerge.txt" >> "$final_res" #"$output_folder/${fake_chr}_${guide_name}_${mm}_${bDNA}_${bRNA}.bestCFD.txt.tmp" -# tail -n +2 "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.altMerge.txt" >> "$final_res_alt" #"$output_folder/${fake_chr}_${guide_name}_${mm}_${bDNA}_${bRNA}.altCFD.txt.tmp" -# rm "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestMerge.txt" -# rm "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.altMerge.txt" diff --git a/PostProcess/post_analisi_snp.sh b/PostProcess/post_analisi_snp.sh index c1c8423..cfb2d28 100755 --- a/PostProcess/post_analisi_snp.sh +++ b/PostProcess/post_analisi_snp.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -e # trace all command failures +set -e # trace all command failures output_folder=$1 ref_folder=$2 @@ -23,16 +23,15 @@ final_res_alt=${12} key=${13} - echo "Processing SNPs for $key" -LC_ALL=C grep -F -w $key "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" +awk -v key="$key" '{ if ($0 ~ key) print }' "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" if ! [ -f "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then cp "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" else - LC_ALL=C grep -F -w $key "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" + awk -v key="$key" '$0 ~ key' "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" fi ./scriptAnalisiNNN_v3.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.${key}" "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.${key}" "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key" "$annotation_file" "${dict_folder}/my_dict_${key}.json" "${ref_folder}/${key}.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || { - echo "CRISPRme ERROR: SNP analysis failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR: SNP analysis failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" diff --git a/PostProcess/post_analysis_only.sh b/PostProcess/post_analysis_only.sh index 907730c..1d2c6e8 100755 --- a/PostProcess/post_analysis_only.sh +++ b/PostProcess/post_analysis_only.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -e # trace all failures +set -e # trace all failures #file for automated search of guide+pam in reference and variant genomes @@ -122,7 +122,7 @@ if [ "$vcf_name" != "_" ]; then fi ./pool_post_analisi_snp.py $output_folder $ref_folder $vcf_name $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $sampleID $dict_folder $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR: indels postprocessing failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR: indels postprocessing failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } @@ -153,10 +153,10 @@ else fi for key in "${real_chroms[@]}"; do echo "Processing $key" - LC_ALL=C grep -F -w $key "$output_folder/crispritz_targets/${ref_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" + awk -v key="$key" '$0 ~ key { print }' "$output_folder/crispritz_targets/${ref_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" touch "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" ./scriptAnalisiNNN_v3.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" "${ref_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key" "$annotation_file" "_" "$ref_folder" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$sampleID" "$output_folder" || { - echo "CRISPRme ERROR: analysis failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR: analysis failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" @@ -181,7 +181,7 @@ if [ "$vcf_name" != "_" ]; then echo "Post-analysis INDELs Start: "$(date +%F-%T) >>$output_folder/$log ./pool_post_analisi_indel.py $output_folder $ref_folder $vcf_folder $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $sampleID "$output_folder/log_indels_$vcf_name" $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR:indels analysis failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR:indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } echo "Post-analysis INDELs End: "$(date +%F-%T) >>$output_folder/$log @@ -198,7 +198,7 @@ cd "$starting_dir" echo "Merging Close Targets Start: "$(date +%F-%T) >>$output_folder/$log ./merge_close_targets_cfd.sh $final_res $final_res.trimmed $merge_t || { - echo "CRISPRme ERROR: CFD targets merge failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR: CFD targets merge failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } mv $final_res.trimmed $final_res @@ -209,8 +209,8 @@ mv $final_res.trimmed.discarded_samples $final_res_alt echo "Merging Close Targets End: "$(date +%F-%T) >>$output_folder/$log echo "Merging Alternative Chromosomes Start: "$(date +%F-%T) >>$output_folder/$log -./merge_alt_chr.sh $final_res $final_res.chr_merged || { - echo "CRISPRme ERROR: alternative targets merge failed (script: ${0} line $((LINENO-1)))" >&2 +./merge_alt_chr.sh $final_res $final_res.chr_merged || { + echo "CRISPRme ERROR: alternative targets merge failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } #rm $final_res.trimmed @@ -231,7 +231,7 @@ if ! [ -d "$output_folder/cfd_graphs" ]; then mkdir $output_folder/cfd_graphs fi ./assemble_cfd_graphs.py $output_folder || { - echo "CRISPRme ERROR: CFD graph creation failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR: CFD graph creation failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } mv $output_folder/snps.CFDGraph.txt $output_folder/cfd_graphs diff --git a/PostProcess/remove_contiguous_samples_cfd.py b/PostProcess/remove_contiguous_samples_cfd.py index 4fc0cd9..ca80ac6 100755 --- a/PostProcess/remove_contiguous_samples_cfd.py +++ b/PostProcess/remove_contiguous_samples_cfd.py @@ -19,643 +19,661 @@ $ python remove_contiguous_samples_cfd.py input.txt output.txt 10 2 3 4 5 6 7 8 score """ - -from typing import List, Any, Tuple, Dict, Callable +from typing import List, Tuple, Dict, Callable, Union from io import TextIOWrapper from time import time import sys +import os -INPUT_ARG_COUNT = 12 # expected input args -FLAG_ALT_ONLY = 12 # target column for ALT target +INPUT_ARG_COUNT = 12 +FLAG_ALT_ONLY = 12 +SORTING_PIVOTS = ["score", "total"] SORTING_CRITERIA = {"mm+bulges": 0, "mm": 2, "bulges": 1} -def parse_input_args(args: List[str]) -> List[Any]: +class MergeTargets: """ - Parses the input arguments and returns a list of parsed values. + Represents a class for merging targets with specified criteria for sorting + and indexing. Args: - args (List[str]): The input arguments to be parsed. + args (List[str]): List of arguments containing input parameters for + target merging. Returns: - List[Any]: A list of parsed values. + None Raises: - ValueError: If no input argument is provided or if the number of input arguments is not equal to INPUT_ARG_COUNT. - ValueError: If the input argument types are invalid. + FileNotFoundError: If the input targets file is not found. + ValueError: If there are issues with the specified merge range, column + indices, or sorting criteria. + """ + + def __init__(self, args: List[str]) -> None: + self._infname = args[0] # input targets + if not os.path.exists(self._infname) or not os.path.isfile(self._infname): + raise FileNotFoundError(f"{self._infname} not found") + self._outfname = args[1] # output merged targets + self._rangebp = int(args[2]) # merge bp range + if self._rangebp <= 0: + raise ValueError(f"Invalid merge range ({self._rangebp})") + self._chromidx = int(args[3]) - 1 # chromosome index + if self._chromidx != 4: + raise ValueError( + f"Chromosome data is expected on column 5, got {self._chromidx}" + ) + self._posidx = int(args[4]) - 1 # position index + if self._posidx != 6: + raise ValueError( + f"Position data is expected on column 7, got {self._posidx}" + ) + self._mmb_idx = int(args[5]) - 1 # mm+bulges index + if self._mmb_idx != 10: + raise ValueError( + f"MM+Bulges data is expected on column 11, got {self._mmb_idx}" + ) + self._guide_idx = int(args[6]) - 1 # guide index + if self._guide_idx != 15: + raise ValueError( + f"Guide data is expected on column 16, got {self._guide_idx}" + ) + self._snp_idx = int(args[7]) - 1 # snp info index + if self._snp_idx != 18: + raise ValueError(f"SNP data is expected on column 19, got {self._snp_idx}") + self._score_idx = int(args[8]) - 1 # score index + if self._score_idx != 20: + raise ValueError( + f"Score data is expected on column 21, got {self._score_idx}" + ) + self._sort_criterion = args[9] # sorting criterion (pivot) + if self._sort_criterion not in SORTING_PIVOTS: + raise ValueError( + f"Allowed sort pivots: {SORTING_PIVOTS}, got {self._sort_criterion}" + ) + self._sorting_criteria_scoring = args[10] # sorting criteria (score is pivot) + self._sorting_criteria = args[11] # sorting criteria (mm+bulges is pivot) + + @property + def targets_fname(self) -> str: + return self._infname + + @property + def targets_fname_merged(self) -> str: + return self._outfname + + @property + def targets_fname_discarded(self) -> str: + return f"{self._outfname}.discarded_samples" + + @property + def rangebp(self) -> int: + return self._rangebp + + @property + def chromidx(self) -> int: + return self._chromidx + + @property + def posidx(self) -> int: + return self._posidx + + @property + def mmbidx(self) -> int: + return self._mmb_idx + + @property + def guideidx(self) -> int: + return self._guide_idx + + @property + def snpidx(self) -> int: + return self._snp_idx + + @property + def scoreidx(self) -> int: + return self._score_idx + + @property + def sort_pivot(self) -> str: + return self._sort_criterion + + @property + def sorting_criteria_scoring(self) -> str: + return self._sorting_criteria_scoring + + @property + def sorting_criteria(self) -> str: + return self._sorting_criteria + + +def parse_input_args(args: List[str]) -> MergeTargets: + """ + Parses the input arguments to create an instance of MergeTargets for handling + target merging based on the provided arguments. + + Args: + args (List[str]): List of input arguments to be parsed for target merging. - Examples: - >>> parse_input_args(['input.txt', 'output.txt', '10', '2', '3', '4', '5', '6', '7', '8', 'score', 'mm', 'mm+bulges,mm']) - ['input.txt', 'output.txt', 10, 1, 2, 3, 4, 5, 6, 'sort', 'mm', 'mm+bulges,mm'] + Returns: + MergeTargets: An instance of MergeTargets class for further processing. """ if not args: raise ValueError("No input argument provided") if len(args) != INPUT_ARG_COUNT: - raise ValueError(f"Expected {INPUT_ARG_COUNT} input arguments, got {len(args)}") - try: - # Use more descriptive variable names - ( - infname, - outfname, - rangebp, - chrom_idx, - position_idx, - mm_bul_count_idx, - guide_idx, - snp_info_idx, - score_idx, - sort_criterion, - sorting_criteria_scoring, - sorting_criteria, - ) = ( - args[0], - args[1], - int(args[2]), - int(args[3]) - 1, - int(args[4]) - 1, - int(args[5]) - 1, - int(args[6]) - 1, - int(args[7]) - 1, - int(args[8]) - 1, - args[9], - args[10], - args[11], - ) - except (ValueError, TypeError) as e: - raise ValueError("Invalid input argument types") from e - return [ - infname, - outfname, - rangebp, - chrom_idx, - position_idx, - mm_bul_count_idx, - guide_idx, - snp_info_idx, - score_idx, - sort_criterion, - sorting_criteria_scoring, - sorting_criteria, - ] - - -def read_raw_targets(targets_file: str) -> Tuple[str, List[List[str]]]: - """ - Reads and parses a targets file. + raise ValueError(f"Expected {INPUT_ARG_COUNT} arguments, got {len(args)}") + return MergeTargets(args) # parse input arguments + + +def initialize_targets_cluster(rangebp: int) -> Tuple[int, str, str, str, List[str]]: + """ + Initializes the targets cluster by setting the previous cluster position, + guide, chromosome, SNP, and an open starting cluster. Args: - targets_file (str): The path to the targets file. + rangebp (int): The range of base pairs for the cluster. Returns: - Tuple[str, List[List[str]]]: A tuple containing the header of the targets file and the parsed data. + Tuple[int, str, str, str, List[str]]: Tuple containing the initialized + cluster parameters. + """ + + pos_prev = -(rangebp + 1) # previous cluster position + guide_prev = "" # previous cluster's guide + chrom_prev = "" # previous cluster's chromosome + snp_prev = "" # previous cluster snp + cluster = [] # open starting cluster + return pos_prev, guide_prev, chrom_prev, snp_prev, cluster - Raises: - OSError: If parsing the targets file fails. - Examples: - >>> read_raw_targets('targets.txt') - ('Header line', [['data1', 'data2'], ['data3', 'data4']]) +def parse_target(target: str) -> List[str]: """ + Parses the fields of a target string and returns a list of the parsed fields. - try: - with open(targets_file, mode="r") as infile: - header = infile.readline() # recover targets file header - targets_data = [line.split() for line in infile] # parse line fields - except OSError as e: - raise OSError(f"Parsing targets file {targets_file} failed!") from e - return header, targets_data + Args: + target (str): The target string to be parsed. + + Returns: + List[str]: List of parsed fields extracted from the target string. + """ + + # parse target's fields + return target.strip().split() def open_targets_cluster( guide: str, - prev_guide: str, + guide_prev: str, chrom: str, - prev_chrom: str, + chrom_prev: str, pos: int, - prev_pos: int, + pos_prev: int, rangebp: int, ) -> bool: """ - Determines whether to open a new targets cluster based on the given conditions. + Determines whether a new targets cluster should be opened based on changes + in guide, chromosome, and position within a specified range of base pairs. Args: - guide (str): The current guide. - prev_guide (str): The previous guide. - chrom (str): The current chromosome. - prev_chrom (str): The previous chromosome. - pos (int): The current position. - prev_pos (int): The previous position. - rangebp (int): The maximum range in base pairs for samples to be considered contiguous. + guide (str): Current guide. + guide_prev (str): Previous guide. + chrom (str): Current chromosome. + chrom_prev (str): Previous chromosome. + pos (int): Current position. + pos_prev (int): Previous position. + rangebp (int): Range of base pairs for cluster merging. Returns: bool: True if a new targets cluster should be opened, False otherwise. """ - # if the condition is satisfied open a new tragets cluster - return guide != prev_guide or chrom != prev_chrom or pos - prev_pos > rangebp + return guide != guide_prev or chrom != chrom_prev or pos - pos_prev > rangebp -def merge_targets_by_snp( - cluster: List[List[str]], snp_info_idx: int, pos_idx: int, guide_idx: int -) -> Tuple[List[List[str]], Dict[Tuple[str, str], List[List[str]]]]: +def cluster_targets_by_pos_snp( + cluster: List[List[str]], guideidx: int, snpidx: int, posidx: int +) -> Tuple[List[List[str]], Dict[Tuple[int, str], List[List[str]]]]: """ - Merges targets in a cluster based on SNP information. + Clusters targets based on position and SNP information, separating them into + reference and alternative targets. Args: - cluster (List[List[str]]): The cluster of targets to be merged. - snp_info_idx (int): The index of the SNP info column in the target data. - pos_idx (int): The index of the position column in the target data. - guide_idx (int): The index of the guide column in the target data. + cluster (List[List[str]]): List of target clusters. + guideidx (int): Index of the guide. + snpidx (int): Index of the SNP information. + posidx (int): Index of the position. Returns: - Tuple[List[List[str]], Dict[Tuple[str, str], List[List[str]]]]: A tuple containing the merged reference targets and a dictionary of merged variant targets. - - Raises: - ValueError: If the cluster is empty. - - Examples: - >>> merge_targets_by_snp([['target1', 'n', 'pos1'], ['target2', 'A', 'pos2'], ['target3', 'A', 'pos1']], 1, 2, 3) - ([['target1', 'n', 'pos1']], {('pos2', 'A'): [['target2', 'A', 'pos2']], ('pos1', 'A'): [['target3', 'A', 'pos1']]}]) + Tuple[List[List[str]], Dict[Tuple[int, str], List[List[str]]]: A tuple + containing the reference targets list and a dictionary of alternative + targets grouped by position and SNP information. """ - if not cluster: - raise ValueError("Empty cluster, nothing to merge") - reference_targets = [] # REF targets list - variants = {} # dictionary storing ALT targets positions and info + # initialize reference and alternative targets lists + reference_targets, alternative_targets = [], {} + samplesidx, snpididx, afidx = guideidx - 2, snpidx - 2, snpidx - 1 for target in cluster: - if target[snp_info_idx] == "n": # reference target + if target[snpidx] == "n": # reference target reference_targets.append(target) - else: # ALT target - pos_t, snp_info_t = target[pos_idx], target[snp_info_idx] - if (pos_t, snp_info_t) in variants: # merge samples with identical targets - variants[(pos_t, snp_info_t)][0][guide_idx - 2] += ( - "," + target[guide_idx - 2] - ) - variants[(pos_t, snp_info_t)][0][snp_info_idx - 2] += ( - "," + target[snp_info_idx - 2] - ) - variants[(pos_t, snp_info_t)][0][snp_info_idx - 1] += ( - "," + target[snp_info_idx - 1] - ) - else: # add new ALT target - variants[(pos_t, snp_info_t)] = [target] - return reference_targets, variants + continue + # alternative target + pos_t, snp_info_t = target[posidx], target[snpidx] + if (pos_t, snp_info_t) in alternative_targets: + alternative_targets[(pos_t, snp_info_t)][0][ + samplesidx + ] += f",{target[samplesidx]}" # add samples + alternative_targets[(pos_t, snp_info_t)][0][ + snpididx + ] += f",{target[snpididx]}" # add snp ids + alternative_targets[(pos_t, snp_info_t)][0][ + afidx + ] += f",{target[afidx]}" # add allele frequencies + else: + alternative_targets[(pos_t, snp_info_t)] = [target] + return reference_targets, alternative_targets -def recover_alt_targets( - variants: Dict[Tuple[str, str], List[List[str]]], noref: bool +def recover_alternative_targets_list( + alternative_targets: Dict[Tuple[int, str], List[List[str]]], noref: bool ) -> List[List[str]]: """ - Recovers alternative targets from a dictionary of variants. + Recovers the list of alternative targets from a dictionary of alternative + targets grouped by position and SNP information. Args: - variants (Dict[Tuple[str, str], List[List[str]]]): A dictionary containing merged variant targets. - noref (bool): Flag indicating whether to mark non-reference targets. + alternative_targets (Dict[Tuple[int, str], List[List[str]]): Dictionary + of alternative targets grouped by position and SNP information. + noref (bool): Flag indicating whether to include reference targets. Returns: - List[List[str]]: A list of recovered alternative targets. - - Examples: - >>> recover_alt_targets({('pos2', 'A'): [['target2', 'A', 'pos2']], ('pos1', 'A'): [['target3', 'A', 'pos1']]}, True) - [['target2', 'A', 'pos2'], ['target3', 'A', 'pos1']] + List[List[str]]: List of alternative targets with updated flags. """ - alternative_targets = [] - for value in variants.values(): - for target in value: + alternative_targets_list = [] # initialize the list + for v in alternative_targets.values(): + for target in v: + # check whether the target is uniquely found on alternative sequences target[FLAG_ALT_ONLY] = "y" if noref else target[FLAG_ALT_ONLY] - alternative_targets.append(target) - return alternative_targets + alternative_targets_list.append(target) + return alternative_targets_list -def remove_duplicate_data(target: List[str], idx: int) -> str: +def remove_duplicates(target: List[str], idx: int) -> str: """ - Removes duplicate data from a target list. + Removes duplicates from a target list at the specified index and returns the + unique values as a comma-separated string. Args: - target (List[str]): The target list. - idx (int): The index of the target element to remove duplicates from. + target (List[str]): List of target values. + idx (int): Index of the target list to remove duplicates from. Returns: - str: The target element with duplicates removed. - - Examples: - >>> remove_duplicate_data(['target1', 'A,A,A,A,A,A,A,A,A'], 1) - 'A' + str: Comma-separated string of unique values after removing duplicates. """ return ",".join(set(target[idx].split(","))) -def remove_duplicate_alt_targets( - targets_alt: List[List[str]], redundant_idxs: List[int] +def unique_values( + targets: List[List[str]], snpidx: int, guideidx: int ) -> List[List[str]]: """ - Removes duplicate data from alternative targets. + Returns a list of targets with unique values for SNP information, SNP ID, + allele frequency, and samples. + + Args: + targets (List[List[str]]): List of target clusters. + snpidx (int): Index of the SNP information. + guideidx (int): Index of the guide. + + Returns: + List[List[str]]: List of targets with unique values for specified fields. + """ + + snpididx, afidx, samplesidx = snpidx - 2, snpidx - 1, guideidx - 2 + targets_filt = [] + # remove duplicate values + for target in targets: + target[snpidx] = remove_duplicates(target, snpidx) # snp info + target[snpididx] = remove_duplicates(target, snpididx) # snp id + target[afidx] = remove_duplicates(target, afidx) # af + target[samplesidx] = remove_duplicates(target, samplesidx) # samples + targets_filt.append(target) # update target + return targets_filt + + +def construct_cluster( + cluster: List[List[str]], + guideidx: int, + snpidx: int, + posidx: int, + scoreidx: int, + mmbidx: int, + sort_pivot: str, + sorting_criteria: str, + sorting_criteria_scoring: str, + outfile: TextIOWrapper, + outfiledisc: TextIOWrapper, +) -> None: + """ + Constructs target clusters by processing and sorting reference and + alternative targets based on specified criteria, and writes the results to + output files. Args: - targets_alt (List[List[str]]): The list of alternative targets. - redundant_idxs (List[int]): The list of indices indicating the elements to remove duplicates from. + cluster (List[List[str]]): List of target clusters. + guideidx (int): Index of the guide. + snpidx (int): Index of the SNP information. + posidx (int): Index of the position. + scoreidx (int): Index of the score. + mmbidx (int): Index of the MM+Bulges. + sort_pivot (str): Sorting pivot. + sorting_criteria (str): Sorting criteria. + sorting_criteria_scoring (str): Sorting criteria for scoring. + outfile (TextIOWrapper): Output file for reported alignments. + outfiledisc (TextIOWrapper): Output file for alternative alignments. Returns: - List[List[str]]: The list of alternative targets with duplicates removed. + None + """ + + if not cluster: # avoid crashes when cluster is empty + return + # recover reference and alternative targets + reference_targets, alternative_targets = cluster_targets_by_pos_snp( + cluster, guideidx, snpidx, posidx + ) + noref = ( + not reference_targets + ) # check whether only alternative targets have been found + # recover alternative targets list + alternative_targets = recover_alternative_targets_list(alternative_targets, noref) + # remove duplicates values on snp info, snp id, af, and samples columns + alternative_targets = unique_values(alternative_targets, snpidx, guideidx) + # sort targets + score = sort_pivot == SORTING_PIVOTS[0] + sorting_criteria = sorting_criteria_scoring if score else sorting_criteria + criteria = initialize_sorting_criteria(sorting_criteria, score, scoreidx, mmbidx) + if reference_targets: + reference_targets = sort_targets(reference_targets, criteria) + if alternative_targets: + alternative_targets = sort_targets(alternative_targets, criteria) + # write targets to reported or alternative alignments files + write_best_targets( + reference_targets, + alternative_targets, + noref, + score, + scoreidx, + mmbidx, + outfile, + outfiledisc, + ) - Examples: - >>> remove_duplicate_alt_targets([['target1', 'A,A,A,A,A,A,A,A,A'], ['target2', 'C,C,C,C,C,C,C,C,C']], [1]) - [['target1', 'A'], ['target2', 'C']] + +def write_best_targets( + reference_targets: List[List[str]], + alternative_targets: List[List[str]], + noref: bool, + score: bool, + scoreidx: int, + mmbidx: int, + outfile: TextIOWrapper, + outfiledisc: TextIOWrapper, +) -> None: """ + Writes the best targets to the reported alignments file and alternative + alignments file based on specified criteria. - targets_alt_polished = [] # remove duplicate data from ALT target - for target in targets_alt: - for idx in redundant_idxs: - target[idx] = remove_duplicate_data(target, idx) - targets_alt_polished.append(target) - assert len(targets_alt_polished) == len(targets_alt) - return targets_alt_polished + Args: + reference_targets (List[List[str]]): List of reference targets. + alternative_targets (List[List[str]]): List of alternative targets. + noref (bool): Flag indicating if no reference target is found. + score (bool): Flag indicating if scoring is used for comparison. + scoreidx (int): Index of the score. + mmbidx (int): Index of the MM+Bulges. + outfile (TextIOWrapper): Output file for reported alignments. + outfiledisc (TextIOWrapper): Output file for alternative alignments. + Returns: + None + """ -def sorting_score( - criteria: List[str], score_idx: int, mm_bul_count_idx: int -) -> Callable: + if noref: # no reference target found + target = alternative_targets.pop(0) # pop best target + target[scoreidx - 1] = str(len(alternative_targets)) + elif reference_targets and alternative_targets: # targets found in both + if score: + target = ( + alternative_targets.pop(0) + if float(reference_targets[0][scoreidx]) + < float(alternative_targets[0][scoreidx]) + else reference_targets.pop(0) + ) + elif int(alternative_targets[0][mmbidx]) < int(reference_targets[0][mmbidx]): + target = alternative_targets.pop(0) + else: + target = reference_targets.pop(0) + target[scoreidx - 1] = str(len(reference_targets) + len(alternative_targets)) + else: # no alternative target found + target = reference_targets.pop(0) + target[scoreidx - 1] = str(len(reference_targets)) + target = "\t".join(target) + outfile.write(f"{target}\n") + # write the alternative alignments targets + for target in reference_targets + alternative_targets: + target[scoreidx - 1] = str(len(reference_targets) + len(alternative_targets)) + target = "\t".join(target) + outfiledisc.write(f"{target}\n") + + +def sort_targets(targets: List[List[str]], criteria: Callable) -> List[List[str]]: """ - Creates a sorting key function based on the given criteria. Score has highest - priority and is always included in the sorting function + Sorts the list of targets based on the specified criteria function. Args: - criteria (List[str]): The sorting criteria. - score_idx (int): The index of the score column in the data. - mm_bul_count_idx (int): The index of the mismatch and bulge count column in the data. + targets (List[List[str]]): List of targets to be sorted. + criteria (Callable): Sorting criteria function. Returns: - Callable: A sorting key function that can be used with the `sorted` function. + List[List[str]]: Sorted list of targets based on the provided criteria. + """ + + return sorted(targets, key=criteria) - Examples: - >>> sorting_score(['mm+bulges', 'mm'], 8, 5) - . at 0x7f1234567890> + +def sorting_score(criteria: List[str], score_idx: int, mmb_idx: int) -> Callable: """ + Returns a sorting function based on the specified criteria for scoring, + MM+Bulges index, and multiple criteria. - if len(criteria) == 1: # one criterion + Args: + criteria (List[str]): List of sorting criteria. + score_idx (int): Index of the score. + mmb_idx (int): Index of the MM+Bulges. + + Returns: + Callable: Sorting function based on the provided criteria. + """ + + if len(criteria) == 1: # single criterion return lambda x: ( -float(x[score_idx]), - int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), ) elif len(criteria) == 2: return lambda x: ( -float(x[score_idx]), - int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[1]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), ) # base case (all three ) return lambda x: ( -float(x[score_idx]), - int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[1]]]), - int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[2]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[2]]]), ) -def sorting_fewest(criteria: List[str], mm_bul_count_idx: int) -> Callable: +def sorting_fewest(criteria: List[str], mmb_idx: int) -> Callable: """ - Creates a sorting key function based on the input criteria. The sorting - criteria include mismatches, bulges, and mismatches+bulges + Returns a sorting function based on the specified criteria for the fewest + MM+Bulges index. Args: - criteria (List[str]): The sorting criteria. - mm_bul_count_idx (int): The index of the mismatch and bulge count column in the data. + criteria (List[str]): List of sorting criteria. + mmb_idx (int): Index of the MM+Bulges. Returns: - Callable: A sorting key function that can be used with the `sorted` function. - - Examples: - >>> sorting_fewest(['mm+bulges', 'mm'], 5) - . at 0x7f1234567890> + Callable: Sorting function based on the provided criteria for the fewest + MM+Bulges index. """ if len(criteria) == 1: # one criterion - return lambda x: (int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[0]]])) + return lambda x: (int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]])) elif len(criteria) == 2: return lambda x: ( - int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[1]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), ) # base case (all three ) return lambda x: ( - int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[1]]]), - int(x[mm_bul_count_idx - SORTING_CRITERIA[criteria[2]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[2]]]), ) -def define_sorting_criteria( - sorting_criteria: str, score: bool, score_idx: int, mm_bul_count_idx: int +def initialize_sorting_criteria( + sorting_criteria: str, score: bool, score_idx: int, mmb_idx: int ) -> Callable: """ - Defines the sorting criteria based on the given parameters. + Initializes the sorting criteria function based on the specified sorting + criteria, score flag, score index, and MM+Bulges index. Args: - sorting_criteria (str): The sorting criteria as a comma-separated string. - score (bool): Flag indicating whether to prioritize sorting by score. - score_idx (int): The index of the score column in the data. - mm_bul_count_idx (int): The index of the mismatch and bulge count column in the data. + sorting_criteria (str): Comma-separated string of sorting criteria. + score (bool): Flag indicating if scoring is used for sorting. + score_idx (int): Index of the score. + mmb_idx (int): Index of the MM+Bulges. Returns: - Callable: A sorting key function based on the defined criteria. - - Raises: - ValueError: If the number of sorting criteria exceeds 3. - ValueError: If unknown sorting criteria are provided. - - Examples: - >>> define_sorting_criteria('mm+bulges,mm', True, 8, 5) - . at 0x7f1234567890> + Callable: Sorting criteria function based on the provided parameters. """ criteria = sorting_criteria.split(",") - # if score: - # print("score") - # else: - # print("fewest") - # print(criteria) if len(criteria) > 3: raise ValueError("Mismatching sorting criteria selected") if any(c not in SORTING_CRITERIA for c in criteria): raise ValueError("Unknown sorting criteria") - if score: # sort by criteria (priority to score) - return sorting_score(criteria, score_idx, mm_bul_count_idx) - return sorting_fewest(criteria, mm_bul_count_idx) # sort by criteria - - -def report_best_targets( - targets: List[List[str]], score_idx: int, outfile: TextIOWrapper -) -> List[List[str]]: - """ - Reports the best targets and writes them to an output file. - - Args: - targets (List[List[str]]): The list of targets. - score_idx (int): The index of the score column in the data. - outfile (TextIOWrapper): The output file to write the best target. - - Returns: - List[List[str]]: The remaining targets after removing the best target. - - Examples: - >>> report_best_targets([['target1', 'A', 'score1'], ['target2', 'C', 'score2']], 2, outfile) - [['target2', 'C', 'score2']] - """ - - # count remaining targets - targets[0][score_idx - 1] = str(len(targets) - 1) - reportline = "\t".join(targets[0]) # write best target to merge file - outfile.write(f"{reportline}\n") - best = targets.pop(0) # pop best target from list - return targets + if score: + return sorting_score(criteria, score_idx, mmb_idx) + return sorting_fewest(criteria, mmb_idx) -def report_best_targets_ref_alt( - targets_ref: List[List[str]], - targets_alt: List[List[str]], - score: bool, - mm_bul_count_idx: int, - score_idx: int, - outfile: TextIOWrapper, -) -> Tuple[List[List[str]], List[List[str]]]: +def split_targets(input_args: MergeTargets) -> None: """ - Reports the best targets (reference and alternative) and writes them to an output file. + Splits and processes the input targets file into best and alternative targets + files based on specified criteria. Args: - targets_ref (List[List[str]]): The list of reference targets. - targets_alt (List[List[str]]): The list of alternative targets. - score (bool): Flag indicating whether to prioritize sorting by score. - mm_bul_count_idx (int): The index of the mismatch and bulge count column in the target data. - score_idx (int): The index of the score column in the target data. - outfile (TextIOWrapper): The output file to write the best targets. - - Returns: - Tuple[List[List[str]], List[List[str]]]: A tuple containing the remaining reference targets and the remaining alternative targets. - - Examples: - >>> report_best_targets_ref_alt([['target1', 'A', 'score1']], [['target2', 'C', 'score2']], True, 6, 5, outfile) - """ - - # compare values (score or mm+bulges) to determine the best target - if score: # compare scores - if float(targets_ref[0][score_idx]) >= float(targets_alt[0][score_idx]): - targets_ref[0][score_idx - 1] = str( - len(targets_ref) + len(targets_alt) - 1 - ) # recover remaining targets - reportline = "\t".join(targets_ref[0]) - outfile.write(f"{reportline}\n") - best = targets_ref.pop(0) # remove best target - else: - targets_alt[0][score_idx - 1] = str( - len(targets_ref) + len(targets_alt) - 1 - ) # recover remaining targets - reportline = "\t".join(targets_alt[0]) - outfile.write(f"{reportline}\n") - best = targets_alt.pop(0) # remove best target - elif int(targets_ref[0][mm_bul_count_idx]) <= int(targets_alt[0][mm_bul_count_idx]): - targets_ref[0][score_idx - 1] = str( - len(targets_ref) + len(targets_alt) - 1 - ) # recover remaining targets - reportline = "\t".join(targets_ref[0]) - outfile.write(f"{reportline}\n") - best = targets_ref.pop(0) # remove best target - else: - targets_alt[0][score_idx - 1] = str( - len(targets_ref) + len(targets_alt) - 1 - ) # recover remaining targets - reportline = "\t".join(targets_alt[0]) - outfile.write(f"{reportline}\n") - best = targets_alt.pop(0) # remove best target - return targets_ref, targets_alt - - -def report_remaining_targets( - targets_ref: List[List[str]], - targets_alt: List[List[str]], - score_idx: int, - outfile: TextIOWrapper, -) -> None: - """ - Reports the discarded targets (reference and alternative) and writes them to an output file. - - Args: - targets_ref (List[List[str]]): The list of remaining reference targets. - targets_alt (List[List[str]]): The list of remaining alternative targets. - score_idx (int): The index of the score column in the data. - outfile (TextIOWrapper): The output file to write the remaining targets. + input_args (MergeTargets): Object containing input arguments for target + splitting and processing. Returns: None - Examples: - >>> report_remaining_targets([['target1', 'A', 'score1']], [['target2', 'C', 'score2']], 3, outfile) - """ - - for i, target in enumerate(targets_ref): - targets_ref[i][score_idx - 1] = str(len(targets_ref) + len(targets_alt) - 1) - reportline = "\t".join(target) - outfile.write(f"{reportline}\n") - for i, target in enumerate(targets_alt): - targets_alt[i][score_idx - 1] = str(len(targets_ref) + len(targets_alt) - 1) - reportline = "\t".join(target) - outfile.write(f"{reportline}\n") - - -def recover_best_targets( - cluster: List[List[str]], - snp_info_idx: int, - pos_idx: int, - guide_idx: int, - score_idx: int, - mm_bul_count_idx: int, - criterion: str, - sorting_criteria_scoring: str, - sorting_criteria: str, - outfile: TextIOWrapper, - outfile_discarded: TextIOWrapper, -) -> None: - """ - Recovers the best targets from a cluster and writes them to an output file. - - Args: - cluster (List[List[str]]): The cluster of targets. - snp_info_idx (int): The index of the SNP info column in the target data. - pos_idx (int): The index of the position column in the target data. - guide_idx (int): The index of the guide column in the target data. - score_idx (int): The index of the score column in the target data. - mm_bul_count_idx (int): The index of the mismatch and bulge count column in the target data. - criterion (str): The criterion for selecting the best targets (score or fewest). - sorting_criteria_scoring (str): The sorting criteria for the targets (scoring has highest priority). - sorting_criteria (str): The sorting criteria for the targets. - outfile (TextIOWrapper): The output file to write the best targets. - outfile_discarded (TextIOWrapper): The output file to write the discarded targets. - - Returns: - None - - Examples: - >>> recover_best_targets(cluster, 2, 3, 4, 5, 6, 'score', 'mm+bulges,mm', outfile, outfile_discarded) + Raises: + OSError: If an error occurs during the process of splitting and processing + the targets. """ - if not cluster: - return # avoids potential crash at first iteration when opened new cluster - targets_ref, targets_alt_dict = merge_targets_by_snp( - cluster, snp_info_idx, pos_idx, guide_idx - ) - noref = not targets_ref # only ALT targets - targets_alt = remove_duplicate_alt_targets( - recover_alt_targets(targets_alt_dict, noref), - [snp_info_idx, snp_info_idx - 2, snp_info_idx - 1, guide_idx - 2], - ) # recover ALT targets - # sort by CFD or CRISTA score (no impact when using non Cas9 proteins) - if criterion == "score": - # recover sorting criteria order (score has priority) - criteria = define_sorting_criteria( - sorting_criteria_scoring, True, score_idx, mm_bul_count_idx - ) - else: # sort on mm and bulges only (fewest) - # recover sorting criteria order (no score) - criteria = define_sorting_criteria( - sorting_criteria, False, score_idx, mm_bul_count_idx - ) - if targets_ref: - targets_ref = sorted(targets_ref, key=criteria) - if targets_alt: - targets_alt = sorted(targets_alt, key=criteria) - # recover best targets - if noref: # only alt targets - targets_alt = report_best_targets(targets_alt, score_idx, outfile) - elif targets_ref and targets_alt: # both ref and alt targets - score = criterion == "score" - targets_ref, targets_alt = report_best_targets_ref_alt( - targets_ref, targets_alt, score, mm_bul_count_idx, score_idx, outfile - ) - else: # only ref targets - targets_ref = report_best_targets(targets_ref, score_idx, outfile) - report_remaining_targets(targets_ref, targets_alt, score_idx, outfile_discarded) + # open best, and alternative targets files + try: + with open(input_args.targets_fname, mode="r") as infile, open( + input_args.targets_fname_merged, mode="w" + ) as outfile, open(input_args.targets_fname_discarded, mode="w") as discfile: + # write header to outfiles + header = infile.readline().strip() + outfile.write(f"{header}\n") + discfile.write(f"{header}\n") + # begin dividing targets + ( + pos_prev, + guide_prev, + chrom_prev, + snp_prev, + cluster, + ) = initialize_targets_cluster(input_args.rangebp) + for line in infile: + target = parse_target(line) + guide, chrom, pos, snp = ( + target[input_args.guideidx], + target[input_args.chromidx], + int(target[input_args.posidx]), + target[input_args.snpidx], + ) + if open_targets_cluster( + guide, + guide_prev, + chrom, + chrom_prev, + pos, + pos_prev, + input_args.rangebp, + ): + construct_cluster( + cluster, + input_args.guideidx, + input_args.snpidx, + input_args.posidx, + input_args.scoreidx, + input_args.mmbidx, + input_args.sort_pivot, + input_args.sorting_criteria, + input_args.sorting_criteria_scoring, + outfile, + discfile, + ) + cluster = [target] + else: + cluster.append(target) + guide_prev = guide + pos_prev = pos + chrom_prev = chrom + snp_prev = snp + construct_cluster( + cluster, + input_args.guideidx, + input_args.snpidx, + input_args.posidx, + input_args.scoreidx, + input_args.mmbidx, + input_args.sort_pivot, + input_args.sorting_criteria, + input_args.sorting_criteria_scoring, + outfile, + discfile, + ) + except IOError as e: + raise OSError("An error occurred while merging contiguous targets") from e def merge_targets() -> None: """ - Merges targets based on specified criteria and writes the merged targets to output files. + Merges targets by parsing input arguments, splitting the targets, and + displaying the completion time. Returns: None - - Raises: - OSError: If the targets merge fails. - - Examples: - >>> merge_targets() """ - start = time() # merging start time + start = time() # start time point input_args = parse_input_args(sys.argv[1:]) - # recover raw targets file header and data - header, targets_data_raw = read_raw_targets(input_args[0]) - # define fnames for merged target files (kept and discarded) - targets_file_merged = input_args[1] - targets_file_merged_discarded = f"{targets_file_merged}.discarded_samples" - # initialize target clustering values - prev_pos, prev_guide, prev_chrom = -(input_args[2] + 1), "", "" - cluster = [] # targets clusters - try: - with open(targets_file_merged, mode="w") as outfile: - with open(targets_file_merged_discarded, mode="w") as outfile_discarded: - # write headers to target merged files - outfile.write(header) - outfile_discarded.write(header) - for target_data in targets_data_raw: - if open_targets_cluster( - target_data[input_args[6]], - prev_guide, - target_data[input_args[3]], - prev_chrom, - int(target_data[input_args[4]]), - prev_pos, - input_args[2], - ): - recover_best_targets( - cluster, - input_args[7], - input_args[4], - input_args[6], - input_args[8], - input_args[5], - input_args[9], - input_args[10], - input_args[11], - outfile, - outfile_discarded, - ) - cluster = [target_data] - else: - cluster.append(target_data) - prev_guide, prev_pos, prev_chrom = ( - target_data[input_args[6]], - int(target_data[input_args[4]]), - target_data[input_args[3]], - ) - recover_best_targets( - cluster, - input_args[7], - input_args[4], - input_args[6], - input_args[8], - input_args[5], - input_args[9], - input_args[10], - input_args[11], - outfile, - outfile_discarded, - ) - except OSError as e: - raise OSError(f"Targets merge failed for {input_args[0]}") from e - sys.stdout.write(f"Merge completed in: {(time() - start):.2f}s\n") + split_targets(input_args) + sys.stdout.write(f"Merge completed in {(time() - start)}s\n") if __name__ == "__main__": diff --git a/PostProcess/scriptAnalisiNNN_v3.sh b/PostProcess/scriptAnalisiNNN_v3.sh index c227ddf..afc1f86 100755 --- a/PostProcess/scriptAnalisiNNN_v3.sh +++ b/PostProcess/scriptAnalisiNNN_v3.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -e # trace all commands failures +set -e # trace all commands failures # Script per l'analisi dei targets della ricerca REF e ENR con PAM NNN # Il file dei targets della ricerca sul genoma reference si chiama $REFtargets -> INPUT $1 @@ -42,7 +42,7 @@ echo $jobid # 1) Rimozione duplicati, estrazione semicommon e unique e creazione file total #echo 'Creazione file .total.txt' ./extraction.sh $REFtargets $ENRtargets $jobid || { - echo "CRISPRme ERROR: targets extraction failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR: targets extraction failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } # OUTPUT $jobid.common_targets.txt -> Non usato # $jobid.semi_common_targets.txt @@ -63,7 +63,7 @@ rm $jobid.semi_common_targets.minmaxdisr.txt #echo 'Creazione cluster del file .total.txt' # 3) Clustering ./cluster.dict.py $jobid.total.txt 'no' 'True' 'True' "$guide_file" 'total' 'orderChr' || { - echo "CRISPRme ERROR: targets clustering failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR: targets clustering failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } # OUTPUT $jobid.total.cluster.txt rm $jobid.total.txt @@ -95,7 +95,7 @@ rm $jobid.total.txt # ./simpleAnalysis_v3.py "$annotationfile" "$jobid.total.cluster.txt" "$jobid" "$dictionaries" "$pam_file" $mismatch "$referencegenome" "$guide_file" $bulgesDNA $bulgesRNA ./new_simple_analysis.py "$referencegenome" "$dictionaries" "$jobid.total.cluster.txt" "${pam_file}" "$jobid" "$mismatch" || { - echo "CRISPRme ERROR: annotation analysis failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR: annotation analysis failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } # cp $jobid.bestCFD.txt $jobid.bestCFD.txt.check_analysis @@ -130,15 +130,15 @@ echo 'Sorting and adjusting results' #adjustin columns to have the correct order and remove uncessary ones ./adjust_cols.py $jobid.bestCFD.txt || { - echo "CRISPRme ERROR: CFD report cleaning failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR: CFD report cleaning failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } ./adjust_cols.py $jobid.bestmmblg.txt || { - echo "CRISPRme ERROR: mismatch+bulges report cleaning failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR: mismatch+bulges report cleaning failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } ./adjust_cols.py $jobid.bestCRISTA.txt || { - echo "CRISPRme ERROR: CRISTA report cleaning failed (script: ${0} line $((LINENO-1)))" >&2 + echo "CRISPRme ERROR: CRISTA report cleaning failed (script: ${0} line $((LINENO - 1)))" >&2 exit 1 } # ./adjust_cols.py $jobid.altmmblg.txt diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh index f7feac4..c68621d 100755 --- a/PostProcess/submit_job_automated_new_multiple_vcfs.sh +++ b/PostProcess/submit_job_automated_new_multiple_vcfs.sh @@ -532,9 +532,9 @@ while read samples; do if [ -z "$samples" ]; then continue fi - # tail -n +2 $samples >> "$output_folder/.sampleID.txt" - grep -v '#' "${current_working_directory}/samplesIDs/$samples" >>"$output_folder/.sampleID.txt" -done <$sampleID + awk '!/^#/ { print }' "${current_working_directory}/samplesIDs/$samples" >>"$output_folder/.sampleID.txt" +done <"$sampleID" +# done <$sampleID # if [ "$vcf_name" != "_" ]; then touch "$output_folder/.sampleID.txt" sed -i 1i"#SAMPLE_ID\tPOPULATION_ID\tSUPERPOPULATION_ID\tSEX" "$output_folder/.sampleID.txt" || { diff --git a/TODO.txt b/TODO.txt index cd0ab44..d490904 100755 --- a/TODO.txt +++ b/TODO.txt @@ -1,9 +1 @@ --change indexing for non-PAM search (accept no PAM as input, PAM len=0) -use this as notes: - positive strand - standard FASTA - negative strand - reverse complemented FASTA - try to use only one index for any sequence with len<=fix_length(30-40bp) - (NO SIGNIFICANT IMPROVEMENTE IN SPEED OR SPACE SAVING WITH THIS OPERATION) - --fix bed file reading in GUI --big report creation with summarized data for single guides (guide|on-target|total_count|cfd_score|crista_score) +TEST WITH AWK ON BIGGER DATA FOR EXTRACTION, SELECTION AND FILTERING (MAYBE ISSUE WITH TIME EXEC??) \ No newline at end of file diff --git a/crisprme.py b/crisprme.py index 561d3f6..666e653 100755 --- a/crisprme.py +++ b/crisprme.py @@ -9,7 +9,7 @@ import re -version = "2.1.4" # CRISPRme version; TODO: update when required +version = "2.1.5" # CRISPRme version; TODO: update when required __version__ = version script_path = os.path.dirname(os.path.abspath(__file__)) @@ -742,6 +742,16 @@ def complete_search(): genome_idx = pam_char + "_" + str(bMax) + "_" + genome_ref ref_comparison = False # os.chdir(script_path) + # write crisprme version to file + with open(outputfolder + "/.command_line.txt", "w") as p: + p.write("input_command\t" + " ".join(sys.argv[:])) + p.write("\n") + p.close() + with open(outputfolder + "/.version.txt", "w") as p: + p.write("crisprme_version\t" + __version__) + p.write("\n") + p.close() + # write parameters to file with open(outputfolder + "/Params.txt", "w") as p: p.write("Genome_selected\t" + genome_ref.replace(" ", "_") + "\n") p.write("Genome_ref\t" + genome_ref + "\n") @@ -1213,8 +1223,9 @@ def complete_test_crisprme(): process. """ - if "--help" in input_args: + if "--help" in input_args or len(input_args) < 3: print_help_complete_test() + sys.exit(1) chrom = "all" if "--chrom" in input_args: # individual chrom to test try: