initial commit, complete config file, add test data

emmarousseau · Aug 12, 2024 · 924eeee · 924eeee
1 parent 1679c59
commit 924eeee
Show file tree

Hide file tree

Showing 8 changed files with 8,064 additions and 0 deletions.
diff --git a/src/bbmap_bbsplit/config.vsh.yaml b/src/bbmap_bbsplit/config.vsh.yaml
@@ -0,0 +1,101 @@
+namespace: "bbmap"
+name: "bbmap_bbsplit"
+description: |
+  Split sequencing reads by mapping them to multiple references simultaneously.
+links:
+  homepage: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/
+  documentation: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmap-guide/
+  repository: https://github.com/BioInfoTools/BBMap/blob/master/sh/bbsplit.sh
+
+license: BBTools Copyright (c) 2014
+
+argument_groups:
+- name: "Input"
+  arguments:
+  - name: "--id"
+    type: string
+    description: Sample ID
+  - name: "--paired"
+    type: boolean 
+    default: false
+    description: Paired fastq files or not?
+  - name: "--input"
+    type: file
+    multiple: true
+    description: Input fastq files, either one or two (paired), separated by ";".
+    example: sample.fastq
+  - name: "--primary_ref"
+    type: file
+    description: Primary reference FASTA
+  - name: "--other_ref_names"
+    type: file
+    description: Path to comma-separated file containing a list of reference genomes to filter reads against with BBSplit.
+  - name: "--only_build_index"
+    type: boolean
+    description: true = only build index; false = mapping
+
+- name: "Output"
+  arguments:
+  - name: "--fastq_1"
+    type: file
+    description: |
+      Output file for read 1.
+    direction: output
+    example: read_1.fastq
+  - name: "--fastq_2"
+    type: file
+    description: |
+      Output file for read 2.
+    direction: output
+    example: read_2.fastq
+  - name: "--primary_fastq"
+    type: file
+    description: |
+      Output reads that map to the primary reference.
+    direction: output
+    example: primary.fastq.gz
+  - name: "--all_fastq"
+    type: file
+    description: |
+      Output reads that map to the primary reference.
+    direction: output
+    example: all.fastq.gz
+  - name: "--index"
+    type: file
+    description: |
+      Directory with index files.
+    direction: output
+    example: bbsplit
+  - name : "--stats"
+    type: file
+    description: |
+      Tab-delimited text file containing mapping statistics.
+    direction: output
+    example: stats.txt
+
+resources:
+  - type: bash_script
+    path: script.sh
+
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - path: test_data
+
+engines:
+- type: docker
+  image: ubuntu:22.04
+  setup:
+    - type: docker
+      run: | 
+        apt-get update && \
+        apt-get install -y build-essential openjdk-17-jdk wget tar && \
+        wget --no-check-certificate https://sourceforge.net/projects/bbmap/files/BBMap_39.01.tar.gz && \
+        tar xzf BBMap_39.01.tar.gz && \
+        cp -r bbmap/* /usr/local/bin
+    - type: docker
+      run: |
+        echo "${meta_functionality_name}: bbmap: $(bbversion.sh | grep -v "Duplicate cpuset")" > /var/software_versions.txt
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/bbmap_bbsplit/script.sh b/src/bbmap_bbsplit/script.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+set -eo pipefail
+
+function clean_up {
+    rm -rf "$tmpdir"
+}
+trap clean_up EXIT 
+
+if [ ! -d "$par_built_bbsplit_index" ]; then
+    other_refs=()
+    while IFS="," read -r name path 
+    do
+        other_refs+=("ref_$name=$path")
+    done < "$par_bbsplit_fasta_list"
+fi
+
+if $par_only_build_index; then
+    if [ -f "$par_primary_ref" ] && [ ${#other_refs[@]} -gt 0 ]; then
+        bbsplit.sh \
+            ref_primary="$par_primary_ref" "${other_refs[@]}" \
+            path=$par_bbsplit_index \
+            threads=${meta_cpus:-1}
+    else
+        echo "ERROR: Please specify as input a primary fasta file along with names and paths to non-primary fasta files."
+    fi
+else
+    IFS="," read -ra input <<< "$par_input"
+    tmpdir=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXXXX")
+    index_files=''
+    if [ -d "$par_built_bbsplit_index" ]; then
+        index_files="path=$par_built_bbsplit_index"
+    elif [ -f "$par_primary_ref" ] && [ ${#other_refs[@]} -gt 0 ]; then
+        index_files="ref_primary=$par_primary_ref ${other_refs[@]}"
+    else
+        echo "ERROR: Please either specify a BBSplit index as input or a primary fasta file along with names and paths to non-primary fasta files."
+    fi
+    if $par_paired; then
+        bbsplit.sh \
+            $index_files \
+            threads=${meta_cpus:-1} \
+            in=${input[0]} \
+            in2=${input[1]} \
+            basename=${tmpdir}/%_#.fastq \
+            refstats=bbsplit_stats.txt
+        read1=$(find $tmpdir/ -iname primary_1*)
+        read2=$(find $tmpdir/ -iname primary_2*)
+        cp $read1 $par_fastq_1
+        cp $read2 $par_fastq_2
+    else
+        bbsplit.sh \
+            $index_files \
+            threads=${meta_cpus:-1} \
+            in=${input[0]} \
+            basename=${tmpdir}/%.fastq \
+            refstats=bbsplit_stats.txt
+        read1=$(find $tmpdir/ -iname primary*)
+        cp $read1 $par_fastq_1
+    fi
+fi
+
+exit 0
diff --git a/src/bbmap_bbsplit/test.sh b/src/bbmap_bbsplit/test.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+echo ">>> Test $meta_functionality_name"
+
+cat > bbsplit_fasta_list.txt << HERE
+sarscov2,${meta_resources_dir}/test_data/sarscov2.fa
+human,${meta_resources_dir}/test_data/human.fa
+HERE
+
+echo ">>> Building BBSplit index"
+"${meta_executable}" \
+  --primary_ref "${meta_resources_dir}/test_data/genome.fasta" \
+  --bbsplit_fasta_list "bbsplit_fasta_list.txt" \
+  --only_build_index true \
+  --bbsplit_index "BBSplit_index" 
+
+echo ">>> Check whether output exists"
+[ ! -d "BBSplit_index" ] && echo "BBSplit index does not exist!" && exit 1
+[ -z "$(ls -A 'BBSplit_index')" ] && echo "BBSplit index is empty!" && exit 1
+
+echo ">>> Filtering ribosomal RNA reads"
+
+echo ">>> Testing with single-end reads and primary/non-primary FASTA files"
+"${meta_executable}" \
+  --paired false \
+  --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz" \
+  --only_build_index false \
+  --primary_ref "${meta_resources_dir}/test_data/genome.fasta" \
+  --bbsplit_fasta_list "bbsplit_fasta_list.txt" \
+  --fastq_1 "filtered_SRR6357070_1.fastq.gz"
+
+echo ">>> Check whether output exists"
+[ ! -f "filtered_SRR6357070_1.fastq.gz" ] && echo "Filtered reads file does not exist!" && exit 1
+[ ! -s "filtered_SRR6357070_1.fastq.gz" ] && echo "Filtered reads file is empty!" && exit 1
+
+rm filtered_SRR6357070_1.fastq.gz
+
+echo ">>> Testing with paired-end reads and primary/non-primary FASTA files"
+"${meta_executable}" \
+  --paired true \
+  --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz,${meta_resources_dir}/test_data/SRR6357070_2.fastq.gz" \
+  --only_build_index false \
+  --primary_ref "${meta_resources_dir}/test_data/genome.fasta" \
+  --bbsplit_fasta_list "bbsplit_fasta_list.txt" \
+  --fastq_1 "filtered_SRR6357070_1.fastq.gz" \
+  --fastq_2 "filtered_SRR6357070_2.fastq.gz"
+
+echo ">>> Check whether output exists"
+[ ! -f "filtered_SRR6357070_1.fastq.gz" ] && echo "Filtered read 1 file does not exist!" && exit 1
+[ ! -s "filtered_SRR6357070_1.fastq.gz" ] && echo "Filtered read 1 file is empty!" && exit 1
+[ ! -f "filtered_SRR6357070_2.fastq.gz" ] && echo "Filtered read 2 file does not exist!" && exit 1
+[ ! -s "filtered_SRR6357070_2.fastq.gz" ] && echo "Filtered read 2 file is empty!" && exit 1
+
+rm filtered_SRR6357070_1.fastq.gz filtered_SRR6357070_2.fastq.gz
+
+echo ">>> Testing with single-end reads and BBSplit index"
+"${meta_executable}" \
+  --paired false \
+  --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz" \
+  --only_build_index false \
+  --built_bbsplit_index "BBSplit_index" \
+  --fastq_1 "filtered_SRR6357070_1.fastq.gz"
+
+echo ">>> Check whether output exists"
+[ ! -f "filtered_SRR6357070_1.fastq.gz" ] && echo "Filtered reads file does not exist!" && exit 1
+[ ! -s "filtered_SRR6357070_1.fastq.gz" ] && echo "Filtered reads file is empty!" && exit 1
+
+echo ">>> Testing with paired-end reads and BBSplit index"
+"${meta_executable}" \
+  --paired true \
+  --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz,${meta_resources_dir}/test_data/SRR6357070_2.fastq.gz" \
+  --only_build_index false \
+  --built_bbsplit_index "BBSplit_index" \
+  --fastq_1 "filtered_SRR6357070_1.fastq.gz" \
+  --fastq_2 "filtered_SRR6357070_2.fastq.gz"
+
+echo ">>> Check whether output exists"
+[ ! -f "filtered_SRR6357070_1.fastq.gz" ] && echo "Filtered read 1 file does not exist!" && exit 1
+[ ! -s "filtered_SRR6357070_1.fastq.gz" ] && echo "Filtered read 1 file is empty!" && exit 1
+[ ! -f "filtered_SRR6357070_2.fastq.gz" ] && echo "Filtered read 2 file does not exist!" && exit 1
+[ ! -s "filtered_SRR6357070_2.fastq.gz" ] && echo "Filtered read 2 file is empty!" && exit 1
+
+rm filtered_SRR6357070_1.fastq.gz filtered_SRR6357070_2.fastq.gz
+
+echo "All tests succeeded!"
+exit 0
diff --git a/src/bbmap_bbsplit/test_data/SRR6357070_1.fastq.gz b/src/bbmap_bbsplit/test_data/SRR6357070_1.fastq.gz
diff --git a/src/bbmap_bbsplit/test_data/SRR6357070_2.fastq.gz b/src/bbmap_bbsplit/test_data/SRR6357070_2.fastq.gz