diff --git a/src/fq_subsample/config.vsh.yaml b/src/fq_subsample/config.vsh.yaml new file mode 100644 index 00000000..ea07d342 --- /dev/null +++ b/src/fq_subsample/config.vsh.yaml @@ -0,0 +1,75 @@ +name: fq_subsample +description: fq subsample outputs a subset of records from single or paired FASTQ files. +keywords: [fastq, subsample, subset] +links: + homepage: https://github.com/stjude-rust-labs/fq/blob/master/README.md + documentation: https://github.com/stjude-rust-labs/fq/blob/master/README.md + repository: https://github.com/stjude-rust-labs/fq +license: MIT + +argument_groups: +- name: "Input" + arguments: + - name: "--input_1" + type: file + required: true + description: First input fastq file to subsample. Accepts both raw and gzipped FASTQ inputs. + - name: "--input_2" + type: file + description: Second input fastq files to subsample. Accepts both raw and gzipped FASTQ inputs. + +- name: "Output" + arguments: + - name: "--output_1" + type: file + direction: output + default: $id.read_1.subsampled.fastq + description: Sampled read 1 fastq files. Output will be gzipped if ends in `.gz`. + - name: "--output_2" + type: file + direction: output + default: $id.read_2.subsampled.fastq + description: Sampled read 2 fastq files. Output will be gzipped if ends in `.gz`. + +- name: "Options" + arguments: + - name: "--probability" + type: double + description: The probability a record is kept, as a percentage (0.0, 1.0). Cannot be used with `record-count` + - name: "--record_count" + type: integer + description: The exact number of records to keep. Cannot be used with `probability` + - name: "--seed" + type: integer + description: Seed to use for the random number generator + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: docker + env: + - TZ Europe/Brussels + run: | + ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \ + apt-get update && \ + apt-get install -y --no-install-recommends build-essential git-all curl && \ + curl https://sh.rustup.rs -sSf | sh -s -- -y && \ + . "$HOME/.cargo/env" && \ + git clone --depth 1 --branch v0.11.0 https://github.com/stjude-rust-labs/fq.git && \ + mv fq /usr/local/ && cd /usr/local/fq && \ + cargo install --locked --path . && \ + mv /usr/local/fq/target/release/fq /usr/local/bin/ + +runners: + - type: executable + - type: nextflow diff --git a/src/fq_subsample/help.txt b/src/fq_subsample/help.txt new file mode 100644 index 00000000..6f4a9acf --- /dev/null +++ b/src/fq_subsample/help.txt @@ -0,0 +1,20 @@ +``` +fq subsample -h +``` + +Outputs a subset of records + +Usage: fq subsample [OPTIONS] --r1-dst <--probability |--record-count > [R2_SRC] + +Arguments: + Read 1 source. Accepts both raw and gzipped FASTQ inputs + [R2_SRC] Read 2 source. Accepts both raw and gzipped FASTQ inputs + +Options: + -p, --probability The probability a record is kept, as a percentage (0.0, 1.0). Cannot be used with `record-count` + -n, --record-count The exact number of records to keep. Cannot be used with `probability` + -s, --seed Seed to use for the random number generator + --r1-dst Read 1 destination. Output will be gzipped if ends in `.gz` + --r2-dst Read 2 destination. Output will be gzipped if ends in `.gz` + -h, --help Print help + -V, --version \ No newline at end of file diff --git a/src/fq_subsample/script.sh b/src/fq_subsample/script.sh new file mode 100755 index 00000000..bcc81b40 --- /dev/null +++ b/src/fq_subsample/script.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + + +required_args=("-p" "--probability" "-n" "--record_count") + +# exclusive OR for required arguments $par_probability and $par_record_count +if [[ -n $par_probability && -n $par_record_count ]] || [[ -z $par_probability && -z $par_record_count ]]; then + echo "FQ/SUBSAMPLE requires either --probability or --record_count to be specified" + exit 1 +fi + + +fq subsample \ + ${par_output_1:+--r1-dst "${par_output_1}"} \ + ${par_output_2:+--r2-dst "${par_output_2}"} \ + ${par_probability:+--probability "${par_probability}"} \ + ${par_record_count:+--record-count "${par_record_count}"} \ + ${par_seed:+--seed "${par_seed}"} \ + ${par_input_1} \ + ${par_input_2} + diff --git a/src/fq_subsample/test.sh b/src/fq_subsample/test.sh new file mode 100644 index 00000000..fd8545e8 --- /dev/null +++ b/src/fq_subsample/test.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +echo ">>> Testing $meta_executable" + +echo ">>> Testing for paired-end reads" +"$meta_executable" \ + --input_1 $meta_resources_dir/test_data/a.1.fastq \ + --input_2 $meta_resources_dir/test_data/a.1.fastq \ + --record_count 3 \ + --seed 1 \ + --output_1 a.1.subsampled.fastq \ + --output_2 a.2.subsampled.fastq + +echo ">> Checking if the correct files are present" +[ ! -f "a.1.subsampled.fastq" ] && echo "Subsampled FASTQ file for read 1 is missing!" && exit 1 +[ ! -s "a.1.subsampled.fastq" ] && echo "Subsampled FASTQ file is empty!" && exit 1 +[ ! -f "a.2.subsampled.fastq" ] && echo "Subsampled FASTQ file for read 2 is missing" && exit 1 +[ ! -s "a.2.subsampled.fastq" ] && echo "Subsampled FASTQ file is empty" && exit 1 + +rm a.1.subsampled.fastq a.2.subsampled.fastq + +echo ">>> Testing for single-end reads" +"$meta_executable" \ + --input_1 $meta_resources_dir/test_data/a.1.fastq \ + --input_2 $meta_resources_dir/test_data/a.1.fastq \ + --record_count 3 \ + --seed 1 \ + --output_1 a.1.subsampled.fastq + +echo ">> Checking if the correct files are present" +[ ! -f "a.1.subsampled.fastq" ] && echo "Subsampled FASTQ file is missing" && exit 1 +[ ! -s "a.1.subsampled.fastq" ] && echo "Subsampled FASTQ file is empty" && exit 1 + +echo ">>> Tests finished successfully" +exit 0 + diff --git a/src/fq_subsample/test_data/a.1.fastq b/src/fq_subsample/test_data/a.1.fastq new file mode 100644 index 00000000..4cd6d866 --- /dev/null +++ b/src/fq_subsample/test_data/a.1.fastq @@ -0,0 +1,21 @@ +@1 +ACGGCAT ++ +!!!!!!! +@2 +TACGGCA ++ +!!!!!!! +@3 +ATACGGC ++ +!!!!!!! +@4 +CATACGG ++ +!!!!!!! +@5 +GCATACG ++ +!!!!!!! + diff --git a/src/fq_subsample/test_data/a.2.fastq b/src/fq_subsample/test_data/a.2.fastq new file mode 100644 index 00000000..f9fa80de --- /dev/null +++ b/src/fq_subsample/test_data/a.2.fastq @@ -0,0 +1,20 @@ +@1 +ACGGCAT ++ +!!!!!!! +@2 +TACGGCA ++ +!!!!!!! +@3 +ATACGGC ++ +!!!!!!! +@4 +CATACGG ++ +!!!!!!! +@5 +GCATACG ++ +!!!!!!! \ No newline at end of file