Add github actions testing pipeline (#34)

* Add github actions testing pipeline * add workflow dispatch trigger * Rename branch to main * Add steps to workflow * Remove system update steps from conda install * Use conda * conda init * Source bashrc and publish artifact * conda init * source bashrc * Activate env from script * Add pipeline running script * Fix nextflow versions * Fix path to script * Create output dir for simulated reads * gzip reads and add pipeline run to workflow * Install nextflow to /usr/local/bin * Fix artifacts * Prepare artifacts * Use correct artifacts dir * Reduce cpus to 4 * Check nextflow config * Always prepare artifacts * Fix regex? * Check contents of artifacts dirs * Fix artifacts path * Fix artifacts path * Add output checking script, mutations file * Make artifacts name unique * Work on checking outputs * Fix permissions * Add status badge * Fix output file path * Rename workflow to tests * Adjust how we check for failure * Fix mutation checking * Add scripts for applying variants to genome, test multiple samples * Capture more artifacts * Remove provenance schema * Clean up workflow
BCCDC-PHL · Feb 9, 2024 · 073d7fe · 073d7fe
1 parent 5c07f7e
commit 073d7fe
Show file tree

Hide file tree

Showing 20 changed files with 446 additions and 0 deletions.
diff --git a/.github/data/ERR1664619_mutations.vcf b/.github/data/ERR1664619_mutations.vcf
diff --git a/.github/data/reads_to_simulate.csv b/.github/data/reads_to_simulate.csv
@@ -0,0 +1,2 @@
+NC000962.3,.github/data/assemblies/NC_000962.3.fa
+ERR1664619,.github/data/assemblies/ERR1664619.fa
diff --git a/.github/environments/art.yml b/.github/environments/art.yml
@@ -0,0 +1,7 @@
+name: art
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - art=2016.06.05
diff --git a/.github/environments/check-outputs.yml b/.github/environments/check-outputs.yml
@@ -0,0 +1,9 @@
+name: check-outputs
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python=3
+  - jsonschema=4.20.0
+  - pyyaml=6.0.1
diff --git a/.github/scripts/apply_variants.py b/.github/scripts/apply_variants.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+
+
+import argparse
+import json
+
+def parse_fasta(fasta):
+    """
+    Parse a fasta file into a header and a sequence.
+    """
+    with open(fasta, 'r') as f:
+        lines = f.readlines()
+    header = lines[0].strip()
+    sequence = ''.join(lines[1:])
+    sequence = sequence.replace('\n', '')
+    parsed_fasta = {
+        'header': header,
+        'sequence': sequence
+    }
+
+    return parsed_fasta
+
+def parse_vcf(vcf):
+    """
+    """
+    parsed_vcf = []
+    header = []
+    with open(vcf, 'r') as f:
+        for line in f:
+            if line.startswith('##'):
+                continue
+            elif line.startswith('#'):
+                header = line.strip().lstrip('#').split('\t')
+            else:
+                vcf_line_split = line.strip().split('\t')
+                vcf_dict = dict(zip(header, vcf_line_split))
+                vcf_dict['POS'] = int(vcf_dict['POS'])
+                vcf_dict['INFO'] = dict([x.split('=') for x in vcf_dict['INFO'].split(';')])
+                if 'ANN' in vcf_dict['INFO']:
+                    vcf_dict['INFO']['ANN'] = vcf_dict['INFO']['ANN'].split(',')
+                parsed_vcf.append(vcf_dict)
+
+    return parsed_vcf
+
+
+def apply_variants(genome, variants):
+    """
+    Apply variants to a reference genome.
+    """
+    for variant in variants:
+        if variant['ALT'] == '.':
+            continue
+        genome['sequence'] = genome['sequence'][:variant['POS']-1] + variant['ALT'] + genome['sequence'][variant['POS']:]
+
+    return genome
+
+
+
+def main(args):
+    genome = parse_fasta(args.genome)
+    variants = parse_vcf(args.variants)
+    genome = apply_variants(genome, variants)
+    with open(args.output, 'w') as f:
+        f.write(genome['header'] + '\n')
+        f.write(genome['sequence'] + '\n')
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('-g', '--genome', type=str, help='Input reference genome')
+    parser.add_argument('-v', '--variants', type=str, help='Variants to apply to the reference genome (vcf format)')
+    parser.add_argument('-o', '--output', type=str, help='Output file path, (fasta format)')
+    args = parser.parse_args()
+    main(args)
diff --git a/.github/scripts/apply_variants.sh b/.github/scripts/apply_variants.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+.github/scripts/apply_variants.py --genome .github/data/assemblies/NC_000962.3.fa --variants .github/data/ERR1664619_mutations.vcf -o .github/data/assemblies/ERR1664619.fa
diff --git a/.github/scripts/check_outputs.py b/.github/scripts/check_outputs.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import glob
+import json
+import urllib.request
+
+from jsonschema import validate
+import yaml
+
+
+def check_provenance_format_valid(provenance_files, schema):
+    """
+    Check that the provenance files are valid according to the schema.
+    """
+    for provenance_file in provenance_files:
+        with open(provenance_file) as f:
+            try:
+                provenance = yaml.load(f, Loader=yaml.BaseLoader)
+                validate(provenance, schema)
+            except Exception as e:
+                return False
+
+    return True
+
+
+def check_expected_mutations(resistance_mutations_files, expected_mutations_by_sample_id):
+    """
+    Check that the resistance mutations files contain the expected mutations.
+    """
+    found_mutations_by_sample = {}
+    for resistance_mutations_file in resistance_mutations_files:
+        with open(resistance_mutations_file) as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                sample_id = row['sample_id']
+                gene = row['gene']
+                mutation = row['mutation']
+                if sample_id not in found_mutations_by_sample:
+                    found_mutations_by_sample[sample_id] = set([])
+                if mutation != '':
+                    found_mutations_by_sample[sample_id].add(':'.join([gene, mutation]))
+
+    for sample_id, expected_mutations in expected_mutations_by_sample_id.items():
+        if sample_id not in found_mutations_by_sample:
+            return False
+        if expected_mutations != found_mutations_by_sample[sample_id]:
+            return False
+
+    return True
+
+
+def main(args):
+    provenance_schema_url = "https://raw.githubusercontent.com/BCCDC-PHL/pipeline-provenance-schema/main/schema/pipeline-provenance.json"
+    provenance_schema_path = ".github/data/pipeline-provenance.json"
+    urllib.request.urlretrieve(provenance_schema_url, provenance_schema_path)
+
+    provenance_schema = None
+    with open(provenance_schema_path) as f:
+        provenance_schema = json.load(f)
+
+    provenace_files_glob = f"{args.pipeline_outdir}/**/*_provenance.yml"
+    provenance_files = glob.glob(provenace_files_glob, recursive=True)
+
+    resistance_mutations_files_glob = f"{args.pipeline_outdir}/**/*tbprofiler_resistance_mutations.csv"
+    resistance_mutations_files = glob.glob(resistance_mutations_files_glob, recursive=True)
+
+    expected_mutations_by_sample_id = {
+        'NC000962.3': set([]),
+        'ERR1664619': set([
+            'inhA:p.Ile194Thr',
+            'embA:c.-16C>T',
+            'embB:p.Met306Val',
+            'embB:p.Met423Thr',
+            'gyrA:p.Asp94Ala',
+            'rrs:n.1401A>G',
+        ]),                  
+    }       
+
+    tests = [
+        {
+            "test_name": "provenance_format_valid",
+            "test_passed": check_provenance_format_valid(provenance_files, provenance_schema),
+        },
+        {
+            "test_name": "expected_mutations",
+            "test_passed": check_expected_mutations(resistance_mutations_files, expected_mutations_by_sample_id),
+        },
+    ]
+
+    output_fields = [
+        "test_name",
+        "test_result"
+    ]
+
+    output_path = args.output
+    with open(output_path, 'w') as f:
+        writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
+        writer.writeheader()
+        for test in tests:
+            if test["test_passed"]:
+                test["test_result"] = "PASS"
+            else:
+                test["test_result"] = "FAIL"
+            writer.writerow(test)
+
+    for test in tests:
+        if not test['test_passed']:
+            exit(1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Check outputs')
+    parser.add_argument('--pipeline-outdir', type=str, help='Path to the pipeline output directory')
+    parser.add_argument('-o', '--output', type=str, help='Path to the output file')
+    args = parser.parse_args()
+    main(args)
diff --git a/.github/scripts/check_outputs.sh b/.github/scripts/check_outputs.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+source ${HOME}/.bashrc
+
+eval "$(conda shell.bash hook)"
+
+conda activate check-outputs
+
+
+.github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output -o artifacts/check_outputs_results.csv
diff --git a/.github/scripts/create_art_environment.sh b/.github/scripts/create_art_environment.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+conda env create -f .github/environments/art.yml
diff --git a/.github/scripts/create_output_checking_environment.sh b/.github/scripts/create_output_checking_environment.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+conda env create -f .github/environments/check-outputs.yml
diff --git a/.github/scripts/download_assemblies.sh b/.github/scripts/download_assemblies.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+mkdir -p .github/data/assemblies
+
+curl -o .github/data/assemblies/NC_000962.3.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_000962.3&db=nucleotide&rettype=fasta"
diff --git a/.github/scripts/install_conda.sh b/.github/scripts/install_conda.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -eo pipefail
+
+artifacts_dir="artifacts"
+
+echo "Install Miniconda .." >> ${artifacts_dir}/test.log
+
+export PATH=/opt/miniconda3/bin:$PATH
+
+wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
+
+/bin/bash ~/miniconda.sh -b -p /opt/miniconda3
+
+rm ~/miniconda.sh
+
+echo ". /opt/minconda3/etc/profile.d/conda.sh" >> ~/.bashrc
+
+conda update -n base -c defaults conda
+
+conda install -y -c conda-forge mamba
+
+conda init bash
diff --git a/.github/scripts/install_nextflow.sh b/.github/scripts/install_nextflow.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -eo pipefail
+
+artifacts_dir="artifacts"
+
+echo Install Nextflow .. >> ${artifacts_dir}/test.log
+
+wget -qO- https://get.nextflow.io | bash
+
+sudo mv nextflow /usr/local/bin/
diff --git a/.github/scripts/prepare_artifacts.sh b/.github/scripts/prepare_artifacts.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+artifacts_dir="artifacts"
+
+echo "Prepare artifacts .." >> ${artifacts_dir}/test.log
+
+mkdir -p ${artifacts_dir}/fastq
+
+mv .github/data/fastq/*.fastq.gz ${artifacts_dir}/fastq
+
+mkdir -p ${artifacts_dir}/pipeline_outputs
+
+mv .github/data/test_output/* ${artifacts_dir}/pipeline_outputs
diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -eo pipefail
+
+sed -i 's/cpus = 8/cpus = 4/g' nextflow.config 
+
+nextflow run main.nf \
+	 -profile conda \
+	 --cache ${HOME}/.conda/envs \
+	 --fastq_input .github/data/fastq \
+	 --outdir .github/data/test_output \
+	 -with-report .github/data/test_output/nextflow_report.html \
+ 	 -with-trace .github/data/test_output/nextflow_trace.tsv
diff --git a/.github/scripts/simulate_reads.sh b/.github/scripts/simulate_reads.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+
+source ${HOME}/.bashrc
+
+eval "$(conda shell.bash hook)"
+
+conda activate art
+
+mkdir -p .github/data/fastq
+
+while IFS=',' read -r sample_id assembly; do
+    art_illumina \
+	--paired \
+	--in ${assembly} \
+	--fcov 12 \
+	--len 150 \
+	--mflen 400 \
+	--sdev 100 \
+	--rndSeed 42 \
+	--qShift 0 \
+	--qShift2 0 \
+	--out .github/data/fastq/${sample_id}_R
+
+    rm -f .github/data/fastq/${sample_id}_R1.aln
+    rm -f .github/data/fastq/${sample_id}_R2.aln
+
+    mv .github/data/fastq/${sample_id}_R1.fq .github/data/fastq/${sample_id}_R1.fastq
+    mv .github/data/fastq/${sample_id}_R2.fq .github/data/fastq/${sample_id}_R2.fastq
+
+    gzip -f .github/data/fastq/${sample_id}_R1.fastq
+    gzip -f .github/data/fastq/${sample_id}_R2.fastq
+
+done < .github/data/reads_to_simulate.csv
+
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,50 @@
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+name: Tests
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        nextflow_version: ["21.04.3", "23.10.1"]
+    name: Run tests
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@master
+    - name: Create Artifacts Directory
+      run: mkdir artifacts
+    - name: Install Miniconda
+      run: bash .github/scripts/install_conda.sh
+    - name: Install Nextflow
+      env:
+        NXF_VER: ${{ matrix.nextflow_version }}
+      run: bash .github/scripts/install_nextflow.sh
+    - name: Create ART Read-Simulation Environment
+      run: bash .github/scripts/create_art_environment.sh
+    - name: Download Assemblies
+      run: bash .github/scripts/download_assemblies.sh
+    - name: Apply Variants
+      run: bash .github/scripts/apply_variants.sh
+    - name: Simulate Reads
+      run: bash .github/scripts/simulate_reads.sh
+    - name: Run Pipeline
+      run: bash .github/scripts/run_pipeline.sh
+    - name: Create Output Checking Environment
+      run: bash .github/scripts/create_output_checking_environment.sh
+    - name: Check Outputs
+      run: bash .github/scripts/check_outputs.sh
+    - name: Prepare Artifacts
+      if: always()
+      run: bash .github/scripts/prepare_artifacts.sh
+    - name: Upload Artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: artifacts-BCCDC-PHL-tbprofiler-nf-nextflow-v${{ matrix.nextflow_version }}-${{ github.run_id }}.${{ github.run_attempt }}
+        path: artifacts