BCCDC-PHL · dfornika · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/.github/data/reads_to_simulate.csv b/.github/data/reads_to_simulate.csv
@@ -0,0 +1,5 @@
+H37Rv,.github/data/assemblies/NC_000962.3.fa
+DRR261178,.github/data/assemblies/DRR261178.fa
+DRR261179,.github/data/assemblies/DRR261179.fa
+DRR261180,.github/data/assemblies/DRR261180.fa
+DRR261181,.github/data/assemblies/DRR261181.fa
diff --git a/.github/data/variants/DRR261178.vcf b/.github/data/variants/DRR261178.vcf
diff --git a/.github/data/variants/DRR261179.vcf b/.github/data/variants/DRR261179.vcf
diff --git a/.github/data/variants/DRR261180.vcf b/.github/data/variants/DRR261180.vcf
diff --git a/.github/data/variants/DRR261181.vcf b/.github/data/variants/DRR261181.vcf
diff --git a/.github/environments/art.yml b/.github/environments/art.yml
@@ -0,0 +1,7 @@
+name: art
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - art=2016.06.05
diff --git a/.github/environments/check-outputs.yml b/.github/environments/check-outputs.yml
@@ -0,0 +1,9 @@
+name: check-outputs
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python=3
+  - jsonschema=4.20.0
+  - pyyaml=6.0.1
diff --git a/.github/scripts/apply_variants.py b/.github/scripts/apply_variants.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+
+
+import argparse
+import json
+
+def parse_fasta(fasta):
+    """
+    Parse a fasta file into a header and a sequence.
+    """
+    with open(fasta, 'r') as f:
+        lines = f.readlines()
+    header = lines[0].strip()
+    sequence = ''.join(lines[1:])
+    sequence = sequence.replace('\n', '')
+    parsed_fasta = {
+        'header': header,
+        'sequence': sequence
+    }
+
+    return parsed_fasta
+
+def parse_vcf(vcf):
+    """
+    """
+    parsed_vcf = []
+    header = []
+    with open(vcf, 'r') as f:
+        for line in f:
+            if line.startswith('##'):
+                continue
+            elif line.startswith('#'):
+                header = line.strip().lstrip('#').split('\t')
+            else:
+                vcf_line_split = line.strip().split('\t')
+                vcf_dict = dict(zip(header, vcf_line_split))
+                vcf_dict['POS'] = int(vcf_dict['POS'])
+                vcf_dict['INFO'] = dict([x.split('=') for x in vcf_dict['INFO'].split(';')])
+                if 'ANN' in vcf_dict['INFO']:
+                    vcf_dict['INFO']['ANN'] = vcf_dict['INFO']['ANN'].split(',')
+                parsed_vcf.append(vcf_dict)
+
+    return parsed_vcf
+
+
+def apply_variants(genome, variants):
+    """
+    Apply variants to a reference genome.
+    """
+    for variant in variants:
+        if variant['ALT'] == '.':
+            continue
+        genome['sequence'] = genome['sequence'][:variant['POS']-1] + variant['ALT'] + genome['sequence'][variant['POS']:]
+
+    return genome
+
+
+
+def main(args):
+    genome = parse_fasta(args.genome)
+    variants = parse_vcf(args.variants)
+    genome = apply_variants(genome, variants)
+    with open(args.output, 'w') as f:
+        f.write(genome['header'] + '\n')
+        f.write(genome['sequence'] + '\n')
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('-g', '--genome', type=str, help='Input reference genome')
+    parser.add_argument('-v', '--variants', type=str, help='Variants to apply to the reference genome (vcf format)')
+    parser.add_argument('-o', '--output', type=str, help='Output file path, (fasta format)')
+    args = parser.parse_args()
+    main(args)
diff --git a/.github/scripts/apply_variants.sh b/.github/scripts/apply_variants.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+.github/scripts/apply_variants.py --genome .github/data/assemblies/NC_000962.3.fa --variants .github/data/variants/DRR261178.vcf -o .github/data/assemblies/DRR261178.fa
+.github/scripts/apply_variants.py --genome .github/data/assemblies/NC_000962.3.fa --variants .github/data/variants/DRR261179.vcf -o .github/data/assemblies/DRR261179.fa
+.github/scripts/apply_variants.py --genome .github/data/assemblies/NC_000962.3.fa --variants .github/data/variants/DRR261180.vcf -o .github/data/assemblies/DRR261180.fa
+.github/scripts/apply_variants.py --genome .github/data/assemblies/NC_000962.3.fa --variants .github/data/variants/DRR261181.vcf -o .github/data/assemblies/DRR261181.fa
diff --git a/.github/scripts/check_outputs.py b/.github/scripts/check_outputs.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import glob
+import json
+import urllib.request
+
+from jsonschema import validate
+import yaml
+
+
+def check_provenance_format_valid(provenance_files, schema):
+    """
+    Check that the provenance files are valid according to the schema.
+    """
+    for provenance_file in provenance_files:
+        with open(provenance_file) as f:
+            try:
+                provenance = yaml.load(f, Loader=yaml.BaseLoader)
+                validate(provenance, schema)
+            except Exception as e:
+                return False
+
+    return True
+
+
+def check_expected_mutations(resistance_mutations_files, expected_mutations_by_sample_id):
+    """
+    Check that the resistance mutations files contain the expected mutations.
+    """
+    found_mutations_by_sample = {}
+    for resistance_mutations_file in resistance_mutations_files:
+        with open(resistance_mutations_file) as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                sample_id = row['sample_id']
+                gene = row['gene']
+                mutation = row['mutation']
+                if sample_id not in found_mutations_by_sample:
+                    found_mutations_by_sample[sample_id] = set([])
+                if mutation != '':
+                    found_mutations_by_sample[sample_id].add(':'.join([gene, mutation]))
+
+    for sample_id, expected_mutations in expected_mutations_by_sample_id.items():
+        if sample_id not in found_mutations_by_sample:
+            return False
+        if expected_mutations != found_mutations_by_sample[sample_id]:
+            return False
+
+    return True
+
+
+def main(args):
+    provenance_schema_url = "https://raw.githubusercontent.com/BCCDC-PHL/pipeline-provenance-schema/main/schema/pipeline-provenance.json"
+    provenance_schema_path = ".github/data/pipeline-provenance.json"
+    urllib.request.urlretrieve(provenance_schema_url, provenance_schema_path)
+
+    provenance_schema = None
+    with open(provenance_schema_path) as f:
+        provenance_schema = json.load(f)
+
+    provenace_files_glob = f"{args.pipeline_outdir}/**/*_provenance.yml"
+    provenance_files = glob.glob(provenace_files_glob, recursive=True)
+
+    resistance_mutations_files_glob = f"{args.pipeline_outdir}/**/*tbprofiler_resistance_mutations.csv"
+    resistance_mutations_files = glob.glob(resistance_mutations_files_glob, recursive=True)
+
+    expected_mutations_by_sample_id = {
+        'NC000962.3': set([]),
+        'ERR1664619': set([
+            'inhA:p.Ile194Thr',
+            'embA:c.-16C>T',
+            'embB:p.Met306Val',
+            'embB:p.Met423Thr',
+            'gyrA:p.Asp94Ala',
+            'rrs:n.1401A>G',
+        ]),                  
+    }       
+
+    tests = [
+        {
+            "test_name": "provenance_format_valid",
+            "test_passed": check_provenance_format_valid(provenance_files, provenance_schema),
+        },
+        {
+            "test_name": "expected_mutations",
+            "test_passed": check_expected_mutations(resistance_mutations_files, expected_mutations_by_sample_id),
+        },
+    ]
+
+    output_fields = [
+        "test_name",
+        "test_result"
+    ]
+
+    output_path = args.output
+    with open(output_path, 'w') as f:
+        writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
+        writer.writeheader()
+        for test in tests:
+            if test["test_passed"]:
+                test["test_result"] = "PASS"
+            else:
+                test["test_result"] = "FAIL"
+            writer.writerow(test)
+
+    for test in tests:
+        if not test['test_passed']:
+            exit(1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Check outputs')
+    parser.add_argument('--pipeline-outdir', type=str, help='Path to the pipeline output directory')
+    parser.add_argument('-o', '--output', type=str, help='Path to the output file')
+    args = parser.parse_args()
+    main(args)
diff --git a/.github/scripts/check_outputs.sh b/.github/scripts/check_outputs.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+source ${HOME}/.bashrc
+
+eval "$(conda shell.bash hook)"
+
+conda activate check-outputs
+
+
+.github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output -o artifacts/check_outputs_results.csv
diff --git a/.github/scripts/create_art_environment.sh b/.github/scripts/create_art_environment.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+conda env create -f .github/environments/art.yml
diff --git a/.github/scripts/create_output_checking_environment.sh b/.github/scripts/create_output_checking_environment.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+conda env create -f .github/environments/check-outputs.yml
diff --git a/.github/scripts/download_assemblies.sh b/.github/scripts/download_assemblies.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+mkdir -p .github/data/assemblies
+
+curl -o .github/data/assemblies/NC_000962.3.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_000962.3&db=nucleotide&rettype=fasta"
diff --git a/.github/scripts/install_conda.sh b/.github/scripts/install_conda.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -eo pipefail
+
+artifacts_dir="artifacts"
+
+echo "Install Miniconda .." >> ${artifacts_dir}/test.log
+
+export PATH=/opt/miniconda3/bin:$PATH
+
+wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
+
+/bin/bash ~/miniconda.sh -b -p /opt/miniconda3
+
+rm ~/miniconda.sh
+
+echo ". /opt/minconda3/etc/profile.d/conda.sh" >> ~/.bashrc
+
+conda update -n base -c defaults conda
+
+conda install -y -c conda-forge mamba
+
+conda init bash
diff --git a/.github/scripts/install_nextflow.sh b/.github/scripts/install_nextflow.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -eo pipefail
+
+artifacts_dir="artifacts"
+
+echo Install Nextflow .. >> ${artifacts_dir}/test.log
+
+wget -qO- https://get.nextflow.io | bash
+
+sudo mv nextflow /usr/local/bin/
diff --git a/.github/scripts/prepare_artifacts.sh b/.github/scripts/prepare_artifacts.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+artifacts_dir="artifacts"
+
+echo "Prepare artifacts .." >> ${artifacts_dir}/test.log
+
+mkdir -p ${artifacts_dir}/fastq
+
+mv .github/data/fastq/*.fastq.gz ${artifacts_dir}/fastq
+
+mkdir -p ${artifacts_dir}/pipeline_outputs
+
+mv .github/data/test_output/* ${artifacts_dir}/pipeline_outputs
diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -eo pipefail
+
+sed -i 's/cpus = 8/cpus = 4/g' nextflow.config 
+
+nextflow run main.nf \
+	 -profile conda \
+	 --cache ${HOME}/.conda/envs \
+	 --fastq_input .github/data/fastq \
+	 --ref .github/data/assemblies/NC_000962.3.fa \
+	 --mincov 6 \
+	 --outdir .github/data/test_output \
+	 -with-report .github/data/test_output/nextflow_report.html \
+ 	 -with-trace .github/data/test_output/nextflow_trace.tsv
diff --git a/.github/scripts/simulate_reads.sh b/.github/scripts/simulate_reads.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+
+source ${HOME}/.bashrc
+
+eval "$(conda shell.bash hook)"
+
+conda activate art
+
+mkdir -p .github/data/fastq
+
+while IFS=',' read -r sample_id assembly; do
+    art_illumina \
+	--paired \
+	--in ${assembly} \
+	--fcov 10 \
+	--len 150 \
+	--mflen 400 \
+	--sdev 100 \
+	--rndSeed 42 \
+	--qShift 0 \
+	--qShift2 0 \
+	--out .github/data/fastq/${sample_id}_R
+
+    rm -f .github/data/fastq/${sample_id}_R1.aln
+    rm -f .github/data/fastq/${sample_id}_R2.aln
+
+    mv .github/data/fastq/${sample_id}_R1.fq .github/data/fastq/${sample_id}_R1.fastq
+    mv .github/data/fastq/${sample_id}_R2.fq .github/data/fastq/${sample_id}_R2.fastq
+
+    gzip -f .github/data/fastq/${sample_id}_R1.fastq
+    gzip -f .github/data/fastq/${sample_id}_R2.fastq
+
+done < .github/data/reads_to_simulate.csv
+