Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add testing pipeline #9

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/data/reads_to_simulate.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
H37Rv,.github/data/assemblies/NC_000962.3.fa
DRR261178,.github/data/assemblies/DRR261178.fa
DRR261179,.github/data/assemblies/DRR261179.fa
DRR261180,.github/data/assemblies/DRR261180.fa
DRR261181,.github/data/assemblies/DRR261181.fa
1,488 changes: 1,488 additions & 0 deletions .github/data/variants/DRR261178.vcf

Large diffs are not rendered by default.

2,196 changes: 2,196 additions & 0 deletions .github/data/variants/DRR261179.vcf

Large diffs are not rendered by default.

1,567 changes: 1,567 additions & 0 deletions .github/data/variants/DRR261180.vcf

Large diffs are not rendered by default.

2,215 changes: 2,215 additions & 0 deletions .github/data/variants/DRR261181.vcf

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions .github/environments/art.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: art
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- art=2016.06.05
9 changes: 9 additions & 0 deletions .github/environments/check-outputs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: check-outputs
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- python=3
- jsonschema=4.20.0
- pyyaml=6.0.1
73 changes: 73 additions & 0 deletions .github/scripts/apply_variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python3


import argparse
import json

def parse_fasta(fasta):
"""
Parse a fasta file into a header and a sequence.
"""
with open(fasta, 'r') as f:
lines = f.readlines()
header = lines[0].strip()
sequence = ''.join(lines[1:])
sequence = sequence.replace('\n', '')
parsed_fasta = {
'header': header,
'sequence': sequence
}

return parsed_fasta

def parse_vcf(vcf):
"""
"""
parsed_vcf = []
header = []
with open(vcf, 'r') as f:
for line in f:
if line.startswith('##'):
continue
elif line.startswith('#'):
header = line.strip().lstrip('#').split('\t')
else:
vcf_line_split = line.strip().split('\t')
vcf_dict = dict(zip(header, vcf_line_split))
vcf_dict['POS'] = int(vcf_dict['POS'])
vcf_dict['INFO'] = dict([x.split('=') for x in vcf_dict['INFO'].split(';')])
if 'ANN' in vcf_dict['INFO']:
vcf_dict['INFO']['ANN'] = vcf_dict['INFO']['ANN'].split(',')
parsed_vcf.append(vcf_dict)

return parsed_vcf


def apply_variants(genome, variants):
"""
Apply variants to a reference genome.
"""
for variant in variants:
if variant['ALT'] == '.':
continue
genome['sequence'] = genome['sequence'][:variant['POS']-1] + variant['ALT'] + genome['sequence'][variant['POS']:]

return genome



def main(args):
genome = parse_fasta(args.genome)
variants = parse_vcf(args.variants)
genome = apply_variants(genome, variants)
with open(args.output, 'w') as f:
f.write(genome['header'] + '\n')
f.write(genome['sequence'] + '\n')

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='')
parser.add_argument('-g', '--genome', type=str, help='Input reference genome')
parser.add_argument('-v', '--variants', type=str, help='Variants to apply to the reference genome (vcf format)')
parser.add_argument('-o', '--output', type=str, help='Output file path, (fasta format)')
args = parser.parse_args()
main(args)
6 changes: 6 additions & 0 deletions .github/scripts/apply_variants.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

.github/scripts/apply_variants.py --genome .github/data/assemblies/NC_000962.3.fa --variants .github/data/variants/DRR261178.vcf -o .github/data/assemblies/DRR261178.fa
.github/scripts/apply_variants.py --genome .github/data/assemblies/NC_000962.3.fa --variants .github/data/variants/DRR261179.vcf -o .github/data/assemblies/DRR261179.fa
.github/scripts/apply_variants.py --genome .github/data/assemblies/NC_000962.3.fa --variants .github/data/variants/DRR261180.vcf -o .github/data/assemblies/DRR261180.fa
.github/scripts/apply_variants.py --genome .github/data/assemblies/NC_000962.3.fa --variants .github/data/variants/DRR261181.vcf -o .github/data/assemblies/DRR261181.fa
118 changes: 118 additions & 0 deletions .github/scripts/check_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env python3

import argparse
import csv
import glob
import json
import urllib.request

from jsonschema import validate
import yaml


def check_provenance_format_valid(provenance_files, schema):
"""
Check that the provenance files are valid according to the schema.
"""
for provenance_file in provenance_files:
with open(provenance_file) as f:
try:
provenance = yaml.load(f, Loader=yaml.BaseLoader)
validate(provenance, schema)
except Exception as e:
return False

return True


def check_expected_mutations(resistance_mutations_files, expected_mutations_by_sample_id):
"""
Check that the resistance mutations files contain the expected mutations.
"""
found_mutations_by_sample = {}
for resistance_mutations_file in resistance_mutations_files:
with open(resistance_mutations_file) as f:
reader = csv.DictReader(f)
for row in reader:
sample_id = row['sample_id']
gene = row['gene']
mutation = row['mutation']
if sample_id not in found_mutations_by_sample:
found_mutations_by_sample[sample_id] = set([])
if mutation != '':
found_mutations_by_sample[sample_id].add(':'.join([gene, mutation]))

for sample_id, expected_mutations in expected_mutations_by_sample_id.items():
if sample_id not in found_mutations_by_sample:
return False
if expected_mutations != found_mutations_by_sample[sample_id]:
return False

return True


def main(args):
provenance_schema_url = "https://raw.githubusercontent.com/BCCDC-PHL/pipeline-provenance-schema/main/schema/pipeline-provenance.json"
provenance_schema_path = ".github/data/pipeline-provenance.json"
urllib.request.urlretrieve(provenance_schema_url, provenance_schema_path)

provenance_schema = None
with open(provenance_schema_path) as f:
provenance_schema = json.load(f)

provenace_files_glob = f"{args.pipeline_outdir}/**/*_provenance.yml"
provenance_files = glob.glob(provenace_files_glob, recursive=True)

resistance_mutations_files_glob = f"{args.pipeline_outdir}/**/*tbprofiler_resistance_mutations.csv"
resistance_mutations_files = glob.glob(resistance_mutations_files_glob, recursive=True)

expected_mutations_by_sample_id = {
'NC000962.3': set([]),
'ERR1664619': set([
'inhA:p.Ile194Thr',
'embA:c.-16C>T',
'embB:p.Met306Val',
'embB:p.Met423Thr',
'gyrA:p.Asp94Ala',
'rrs:n.1401A>G',
]),
}

tests = [
{
"test_name": "provenance_format_valid",
"test_passed": check_provenance_format_valid(provenance_files, provenance_schema),
},
{
"test_name": "expected_mutations",
"test_passed": check_expected_mutations(resistance_mutations_files, expected_mutations_by_sample_id),
},
]

output_fields = [
"test_name",
"test_result"
]

output_path = args.output
with open(output_path, 'w') as f:
writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
writer.writeheader()
for test in tests:
if test["test_passed"]:
test["test_result"] = "PASS"
else:
test["test_result"] = "FAIL"
writer.writerow(test)

for test in tests:
if not test['test_passed']:
exit(1)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check outputs')
parser.add_argument('--pipeline-outdir', type=str, help='Path to the pipeline output directory')
parser.add_argument('-o', '--output', type=str, help='Path to the output file')
args = parser.parse_args()
main(args)
10 changes: 10 additions & 0 deletions .github/scripts/check_outputs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate check-outputs


.github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output -o artifacts/check_outputs_results.csv
3 changes: 3 additions & 0 deletions .github/scripts/create_art_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/art.yml
3 changes: 3 additions & 0 deletions .github/scripts/create_output_checking_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/check-outputs.yml
5 changes: 5 additions & 0 deletions .github/scripts/download_assemblies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

mkdir -p .github/data/assemblies

curl -o .github/data/assemblies/NC_000962.3.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_000962.3&db=nucleotide&rettype=fasta"
22 changes: 22 additions & 0 deletions .github/scripts/install_conda.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
set -eo pipefail

artifacts_dir="artifacts"

echo "Install Miniconda .." >> ${artifacts_dir}/test.log

export PATH=/opt/miniconda3/bin:$PATH

wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh

/bin/bash ~/miniconda.sh -b -p /opt/miniconda3

rm ~/miniconda.sh

echo ". /opt/minconda3/etc/profile.d/conda.sh" >> ~/.bashrc

conda update -n base -c defaults conda

conda install -y -c conda-forge mamba

conda init bash
11 changes: 11 additions & 0 deletions .github/scripts/install_nextflow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

set -eo pipefail

artifacts_dir="artifacts"

echo Install Nextflow .. >> ${artifacts_dir}/test.log

wget -qO- https://get.nextflow.io | bash

sudo mv nextflow /usr/local/bin/
13 changes: 13 additions & 0 deletions .github/scripts/prepare_artifacts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

artifacts_dir="artifacts"

echo "Prepare artifacts .." >> ${artifacts_dir}/test.log

mkdir -p ${artifacts_dir}/fastq

mv .github/data/fastq/*.fastq.gz ${artifacts_dir}/fastq

mkdir -p ${artifacts_dir}/pipeline_outputs

mv .github/data/test_output/* ${artifacts_dir}/pipeline_outputs
15 changes: 15 additions & 0 deletions .github/scripts/run_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

set -eo pipefail

sed -i 's/cpus = 8/cpus = 4/g' nextflow.config

nextflow run main.nf \
-profile conda \
--cache ${HOME}/.conda/envs \
--fastq_input .github/data/fastq \
--ref .github/data/assemblies/NC_000962.3.fa \
--mincov 6 \
--outdir .github/data/test_output \
-with-report .github/data/test_output/nextflow_report.html \
-with-trace .github/data/test_output/nextflow_trace.tsv
35 changes: 35 additions & 0 deletions .github/scripts/simulate_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash


source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate art

mkdir -p .github/data/fastq

while IFS=',' read -r sample_id assembly; do
art_illumina \
--paired \
--in ${assembly} \
--fcov 10 \
--len 150 \
--mflen 400 \
--sdev 100 \
--rndSeed 42 \
--qShift 0 \
--qShift2 0 \
--out .github/data/fastq/${sample_id}_R

rm -f .github/data/fastq/${sample_id}_R1.aln
rm -f .github/data/fastq/${sample_id}_R2.aln

mv .github/data/fastq/${sample_id}_R1.fq .github/data/fastq/${sample_id}_R1.fastq
mv .github/data/fastq/${sample_id}_R2.fq .github/data/fastq/${sample_id}_R2.fastq

gzip -f .github/data/fastq/${sample_id}_R1.fastq
gzip -f .github/data/fastq/${sample_id}_R2.fastq

done < .github/data/reads_to_simulate.csv

Loading
Loading