Skip to content

Commit

Permalink
Merge pull request #46 from ncbi/release-0.3.0-alpha
Browse files Browse the repository at this point in the history
Release 0.3.0-alpha
  • Loading branch information
pstrope authored Nov 5, 2024
2 parents 5250364 + 5cdbc46 commit f21ac06
Show file tree
Hide file tree
Showing 49 changed files with 1,570 additions and 477 deletions.
2 changes: 1 addition & 1 deletion PRIVACY.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
## Privacy Statement
We do not currently collect EGAPx usage data.

Additional privacy and security policy information can be found on the [NLM Web Policies](https://www.nlm.nih.gov/web_policies.html) page.
Additional privacy and security policy information can be found on the [NLM Web Policies](https://www.nlm.nih.gov/web_policies.html) page.
328 changes: 223 additions & 105 deletions README.md

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions examples/input_C_longicornis.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/029//603/195/GCF_029603195.1_ASM2960319v2/GCF_029603195.1_ASM2960319v2_genomic.fna.gz
genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/029/603/195/GCA_029603195.2_ASM2960319v2/GCA_029603195.2_ASM2960319v2_genomic.fna.gz
reads: txid2530218[Organism] AND biomol_transcript[properties] NOT SRS024887[Accession]
taxid: 2530218
taxid: 2530218
annotation_provider: GenBank submitter
annotation_name_prefix: GCA_029603195.2
locus_tag_prefix: egapxtmp
16 changes: 0 additions & 16 deletions examples/input_D_farinae_minimal.yaml

This file was deleted.

5 changes: 4 additions & 1 deletion examples/input_D_farinae_small.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz
genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/809/275/GCA_020809275.1_ASM2080927v1/GCA_020809275.1_ASM2080927v1_genomic.fna.gz
taxid: 6954
reads:
- https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1
- https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2
- https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1
- https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2
annotation_provider: GenBank submitter
annotation_name_prefix: GCA_020809275.1
locus_tag_prefix: egapxtmp
3 changes: 0 additions & 3 deletions examples/input_D_farinae_small_proteins.yaml

This file was deleted.

6 changes: 6 additions & 0 deletions examples/input_D_farinae_small_readlist.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/809/275/GCA_020809275.1_ASM2080927v1/GCA_020809275.1_ASM2080927v1_genomic.fna.gz
taxid: 6954
reads: path/to/input_D_farinae_small_reads.txt
annotation_provider: GenBank submitter
annotation_name_prefix: GCA_020809275.1
locus_tag_prefix: egapxtmp
4 changes: 4 additions & 0 deletions examples/input_D_farinae_small_reads.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
set1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1
set1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2
set2 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1
set2 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2
9 changes: 0 additions & 9 deletions examples/input_D_farinae_small_rnaseq.yaml

This file was deleted.

8 changes: 0 additions & 8 deletions examples/input_D_farinae_small_rnaseq_proteins.yaml

This file was deleted.

5 changes: 4 additions & 1 deletion examples/input_Gavia_stellata.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/030/936/135/GCF_030936135.1_bGavSte3.hap2/GCF_030936135.1_bGavSte3.hap2_genomic.fna.gz
genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/030/936/135/GCA_030936135.1_bGavSte3.hap2/GCA_030936135.1_bGavSte3.hap2_genomic.fna.gz
reads: txid37040[Organism] AND biomol_transcript[properties] NOT SRS024887[Accession]
taxid: 37040
annotation_provider: GenBank submitter
annotation_name_prefix: GCA_030936135.1
locus_tag_prefix: egapxtmp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ workflow annot_builder {
def i = annot_builder_input('outdir', m, '01', gnomon_file, params)
// FIXME: intended params 4-5 to be lists of all input files and all input manifests, but it complained with only one entry
def (all, accept, accept_ftable, annot) = annot_builder_run('outdir', i[0], gencoll_asn, i[1], gnomon_file, genome_asn, params)

emit:
outputs = all
accept_asn = accept
Expand Down Expand Up @@ -158,9 +157,9 @@ process annot_builder_run {
val params
output:
path "${outdir}/*", emit: "all"
path "${outdir}/ACCEPT/accept.asn", emit: "accept", optional: true
path "${outdir}/ACCEPT/accept.ftable_annot", emit: "accept_ftable_annot", optional: true
path "${outdir}/ACCEPT/*.annot", optional: true
path "${outdir}/ACCEPT/accept.asn", emit: "accept"//, optional: true
path "${outdir}/ACCEPT/accept.ftable_annot", emit: "accept_ftable_annot"//, optional: true
path "${outdir}/ACCEPT/*.annot"//, optional: true
script:
"""
mkdir -p $outdir/ACCEPT
Expand Down
File renamed without changes.
24 changes: 24 additions & 0 deletions nf/subworkflows/ncbi/annot_proc/diamond_identify/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2

include { merge_params; to_map; shellSplit } from '../../utilities'
include { run_diamond_egap;} from '../../shared/diamond/main'


workflow diamond_worker {
take:
gnomon_prot_ids
swiss_prot_ids
gnomon_prot_asn
swiss_prot_asn
parameters // Map : extra parameter and parameter update
main:
String diamond_blastp_params = merge_params('--sam-query-len --very-sensitive --unal 0 --comp-based-stats 0 --masking 0', parameters, 'diamond_blastp')
String diamond_regular_params = merge_params('-ofmt seq-align-set -query-fmt seq-ids -subject-fmt seq-ids -output-prefix hits', parameters, 'diamond')
String diamond_egap_params = '-blastp-args \'' + diamond_blastp_params + '\' ' + diamond_regular_params

run_diamond_egap(gnomon_prot_ids, swiss_prot_ids, gnomon_prot_asn, swiss_prot_asn, diamond_egap_params)

emit:
alignments = run_diamond_egap.out
}
157 changes: 157 additions & 0 deletions nf/subworkflows/ncbi/annot_proc/final_asn_markup/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2

include { merge_params } from '../../utilities'

///final_asn
// -scaffolds final_asn_markup.8279392/inp/scaffold.mft
// -chromosomes final_asn_markup.8279392/inp/chromosome.mft
// -annots final_asn_markup.8279392/inp/annotation.mft
// -gene_weights final_asn_markup.8279392/inp/gene_weight.mft
// -locus_lnk final_asn_markup.8279392/inp/gene_assignment.mft
// -out_dir final_asn_markup.8279392/out
// -lsq_dir final_asn_markup.8279392/var/idx
// -gencoll-asn passthrough_gcaccess.8279042/out/gencoll.asn
// -asn-cache sequence_cache
// -nogenbank

// the scaf and chrom files are all in bare_scaffold_asn format, individual chroms, scafs bundled in per Asm and AsmUnit files
// final_asn task also runs asn_cleanup and asnvalidate on everything, examples from runlog below
// and it calls asn_stats, asnval2gbitem (?) , gbproject, and load_finl_asn_tracking_data after it all, some of which will probably be discarded by egapx

///prime_cache -cache final_asn_markup.8279392/var/nucprots.cache -input-manifest final_asn_markup.8279392/out/nucprot.mft -ifmt asn-seq-entry
///asn_cleanup -basic -i final_asn_markup.8279392/out/raw/scaf/GCF_030936175.1/all_unannotated.asn -o final_asn_markup.8279392/out/scaf/GCF_030936175.1/all_unannotated.asn
///asn_cleanup -basic -i final_asn_markup.8279392/out/raw/scaf/GCF_030936175.1/asm_Primary_Assembly_1.cat.asn -o final_asn_markup.8279392/out/scaf/GCF_030936175.1/asm_Primary_Assembly_1.cat.asn
///asn_cleanup -basic -i final_asn_markup.8279392/out/raw/chrom/GCF_030936175.1/Chr_SUPER_1.asn -o final_asn_markup.8279392/out/chrom/GCF_030936175.1/Chr_SUPER_1.asn
///asn_cleanup -basic -i final_asn_markup.8279392/out/raw/chrom/GCF_030936175.1/Chr_SUPER_10.asn -o final_asn_markup.8279392/out/chrom/GCF_030936175.1/Chr_SUPER_10.asn

///asnvalidate -Q 0 -asn-cache final_asn_markup.8279392/var/nucprots.cache,sequence_cache -v 4 -A -X -Z -o final_asn_markup.8279392/out/val/GCF_030936175.1/asm_Primary_Assembly_1.cat.val -i final_asn_markup.8279392/out/scaf/GCF_030936175.1/asm_Primary_Assembly_1.cat.asn
///asnvalidate -Q 0 -asn-cache final_asn_markup.8279392/var/nucprots.cache,sequence_cache -v 4 -A -X -Z -o final_asn_markup.8279392/out/val/GCF_030936175.1/asm_Primary_Assembly_1001.cat.val -i final_asn_markup.8279392/out/scaf/GCF_030936175.1/asm_Primary_Assembly_1001.cat.asn
///asnvalidate -Q 0 -asn-cache final_asn_markup.8279392/var/nucprots.cache,sequence_cache -v 4 -A -X -Z -o final_asn_markup.8279392/out/val/GCF_030936175.1/Chr_SUPER_1.val -i final_asn_markup.8279392/out/chrom/GCF_030936175.1/Chr_SUPER_1.asn
///asnvalidate -Q 0 -asn-cache sequence_cache -v 4 -A -X -Z -o final_asn_markup.8279392/out/val/GCF_030936175.1/all_nucprots.scaf.val -i final_asn_markup.8279392/out/scaf/GCF_030936175.1/all_nucprots.asn
///asnvalidate -Q 0 -asn-cache sequence_cache -v 4 -A -X -Z -o final_asn_markup.8279392/out/val/GCF_030936175.1/all_nucprots.chrom.val -i final_asn_markup.8279392/out/chrom/GCF_030936175.1/all_nucprots.asn

///asn_stats -input-manifest final_asn_markup.8279392/var/joint.mft -nucprot-manifest final_asn_markup.8279392/out/nucprot.mft -o final_asn_markup.8279392/out/feature_counts.txt -counts-xml-output final_asn_markup.8279392/out/feature_counts.xml -stats-xml-output final_asn_markup.8279392/out/feature_stats.xml -t -break-by assembly-unit -asn-cache sequence_cache -gencoll-asn passthrough_gcaccess.8279042/out/gencoll.asn
///asnval2gbitem -t -asn-cache sequence_cache -asnval-path final_asn_markup.8279392/out/val/GCF_030936175.1 -scaffold-manifest final_asn_markup.8279392/out/GCF_030936175.1.scaffolds.mft -chromosome-manifest final_asn_markup.8279392/out/GCF_030936175.1.chromosomes.mft -nucprot-manifest final_asn_markup.8279392/out/GCF_030936175.1.nucprots.mft -o final_asn_markup.8279392/var/gbench/GCF_030936175.1
///gbproject -collapse final_asn_markup.8279392/var/gbench/GCF_030936175.1 -o final_asn_markup.8279392/out/GCF_030936175.1.gbp
///load_final_asn_tracking_data -feature-counts-xml final_asn_markup.8279392/out/feature_counts.xml -length-stats-xml final_asn_markup.8279392/out/feature_stats.xml -validation-xml final_asn_markup.8279392/out/annot.val.xml


workflow final_asn_markup {
take:
gencoll_asn
genome_asn
scaffolds // asn seqentrs
chromosomes // asn seqentrysseqids
annots // asnt seq-annots
locus_link // rpt from locus_link
locustypes // tsv from locus_link
parameters // Map : extra parameter and parameter update
main:
params = merge_params("", parameters, 'final_asn')

final_asn(gencoll_asn, genome_asn, scaffolds, chromosomes, annots, locus_link, locustypes, params)

emit:
outputs = final_asn.out.all
to_convert = final_asn.out.to_convert
validated = final_asn.out.validated
stats = final_asn.out.stats
annotated_genome_asn = final_asn.out.annotated_genome_asn
annotation_data_comment = final_asn.out.annotation_data_comment
}


process final_asn {
input:
path gencoll_asn, stageAs: 'gencoll.asn'
path genome_asn, stageAs: 'genome/*'
path scaffolds, stageAs: 'scaffolds' // asn seqentry
path chromosomes, stageAs: 'chromosomes' // asn seqentry
path annots, stageAs: 'annots/*' // asnt seq-annots
path locus_link // tsv rpt
path locustypes // tsv
val params
output:
path "output/*", emit: "all"
path "output/scaf/EGAPx_Test_Assembly/*.asn", emit: "to_convert"
path "output/val/EGAPx_Test_Assembly/*", emit: "validated"
path "output/stats/*", emit: "stats"
path "output/annotated_genome.asn", emit: "annotated_genome_asn"
path "output/annotation_data.cmt", emit: "annotation_data_comment"
script:
"""
mkdir -p output
mkdir -p asncache
mkdir -p 'EGAPx_Test_Assembly'
prime_cache -cache ./asncache/ -ifmt asn-seq-entry -i $genome_asn -oseq-ids cached_ids -split-sequences
concat_seqentries -cache ./asncache/ -o "./EGAPx_Test_Assembly/genome.asnb.gz"
asn_translator -gzip -i "./EGAPx_Test_Assembly/genome.asnb.gz" -o "./EGAPx_Test_Assembly/genome.asnt"
echo "./EGAPx_Test_Assembly/genome.asnt" > ./scaffold.mft
touch ./chromosome.mft
ls -1 annots/* > ./annots.mft
echo $locus_link > ./locus_link.mft
echo $locustypes > ./locus_types.mft
echo "" > ./gene_weights.mft
##lds2_indexer -source genome/ -db LDS2
## prime_cache
# EXCEPTION_STACK_TRACE_LEVEL=Warning DEBUG_STACK_TRACE_LEVEL=Warning DIAG_POST_LEVEL=Trace
final_asn $params -egapx -nogenbank -gencoll-asn $gencoll_asn -asn-cache ./asncache/ \
-scaffolds ./scaffold.mft -chromosomes ./chromosome.mft \
-gene_weights ./gene_weights.mft \
-annots ./annots.mft -locus_lnk ./locus_link.mft -locus_types ./locus_types.mft \
-S NONE -genbank-mode -out_dir ./output/
mkdir -p raw/scaf
mv ./output/scaf/EGAPx_Test_Assembly/*.asn ./raw/scaf
for f in ./raw/scaf/*.asn; do
of=./output/scaf/EGAPx_Test_Assembly/`basename \$f`
asn_cleanup -basic -i \$f -o \$of
cat \$of >> output/annotated_genome.asn
done
# NB if (when) chromosomes is not empty the same logic should be applied to chrom directroies
if [ -s ./output/chrom/EGAPx_Test_Assembly/*.asn ]; then
mkdir -p raw/chrom
mv ./output/chrom/EGAPx_Test_Assembly/*.asn ./raw/chrom
for f in ./raw/chrom/*.asn; do
of=./output/chrom/EGAPx_Test_Assembly/`basename \$f`
asn_cleanup -basic -i \$f -o \$of
cat \$of >> output/annotated_genome.asn
done
fi
mkdir -p output/val/EGAPx_Test_Assembly
for f in ./output/scaf/EGAPx_Test_Assembly/*.asn; do
asnvalidate -Q 0 -asn-cache ./asncache/ -v 4 -A -X -Z -o ./output/val/EGAPx_Test_Assembly/`basename \$f .asn`.val -i \$f
done
# joint manifest is scaffolds, chromosomes, and organelles (not implemented here)
# take it from annotated_genome.asn
echo "./output/annotated_genome.asn" > ./joint.mft
mkdir -p output/stats
asn_stats -input-manifest ./joint.mft -o output/stats/feature_counts.txt -counts-xml-output output/stats/feature_counts.xml -stats-xml-output output/stats/feature_stats.xml -t -break-by assembly-unit -asn-cache ./asncache/ -gencoll-asn $gencoll_asn -genbank-mode
"""
stub:
"""
mkdir -p output/ACCEPT
echo "1" > output/ACCEPT/something.asn
mkdir -p output/scaf/EGAPx_Test_Assembly/
echo "1" > output/scaf/EGAPx_Test_Assembly/genome.asn
mkdir -p output/val/EGAPx_Test_Assembly/
echo "1" > output/val/EGAPx_Test_Assembly/genome.val
mkdir -p output/stats
echo "1" > output/stats/feature_counts.txt
echo "1" > output/annotated_genome.asn
echo "1" > output/annotation_data.cmt
echo "1" > output/final_asn.log
"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,19 @@ process run_gnomon_biotype {
"""
mkdir -p output
mkdir -p ./asncache/
prime_cache -cache ./asncache/ -ifmt asnb-seq-entry -i ${swiss_prot_asn} -oseq-ids /dev/null -split-sequences
echo "${raw_blastp_hits.join('\n')}" > raw_blastp_hits.mft
prime_cache -cache ./asncache/ -ifmt asnb-seq-entry -i ${swiss_prot_asn} -oseq-ids spids -split-sequences
prime_cache -cache ./asncache/ -ifmt asnb-seq-entry -i ${models_files} -oseq-ids gnids -split-sequences
lds2_indexer -source genome/ -db LDS2
echo "${raw_blastp_hits.join('\n')}" > raw_blastp_hits.mft
merge_blastp_hits -asn-cache ./asncache/ -nogenbank -lds2 LDS2 -input-manifest raw_blastp_hits.mft -o prot_hits.asn
echo "${models_files.join('\n')}" > models.mft
echo "prot_hits.asn" > prot_hits.mft
echo "${splices_files.join('\n')}" > splices.mft
if [ -z "$denylist" ]
then
gnomon_biotype -gc $gencoll_asn -asn-cache ./asncache/ -nogenbank -gnomon_models models.mft -o output/biotypes.tsv -o_prots_rpt output/prots_rpt.tsv -prot_hits prot_hits.mft -prot_splices splices.mft -reftrack-server 'NONE' -allow_lt631 true
gnomon_biotype -gc $gencoll_asn -asn-cache ./asncache/ -lds2 ./LDS2 -nogenbank -gnomon_models models.mft -o output/biotypes.tsv -o_prots_rpt output/prots_rpt.tsv -prot_hits prot_hits.mft -prot_splices splices.mft -reftrack-server 'NONE' -allow_lt631 true
else
gnomon_biotype -gc $gencoll_asn -asn-cache ./asncache/ -nogenbank -gnomon_models models.mft -o output/biotypes.tsv -o_prots_rpt output/prots_rpt.tsv -prot_denylist $denylist -prot_hits prot_hits.mft -prot_splices splices.mft -reftrack-server 'NONE' -allow_lt631 true
gnomon_biotype -gc $gencoll_asn -asn-cache ./asncache/ -lds2 ./LDS2 -nogenbank -gnomon_models models.mft -o output/biotypes.tsv -o_prots_rpt output/prots_rpt.tsv -prot_denylist $denylist -prot_hits prot_hits.mft -prot_splices splices.mft -reftrack-server 'NONE' -allow_lt631 true
fi
"""
stub:
Expand Down
Loading

0 comments on commit f21ac06

Please sign in to comment.