From f7f116c3e1e22d9c1d41f46b5bd60023554c1394 Mon Sep 17 00:00:00 2001 From: Victor Joukov Date: Tue, 5 Nov 2024 16:38:01 -0500 Subject: [PATCH 1/2] Release 0.3.0-alpha --- PRIVACY.md | 2 +- README.md | 328 +++++++---- examples/input_C_longicornis.yaml | 7 +- examples/input_D_farinae_minimal.yaml | 16 - examples/input_D_farinae_small.yaml | 5 +- examples/input_D_farinae_small_proteins.yaml | 3 - examples/input_D_farinae_small_readlist.yaml | 6 + examples/input_D_farinae_small_reads.txt | 4 + examples/input_D_farinae_small_rnaseq.yaml | 9 - ...input_D_farinae_small_rnaseq_proteins.yaml | 8 - examples/input_Gavia_stellata.yaml | 5 +- .../annot_builder/main.nf | 7 +- .../annotwriter/main.nf | 0 .../best_protein_hits}/main.nf | 0 .../{gnomon => annot_proc}/diamond/main.nf | 0 .../ncbi/annot_proc/diamond_identify/main.nf | 24 + .../ncbi/annot_proc/final_asn_markup/main.nf | 157 ++++++ .../gnomon_biotype/main.nf | 9 +- .../{gnomon => annot_proc}/locus_link/main.nf | 36 +- .../ncbi/annot_proc/locus_track/main.nf | 70 +++ nf/subworkflows/ncbi/annot_proc/main.nf | 107 ++++ .../prot_gnomon_prepare/main.nf | 0 .../ncbi/default/align_sort_sa/main.nf | 3 +- .../ncbi/default/convert_annotations/main.nf | 91 +++ .../gnomon_training_iterations/main.nf | 98 ++++ .../ncbi/gnomon/chainer_wnode/main.nf | 3 + .../ncbi/gnomon/gnomon_wnode/main.nf | 10 +- nf/subworkflows/ncbi/gnomon/main.nf | 60 +- nf/subworkflows/ncbi/main.nf | 76 ++- nf/subworkflows/ncbi/only_gnomon.nf | 21 +- .../ncbi/orthology/diamond_orthology/main.nf | 4 +- .../extract_products_from_models/main.nf | 2 +- .../ncbi/orthology/find_orthologs/main.nf | 48 +- nf/subworkflows/ncbi/orthology/main.nf | 20 +- .../rnaseq_short/bam_bin_and_sort/main.nf | 2 +- .../rnaseq_short/convert_from_bam/main.nf | 2 +- nf/subworkflows/ncbi/rnaseq_short/main.nf | 24 +- .../ncbi/rnaseq_short/star_index/main.nf | 23 +- .../ncbi/rnaseq_short/star_wnode/main.nf | 26 +- nf/subworkflows/ncbi/setup/main.nf | 51 +- nf/subworkflows/ncbi/shared/diamond/main.nf | 14 +- .../target_proteins/best_aligned_prot/main.nf | 2 +- nf/subworkflows/ncbi/target_proteins/main.nf | 17 +- nf/ui.nf | 55 +- ui/requirements.txt => requirements.txt | 0 ui/assets/config/docker_image.config | 2 +- ui/assets/config/executor/ncbi-sge.config | 22 +- ui/assets/default_task_params.yaml | 51 +- ui/egapx.py | 517 +++++++++++++++--- 49 files changed, 1570 insertions(+), 477 deletions(-) delete mode 100644 examples/input_D_farinae_minimal.yaml delete mode 100644 examples/input_D_farinae_small_proteins.yaml create mode 100644 examples/input_D_farinae_small_readlist.yaml create mode 100644 examples/input_D_farinae_small_reads.txt delete mode 100644 examples/input_D_farinae_small_rnaseq.yaml delete mode 100644 examples/input_D_farinae_small_rnaseq_proteins.yaml rename nf/subworkflows/ncbi/{default => annot_proc}/annot_builder/main.nf (98%) rename nf/subworkflows/ncbi/{default => annot_proc}/annotwriter/main.nf (100%) rename nf/subworkflows/ncbi/{gnomon/protein_filter => annot_proc/best_protein_hits}/main.nf (100%) rename nf/subworkflows/ncbi/{gnomon => annot_proc}/diamond/main.nf (100%) create mode 100644 nf/subworkflows/ncbi/annot_proc/diamond_identify/main.nf create mode 100644 nf/subworkflows/ncbi/annot_proc/final_asn_markup/main.nf rename nf/subworkflows/ncbi/{gnomon => annot_proc}/gnomon_biotype/main.nf (72%) rename nf/subworkflows/ncbi/{gnomon => annot_proc}/locus_link/main.nf (76%) create mode 100644 nf/subworkflows/ncbi/annot_proc/locus_track/main.nf create mode 100644 nf/subworkflows/ncbi/annot_proc/main.nf rename nf/subworkflows/ncbi/{gnomon => annot_proc}/prot_gnomon_prepare/main.nf (100%) create mode 100644 nf/subworkflows/ncbi/default/convert_annotations/main.nf create mode 100644 nf/subworkflows/ncbi/gnomon-training-iteration/gnomon_training_iterations/main.nf rename ui/requirements.txt => requirements.txt (100%) diff --git a/PRIVACY.md b/PRIVACY.md index a4f87da..e819db8 100644 --- a/PRIVACY.md +++ b/PRIVACY.md @@ -3,4 +3,4 @@ ## Privacy Statement We do not currently collect EGAPx usage data. -Additional privacy and security policy information can be found on the [NLM Web Policies](https://www.nlm.nih.gov/web_policies.html) page. +Additional privacy and security policy information can be found on the [NLM Web Policies](https://www.nlm.nih.gov/web_policies.html) page. \ No newline at end of file diff --git a/README.md b/README.md index ca134a3..bbc7842 100644 --- a/README.md +++ b/README.md @@ -2,14 +2,18 @@ EGAPx is the publicly accessible version of the updated NCBI [Eukaryotic Genome Annotation Pipeline](https://www.ncbi.nlm.nih.gov/refseq/annotation_euk/process/). -EGAPx takes an assembly fasta file, a taxid of the organism, and RNA-seq data. Based on the taxid, EGAPx will pick protein sets and HMM models. The pipeline runs `miniprot` to align protein sequences, and `STAR` to align RNA-seq to the assembly. Protein alignments and RNA-seq read alignments are then passed to `Gnomon` for gene prediction. In the first step of `Gnomon`, the short alignments are chained together into putative gene models. In the second step, these predictions are further supplemented by _ab-initio_ predictions based on HMM models. The final annotation for the input assembly is produced as a `gff` file. +EGAPx takes an assembly fasta file, a taxid of the organism, and RNA-seq data. Based on the taxid, EGAPx will pick protein sets and HMM models. The pipeline runs `miniprot` to align protein sequences, and `STAR` to align RNA-seq to the assembly. Protein alignments and RNA-seq read alignments are then passed to `Gnomon` for gene prediction. In the first step of `Gnomon`, the short alignments are chained together into putative gene models. In the second step, these predictions are further supplemented by _ab-initio_ predictions based on HMM models. Functional annotation is added to the final structural annotation set based on the type and quality of the model and orthology information. The final annotation for the input assembly is produced as a `gff` file. -We currently have protein datasets posted that are suitable for most vertebrates and arthropods: - - Chordata - Mammalia, Sauropsida, Actinopterygii (ray-finned fishes) +We currently have protein datasets posted that are suitable for most vertebrates, arthropods, and some plants: + - Chordata - Mammalia, Sauropsida, Actinopterygii (ray-finned fishes), other Vertebrates - Insecta - Hymenoptera, Diptera, Lepidoptera, Coleoptera, Hemiptera - Arthropoda - Arachnida, other Arthropoda -We will be adding datasets for plants and other invertebrates in the next couple of months. Fungi, protists and nematodes are currently out-of-scope for EGAPx pending additional refinements. + - Monocots - Lilipopsida + - Eudicots - Asterids, Rosids, Fabids, Caryophyllales + + +Fungi, protists and nematodes are currently out-of-scope for EGAPx pending additional refinements. @@ -66,16 +70,6 @@ Input to EGAPx is in the form of a YAML file. reads: [SRA Study ID] reads: SRA query for reads ``` - - If you are using your local reads, then the FASTA/FASTQ files should be provided in the following format: - ``` - reads: - - path_to_Sample1_R1.gz - - path_to_Sample1_R2.gz - - path_to_Sample2_R1.gz - - path_to_Sample2_R2.gz - ``` - - - If you provide an SRA Study ID, all the SRA run ID's belonging to that Study ID will be included in the EGAPx run. - The following are the _optional_ key-value pairs for the input file: @@ -89,7 +83,19 @@ Input to EGAPx is in the form of a YAML file. hmm: path to HMM file ``` - +- The following are _optional_ metadata configuration parameters (not critical for testing EGAPx alpha, will later be used for GenBank submissions): + - Annotation provider. The main contact for this genome assembly. + ``` + annotation_provider: GenBank submitter + ``` + - Annotation name prefix. GenBank assembly accession version. Uniquely identifies annotation products originating from the same annotation run. The resulting annotation name is `-GB_YYYY_MM_DD`. If the GCA acc.ver if not known, do not include this parameter. The annotation name will default to `GB_YYYY_MM_DD`. + ``` + annotation_name_prefix: GCA_#########.1 + ``` + - Locus tag prefix. One to 9-letter prefix to use for naming genes on this genome assembly. If an official locus tag prefix was already reserved from an INSDC organization (GenBank, ENA or DDBJ) for the given BioSample and BioProject pair, provide here. Otherwise, provide a string of your choice. If no value is provided, the prefix 'egapxtmp' will be used. + ``` + locus_tag_prefix: egapxtmp + ``` ## Input example @@ -106,28 +112,57 @@ Input to EGAPx is in the form of a YAML file. - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/data/Dermatophagoides_farinae_small/SRR9005248.2 ``` + +- If you are using your local reads, then the FASTA/FASTQ files can be provided using the format below. For proper specification of paired-end read files, the filenames must have a shared prefix prior to an underscore character, and the prefix is not shared by any other library: + ``` + reads: + - path/to/se1_reads.fq # path to single-end reads + - path/to/se2_reads.fq + - path/to/pe1_reads_R1.fq # path to paired-end R1 reads + - path/to/pe1_reads_R2.fq # path to paired-end R2 reads + - path/to/pe2_reads_R1.fq + - path/to/pe2_reads_R2.fq + ``` + + Alternatively, you can explicitly set the names and paths to reads sets by following the format below. Here the filenames for the reads can be anything, but the set names for each set has to be unique. + ``` + reads: + - - single_end_library_name1 # set name + - - path/to/se1_reads.fq # file name for single-end reads + - - single_end_library_name2 + - - path/to/se2_reads.fq + - - paired_end_library_name1 # set name + - - path/to/pe1_reads_R1.fq # file name for paired-end R1 reads + - path/to/pe1_reads_R2.fq # file name for paied-end R2 reads + - - paired_end_library_name2 + - - path/to/pe2_reads_R1.fq + - path/to/pe2_reads_R2.fq + ``` + + There is one other option on how to provide RNA-seq data. If you have a large number of local RNA-seq data, you can list them in a file with a set name and a filepath in each line (see `examples/input_D_farinae_small_reads.txt`). Then you can read that file from the input yaml (see `examples/input_D_farinae_small_readlist.yaml`). + - To specify an array of NCBI SRA datasets: ``` reads: - SRR8506572 - SRR9005248 ``` + - If you provide an SRA Study ID, all the SRA run ID's belonging to that Study ID will be included in the EGAPx run. - To specify an SRA entrez query: ``` reads: 'txid6954[Organism] AND biomol_transcript[properties] NOT SRS024887[Accession] AND (SRR8506572[Accession] OR SRR9005248[Accession] )' ``` - **Note:** Both the above examples will have more RNA-seq data than the `input_D_farinae_small.yaml` example. To make sure the entrez query does not produce a large number of SRA runs, please run it first at the [NCBI SRA page](https://www.ncbi.nlm.nih.gov/sra). If there are too many SRA runs, then select a few of them and list it in the input yaml. + **Note:** Both the above examples using SRA reads or SRA entrez query will have more RNA-seq data than the `input_D_farinae_small.yaml` example. To make sure the entrez query does not produce a large number of SRA runs, please run it first at the [NCBI SRA page](https://www.ncbi.nlm.nih.gov/sra). If there are too many SRA runs, then select a few of them and list it in the input yaml. -- First, test EGAPx on the example provided (`input_D_farinae_small.yaml`, a dust mite) to make sure everything works. This example usually runs under 30 minutes depending upon resource availability. There are other examples you can try: `input_C_longicornis.yaml`, a green fly, and `input_Gavia_tellata.yaml`, a bird. These will take close to two hours. You can prepare your input YAML file following these examples. +- First, test EGAPx on the example provided (`input_D_farinae_small.yaml`, a dust mite) to make sure everything works. This example usually runs under 30 minutes depending upon resource availability. There are other examples you can try: `input_C_longicornis.yaml`, a green fly, and `input_Gavia_stellata.yaml`, a bird. These will take close to two hours. You can prepare your input YAML file following these examples. ## Run EGAPx - The `egapx` folder contains the following directories: - examples - nf - - test - third_party_licenses - ui @@ -137,7 +172,7 @@ Input to EGAPx is in the form of a YAML file. ``` python -m venv /path/to/new/virtual/environment source /path/to/new/virtual/environment/bin/activate - pip install -r ui/requirements.txt + pip install -r requirements.txt ``` @@ -165,56 +200,58 @@ Input to EGAPx is in the form of a YAML file. - use `-e docker` for using Docker image - use `-e singularity` for using the Singularity image - use `-e biowulf_cluster` for Biowulf cluster using Singularity image - - use '-e slurm` for using SLURM in your HPC. + - use `-e slurm` for using SLURM in your HPC. - Note that for this option, you have to edit `./egapx_config/slurm.config` according to your cluster specifications. - type `python3 ui/egapx.py  -h ` for the help menu ``` - $ ui/egapx.py -h - - - !!WARNING!! - This is an alpha release with limited features and organism scope to collect initial feedback on execution. Outputs are not yet complete and not intended for production use. - - usage: egapx.py [-h] [-o OUTPUT] [-e EXECUTOR] [-c CONFIG_DIR] [-w WORKDIR] [-r REPORT] [-n] [-st] - [-so] [-dl] [-lc LOCAL_CACHE] [-q] [-v] [-fn FUNC_NAME] - [filename] + $ ui/egapx.py -help + usage: egapx.py [-h] [-o OUTPUT] [-e EXECUTOR] [-c CONFIG_DIR] [-w WORKDIR] + [-r REPORT] [-n] [-st] [-so] [-ot ORTHO_TAXID] [-dl] + [-lc LOCAL_CACHE] [-q] [-v] [-V] [-fn FUNC_NAME] + [filename] Main script for EGAPx optional arguments: -h, --help show this help message and exit -e EXECUTOR, --executor EXECUTOR - Nextflow executor, one of docker, singularity, aws, or local (for NCBI - internal use only). Uses corresponding Nextflow config file + Nextflow executor, one of docker, singularity, aws, + or local (for NCBI internal use only). Uses + corresponding Nextflow config file -c CONFIG_DIR, --config-dir CONFIG_DIR - Directory for executor config files, default is ./egapx_config. Can be also - set as env EGAPX_CONFIG_DIR + Directory for executor config files, default is + ./egapx_config. Can be also set as env + EGAPX_CONFIG_DIR -w WORKDIR, --workdir WORKDIR - Working directory for cloud executor + Working directory for cloud executor -r REPORT, --report REPORT - Report file prefix for report (.report.html) and timeline (.timeline.html) - files, default is in output directory + Report file prefix for report (.report.html) and + timeline (.timeline.html) files, default is in output + directory -n, --dry-run -st, --stub-run - -so, --summary-only Print result statistics only if available, do not compute result + -so, --summary-only Print result statistics only if available, do not + compute result + -ot ORTHO_TAXID, --ortho-taxid ORTHO_TAXID + Taxid of reference data for orthology tasks -lc LOCAL_CACHE, --local-cache LOCAL_CACHE - Where to store the downloaded files + Where to store the downloaded files -q, --quiet -v, --verbose + -V, --version Report software version -fn FUNC_NAME, --func_name FUNC_NAME - func_name + func_name run: - filename YAML file with input: section with at least genome: and reads: parameters + filename YAML file with input: section with at least genome: + and reads: parameters -o OUTPUT, --output OUTPUT Output path download: - -dl, --download-only Download external files to local storage, so that future runs can be - isolated - - + -dl, --download-only Download external files to local storage, so that + future runs can be isolated ``` @@ -229,82 +266,148 @@ This is an alpha release with limited features and organism scope to collect ini N E X T F L O W ~ version 23.10.1 Launching `/../home/user/egapx/ui/../nf/ui.nf` [golden_mercator] DSL2 - revision: c134f40af5 in egapx block -executor > awsbatch (67) -[f5/3007b8] process > egapx:setup_genome:get_genome_info [100%] 1 of 1 ✔ -[32/a1bfa5] process > egapx:setup_proteins:convert_proteins [100%] 1 of 1 ✔ -[96/621c4b] process > egapx:miniprot:run_miniprot [100%] 1 of 1 ✔ -[6d/766c2f] process > egapx:paf2asn:run_paf2asn [100%] 1 of 1 ✔ -[56/f1dd6b] process > egapx:best_aligned_prot:run_best_aligned_prot [100%] 1 of 1 ✔ -[c1/ccc4a3] process > egapx:align_filter_sa:run_align_filter_sa [100%] 1 of 1 ✔ -[e0/5548d0] process > egapx:run_align_sort [100%] 1 of 1 ✔ -[a8/456a0e] process > egapx:star_index:build_index [100%] 1 of 1 ✔ -[d5/6469a6] process > egapx:star_simplified:exec (1) [100%] 2 of 2 ✔ -[64/99ab35] process > egapx:bam_strandedness:exec (2) [100%] 2 of 2 ✔ -[98/a12969] process > egapx:bam_strandedness:merge [100%] 1 of 1 ✔ -[78/0d7007] process > egapx:bam_bin_and_sort:calc_assembly_sizes [100%] 1 of 1 ✔ -[74/bb014e] process > egapx:bam_bin_and_sort:bam_bin (2) [100%] 2 of 2 ✔ -[39/3cdd00] process > egapx:bam_bin_and_sort:merge_prepare [100%] 1 of 1 ✔ -[01/f64e38] process > egapx:bam_bin_and_sort:merge (1) [100%] 1 of 1 ✔ -[aa/47a002] process > egapx:bam2asn:convert (1) [100%] 1 of 1 ✔ -[45/6661b3] process > egapx:rnaseq_collapse:generate_jobs [100%] 1 of 1 ✔ -[64/68bc37] process > egapx:rnaseq_collapse:run_rnaseq_collapse (3) [100%] 9 of 9 ✔ -[18/bff1ac] process > egapx:rnaseq_collapse:run_gpx_make_outputs [100%] 1 of 1 ✔ -[a4/76a4a5] process > egapx:get_hmm_params:run_get_hmm [100%] 1 of 1 ✔ -[3c/b71c42] process > egapx:chainer:run_align_sort (1) [100%] 1 of 1 ✔ -[e1/340b6d] process > egapx:chainer:generate_jobs [100%] 1 of 1 ✔ -[c0/477d02] process > egapx:chainer:run_chainer (16) [100%] 16 of 16 ✔ -[9f/27c1c8] process > egapx:chainer:run_gpx_make_outputs [100%] 1 of 1 ✔ -[5c/8f65d0] process > egapx:gnomon_wnode:gpx_qsubmit [100%] 1 of 1 ✔ -[34/6ab0c9] process > egapx:gnomon_wnode:annot (1) [100%] 10 of 10 ✔ -[a9/e38221] process > egapx:gnomon_wnode:gpx_qdump [100%] 1 of 1 ✔ -[bc/8ebca4] process > egapx:annot_builder:annot_builder_main [100%] 1 of 1 ✔ -[5f/6b72c0] process > egapx:annot_builder:annot_builder_input [100%] 1 of 1 ✔ -[eb/1ccdd0] process > egapx:annot_builder:annot_builder_run [100%] 1 of 1 ✔ -[4d/6c33db] process > egapx:annotwriter:run_annotwriter [100%] 1 of 1 ✔ -[b6/d73d18] process > export [100%] 1 of 1 ✔ -Waiting for file transfers to complete (1 files) -Completed at: 27-Mar-2024 11:43:15 -Duration : 27m 36s -CPU hours : 4.2 -Succeeded : 67 +executor > awsbatch (83) +[41/69fb92] process > egapx:setup_genome:get_genome_info [100%] 1 of 1 ✔ +[12/af924a] process > egapx:setup_proteins:convert_proteins [100%] 1 of 1 ✔ +[26/661e33] process > egapx:target_proteins_plane:miniprot:split_proteins [100%] 1 of 1 ✔ +[86/68836c] process > egapx:target_proteins_plane:miniprot:run_miniprot (1) [100%] 1 of 1 ✔ +[f1/2d07a3] process > egapx:target_proteins_plane:paf2asn:run_paf2asn (1) [100%] 1 of 1 ✔ +[05/33457c] process > egapx:target_proteins_plane:best_aligned_prot:run_best_aligned_prot [100%] 1 of 1 ✔ +[41/455b4f] process > egapx:target_proteins_plane:align_filter_sa:run_align_filter_sa [100%] 1 of 1 ✔ +[c9/4627b4] process > egapx:target_proteins_plane:align_sort_sa:run_align_sort [100%] 1 of 1 ✔ +[9b/0b248b] process > egapx:rnaseq_short_plane:star_index:build_index [100%] 1 of 1 ✔ +[79/799e31] process > egapx:rnaseq_short_plane:star:run_star (1) [100%] 2 of 2 ✔ +[01/af1f68] process > egapx:rnaseq_short_plane:bam_strandedness:rnaseq_divide_by_strandedness [100%] 1 of 1 ✔ +[65/4107dc] process > egapx:rnaseq_short_plane:bam_bin_and_sort:calc_assembly_sizes [100%] 1 of 1 ✔ +[5d/c69fbf] process > egapx:rnaseq_short_plane:bam_bin_and_sort:bam_bin (2) [100%] 2 of 2 ✔ +[c1/707e59] process > egapx:rnaseq_short_plane:bam_bin_and_sort:merge_prepare [100%] 1 of 1 ✔ +[e3/bba172] process > egapx:rnaseq_short_plane:bam_bin_and_sort:merge (1) [100%] 1 of 1 ✔ +[2b/7c7b6a] process > egapx:rnaseq_short_plane:bam2asn:convert (1) [100%] 1 of 1 ✔ +[23/3a9fba] process > egapx:rnaseq_short_plane:rnaseq_collapse:generate_jobs [100%] 1 of 1 ✔ +[b8/994db8] process > egapx:rnaseq_short_plane:rnaseq_collapse:run_rnaseq_collapse (8) [100%] 9 of 9 ✔ +[da/f769f6] process > egapx:rnaseq_short_plane:rnaseq_collapse:run_gpx_make_outputs [100%] 1 of 1 ✔ +[af/c32ba6] process > egapx:gnomon_plane:chainer:run_align_sort (1) [100%] 1 of 1 ✔ +[7f/bed27d] process > egapx:gnomon_plane:chainer:generate_jobs [100%] 1 of 1 ✔ +[4a/cdb342] process > egapx:gnomon_plane:chainer:run_chainer (7) [100%] 16 of 16 ✔ +[7c/b687bb] process > egapx:gnomon_plane:chainer:run_gpx_make_outputs [100%] 1 of 1 ✔ +[62/e78572] process > egapx:gnomon_plane:gnomon_wnode:gpx_qsubmit [100%] 1 of 1 ✔ +[62/8445b3] process > egapx:gnomon_plane:gnomon_wnode:annot (1) [100%] 10 of 10 ✔ +[57/589794] process > egapx:gnomon_plane:gnomon_wnode:gpx_qdump [100%] 1 of 1 ✔ +[7b/020592] process > egapx:annot_proc_plane:fetch_swiss_prot_asn [100%] 1 of 1 ✔ +[70/34b131] process > egapx:annot_proc_plane:get_swiss_prot_ids [100%] 1 of 1 ✔ +[7d/16a826] process > egapx:annot_proc_plane:prot_gnomon_prepare:prot_gnomon_prepare_p [100%] 1 of 1 ✔ +[a3/a6a568] process > egapx:annot_proc_plane:diamond_worker:run_diamond_egap [100%] 1 of 1 ✔ +[97/e54b4a] process > egapx:annot_proc_plane:best_protein_hits:run_protein_filter_replacement [100%] 1 of 1 ✔ +[e3/32a317] process > egapx:annot_proc_plane:gnomon_biotype:run_gnomon_biotype [100%] 1 of 1 ✔ +[89/56953c] process > egapx:annot_proc_plane:annot_builder:annot_builder_main [100%] 1 of 1 ✔ +[7c/28df80] process > egapx:annot_proc_plane:annot_builder:annot_builder_input [100%] 1 of 1 ✔ +[19/781bc2] process > egapx:annot_proc_plane:annot_builder:annot_builder_run [100%] 1 of 1 ✔ +[f5/1140c6] process > egapx:annot_proc_plane:print_fake_lxr_data [100%] 1 of 1 ✔ +[94/0ee74c] process > egapx:annot_proc_plane:orthology_plane:fetch_ortholog_references [100%] 1 of 1 ✔ +[f3/053877] process > egapx:annot_proc_plane:orthology_plane:setup_ext_genome:get_genome_info [100%] 1 of 1 ✔ +[bd/5ededd] process > egapx:annot_proc_plane:orthology_plane:setup_ext_proteins:convert_proteins [100%] 1 of 1 ✔ +[7d/fa5f13] process > egapx:annot_proc_plane:orthology_plane:get_prot_ref_ids [100%] 1 of 1 ✔ +[82/8018fb] process > egapx:annot_proc_plane:orthology_plane:extract_products_from_models:run_extract_products_from_mo... [100%] 1 of 1 ✔ +[ce/22bdea] process > egapx:annot_proc_plane:orthology_plane:diamond_orthology:run_diamond_egap [100%] 1 of 1 ✔ +[ed/0d0cdd] process > egapx:annot_proc_plane:orthology_plane:find_orthologs:run_find_orthologs [100%] 1 of 1 ✔ +[56/48bd29] process > egapx:annot_proc_plane:locus_track:run_locus_track [100%] 1 of 1 ✔ +[95/4ad706] process > egapx:annot_proc_plane:locus_link:run_locus_link [100%] 1 of 1 ✔ +[1e/a66cb3] process > egapx:annot_proc_plane:final_asn_markup:final_asn [100%] 1 of 1 ✔ +[f2/391794] process > egapx:annot_proc_plane:annotwriter:run_annotwriter [100%] 1 of 1 ✔ +[4e/6fccc1] process > egapx:convert_annotations:run_converter [100%] 1 of 1 ✔ +[8d/e3225f] process > export [100%] 1 of 1 ✔ +Completed at: 30-Oct-2024 11:46:09 +Duration : 53m 9s +CPU hours : 7.0 +Succeeded : 83 + + +Statistics for example_out/complete.genomic.gff +CDS 33203 +exon 35007 +gene 8828 +lnc_RNA 566 +mRNA 8407 +pseudogene 6 +transcript 4 ``` + ## Output -Look at the output in the out diectory (`example_out`) that was supplied in the command line. The annotation file is called `accept.gff`. +Look at the output in the out diectory (`example_out`) that was supplied in the command line. The annotation file is called `complete.genomic.gff`. ``` -accept.gff annot_builder_output +annotated_genome.asn +annotation_data.cmt +complete.cds.fna +complete.genomic.fna +complete.genomic.gff +complete.genomic.gtf +complete.proteins.faa +complete.transcripts.fna nextflow.log +resume.sh run.report.html run.timeline.html run.trace.txt run_params.yaml +stats +validated ``` -The `nextflow.log` is the log file that captures all the process information and their work directories. `run_params.yaml` has all the parameters that were used in the EGAPx run. More information about the process time and resources can be found in the other run* files. - - +Description of the outputs: +* `complete.genomic.gff`: final annotation set in GFF3 format. +* `complete.genomic.gtf`: final annotation set in GTF format. +* `complete.genomic.fna`: full genome sequences set in FASTA format. +* `complete.genomic.gtf`: final annotation set in gtf format. +* `complete.cds.fna`: annotated Coding DNA Sequences (CDS) in FASTA format. +* `complete.transcripts.fna`: annotated transcripts in FASTA format (includes UTRs). +* `complete.proteins.faa`: annotated protein products in FASTA format. +* `annotated_genome.asn`: final annotation set in ASN1 format. +Description of the logs and miscellaneous outputs: +* `annot_builder_output/accept.ftable_annot`: intermediate file with accepted annotation models called by GNOMON. +* `annotation_data.cmt`: annotation structured comment file. Used for submission to GenBank. +* `nextflow.log`: main Nextflow log that captures all the process information and their work directories. +* `resume.sh`: Nextflow command for resuming a run from the last successful task. +* `run.report.html`: Nextflow rendered HTML execution report containing run summary, resource usage, and tasks execution. +* `run.timeline.html`: Nextflow rendered HTML timeline for all processes executed in the EGAPx pipeline. +* `run.trace.txt`: Nextflow execution tracing file that contains information about each EGAPx process including runtime and CPU usage. +* `run_params.yaml`: YAML file containing parameters used for the EGAPx run +* `stats`: directory containing features statistics for the final annotation set +* `validated`: directory containing validation warnings and errors for annotated features. Used for submission to GenBank. + +## Interpreting Output + +`stats/feature_counts.xml` contains summary counts of features by model prediction categories determined by GNOMON. + +**NOTE** not all categories are expected to have counts data (e.g. model RefSeq, fully supported, ab initio) + +Genes with `major correction` are likely protein-coding genes with frameshifts and/or internal stops. These models include "LOW QUALITY PROTEIN" in the protein FASTA title, are marked up with exception=low-quality sequence region on the mRNA and CDS features, and the annotation is adjusted to meet GenBank criteria (frameshifts are compensated for by 1-2 bp microintrons in the mRNA and CDS features, and internal stops have a transl_except to translate the codon as X instead of a stop). For RefSeq, we set a threshold of no more than 10% of protein-coding genes with major corrections to release the annotation. We recommend users polish assembly sequences if the rate is higher than 10%. + +Counts of protein-coding genes should be considered versus similar species. Low counts may result from insufficient supporting evidence (e.g. low RNAseq coverage or an unusual organism compared to the available protein data). High counts may indicate genome fragmentation or noise from genes annotated on transposons. + +`stats/feature_stats.xml` contains summary statistics on transcript counts per gene, exon counts per transcript, and the counts and length distributions of features by sub-type. ## Intermediate files -In the above log, each line denotes the process that completed in the workflow. The first column (_e.g._ `[96/621c4b]`) is the subdirectory where the intermediate output files and logs are found for the process in the same line, _i.e._, `egapx:miniprot:run_miniprot`. To see the intermediate files for that process, you can go to the work directory path that you had supplied and traverse to the subdirectory `96/621c4b`: +In the nextflow log, you can find work directory paths for each job. You can go to that path, and look for the output files and command logs. For example, to see the files generated during run_miniprot job, run the following command to get the directory path, and list the files within that directory. ``` -$ aws s3 ls s3://temp_datapath/D_farinae/96/ - PRE 06834b76c8d7ceb8c97d2ccf75cda4/ - PRE 621c4ba4e6e87a4d869c696fe50034/ -$ aws s3 ls s3://temp_datapath/D_farinae/96/621c4ba4e6e87a4d869c696fe50034/ +grep run_miniprot example_out/nextflow.log| grep COMPLETED + +aws s3 ls s3://temp_datapath/D_farinae/86/68836c310a571e6752a33a221d1962/ PRE output/ -2024-03-27 11:19:18 0 -2024-03-27 11:19:28 6 .command.begin -2024-03-27 11:20:24 762 .command.err -2024-03-27 11:20:26 762 .command.log -2024-03-27 11:20:23 0 .command.out -2024-03-27 11:19:18 13103 .command.run -2024-03-27 11:19:18 129 .command.sh -2024-03-27 11:20:24 276 .command.trace -2024-03-27 11:20:25 1 .exitcode -$ aws s3 ls s3://temp_datapath/D_farinae/96/621c4ba4e6e87a4d869c696fe50034/output/ -2024-03-27 11:20:24 17127134 aligns.paf +2024-10-30 10:54:36 0 +2024-10-30 10:59:04 6 .command.begin +2024-10-30 10:59:33 780 .command.err +2024-10-30 10:59:35 780 .command.log +2024-10-30 10:59:32 0 .command.out +2024-10-30 10:54:36 13013 .command.run +2024-10-30 10:54:36 139 .command.sh +2024-10-30 10:59:33 277 .command.trace +2024-10-30 10:59:34 1 .exitcode + +aws s3 ls s3://ncbi-egapx-expires/work/D_farinae/86/68836c310a571e6752a33a221d1962/output/ +2024-10-30 10:59:34 26539116 1.paf ``` ## Offline mode @@ -316,7 +419,7 @@ If you do not have internet access from your cluster, you can run EGAPx in offli ``` rm egap*sif singularity cache clean -singularity pull docker://ncbi/egapx:0.2-alpha +singularity pull docker://ncbi/egapx:0.3-alpha ``` - Clone the repo: @@ -348,7 +451,7 @@ Now edit the file paths of SRA reads files in `examples/input_D_farinae_small.ya - Run `egapx.py` first to edit the `biowulf_cluster.config`: ``` ui/egapx.py examples/input_D_farinae_small.yaml -e biowulf_cluster -w dfs_work -o dfs_out -lc ../local_cache -echo "process.container = '/path_to_/egapx_0.2-alpha.sif'" >> egapx_config/biowulf_cluster.config +echo "process.container = '/path_to_/egapx_0.3-alpha.sif'" >> egapx_config/biowulf_cluster.config ``` - Run `egapx.py`: @@ -357,6 +460,21 @@ ui/egapx.py examples/input_D_farinae_small.yaml -e biowulf_cluster -w dfs_work - ``` +## Modifying default parameters + +The default task parameter values are listed in the file `ui/assets/default_task_params.yaml`. If there are cases where you need to change some task parameters from the default values, you can add those to the input yaml file. For example, if you're using RNA-seq from species besides the one being annotated, you can relax the alignment criteria by setting the following parameters in your input yaml: + +``` +tasks: + rnaseq_collapse: + rnaseq_collapse: -high-identity 0.8 + convert_from_bam: + sam2asn: -filter 'pct_identity_gap >= 85' + star_wnode: + star_wnode: -pct-identity 85 +``` + + ## References diff --git a/examples/input_C_longicornis.yaml b/examples/input_C_longicornis.yaml index 78887a2..e262905 100644 --- a/examples/input_C_longicornis.yaml +++ b/examples/input_C_longicornis.yaml @@ -1,3 +1,6 @@ -genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/029//603/195/GCF_029603195.1_ASM2960319v2/GCF_029603195.1_ASM2960319v2_genomic.fna.gz +genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/029/603/195/GCA_029603195.2_ASM2960319v2/GCA_029603195.2_ASM2960319v2_genomic.fna.gz reads: txid2530218[Organism] AND biomol_transcript[properties] NOT SRS024887[Accession] -taxid: 2530218 \ No newline at end of file +taxid: 2530218 +annotation_provider: GenBank submitter +annotation_name_prefix: GCA_029603195.2 +locus_tag_prefix: egapxtmp \ No newline at end of file diff --git a/examples/input_D_farinae_minimal.yaml b/examples/input_D_farinae_minimal.yaml deleted file mode 100644 index d540ed5..0000000 --- a/examples/input_D_farinae_minimal.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# This is a very minimal example of EGAPx, it fits into 4 CPU cores and 6GB of memory. -# To be able to do this, we culled the input files and some stages of execution. -# To limit the requirements you also need to use -e docker_minimal - -genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz -reads: - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1 - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2 - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1 - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2 -taxid: 6954 -proteins: [] -hmm: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/gnomon/hmm_parameters/6956.params -tasks: - star_wnode: - star_wnode: -cpus-per-worker 4 diff --git a/examples/input_D_farinae_small.yaml b/examples/input_D_farinae_small.yaml index fedcfe1..4fc5b02 100644 --- a/examples/input_D_farinae_small.yaml +++ b/examples/input_D_farinae_small.yaml @@ -1,7 +1,10 @@ -genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz +genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/809/275/GCA_020809275.1_ASM2080927v1/GCA_020809275.1_ASM2080927v1_genomic.fna.gz taxid: 6954 reads: - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1 - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2 - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1 - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2 +annotation_provider: GenBank submitter +annotation_name_prefix: GCA_020809275.1 +locus_tag_prefix: egapxtmp \ No newline at end of file diff --git a/examples/input_D_farinae_small_proteins.yaml b/examples/input_D_farinae_small_proteins.yaml deleted file mode 100644 index 795ad91..0000000 --- a/examples/input_D_farinae_small_proteins.yaml +++ /dev/null @@ -1,3 +0,0 @@ -genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz -proteins: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/target_proteins/6954.faa.gz -taxid: 6954 diff --git a/examples/input_D_farinae_small_readlist.yaml b/examples/input_D_farinae_small_readlist.yaml new file mode 100644 index 0000000..c269afd --- /dev/null +++ b/examples/input_D_farinae_small_readlist.yaml @@ -0,0 +1,6 @@ +genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/809/275/GCA_020809275.1_ASM2080927v1/GCA_020809275.1_ASM2080927v1_genomic.fna.gz +taxid: 6954 +reads: path/to/input_D_farinae_small_reads.txt +annotation_provider: GenBank submitter +annotation_name_prefix: GCA_020809275.1 +locus_tag_prefix: egapxtmp \ No newline at end of file diff --git a/examples/input_D_farinae_small_reads.txt b/examples/input_D_farinae_small_reads.txt new file mode 100644 index 0000000..ef6b1ed --- /dev/null +++ b/examples/input_D_farinae_small_reads.txt @@ -0,0 +1,4 @@ +set1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1 +set1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2 +set2 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1 +set2 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2 diff --git a/examples/input_D_farinae_small_rnaseq.yaml b/examples/input_D_farinae_small_rnaseq.yaml deleted file mode 100644 index aad27ae..0000000 --- a/examples/input_D_farinae_small_rnaseq.yaml +++ /dev/null @@ -1,9 +0,0 @@ -genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz -reads: - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1 - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2 - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1 - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2 -# To prevent the system from searching for protein bag by taxid we do not specify it here, but we need to specify hmm explicitly in this case. -# taxid: 6954 -hmm: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/params.asn diff --git a/examples/input_D_farinae_small_rnaseq_proteins.yaml b/examples/input_D_farinae_small_rnaseq_proteins.yaml deleted file mode 100644 index 9d81458..0000000 --- a/examples/input_D_farinae_small_rnaseq_proteins.yaml +++ /dev/null @@ -1,8 +0,0 @@ -genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz -reads: - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1 - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2 - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1 - - https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2 -proteins: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/target_proteins/6954.faa.gz -taxid: 6954 diff --git a/examples/input_Gavia_stellata.yaml b/examples/input_Gavia_stellata.yaml index ad08b5e..c372c66 100644 --- a/examples/input_Gavia_stellata.yaml +++ b/examples/input_Gavia_stellata.yaml @@ -1,3 +1,6 @@ -genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/030/936/135/GCF_030936135.1_bGavSte3.hap2/GCF_030936135.1_bGavSte3.hap2_genomic.fna.gz +genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/030/936/135/GCA_030936135.1_bGavSte3.hap2/GCA_030936135.1_bGavSte3.hap2_genomic.fna.gz reads: txid37040[Organism] AND biomol_transcript[properties] NOT SRS024887[Accession] taxid: 37040 +annotation_provider: GenBank submitter +annotation_name_prefix: GCA_030936135.1 +locus_tag_prefix: egapxtmp \ No newline at end of file diff --git a/nf/subworkflows/ncbi/default/annot_builder/main.nf b/nf/subworkflows/ncbi/annot_proc/annot_builder/main.nf similarity index 98% rename from nf/subworkflows/ncbi/default/annot_builder/main.nf rename to nf/subworkflows/ncbi/annot_proc/annot_builder/main.nf index 66393a9..9aea4d5 100644 --- a/nf/subworkflows/ncbi/default/annot_builder/main.nf +++ b/nf/subworkflows/ncbi/annot_proc/annot_builder/main.nf @@ -31,7 +31,6 @@ workflow annot_builder { def i = annot_builder_input('outdir', m, '01', gnomon_file, params) // FIXME: intended params 4-5 to be lists of all input files and all input manifests, but it complained with only one entry def (all, accept, accept_ftable, annot) = annot_builder_run('outdir', i[0], gencoll_asn, i[1], gnomon_file, genome_asn, params) - emit: outputs = all accept_asn = accept @@ -158,9 +157,9 @@ process annot_builder_run { val params output: path "${outdir}/*", emit: "all" - path "${outdir}/ACCEPT/accept.asn", emit: "accept", optional: true - path "${outdir}/ACCEPT/accept.ftable_annot", emit: "accept_ftable_annot", optional: true - path "${outdir}/ACCEPT/*.annot", optional: true + path "${outdir}/ACCEPT/accept.asn", emit: "accept"//, optional: true + path "${outdir}/ACCEPT/accept.ftable_annot", emit: "accept_ftable_annot"//, optional: true + path "${outdir}/ACCEPT/*.annot"//, optional: true script: """ mkdir -p $outdir/ACCEPT diff --git a/nf/subworkflows/ncbi/default/annotwriter/main.nf b/nf/subworkflows/ncbi/annot_proc/annotwriter/main.nf similarity index 100% rename from nf/subworkflows/ncbi/default/annotwriter/main.nf rename to nf/subworkflows/ncbi/annot_proc/annotwriter/main.nf diff --git a/nf/subworkflows/ncbi/gnomon/protein_filter/main.nf b/nf/subworkflows/ncbi/annot_proc/best_protein_hits/main.nf similarity index 100% rename from nf/subworkflows/ncbi/gnomon/protein_filter/main.nf rename to nf/subworkflows/ncbi/annot_proc/best_protein_hits/main.nf diff --git a/nf/subworkflows/ncbi/gnomon/diamond/main.nf b/nf/subworkflows/ncbi/annot_proc/diamond/main.nf similarity index 100% rename from nf/subworkflows/ncbi/gnomon/diamond/main.nf rename to nf/subworkflows/ncbi/annot_proc/diamond/main.nf diff --git a/nf/subworkflows/ncbi/annot_proc/diamond_identify/main.nf b/nf/subworkflows/ncbi/annot_proc/diamond_identify/main.nf new file mode 100644 index 0000000..ad37a5c --- /dev/null +++ b/nf/subworkflows/ncbi/annot_proc/diamond_identify/main.nf @@ -0,0 +1,24 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +include { merge_params; to_map; shellSplit } from '../../utilities' +include { run_diamond_egap;} from '../../shared/diamond/main' + + +workflow diamond_worker { + take: + gnomon_prot_ids + swiss_prot_ids + gnomon_prot_asn + swiss_prot_asn + parameters // Map : extra parameter and parameter update + main: + String diamond_blastp_params = merge_params('--sam-query-len --very-sensitive --unal 0 --comp-based-stats 0 --masking 0', parameters, 'diamond_blastp') + String diamond_regular_params = merge_params('-ofmt seq-align-set -query-fmt seq-ids -subject-fmt seq-ids -output-prefix hits', parameters, 'diamond') + String diamond_egap_params = '-blastp-args \'' + diamond_blastp_params + '\' ' + diamond_regular_params + + run_diamond_egap(gnomon_prot_ids, swiss_prot_ids, gnomon_prot_asn, swiss_prot_asn, diamond_egap_params) + + emit: + alignments = run_diamond_egap.out +} diff --git a/nf/subworkflows/ncbi/annot_proc/final_asn_markup/main.nf b/nf/subworkflows/ncbi/annot_proc/final_asn_markup/main.nf new file mode 100644 index 0000000..16caf53 --- /dev/null +++ b/nf/subworkflows/ncbi/annot_proc/final_asn_markup/main.nf @@ -0,0 +1,157 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +include { merge_params } from '../../utilities' + +///final_asn +// -scaffolds final_asn_markup.8279392/inp/scaffold.mft +// -chromosomes final_asn_markup.8279392/inp/chromosome.mft +// -annots final_asn_markup.8279392/inp/annotation.mft +// -gene_weights final_asn_markup.8279392/inp/gene_weight.mft +// -locus_lnk final_asn_markup.8279392/inp/gene_assignment.mft +// -out_dir final_asn_markup.8279392/out +// -lsq_dir final_asn_markup.8279392/var/idx +// -gencoll-asn passthrough_gcaccess.8279042/out/gencoll.asn +// -asn-cache sequence_cache +// -nogenbank + +// the scaf and chrom files are all in bare_scaffold_asn format, individual chroms, scafs bundled in per Asm and AsmUnit files +// final_asn task also runs asn_cleanup and asnvalidate on everything, examples from runlog below +// and it calls asn_stats, asnval2gbitem (?) , gbproject, and load_finl_asn_tracking_data after it all, some of which will probably be discarded by egapx + +///prime_cache -cache final_asn_markup.8279392/var/nucprots.cache -input-manifest final_asn_markup.8279392/out/nucprot.mft -ifmt asn-seq-entry +///asn_cleanup -basic -i final_asn_markup.8279392/out/raw/scaf/GCF_030936175.1/all_unannotated.asn -o final_asn_markup.8279392/out/scaf/GCF_030936175.1/all_unannotated.asn +///asn_cleanup -basic -i final_asn_markup.8279392/out/raw/scaf/GCF_030936175.1/asm_Primary_Assembly_1.cat.asn -o final_asn_markup.8279392/out/scaf/GCF_030936175.1/asm_Primary_Assembly_1.cat.asn +///asn_cleanup -basic -i final_asn_markup.8279392/out/raw/chrom/GCF_030936175.1/Chr_SUPER_1.asn -o final_asn_markup.8279392/out/chrom/GCF_030936175.1/Chr_SUPER_1.asn +///asn_cleanup -basic -i final_asn_markup.8279392/out/raw/chrom/GCF_030936175.1/Chr_SUPER_10.asn -o final_asn_markup.8279392/out/chrom/GCF_030936175.1/Chr_SUPER_10.asn + +///asnvalidate -Q 0 -asn-cache final_asn_markup.8279392/var/nucprots.cache,sequence_cache -v 4 -A -X -Z -o final_asn_markup.8279392/out/val/GCF_030936175.1/asm_Primary_Assembly_1.cat.val -i final_asn_markup.8279392/out/scaf/GCF_030936175.1/asm_Primary_Assembly_1.cat.asn +///asnvalidate -Q 0 -asn-cache final_asn_markup.8279392/var/nucprots.cache,sequence_cache -v 4 -A -X -Z -o final_asn_markup.8279392/out/val/GCF_030936175.1/asm_Primary_Assembly_1001.cat.val -i final_asn_markup.8279392/out/scaf/GCF_030936175.1/asm_Primary_Assembly_1001.cat.asn +///asnvalidate -Q 0 -asn-cache final_asn_markup.8279392/var/nucprots.cache,sequence_cache -v 4 -A -X -Z -o final_asn_markup.8279392/out/val/GCF_030936175.1/Chr_SUPER_1.val -i final_asn_markup.8279392/out/chrom/GCF_030936175.1/Chr_SUPER_1.asn +///asnvalidate -Q 0 -asn-cache sequence_cache -v 4 -A -X -Z -o final_asn_markup.8279392/out/val/GCF_030936175.1/all_nucprots.scaf.val -i final_asn_markup.8279392/out/scaf/GCF_030936175.1/all_nucprots.asn +///asnvalidate -Q 0 -asn-cache sequence_cache -v 4 -A -X -Z -o final_asn_markup.8279392/out/val/GCF_030936175.1/all_nucprots.chrom.val -i final_asn_markup.8279392/out/chrom/GCF_030936175.1/all_nucprots.asn + +///asn_stats -input-manifest final_asn_markup.8279392/var/joint.mft -nucprot-manifest final_asn_markup.8279392/out/nucprot.mft -o final_asn_markup.8279392/out/feature_counts.txt -counts-xml-output final_asn_markup.8279392/out/feature_counts.xml -stats-xml-output final_asn_markup.8279392/out/feature_stats.xml -t -break-by assembly-unit -asn-cache sequence_cache -gencoll-asn passthrough_gcaccess.8279042/out/gencoll.asn +///asnval2gbitem -t -asn-cache sequence_cache -asnval-path final_asn_markup.8279392/out/val/GCF_030936175.1 -scaffold-manifest final_asn_markup.8279392/out/GCF_030936175.1.scaffolds.mft -chromosome-manifest final_asn_markup.8279392/out/GCF_030936175.1.chromosomes.mft -nucprot-manifest final_asn_markup.8279392/out/GCF_030936175.1.nucprots.mft -o final_asn_markup.8279392/var/gbench/GCF_030936175.1 +///gbproject -collapse final_asn_markup.8279392/var/gbench/GCF_030936175.1 -o final_asn_markup.8279392/out/GCF_030936175.1.gbp +///load_final_asn_tracking_data -feature-counts-xml final_asn_markup.8279392/out/feature_counts.xml -length-stats-xml final_asn_markup.8279392/out/feature_stats.xml -validation-xml final_asn_markup.8279392/out/annot.val.xml + + +workflow final_asn_markup { + take: + gencoll_asn + genome_asn + scaffolds // asn seqentrs + chromosomes // asn seqentrysseqids + annots // asnt seq-annots + locus_link // rpt from locus_link + locustypes // tsv from locus_link + parameters // Map : extra parameter and parameter update + main: + params = merge_params("", parameters, 'final_asn') + + final_asn(gencoll_asn, genome_asn, scaffolds, chromosomes, annots, locus_link, locustypes, params) + + emit: + outputs = final_asn.out.all + to_convert = final_asn.out.to_convert + validated = final_asn.out.validated + stats = final_asn.out.stats + annotated_genome_asn = final_asn.out.annotated_genome_asn + annotation_data_comment = final_asn.out.annotation_data_comment +} + + +process final_asn { + input: + path gencoll_asn, stageAs: 'gencoll.asn' + path genome_asn, stageAs: 'genome/*' + path scaffolds, stageAs: 'scaffolds' // asn seqentry + path chromosomes, stageAs: 'chromosomes' // asn seqentry + path annots, stageAs: 'annots/*' // asnt seq-annots + path locus_link // tsv rpt + path locustypes // tsv + val params + output: + path "output/*", emit: "all" + path "output/scaf/EGAPx_Test_Assembly/*.asn", emit: "to_convert" + path "output/val/EGAPx_Test_Assembly/*", emit: "validated" + path "output/stats/*", emit: "stats" + path "output/annotated_genome.asn", emit: "annotated_genome_asn" + path "output/annotation_data.cmt", emit: "annotation_data_comment" + script: + """ + mkdir -p output + mkdir -p asncache + mkdir -p 'EGAPx_Test_Assembly' + + prime_cache -cache ./asncache/ -ifmt asn-seq-entry -i $genome_asn -oseq-ids cached_ids -split-sequences + concat_seqentries -cache ./asncache/ -o "./EGAPx_Test_Assembly/genome.asnb.gz" + asn_translator -gzip -i "./EGAPx_Test_Assembly/genome.asnb.gz" -o "./EGAPx_Test_Assembly/genome.asnt" + + echo "./EGAPx_Test_Assembly/genome.asnt" > ./scaffold.mft + touch ./chromosome.mft + ls -1 annots/* > ./annots.mft + echo $locus_link > ./locus_link.mft + echo $locustypes > ./locus_types.mft + + echo "" > ./gene_weights.mft + + ##lds2_indexer -source genome/ -db LDS2 + ## prime_cache + # EXCEPTION_STACK_TRACE_LEVEL=Warning DEBUG_STACK_TRACE_LEVEL=Warning DIAG_POST_LEVEL=Trace + + final_asn $params -egapx -nogenbank -gencoll-asn $gencoll_asn -asn-cache ./asncache/ \ + -scaffolds ./scaffold.mft -chromosomes ./chromosome.mft \ + -gene_weights ./gene_weights.mft \ + -annots ./annots.mft -locus_lnk ./locus_link.mft -locus_types ./locus_types.mft \ + -S NONE -genbank-mode -out_dir ./output/ + + mkdir -p raw/scaf + mv ./output/scaf/EGAPx_Test_Assembly/*.asn ./raw/scaf + for f in ./raw/scaf/*.asn; do + of=./output/scaf/EGAPx_Test_Assembly/`basename \$f` + asn_cleanup -basic -i \$f -o \$of + cat \$of >> output/annotated_genome.asn + done + + # NB if (when) chromosomes is not empty the same logic should be applied to chrom directroies + if [ -s ./output/chrom/EGAPx_Test_Assembly/*.asn ]; then + mkdir -p raw/chrom + mv ./output/chrom/EGAPx_Test_Assembly/*.asn ./raw/chrom + for f in ./raw/chrom/*.asn; do + of=./output/chrom/EGAPx_Test_Assembly/`basename \$f` + asn_cleanup -basic -i \$f -o \$of + cat \$of >> output/annotated_genome.asn + done + fi + + mkdir -p output/val/EGAPx_Test_Assembly + for f in ./output/scaf/EGAPx_Test_Assembly/*.asn; do + asnvalidate -Q 0 -asn-cache ./asncache/ -v 4 -A -X -Z -o ./output/val/EGAPx_Test_Assembly/`basename \$f .asn`.val -i \$f + done + + # joint manifest is scaffolds, chromosomes, and organelles (not implemented here) + # take it from annotated_genome.asn + echo "./output/annotated_genome.asn" > ./joint.mft + + mkdir -p output/stats + asn_stats -input-manifest ./joint.mft -o output/stats/feature_counts.txt -counts-xml-output output/stats/feature_counts.xml -stats-xml-output output/stats/feature_stats.xml -t -break-by assembly-unit -asn-cache ./asncache/ -gencoll-asn $gencoll_asn -genbank-mode + """ + stub: + """ + mkdir -p output/ACCEPT + echo "1" > output/ACCEPT/something.asn + mkdir -p output/scaf/EGAPx_Test_Assembly/ + echo "1" > output/scaf/EGAPx_Test_Assembly/genome.asn + mkdir -p output/val/EGAPx_Test_Assembly/ + echo "1" > output/val/EGAPx_Test_Assembly/genome.val + mkdir -p output/stats + echo "1" > output/stats/feature_counts.txt + echo "1" > output/annotated_genome.asn + echo "1" > output/annotation_data.cmt + + + echo "1" > output/final_asn.log + """ +} diff --git a/nf/subworkflows/ncbi/gnomon/gnomon_biotype/main.nf b/nf/subworkflows/ncbi/annot_proc/gnomon_biotype/main.nf similarity index 72% rename from nf/subworkflows/ncbi/gnomon/gnomon_biotype/main.nf rename to nf/subworkflows/ncbi/annot_proc/gnomon_biotype/main.nf index fd10c2a..1c223cd 100644 --- a/nf/subworkflows/ncbi/gnomon/gnomon_biotype/main.nf +++ b/nf/subworkflows/ncbi/annot_proc/gnomon_biotype/main.nf @@ -43,18 +43,19 @@ process run_gnomon_biotype { """ mkdir -p output mkdir -p ./asncache/ - prime_cache -cache ./asncache/ -ifmt asnb-seq-entry -i ${swiss_prot_asn} -oseq-ids /dev/null -split-sequences - echo "${raw_blastp_hits.join('\n')}" > raw_blastp_hits.mft + prime_cache -cache ./asncache/ -ifmt asnb-seq-entry -i ${swiss_prot_asn} -oseq-ids spids -split-sequences + prime_cache -cache ./asncache/ -ifmt asnb-seq-entry -i ${models_files} -oseq-ids gnids -split-sequences lds2_indexer -source genome/ -db LDS2 + echo "${raw_blastp_hits.join('\n')}" > raw_blastp_hits.mft merge_blastp_hits -asn-cache ./asncache/ -nogenbank -lds2 LDS2 -input-manifest raw_blastp_hits.mft -o prot_hits.asn echo "${models_files.join('\n')}" > models.mft echo "prot_hits.asn" > prot_hits.mft echo "${splices_files.join('\n')}" > splices.mft if [ -z "$denylist" ] then - gnomon_biotype -gc $gencoll_asn -asn-cache ./asncache/ -nogenbank -gnomon_models models.mft -o output/biotypes.tsv -o_prots_rpt output/prots_rpt.tsv -prot_hits prot_hits.mft -prot_splices splices.mft -reftrack-server 'NONE' -allow_lt631 true + gnomon_biotype -gc $gencoll_asn -asn-cache ./asncache/ -lds2 ./LDS2 -nogenbank -gnomon_models models.mft -o output/biotypes.tsv -o_prots_rpt output/prots_rpt.tsv -prot_hits prot_hits.mft -prot_splices splices.mft -reftrack-server 'NONE' -allow_lt631 true else - gnomon_biotype -gc $gencoll_asn -asn-cache ./asncache/ -nogenbank -gnomon_models models.mft -o output/biotypes.tsv -o_prots_rpt output/prots_rpt.tsv -prot_denylist $denylist -prot_hits prot_hits.mft -prot_splices splices.mft -reftrack-server 'NONE' -allow_lt631 true + gnomon_biotype -gc $gencoll_asn -asn-cache ./asncache/ -lds2 ./LDS2 -nogenbank -gnomon_models models.mft -o output/biotypes.tsv -o_prots_rpt output/prots_rpt.tsv -prot_denylist $denylist -prot_hits prot_hits.mft -prot_splices splices.mft -reftrack-server 'NONE' -allow_lt631 true fi """ stub: diff --git a/nf/subworkflows/ncbi/gnomon/locus_link/main.nf b/nf/subworkflows/ncbi/annot_proc/locus_link/main.nf similarity index 76% rename from nf/subworkflows/ncbi/gnomon/locus_link/main.nf rename to nf/subworkflows/ncbi/annot_proc/locus_link/main.nf index e4d3ffb..bb50a06 100644 --- a/nf/subworkflows/ncbi/gnomon/locus_link/main.nf +++ b/nf/subworkflows/ncbi/annot_proc/locus_link/main.nf @@ -21,7 +21,7 @@ workflow locus_link { parameters // Map : extra parameter and parameter update main: default_params = "" - effective_params = merge_params(default_params, parameters, 'locus_link') + effective_params = merge_params(default_params, parameters, 'locus_type') run_locus_link(best_refseq_prot_hit, orthologs, annotation, gencoll_asn, gnomon_lds2_source, best_prot_hit, track_loci, comparisons, curr_prev_compare, gnomon_biotypes, lxr_data, proteins_asn, name_from_ortholog, default_params) @@ -48,7 +48,7 @@ process run_locus_link { path comparisons path curr_prev_compare path gnomon_biotypes - path lxr_data + path lxr_data, stageAs: 'lxr_tracking_data.txt' path proteins_asn path name_from_ortholog val parameters @@ -74,34 +74,12 @@ process run_locus_link { echo "${curr_prev_compare.join('\n')}" > curr_prev_compare.mft echo "${comparisons.join('\n')}" > comparisons.mft str="" - if [ ! -z "$orthologs" ] - then - str="\$str -orthologs $orthologs" - fi - if [ ! -z "$lxr_data" ] - then - str="\$str -lxr $lxr_data" - else - touch lxr_data - str="\$str -lxr lxr_data" - fi + str="\$str -orthologs $orthologs" + str="\$str -lxr $lxr_data" + str="\$str -locus_track $track_loci" + str="\$str -name_from_ortholog_rpt $name_from_ortholog" - if [ ! -z "$track_loci" ] - then - str="\$str -locus_track $track_loci" - else - touch track_loci - str="\$str -locus_track track_loci" - fi - if [ ! -z "$name_from_ortholog" ] - then - str="\$str -name_from_ortholog_rpt $name_from_ortholog" - else - touch name_from_ortholog - str="\$str -name_from_ortholog_rpt name_from_ortholog" - fi - - locus_type -no_acc_reserve -annots annotation.mft -gc $gencoll_asn -gnomon_biotype $gnomon_biotypes -o_stats output/stats.xml -o_locustypes output/locustypes.tsv -o_locus_lnk output/locus.lnk -annotcmp comparisons.mft -annotcmp_pb curr_prev_compare.mft \$str + locus_type -asn-cache ./asncache/ -lds2 ./LDS2 -nogenbank -no_acc_reserve -annots annotation.mft -gc $gencoll_asn -gnomon_biotype $gnomon_biotypes -o_stats output/stats.xml -o_locustypes output/locustypes.tsv -o_locus_lnk output/locus.lnk -annotcmp comparisons.mft -annotcmp_pb curr_prev_compare.mft \$str """ stub: """ diff --git a/nf/subworkflows/ncbi/annot_proc/locus_track/main.nf b/nf/subworkflows/ncbi/annot_proc/locus_track/main.nf new file mode 100644 index 0000000..744b842 --- /dev/null +++ b/nf/subworkflows/ncbi/annot_proc/locus_track/main.nf @@ -0,0 +1,70 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +include { merge_params } from '../../utilities' + +///netmnt/vast01/gpi/regr/GPIPE_REGR1/system/2024-05-16.prod.build26027/bin/compare_annots +// -q_scope_type ID +// -q_scope_args '' +// -q_ids ./gc_get_molecules.8280202/out/no_organelle.gi +// -alns ./locus_track_compare.8281472/inp/prev_build_align.mft +// -s_scope_type annots-mft-lds +// -s_scope_args ./locus_track_compare.8281472/inp/curr_annotation.mft +// -s_ids ./passthrough_gcaccess.8279042/out/no_organelle.gi +// -o_asn ./locus_track_compare.8281472/out/curr_prev.antcmp.asn +// -o_tab ./locus_track_compare.8281472/out/curr_prev.antcmp.tab + +///netmnt/vast01/gpi/regr/GPIPE_REGR1/system/2024-05-16.prod.build26027/bin/locus_track +// -o_loci ./locus_track_compare.8281472/out/locus_track.rpt +// -o_conflicts ./locus_track_compare.8281472/out/conflicts.rpt +// -o_evidence ./locus_track_compare.8281472/out/evidence.rpt +// -annotset (mft of annot_builder ACCEPT files) curr_annot_set.mft +// -gc (annot assembly, out gc_create) gencoll.asn +// -pb_alns (no pb, empty) prev_build_align.mft +// -lxr ./locus_track_compare.8281472/out/lxr_tracking_data.txt +//// LXR is a File_in to locus_tracki, its from a database call that happens in locus_track's action node, so its saved to the out dir +// -track_Assemblies var/intra_assm.mft +// -track_PrevBuild (no pb) var/curr_prev.mft + +workflow locus_track { + take: + annotation + gencoll_asn + lxr_data + parameters // Map : extra parameter and parameter update + main: + default_params = "" + effective_params = merge_params(default_params, parameters, 'locus_track') + run_locus_track(annotation, gencoll_asn, lxr_data, default_params) + emit: + track_rpt = run_locus_track.out.track_rpt +} + + + +process run_locus_track { + input: + path annotation + path gencoll_asn + path lxr_data //, stageAs: 'lxr_tracking_data.txt' + val parameters + output: + path ('output/locus_track.rpt'), emit: track_rpt + script: + """ + mkdir -p output + ##mkdir -p ./asncache/ + ##prime_cache -cache ./asncache/ -ifmt asnb-seq-entry -i .... -oseq-ids /dev/null -split-sequences + + echo "${annotation.join('\n')}" > annotation.mft + + locus_track -nogenbank -annotset annotation.mft -gc $gencoll_asn -lxr $lxr_data -o_loci ./output/locus_track.rpt -o_conflicts ./output/conflicts.rpt -o_evidence ./output/evidence.rpt + """ + stub: + """ + mkdir -p output + touch output/locus_track.rpt + touch output/lxr_tracking_data.txt + """ +} + diff --git a/nf/subworkflows/ncbi/annot_proc/main.nf b/nf/subworkflows/ncbi/annot_proc/main.nf new file mode 100644 index 0000000..51644cc --- /dev/null +++ b/nf/subworkflows/ncbi/annot_proc/main.nf @@ -0,0 +1,107 @@ +#!/usr/bin/env nextflow +// gnomon plane workflow +// route data to tasks + +nextflow.enable.dsl=2 + +params.import_prefix = "../../../../nf/subworkflows/ncbi/" // redirected during testing + +include { prot_gnomon_prepare } from "${params.import_prefix}annot_proc/prot_gnomon_prepare/main" +include { diamond_worker} from "${params.import_prefix}annot_proc/diamond_identify/main" +include { best_protein_hits } from "${params.import_prefix}annot_proc/best_protein_hits/main" +include { gnomon_biotype} from "${params.import_prefix}annot_proc/gnomon_biotype/main" +include { fetch_swiss_prot_asn; get_swiss_prot_ids } from "../shared/diamond/main" +include { orthology_plane } from "../orthology/main" +include { locus_track } from "${params.import_prefix}annot_proc/locus_track/main" +include { locus_link } from "${params.import_prefix}annot_proc/locus_link/main" + +include { annot_builder } from "${params.import_prefix}annot_proc/annot_builder/main" +include { final_asn_markup } from "${params.import_prefix}annot_proc/final_asn_markup/main" +include { annotwriter } from "${params.import_prefix}annot_proc/annotwriter/main" + + +params.intermediate = false + + +// curr_tax_id|is_curated_genome|symbol_format_class|extrn_naming_authority|use_in_lxr_client||orth_source_tax_id|schema_ver +// tax_id|0|symbol_format_class|1|0|y|ortho_tax_id(zero for now?)|2 +process print_fake_lxr_data { + input: + val tax_id + val symbol_format_class + val ortho_tax_id + output: + path('fake_lxr.tsv'), emit: 'lxr_data' + script: + """ + echo '_TAX_ATTRS_' > 'fake_lxr.tsv' + echo '$tax_id|0|$symbol_format_class|1|0|y|$ortho_tax_id|2' >> 'fake_lxr.tsv' + """ + stub: + """ + echo '_TAX_ATTRS_' > 'fake_lxr.tsv' + echo '9606|0|allupper|1|0|y|9606|2' >> 'fake_lxr.tsv' + """ +} + +workflow annot_proc_plane { + take: + gnomon_models + gencoll_asn + genome_asn + genome_asnb + scaffolds + + //orthologs + //name_from_ortholog + + tax_id // NCBI tax id of the closest taxon to the genome + symbol_format_class // string for how to format gene names + ortho_files /// ortho reference input files + reference_sets // reference sets, for now only swissprot + task_params // task parameters for every task + main: + // Post GNOMON + // might come its own plane + def swiss_prot_asn = fetch_swiss_prot_asn(reference_sets) + def swiss_prot_ids = get_swiss_prot_ids(swiss_prot_asn) + + prot_gnomon_prepare(gnomon_models, task_params.get('prot_gnomon_prepare', [:])) + // Seed Protein-Model Hits + diamond_worker(prot_gnomon_prepare.out.prot_ids, swiss_prot_ids, gnomon_models, swiss_prot_asn, task_params.get('diamond_identify', [:])) + best_protein_hits(gnomon_models, swiss_prot_asn, diamond_worker.out.alignments , task_params.get('best_protein_hits', [:])) + gnomon_biotype(gnomon_models,/*splices_file -- constant*/ [], /*denylist -- constant*/ [], gencoll_asn, swiss_prot_asn, [], diamond_worker.out.alignments,task_params.get('gnomon_biotype', [:])) + + annot_builder(gencoll_asn, gnomon_models, genome_asn, task_params.get('annot_builder', [:])) + def accept_ftable_file = annot_builder.out.accept_ftable_annot + def annot_files = annot_builder.out.annot_files + + lxr_data = print_fake_lxr_data(tax_id, symbol_format_class, ortho_files.get('taxid',0)).lxr_data + + orthology_plane(genome_asnb, gencoll_asn, gnomon_models, annot_files, ortho_files, task_params) + def orthologs = orthology_plane.out.orthologs + def name_from_ortholog = orthology_plane.out.name_from_ortholog + + + ///def lxr_data = [] + locus_track(accept_ftable_file, gencoll_asn, lxr_data, task_params.get('locus_track', [:]) ) + + locus_link(/*best_refseq_prot_hit -- best protein hits from refseq plane*/ [], orthologs, accept_ftable_file, + gencoll_asn, gnomon_models, best_protein_hits.out.alignments , locus_track.out.track_rpt, /*comparisons*/ [], /*curr_prev_compare*/ [], + gnomon_biotype.out.biotypes, lxr_data, swiss_prot_asn, name_from_ortholog, task_params.get('locus_link', [:])) + + final_asn_markup(gencoll_asn, genome_asn, scaffolds, /*chromosomes ASN*/ [], annot_builder.out.accept_asn.collect(), locus_link.out.locus, locus_link.out.locustypes, task_params.get('final_asn_markup', [:]) ) + + annotwriter(accept_ftable_file, task_params.get('annotwriter', [:])) + + emit: + locus = locus_link.out.locus + accept_annot_file = accept_ftable_file + final_asn_out = final_asn_markup.out.outputs + to_convert = final_asn_markup.out.to_convert + validated = final_asn_markup.out.validated + stats = final_asn_markup.out.stats + annotated_genome_asn = final_asn_markup.out.annotated_genome_asn + annotation_data_comment = final_asn_markup.out.annotation_data_comment + gff_annotated_file = annotwriter.out.annoted_file +} diff --git a/nf/subworkflows/ncbi/gnomon/prot_gnomon_prepare/main.nf b/nf/subworkflows/ncbi/annot_proc/prot_gnomon_prepare/main.nf similarity index 100% rename from nf/subworkflows/ncbi/gnomon/prot_gnomon_prepare/main.nf rename to nf/subworkflows/ncbi/annot_proc/prot_gnomon_prepare/main.nf diff --git a/nf/subworkflows/ncbi/default/align_sort_sa/main.nf b/nf/subworkflows/ncbi/default/align_sort_sa/main.nf index af3ec37..c4b8be8 100644 --- a/nf/subworkflows/ncbi/default/align_sort_sa/main.nf +++ b/nf/subworkflows/ncbi/default/align_sort_sa/main.nf @@ -35,7 +35,8 @@ process run_align_sort { mkdir -p LDS_Index lds2_indexer -source LDS_Index echo "${alignments.join('\n')}" > alignments.mft - align_sort $parameters -input-manifest alignments.mft -o output/sorted_aligns.asn -lds2 LDS_Index/lds2.db + mkdir -p tmp + align_sort $parameters -tmp tmp -input-manifest alignments.mft -o output/sorted_aligns.asn -lds2 LDS_Index/lds2.db """ stub: """ diff --git a/nf/subworkflows/ncbi/default/convert_annotations/main.nf b/nf/subworkflows/ncbi/default/convert_annotations/main.nf new file mode 100644 index 0000000..cea1ce4 --- /dev/null +++ b/nf/subworkflows/ncbi/default/convert_annotations/main.nf @@ -0,0 +1,91 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +include { merge_params } from '../../utilities' + + +workflow convert_annotations { + take: + asn_file // Channel: ASN file with annotations + parameters // Map : extra parameter and parameter update + // Use "annotwriter_gff" for GFF3 and "annotwriter_gtf" for GTF in section 'convert_annotations' + main: + run_converter(asn_file, merge_params("", parameters, 'annotwriter_gff'), merge_params("", parameters, 'annotwriter_gtf')) + emit: + genomic_gff = run_converter.out.genomic_gff + genomic_gtf = run_converter.out.genomic_gtf + genomic_fasta = run_converter.out.genomic_fasta + transcripts_fasta = run_converter.out.transcripts_fasta + cds_fasta = run_converter.out.cds_fasta + proteins_fasta = run_converter.out.proteins_fasta +} + + +process run_converter { + input: + path asn_files, stageAs: 'asn_inputs/*' + val gff_params + val gtf_params + output: + path 'output/*.genomic.gff', emit: 'genomic_gff' + path 'output/*.genomic.gtf', emit: 'genomic_gtf' + path 'output/*.genomic.fna', emit: 'genomic_fasta' + path 'output/*.transcripts.fna', emit: 'transcripts_fasta' + path 'output/*.cds.fna', emit: 'cds_fasta' + path 'output/*.proteins.faa', emit: 'proteins_fasta' + script: + //def basename = asn_file.baseName.toString() + def basename = asn_files.first().baseName.toString() + """ + echo "${asn_files.join('\n')}" > ${basename}.mft + mkdir -p output + ##if [ -s ${asn_files} ]; then + mkdir -p tmpout + for af in ${asn_files} + do + afb=\$(basename \$af) + annotwriter ${gff_params} -nogenbank -i \${af} -format gff3 -o tmpout/\${afb}.genomic.gff + annotwriter ${gtf_params} -nogenbank -i \${af} -format gtf -o tmpout/\${afb}.genomic.gtf + asn2fasta -nogenbank -i \${af} -nucs-only |sed -e 's/^>lcl|\\(.*\\)/>\\1/' > tmpout/\${afb}.genomic.fna + asn2fasta -nogenbank -i \${af} -feats rna_fasta -o tmpout/\${afb}.transcripts.fna + asn2fasta -nogenbank -i \${af} -feats fasta_cds_na -o tmpout/\${afb}.cds.fna + asn2fasta -nogenbank -i \${af} -prots-only -o tmpout/\${afb}.proteins.faa + done + cat tmpout/*.gff > output/complete.genomic.gff + cat tmpout/*.gtf > output/complete.genomic.gtf + cat tmpout/*.genomic.fna > output/complete.genomic.fna + cat tmpout/*.transcripts.fna > output/complete.transcripts.fna + cat tmpout/*.cds.fna > output/complete.cds.fna + cat tmpout/*.proteins.faa > output/complete.proteins.faa + rm tmpout/* + + ##annotwriter ${gff_params} -nogenbank -i ${asn_files} -format gff3 -o output/${basename}.genomic.gff + ##annotwriter ${gtf_params} -nogenbank -i ${asn_files} -format gtf -o output/${basename}.genomic.gtf + ##asn2fasta -nogenbank -nucs-only -indir asn_inputs -o - |sed -e 's/^>lcl|\\(.*\\)/>\\1/' >output/${basename}.genomic.fna + ##asn2fasta -nogenbank -feats rna_fasta -indir asn_inputs -o output/${basename}.transcripts.fna + ##asn2fasta -nogenbank -feats fasta_cds_na -i -indir asn_inputs -o output/${basename}.cds.fna + ##asn2fasta -nogenbank -prots-only -i -indir asn_inputs -o output/${basename}.proteins.faa + ##else + ## touch output/${basename}.genomic.gff + ## touch output/${basename}.genomic.gtf + ## touch output/${basename}.genomic.fna + ## touch output/${basename}.transcripts.fna + ## touch output/${basename}.cds.fna + ## touch output/${basename}.proteins.faa + ##fi + """ + + stub: + def basename = asn_files.first().baseName.toString() + print(asn_files) + print(basename) + """ + mkdir -p output + echo "Genomic GFF" > output/${basename}.genomic.gff + echo "Genomic GTF" > output/${basename}.genomic.gtf + echo "Genomic FASTA" > output/${basename}.genomic.fna + echo "Transcript FASTA" > output/${basename}.transcripts.fna + echo "CDS FASTA" > output/${basename}.cds.fna + echo "Protein FASTA" > output/${basename}.proteins.faa + """ +} diff --git a/nf/subworkflows/ncbi/gnomon-training-iteration/gnomon_training_iterations/main.nf b/nf/subworkflows/ncbi/gnomon-training-iteration/gnomon_training_iterations/main.nf new file mode 100644 index 0000000..cd71997 --- /dev/null +++ b/nf/subworkflows/ncbi/gnomon-training-iteration/gnomon_training_iterations/main.nf @@ -0,0 +1,98 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 +//nextflow.preview.recursion=true + + +include { gnomon_training_iteration; gnomon_training_iteration as gnomon_training_iteration2; + gnomon_training_iteration as gnomon_training_iteration3; gnomon_training_iteration as gnomon_training_iteration4 } from './../utilities' + + +workflow gnomon_training_iterations { + take: + models_file + genome_asn + proteins_asn + chainer_alignments + chainer_evidence_denylist + chainer_gap_fill_allowlist + chainer_trusted_genes + chainer_scaffolds + gnomon_softmask_lds2 + gnomon_softmask_lds2_source + gnomon_scaffolds + max_intron + parameters + main: + gnomon_training_iteration(models_file, genome_asn, proteins_asn ,chainer_alignments,chainer_evidence_denylist,chainer_gap_fill_allowlist, + chainer_trusted_genes, chainer_scaffolds, gnomon_softmask_lds2, + gnomon_softmask_lds2_source, gnomon_scaffolds, max_intron, parameters) + gnomon_training_iteration2(gnomon_training_iteration.out.hmm_params_file, genome_asn, proteins_asn ,chainer_alignments, + chainer_evidence_denylist,chainer_gap_fill_allowlist, chainer_trusted_genes, chainer_scaffolds, gnomon_softmask_lds2, + gnomon_softmask_lds2_source, gnomon_scaffolds, max_intron, parameters) + gnomon_training_iteration3(gnomon_training_iteration2.out.hmm_params_file, genome_asn, proteins_asn ,chainer_alignments, + chainer_evidence_denylist,chainer_gap_fill_allowlist, chainer_trusted_genes, chainer_scaffolds, gnomon_softmask_lds2, + gnomon_softmask_lds2_source, gnomon_scaffolds, max_intron, parameters) + gnomon_training_iteration4(gnomon_training_iteration3.out.hmm_params_file, genome_asn, proteins_asn ,chainer_alignments, + chainer_evidence_denylist,chainer_gap_fill_allowlist, chainer_trusted_genes, chainer_scaffolds, gnomon_softmask_lds2, + gnomon_softmask_lds2_source, gnomon_scaffolds, max_intron, parameters) + + emit: + hmm_params_file = gnomon_training_iteration4.out.hmm_params_file +} + +workflow gnomon_no_training { + take: + hmm + main: + process_no_training(hmm) + emit: + hmm_params_file = process_no_training.out.file +} + +process process_no_training +{ + input: + val hmm + output: + path "*.params" , emit: 'file' + script: + """ + curl -O ${hmm} + """ + stub: + """ + touch hmm.params + """ +} + + +/* + +//experimental (preview.recursion) +// to be used in the future +workflow gnomon_training_iterations { + take: + models_file + genome_asn + proteins_asn + chainer_alignments + chainer_evidence_denylist + chainer_gap_fill_allowlist + chainer_trusted_genes + chainer_scaffolds + gnomon_softmask_lds2 + gnomon_softmask_lds2_source + gnomon_scaffolds + max_intron + parameters + main: + gnomon_training_iteration + .recurse(models_file, genome_asn, proteins_asn ,chainer_alignments,chainer_evidence_denylist,chainer_gap_fill_allowlist, + chainer_trusted_genes, chainer_scaffolds, gnomon_softmask_lds2, + gnomon_softmask_lds2_source, gnomon_scaffolds, max_intron, parameters) + .times(4) + emit: + hmm_params_file = gnomon_training_iteration.out.hmm_params_file +} + +*/ diff --git a/nf/subworkflows/ncbi/gnomon/chainer_wnode/main.nf b/nf/subworkflows/ncbi/gnomon/chainer_wnode/main.nf index 478eaf8..8baf995 100644 --- a/nf/subworkflows/ncbi/gnomon/chainer_wnode/main.nf +++ b/nf/subworkflows/ncbi/gnomon/chainer_wnode/main.nf @@ -45,6 +45,7 @@ workflow chainer_wnode { chains_slices = run_gpx_make_outputs.out.chains_slices evidence = run_gpx_make_outputs.out.evidence evidence_slices = run_gpx_make_outputs.out.evidence_slices + all = run_gpx_make_outputs.out.all } @@ -85,6 +86,7 @@ process generate_jobs { process run_chainer { + label 'long_job' input: path job path alignments @@ -147,6 +149,7 @@ process run_gpx_make_outputs { path "output/chains.*.out.gz.slices", emit: 'chains_slices' path "output/evidence.*.out.gz", emit: 'evidence', optional: true path "output/evidence.*.out.gz.slices", emit: 'evidence_slices', optional: true + path "output/*", emit: 'all' script: """ ls -1 gpx_inputs/* > gpx_inputs.mft diff --git a/nf/subworkflows/ncbi/gnomon/gnomon_wnode/main.nf b/nf/subworkflows/ncbi/gnomon/gnomon_wnode/main.nf index a8b3f7c..cee6ca5 100644 --- a/nf/subworkflows/ncbi/gnomon/gnomon_wnode/main.nf +++ b/nf/subworkflows/ncbi/gnomon/gnomon_wnode/main.nf @@ -95,15 +95,7 @@ process annot { fi lds2=indexed_lds - if [ -n "$softmask_lds2" ]; then - # patch LDS2 to point to the source - files=\$(sqlite3 $softmask_lds2 -cmd "SELECT file_name FROM file" ".exit") - for f in \$files; do - base=\$(basename \$f) - sqlite3 $softmask_lds2 -cmd "UPDATE file SET file_name = '\$base' WHERE file_name = '\$f'" ".exit" - done - lds2+=",$softmask_lds2" - elif [ -n "$softmask" ]; then + if [ -n "$softmask" ]; then mkdir sm_src mv $softmask ./sm_src/ lds2_indexer -source ./sm_src/ -db softmask_lds2 diff --git a/nf/subworkflows/ncbi/gnomon/main.nf b/nf/subworkflows/ncbi/gnomon/main.nf index 5787e1b..302d45a 100644 --- a/nf/subworkflows/ncbi/gnomon/main.nf +++ b/nf/subworkflows/ncbi/gnomon/main.nf @@ -4,17 +4,14 @@ nextflow.enable.dsl=2 -include { chainer_wnode as chainer } from './chainer_wnode/main' -include { gnomon_wnode } from './gnomon_wnode/main' -include { prot_gnomon_prepare } from './prot_gnomon_prepare/main' -include { gnomon_training_iterations } from '../gnomon-training-iteration/main' +params.import_prefix = "../../../../nf/subworkflows/ncbi/" // redirected during testing -include { diamond_worker} from './diamond/main' -include { best_protein_hits } from './protein_filter/main' -include { gnomon_biotype} from './gnomon_biotype/main' -include { fetch_swiss_prot_asn; get_swiss_prot_ids } from '../shared/diamond/main' -include { diamond_orthology } from '../orthology/diamond_orthology/main' -include { locus_link } from './locus_link/main' +include { chainer_wnode as chainer } from "./${params.import_prefix}gnomon/chainer_wnode/main" +include { gnomon_wnode } from "./${params.import_prefix}gnomon/gnomon_wnode/main" +include { gnomon_training_iterations } from "./${params.import_prefix}gnomon-training-iteration/gnomon_training_iterations/main" +include { gnomon_biotype} from "./${params.import_prefix}annot_proc/gnomon_biotype/main" +//include { locus_track } from './locus_track/main' +//include { locus_link } from './locus_link/main' params.intermediate = false @@ -27,12 +24,13 @@ workflow gnomon_plane { proteins_asn alignments // list of all relevent input alignments + proteins_trusted // Alternative parameters, one of them should be set // tax_id - NCBI tax id of the closest taxon to the genome // hmm_params - HMM parameters tax_id // NCBI tax id of the closest taxon to the genome hmm_params // HMM parameters - hmm_taxid // NCBI tax id of the taxon of the HMM + train_hmm // Boolean, whether to train HMM // softmask // softmask for GNOMON, optional max_intron // max intron length @@ -40,20 +38,20 @@ workflow gnomon_plane { main: // GNOMON def effective_hmm - if (tax_id == hmm_taxid) { + if (!train_hmm) { effective_hmm = hmm_params } else { effective_hmm = gnomon_training_iterations(hmm_params, genome_asn, proteins_asn, alignments, /* evidence_denylist */ [], /* gap_fill_allowlist */ [], - /* trusted_genes */ [], scaffolds, softmask, + [proteins_trusted].flatten(), scaffolds, softmask, softmask, scaffolds, max_intron, task_params) } - chainer(alignments, effective_hmm, /* evidence_denylist */ [], /* gap_fill_allowlist */ [], scaffolds, /* trusted_genes */ [], genome_asn, proteins_asn, task_params.get('chainer', [:])) + chainer(alignments, effective_hmm, /* evidence_denylist */ [], /* gap_fill_allowlist */ [], scaffolds, [proteins_trusted].flatten(), genome_asn, proteins_asn, task_params.get('chainer_wnode', [:])) def gn_models = [] - gnomon_wnode(scaffolds, chainer.out.chains, chainer.out.chains_slices, effective_hmm, [], softmask, genome_asn, proteins_asn, task_params.get('gnomon', [:])) + gnomon_wnode(scaffolds, chainer.out.chains, chainer.out.chains_slices, effective_hmm, [], softmask, genome_asn, proteins_asn, task_params.get('gnomon_wnode', [:])) emit: gnomon_models = gnomon_wnode.out.outputs @@ -62,35 +60,3 @@ workflow gnomon_plane { -workflow post_gnomon_plane { - take: - gnomon_models - gencoll_asn - orthologs - - - // Alternative parameters, one of them should be set - // tax_id - NCBI tax id of the closest taxon to the genome - // hmm_params - HMM parameters - tax_id // NCBI tax id of the closest taxon to the genome - task_params // task parameters for every task - main: - // Post GNOMON - // might come its own plane - def swiss_prot_asn = fetch_swiss_prot_asn() - def swiss_prot_ids = get_swiss_prot_ids(swiss_prot_asn) - - prot_gnomon_prepare(gnomon_models, task_params.get('prot_gnomon_prepare', [:])) - // Seed Protein-Model Hits - diamond_worker(prot_gnomon_prepare.out.prot_ids, swiss_prot_ids, gnomon_models, swiss_prot_asn, task_params.get('diamond', [:])) - best_protein_hits(gnomon_models, swiss_prot_asn, diamond_worker.out.alignments , task_params.get('protein_filter', [:])) - - gnomon_biotype([] /*models*/,/*splices_file -- constant*/ [], /*denylist -- constant*/ [], gencoll_asn, swiss_prot_asn, gnomon_models, diamond_worker.out.alignments,task_params.get('gnomon_biotype', [:])) - locus_link(/*best_refseq_prot_hit -- best protein hits from refseq plane*/ [], orthologs, [] /*annot_builder.out.annot_files*/, - gencoll_asn, gnomon_models, best_protein_hits.out.alignments , /*track_loci*/ [], /*comparisons*/ [], /*curr_prev_compare*/ [], - gnomon_biotype.out.biotypes, /*lxr_data*/ [], swiss_prot_asn, /*name_from_ortholog */ [], task_params.get('locus_link', [:])) - - - emit: - locus = locus_link.out.locus -} diff --git a/nf/subworkflows/ncbi/main.nf b/nf/subworkflows/ncbi/main.nf index fe19bd1..8071344 100644 --- a/nf/subworkflows/ncbi/main.nf +++ b/nf/subworkflows/ncbi/main.nf @@ -6,12 +6,14 @@ nextflow.enable.dsl=2 include { rnaseq_short_plane } from './rnaseq_short/main' include { target_proteins_plane } from './target_proteins/main' -include { gnomon_plane; post_gnomon_plane } from './gnomon/main' +include { gnomon_plane } from './gnomon/main' include { orthology_plane } from './orthology/main' +include { annot_proc_plane } from './annot_proc/main' include { setup_genome; setup_proteins } from './setup/main' -include { annot_builder } from './default/annot_builder/main' -include { annotwriter } from './default/annotwriter/main' - +//include { annot_builder } from './annot_proc/annot_builder/main' +//include { final_asn_markup } from './annot_proc/final_asn/main' +//include { annotwriter } from './annot_proc/annotwriter/main' +include {convert_annotations } from './default/convert_annotations/main' params.intermediate = false params.use_orthology = false @@ -20,8 +22,9 @@ params.use_post_gnomon = false workflow egapx { take: - genome // path to genome - proteins // path to proteins, optional + genome // path to genome + proteins // path to proteins, optional + proteins_trusted // path to trusted proteins, optional // Alternative groups of parameters, one of them should be set // reads_query - SRA query in the form accepted by NCBI @@ -38,13 +41,16 @@ workflow egapx { // tax_id - NCBI tax id of the closest taxon to the genome // hmm_params - HMM parameters tax_id // NCBI tax id of the closest taxon to the genome + symbol_format_class // string for how to create gene names hmm_params // HMM parameters - hmm_taxid // NCBI tax id of the HMM + train_hmm // Boolean, whether to train HMM // softmask // softmask for GNOMON, optional // max_intron // max intron length genome_size_threshold // the threshold for calculating actual max intron length + ortho_files // files reference genome sequence and annotation for find_orthology + reference_sets // reference sets, for now only swissprot task_params // task parameters for every task main: print "workflow.container: ${workflow.container}" @@ -87,30 +93,44 @@ workflow egapx { def gnomon_models = [] def effective_hmm - gnomon_plane(genome_asn, scaffolds, gencoll_asn, proteins_asn, alignments, tax_id, hmm_params, hmm_taxid, softmask, eff_max_intron, task_params) + gnomon_plane(genome_asn, scaffolds, gencoll_asn, proteins_asn, alignments, proteins_trusted, tax_id, hmm_params, train_hmm, softmask, eff_max_intron, task_params) gnomon_models = gnomon_plane.out.gnomon_models // outputs - annot_builder(gencoll_asn, gnomon_models, genome_asn, task_params.get('annot_builder', [:])) - def accept_annot_file = annot_builder.out.accept_ftable_annot - def annot_files = annot_builder.out.annot_files - - if (params.use_orthology) { - // ORTHOLOGY - orthology_plane(genome_asnb, gencoll_asn, gnomon_models, annot_files, task_params) - def orthologs = orthology_plane.out.orthologs - if (params.use_post_gnomon) { - //POST GNOMON - post_gnomon_plane(gnomon_models, gencoll_asn, orthologs, tax_id, task_params) - } - } - - annotwriter(accept_annot_file, [:]) - annotwriter.out.annoted_file - + + def accept_annot_file = [] + def gff_annotated_file = [] + def final_asn_out = [] + def locus_out = [] + def stats_dir = [] + def annotated_genome_file = [] + def annotation_data_comment_file = [] + annot_proc_plane(gnomon_models, gencoll_asn, genome_asn, genome_asnb, scaffolds, tax_id, symbol_format_class, ortho_files, reference_sets, task_params) + locus_out = annot_proc_plane.out.locus + final_asn_out = annot_proc_plane.out.final_asn_out + accept_annot_file = annot_proc_plane.out.accept_annot_file + gff_annotated_file = annot_proc_plane.out.gff_annotated_file + stats_dir = annot_proc_plane.out.stats + annotated_genome_file = annot_proc_plane.out.annotated_genome_asn + annotation_data_comment_file = annot_proc_plane.out.annotation_data_comment + + convert_annotations(annot_proc_plane.out.to_convert, task_params.get('convert_annotations', [:])) + emit: - out_files = annotwriter.out.annoted_file - annot_builder_output = annot_builder.out.outputs - // locus = post_gnomon_plane.out.locus + out_files = gff_annotated_file + out_ggff = convert_annotations.out.genomic_gff + out_ggtf = convert_annotations.out.genomic_gtf + out_gfa = convert_annotations.out.genomic_fasta + out_rna_fa = convert_annotations.out.transcripts_fasta + out_cds_fa = convert_annotations.out.cds_fasta + out_prot_fa = convert_annotations.out.proteins_fasta + annot_builder_output = annot_proc_plane.out.accept_annot_file + locus = locus_out + final_asn_outputs = final_asn_out + validated = annot_proc_plane.out.validated + stats = stats_dir + annotated_genome_asn = annotated_genome_file + annotation_data_comment = annotation_data_comment_file + //converted_outs = converted_outs } diff --git a/nf/subworkflows/ncbi/only_gnomon.nf b/nf/subworkflows/ncbi/only_gnomon.nf index 3544dba..a4f23d3 100644 --- a/nf/subworkflows/ncbi/only_gnomon.nf +++ b/nf/subworkflows/ncbi/only_gnomon.nf @@ -5,12 +5,11 @@ nextflow.enable.dsl=2 include { setup_genome; setup_proteins } from './setup/main' -include { get_hmm_params; run_get_hmm } from './default/get_hmm_params/main' include { chainer_wnode as chainer } from './gnomon/chainer_wnode/main' include { gnomon_wnode } from './gnomon/gnomon_wnode/main' -include { prot_gnomon_prepare } from './gnomon/prot_gnomon_prepare/main' -include { annot_builder } from './default/annot_builder/main' -include { annotwriter } from './default/annotwriter/main' +include { prot_gnomon_prepare } from './annot_proc/prot_gnomon_prepare/main' +include { annot_builder } from './annot_proc/annot_builder/main' +include { annotwriter } from './annot_proc/annotwriter/main' include { run_align_sort} from './default/align_sort_sa/main' params.intermediate = false @@ -29,7 +28,7 @@ workflow only_gnomon { // hmm_params - HMM parameters tax_id // NCBI tax id of the closest taxon to the genome hmm_params // HMM parameters - hmm_taxid // NCBI tax id of the taxon of the HMM + train_hmm // Boolean, whether to train HMM // softmask // softmask for GNOMON, optional task_params // task parameters for every task @@ -64,17 +63,7 @@ workflow only_gnomon { // GNOMON - def effective_hmm - if (hmm_params) { - effective_hmm = hmm_params - } else { - tmp_hmm = run_get_hmm(tax_id) - b = tmp_hmm | splitText( { it.split('\n') } ) | flatten - c = b | last - effective_hmm = c - } - - chainer(alignments, effective_hmm, /* evidence_denylist */ [], /* gap_fill_allowlist */ [], scaffolds, /* trusted_genes */ [], genome_asn, proteins_asn, task_params.get('chainer', [:])) + chainer(alignments, hmm_params, /* evidence_denylist */ [], /* gap_fill_allowlist */ [], scaffolds, /* trusted_genes */ [], genome_asn, proteins_asn, task_params.get('chainer', [:])) gnomon_wnode(scaffolds, chainer.out.chains, chainer.out.chains_slices, effective_hmm, [], softmask, genome_asn, proteins_asn, task_params.get('gnomon', [:])) def models = gnomon_wnode.out.outputs diff --git a/nf/subworkflows/ncbi/orthology/diamond_orthology/main.nf b/nf/subworkflows/ncbi/orthology/diamond_orthology/main.nf index 8efd88d..b8b2038 100644 --- a/nf/subworkflows/ncbi/orthology/diamond_orthology/main.nf +++ b/nf/subworkflows/ncbi/orthology/diamond_orthology/main.nf @@ -14,8 +14,8 @@ workflow diamond_orthology { proteins_asn parameters // Map : extra parameter and parameter update main: - String diamond_blastp_params = merge_params('--sam-query-len --very-sensitive --unal 0 --comp-based-stats 0 --masking 0', parameters, 'diamond_orthology_blastp') - String diamond_regular_params = merge_params('-ofmt seq-align-set -query-fmt seq-ids -subject-fmt seq-ids -output-prefix hits', parameters, 'diamond_orthology') + String diamond_blastp_params = merge_params('--sam-query-len --very-sensitive --unal 0 --comp-based-stats 0 --masking 0', parameters, 'diamond_blastp') + String diamond_regular_params = merge_params('-ofmt seq-align-set -query-fmt seq-ids -subject-fmt seq-ids -output-prefix hits', parameters, 'diamond') String diamond_egap_params = '-blastp-args \'' + diamond_blastp_params + '\' ' + diamond_regular_params run_diamond_egap(gnomon_prot_ids, proteins_ids, gnomon_prot_asn, proteins_asn, diamond_egap_params) diff --git a/nf/subworkflows/ncbi/orthology/extract_products_from_models/main.nf b/nf/subworkflows/ncbi/orthology/extract_products_from_models/main.nf index d6b68e8..765bc50 100644 --- a/nf/subworkflows/ncbi/orthology/extract_products_from_models/main.nf +++ b/nf/subworkflows/ncbi/orthology/extract_products_from_models/main.nf @@ -11,7 +11,7 @@ workflow extract_products_from_models { parameters // Map : extra parameter and parameter update main: default_params = "" - effective_params = merge_params(default_params, parameters, 'extract_products_from_models') + effective_params = merge_params(default_params, parameters, 'extract_products') run_extract_products_from_models(models, effective_params) emit: diff --git a/nf/subworkflows/ncbi/orthology/find_orthologs/main.nf b/nf/subworkflows/ncbi/orthology/find_orthologs/main.nf index 475dd3f..0e81bf8 100644 --- a/nf/subworkflows/ncbi/orthology/find_orthologs/main.nf +++ b/nf/subworkflows/ncbi/orthology/find_orthologs/main.nf @@ -79,36 +79,46 @@ process run_find_orthologs { -ref_prot_url='https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/ortholog_references/9606/current/GCF_000001405.40_GRCh38.p14_protein.faa.gz' -ref_genf_url='https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/ortholog_references/9606/current/GCF_000001405.40_GRCh38.p14_genomic.fna.gz' -ref_geng_url='https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/ortholog_references/9606/current/GCF_000001405.40_GRCh38.p14_genomic.gff.gz' +//ref_prot_url='https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/ortholog_references/9606/current/GCF_000001405.40_GRCh38.p14_protein.faa.gz' +//ref_genf_url='https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/ortholog_references/9606/current/GCF_000001405.40_GRCh38.p14_genomic.fna.gz' +//ref_geng_url='https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/ortholog_references/9606/current/GCF_000001405.40_GRCh38.p14_genomic.gff.gz' +//ref_name_url='https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/ortholog_references/9606/name_from_ortholog.rpt' process fetch_ortholog_references { input: + //path ortho_files // map with file sources + path genomic_fna + path genomic_gff + path protein_faa + path name_from_rpt output: - path "output/p14_protein.faa", emit: "p14_protein_faa" - path "output/p14_genomic.fna", emit: "p14_genomic_fna" - path "output/gc1.annot.asnt.gz", emit: "annot_file" + path "output/ref_protein.faa", emit: "ref_protein_faa" + path "output/ref_genomic.fna", emit: "ref_genomic_fna" + path "output/ref_genomic.gff.asnt.gz", emit: "annot_file" + path "output/name_from_ortholog.rpt", emit: "name_from_ortholog" script: """ - curl -O '$ref_prot_url' - curl -O '$ref_genf_url' - curl -O '$ref_geng_url' - gunzip GCF_000001405.40_GRCh38.p14_protein.faa.gz - gunzip GCF_000001405.40_GRCh38.p14_genomic.fna.gz - #gunzip GCF_000001405.40_GRCh38.p14_genomic.gff.gz mkdir -p output - zcat GCF_000001405.40_GRCh38.p14_genomic.gff.gz | multireader -format gff3 | gzip -c > output/gc1.annot.asnt.gz - mv GCF_000001405.40_GRCh38.p14_protein.faa output/p14_protein.faa - mv GCF_000001405.40_GRCh38.p14_genomic.fna output/p14_genomic.fna - #mv GCF_000001405.40_GRCh38.p14_genomic.gff output/p14_genomic.gff + cp ${genomic_fna} output/ref_genomic.fna.gz + cp ${protein_faa} output/ref_protein.faa.gz + cp ${name_from_rpt} output/name_from_ortholog.rpt + + gunzip output/ref_genomic.fna.gz + gunzip output/ref_protein.faa.gz + zcat ${genomic_gff} | multireader -format gff3 | gzip -c > output/ref_genomic.gff.asnt.gz """ stub: """ mkdir -p output - touch output/p14_protein.faa - touch output/p14_genomic.fna - touch output/gc1.annot.asnt.gz + touch output/ref_protein.faa + touch output/ref_genomic.fna + touch output/ref_genomic.gff.asnt.gz + touch output/name_from_ortholog.rpt + + ##cp ${genomic_fna} output/ref_genomic.fna + ##cp ${genomic_gff} output/ref_genomic.gff + ##cp ${protein_faa} output/ref_protein.faa + ##cp ${name_from_rpt} output/name_from_ortholog.rpt """ } diff --git a/nf/subworkflows/ncbi/orthology/main.nf b/nf/subworkflows/ncbi/orthology/main.nf index 2ce9fd5..71cc70f 100644 --- a/nf/subworkflows/ncbi/orthology/main.nf +++ b/nf/subworkflows/ncbi/orthology/main.nf @@ -4,11 +4,13 @@ nextflow.enable.dsl=2 +params.import_prefix = "../../../../nf/subworkflows/ncbi/" // redirected during testing -include { extract_products_from_models } from './extract_products_from_models/main' -include { find_orthologs; fetch_ortholog_references; } from './find_orthologs/main' -include { diamond_orthology } from './diamond_orthology/main' -include { setup_genome; setup_proteins } from './../setup/main' +include { extract_products_from_models } from "${params.import_prefix}orthology/extract_products_from_models/main" +include { find_orthologs;} from "${params.import_prefix}orthology/find_orthologs/main" +include { fetch_ortholog_references; } from "./find_orthologs/main" +include { diamond_orthology } from "${params.import_prefix}orthology/diamond_orthology/main" +include { setup_ext_genome; setup_ext_proteins } from './../setup/main' include { get_swiss_prot_ids as get_prot_ref_ids } from '../shared/diamond/main' params.intermediate = false @@ -20,12 +22,15 @@ workflow orthology_plane { gencoll_asn models annot_files + ortho_files // ref side input files task_params // task parameters for every task main: // Protein alignments - fetch_ortholog_references() - def (scaffolds_ref, gencoll_ref_asn, unpacked_genome_ref, genome_ref_asn, genome_ref_asnb) = setup_genome(fetch_ortholog_references.out.p14_genomic_fna, [], task_params.get('setup', [:])) - def (unpacked_proteins_ref, proteins_ref_asn, proteins_ref_asnb) = setup_proteins(fetch_ortholog_references.out.p14_protein_faa, task_params.get('setup', [:])) + //fetch_ortholog_references() + //fetch_ortholog_references(ortho_files) + fetch_ortholog_references(ortho_files['genomic.fna'], ortho_files['genomic.gff'], ortho_files['protein.faa'], ortho_files['name_from.rpt']) + def (scaffolds_ref, gencoll_ref_asn, unpacked_genome_ref, genome_ref_asn, genome_ref_asnb) = setup_ext_genome(fetch_ortholog_references.out.ref_genomic_fna, [], task_params.get('setup', [:])) + def (unpacked_proteins_ref, proteins_ref_asn, proteins_ref_asnb) = setup_ext_proteins(fetch_ortholog_references.out.ref_protein_faa, task_params.get('setup', [:])) def prot_ref_ids = get_prot_ref_ids(proteins_ref_asnb) //orthology plane extract_products_from_models(annot_files, task_params.get('extract_products_from_models', [:])) @@ -39,5 +44,6 @@ workflow orthology_plane { emit: orthologs = find_orthologs.out.orthologs + name_from_ortholog = fetch_ortholog_references.out.name_from_ortholog } diff --git a/nf/subworkflows/ncbi/rnaseq_short/bam_bin_and_sort/main.nf b/nf/subworkflows/ncbi/rnaseq_short/bam_bin_and_sort/main.nf index ba3e9c5..aed99ee 100644 --- a/nf/subworkflows/ncbi/rnaseq_short/bam_bin_and_sort/main.nf +++ b/nf/subworkflows/ncbi/rnaseq_short/bam_bin_and_sort/main.nf @@ -13,7 +13,7 @@ workflow bam_bin_and_sort { parameters // Map : extra parameter and parameter update main: def assembly_sizes = calc_assembly_sizes(ch_bam.collect()) - def bam_bin_params = "-avg-size-per-bin 200000000 -file-pattern 'bin#.bam' -exclude-organelle" + def bam_bin_params = " -file-pattern 'bin#.bam' " out = bam_bin(ch_bam, ch_index, genome, organelle, assembly_sizes, merge_params(bam_bin_params, parameters, 'bam_bin')) bins = out.collect() merge_args = merge_prepare(bins) diff --git a/nf/subworkflows/ncbi/rnaseq_short/convert_from_bam/main.nf b/nf/subworkflows/ncbi/rnaseq_short/convert_from_bam/main.nf index 1dbbcb0..385ce93 100644 --- a/nf/subworkflows/ncbi/rnaseq_short/convert_from_bam/main.nf +++ b/nf/subworkflows/ncbi/rnaseq_short/convert_from_bam/main.nf @@ -38,7 +38,7 @@ process convert { if [ `stat -L -c%s $in_bam` -lt $min_file_size ] && [ `\$samtools view -c $in_bam` -eq 0 ]; then exit 0 fi - tmpdir=`mktemp -d` + tmpdir=`mktemp -d --tmpdir=.` lds2_indexer -source genome/ -db LDS2 # EXCEPTION_STACK_TRACE_LEVEL=Warning DEBUG_STACK_TRACE_LEVEL=Warning DIAG_POST_LEVEL=Trace sam2asn $conv_param -refs-local-by-default -nogenbank -lds2 LDS2 -tmp-dir \$tmpdir -align-counts "${prefix}.align_counts.txt" -o "${prefix}.align.asnb.gz" -strandedness $strandedness -input $in_bam -samtools-path \$samtools diff --git a/nf/subworkflows/ncbi/rnaseq_short/main.nf b/nf/subworkflows/ncbi/rnaseq_short/main.nf index a69c790..d28ea0c 100644 --- a/nf/subworkflows/ncbi/rnaseq_short/main.nf +++ b/nf/subworkflows/ncbi/rnaseq_short/main.nf @@ -4,14 +4,17 @@ nextflow.enable.dsl=2 -include { sra_query } from './sra_qry/main' -include { fetch_sra_fasta } from './fetch_sra_fasta/main' -include { star_index } from './star_index/main' -include { star_wnode as star } from './star_wnode/main' -include { bam_strandedness } from './bam_strandedness/main' -include { bam_bin_and_sort } from './bam_bin_and_sort/main' -include { bam2asn } from './convert_from_bam/main' -include { rnaseq_collapse } from './rnaseq_collapse/main' +params.import_prefix = "../../../../nf/subworkflows/ncbi/" // redirected during testing + +include { sra_query } from "./${params.import_prefix}rnaseq_short/sra_qry/main" +include { fetch_sra_fasta } from "./${params.import_prefix}rnaseq_short/fetch_sra_fasta/main" +include { star_index } from "./${params.import_prefix}rnaseq_short/star_index/main" +include { star_wnode as star } from "./${params.import_prefix}rnaseq_short/star_wnode/main" +include { bam_strandedness } from "./${params.import_prefix}rnaseq_short/bam_strandedness/main" +include { bam_bin_and_sort } from "./${params.import_prefix}rnaseq_short/bam_bin_and_sort/main" +include { bam2asn } from "./${params.import_prefix}rnaseq_short/convert_from_bam/main" +include { rnaseq_collapse } from "./${params.import_prefix}rnaseq_short/rnaseq_collapse/main" + params.intermediate = false @@ -52,9 +55,12 @@ workflow rnaseq_short_plane { (sra_metadata, sra_run_list) = sra_query(query, task_params.get('sra_qry', [:])) def reads_fasta_pairs = fetch_sra_fasta(sra_run_list, task_params.get('fetch_sra_fasta', [:])) (ch_align, ch_align_index) = star(scaffolds, reads_fasta_pairs, genome_asn, index, max_intron, task_params.get('star_wnode', [:])) - } else { + } else if (ch_reads) { sra_metadata = reads_metadata (ch_align, ch_align_index) = star(scaffolds, ch_reads, genome_asn, index, max_intron, task_params.get('star_wnode', [:])) + } else { + sra_metadata = reads_metadata + (ch_align, ch_align_index) = star(scaffolds, reads, genome_asn, index, max_intron, task_params.get('star_wnode', [:])) } // diff --git a/nf/subworkflows/ncbi/rnaseq_short/star_index/main.nf b/nf/subworkflows/ncbi/rnaseq_short/star_index/main.nf index 10d5d68..8154501 100644 --- a/nf/subworkflows/ncbi/rnaseq_short/star_index/main.nf +++ b/nf/subworkflows/ncbi/rnaseq_short/star_index/main.nf @@ -5,18 +5,33 @@ include { merge_params } from '../../utilities' process build_index { + debug true label 'big_job' input: path genome_file val parameters output: - path out_dir + path "${out_dir}", emit: "index_dir" script: out_dir = genome_file.toString().replaceFirst(/\.(fa(sta)?|fna)$/, ".index") """ + mkdir -p $out_dir echo "in $genome_file, out $out_dir" - STAR $parameters --runMode genomeGenerate --genomeDir $out_dir --genomeFastaFiles $genome_file + # Check that --genomeSAindexNbases is not set + if [[ ! "$parameters" =~ --genomeSAindexNbases ]]; then + # Formula from https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf, section 2.2.5 + sa_index_bases=`python3 -c "import os;import math;print(math.floor(min(14, math.log2(os.path.getsize('$genome_file'))/2 - 1)))"` + effective_parameters="$parameters --genomeSAindexNbases \$sa_index_bases" + else + effective_parameters="$parameters" + fi + echo "ls" + ls + + echo "STAR index effective parameters: \$effective_parameters" + STAR \$effective_parameters --runMode genomeGenerate --genomeDir $out_dir --genomeFastaFiles $genome_file chmod a+rx $out_dir + STAR $parameters --runMode genomeGenerate --genomeDir $out_dir --genomeFastaFiles $genome_file """ stub: @@ -41,8 +56,8 @@ workflow star_index { genome //path: genome parameters // Map : extra parameter and parameter update main: - default_params = "--runThreadN 6 --genomeSAindexNbases 14" + default_params = "--runThreadN 6" build_index(genome, merge_params(default_params, parameters, 'STAR')) emit: - build_index.out + index_dir = build_index.out.index_dir } diff --git a/nf/subworkflows/ncbi/rnaseq_short/star_wnode/main.nf b/nf/subworkflows/ncbi/rnaseq_short/star_wnode/main.nf index 47a5cda..48ff4f8 100644 --- a/nf/subworkflows/ncbi/rnaseq_short/star_wnode/main.nf +++ b/nf/subworkflows/ncbi/rnaseq_short/star_wnode/main.nf @@ -3,17 +3,21 @@ nextflow.enable.dsl=2 include { merge_params } from '../../utilities' +params.verbose = false -def get_effective_params(parameters, use_zcat, max_intron) { + +def get_effective_params(parameters, use_zcat, max_intron, cpus) { def effective_star_params = "" // Check that parameters for star_wnode doesn't contain 'star-params' boolean back_compat = parameters.get("star_wnode", "").contains("-star-params") if (!back_compat) { - def star_params = "--alignSJoverhangMin 8 --outFilterMultimapNmax 200 --outFilterMismatchNmax 50 --runThreadN 16 --genomeLoad NoSharedMemory --outSAMtype SAM --outSAMattributes 'NH HI AS nM NM MD jM jI XS MC' --outSAMprimaryFlag AllBestScore --outFilterMultimapScoreRange 50 --seedSearchStartLmax 15 --limitOutSAMoneReadBytes 1000000 --outSJtype None" + def star_params = "--alignSJoverhangMin 8 --outFilterMultimapNmax 200 --outFilterMismatchNmax 50 --runThreadN ${cpus} --genomeLoad NoSharedMemory --outSAMtype SAM --outSAMattributes 'NH HI AS nM NM MD jM jI XS MC' --outSAMprimaryFlag AllBestScore --outFilterMultimapScoreRange 50 --seedSearchStartLmax 15 --limitOutSAMoneReadBytes 1000000 --outSJtype None" effective_star_params = ' -star-params "' + (use_zcat ? "--readFilesCommand zcat " : "") + merge_params(star_params, parameters, 'star-params') + '"' } - def default_params = "-cpus-per-worker 4 -csi-threshold 512000000 -max-intron ${max_intron} -preserve-star-logs" + def default_params = "-cpus-per-worker ${cpus} -csi-threshold 0 -max-intron ${max_intron} -preserve-star-logs" def effective_params = merge_params(default_params, parameters, 'star_wnode') + effective_star_params + // Force CSI threshold to 0 to always use CSI + effective_params = effective_params.replaceAll(/-csi-threshold [0-9]+/, "-csi-threshold 0") if (!back_compat) { // Ad-hoc post processing - remove single quotes from effective_params effective_params = effective_params.replaceAll("'", "") @@ -35,7 +39,7 @@ process run_star { val parameters output: path "*-Aligned.out.Sorted.bam", emit: 'align' - path "*-Aligned.out.Sorted.bam.bai", emit: 'align_index' + path "*-Aligned.out.Sorted.bam.csi", emit: 'align_index' // path "per_run_counts.txt", emit: 'per_run_counts' script: def assembly=genome_file.baseName.toString().replaceFirst(/\.(fa(sta)?|asn[bt]?)$/, "") @@ -47,8 +51,13 @@ process run_star { query_str = fasta_rna_file[0] seqkit_cmd = "seqkit stats ${fasta_rna_file[0]} " } - def effective_params = get_effective_params(parameters, use_zcat, max_intron) - // println("Effective STAR parameters: $effective_params") + // For some executors (e.g. SGE) the task.cpus is not set correctly, they allocate correct number of threads through clusterOptions. + // We use ext.cpus to pass the number of cpus here and use clusterOptions to allocate large enough instance + def cpus = task.cpus == 1 && task.ext.cpus ? task.ext.cpus : task.cpus + def effective_params = get_effective_params(parameters, use_zcat, max_intron, cpus) + if (params.verbose) { + println("Effective STAR parameters: $effective_params") + } """ echo "Assembly: ${assembly} sampleID: ${sampleID} Query: ${query_str}" echo "${seqid_list.join('\n')}" > seqid_list.mft @@ -66,11 +75,12 @@ process run_star { stub: def assembly=genome_file.baseName.toString().replaceFirst(/\.(fa(sta)?|asn[bt]?)$/, "") println("Assembly: ${assembly} sampleID: ${sampleID}, max_intron: ${max_intron}") - def effective_params = get_effective_params(parameters, use_zcat, max_intron) + def cpus = task.cpus == 1 && task.ext.cpus ? task.ext.cpus : task.cpus + def effective_params = get_effective_params(parameters, use_zcat, max_intron, cpus) println("Effective STAR parameters: $effective_params") """ touch ${assembly}-${sampleID}-Aligned.out.Sorted.bam - touch ${assembly}-${sampleID}-Aligned.out.Sorted.bam.bai + touch ${assembly}-${sampleID}-Aligned.out.Sorted.bam.csi """ } diff --git a/nf/subworkflows/ncbi/setup/main.nf b/nf/subworkflows/ncbi/setup/main.nf index 15b6fa2..8200080 100644 --- a/nf/subworkflows/ncbi/setup/main.nf +++ b/nf/subworkflows/ncbi/setup/main.nf @@ -9,7 +9,7 @@ workflow setup_genome { organelles parameters // Map : extra parameter and parameter update main: - get_genome_info(genome, organelles, parameters) + get_genome_info(genome, organelles, true, parameters) emit: seqid_list = get_genome_info.out.seqid_list gencoll_asn = get_genome_info.out.gencoll_asn @@ -18,6 +18,22 @@ workflow setup_genome { genome_asnb = get_genome_info.out.genome_asnb max_intron = get_genome_info.out.max_intron } +workflow setup_ext_genome { + take: + genome + organelles + parameters // Map : extra parameter and parameter update + main: + get_genome_info(genome, organelles, false, parameters) + emit: + seqid_list = get_genome_info.out.seqid_list + gencoll_asn = get_genome_info.out.gencoll_asn + unpacked_genome = get_genome_info.out.fasta + genome_asn = get_genome_info.out.genome_asn + genome_asnb = get_genome_info.out.genome_asnb + max_intron = get_genome_info.out.max_intron +} + process get_genome_info { @@ -25,6 +41,7 @@ process get_genome_info { input: path fasta_genome_file, stageAs: 'src/*' path organelles + val localize_ids val parameters output: path '*.seqids', emit: 'seqid_list' @@ -46,7 +63,11 @@ process get_genome_info { genome_asn = genome_dir + "/" + base_name_stripped + ".asn" genome_asnb = genome_dir + "/" + base_name_stripped + ".asnb" max_intron = parameters.max_intron - genome_size_threshold = parameters.genome_size_threshold + genome_size_threshold = parameters.genome_size_threshold + lcl_id_cmd = "cat" + if(localize_ids) { + lcl_id_cmd = "sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/'" + } """ # echo "need_zcat: ${need_zcat}, out_fasta: ${out_fasta}" mkdir -p ${genome_dir} @@ -54,11 +75,11 @@ process get_genome_info { if [[ ${need_zcat} == true ]]; then # zcat ${fasta_genome_file} | sed 's/^\\(>gi|[0-9]\\+\\)|\\?\\([^ ]\\+\\)\\(.*\\)/\\1\\3/' > ${out_fasta} # zcat ${fasta_genome_file} > ${out_fasta} - zcat ${fasta_genome_file} | sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' > ${out_fasta} + zcat ${fasta_genome_file} | ${lcl_id_cmd} > ${out_fasta} else # sed 's/^\\(>gi|[0-9]\\+\\)|\\?\\([^ ]\\+\\)\\(.*\\)/\\1\\3/' ${fasta_genome_file} > ${out_fasta} # mv ${fasta_genome_file} ${out_fasta} - sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' ${fasta_genome_file} > ${out_fasta} + ${lcl_id_cmd} ${fasta_genome_file} > ${out_fasta} fi # Old way, now use gc_get_molecules. For multipart ids with gi first use the second part # grep -oP "^>\\K[^ ]+" ${out_fasta} | sed 's/^\\(gi|[0-9]\\+\\)|\\([^|]\\+|[^|]\\+\\)|\\?/\\2/' >list.seqids @@ -114,17 +135,29 @@ workflow setup_proteins { proteins parameters // Map : extra parameter and parameter update main: - convert_proteins(proteins) + convert_proteins(proteins, true) emit: unpacked_proteins = convert_proteins.out.unpacked_proteins proteins_asn = convert_proteins.out.proteins_asn proteins_asnb = convert_proteins.out.proteins_asnb } +workflow setup_ext_proteins { + take: + proteins + parameters // Map : extra parameter and parameter update + main: + convert_proteins(proteins, false) + emit: + unpacked_proteins = convert_proteins.out.unpacked_proteins + proteins_asn = convert_proteins.out.proteins_asn + proteins_asnb = convert_proteins.out.proteins_asnb +} process convert_proteins { input: path fasta_proteins_file, stageAs: 'src/*' + val localize_ids output: path out_fasta, emit: 'unpacked_proteins' path proteins_asn, emit: 'proteins_asn' @@ -139,13 +172,17 @@ process convert_proteins { out_fasta = fasta_dir + "/" + fasta_name proteins_asn = asn_dir + "/" + base_name_stripped + ".asn" proteins_asnb = asn_dir + "/" + base_name_stripped + ".asnb" + lcl_id_cmd = "cat" + if(localize_ids) { + lcl_id_cmd = "sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/'" + } """ mkdir -p ${asn_dir} mkdir -p ${fasta_dir} if [[ ${need_zcat} == true ]]; then - zcat ${fasta_proteins_file} | sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' > ${out_fasta} + zcat ${fasta_proteins_file} | ${lcl_id_cmd} > ${out_fasta} else - sed 's/>\\([^ |]\\+\\)\\( .*\\)\\?\$/>lcl\\|\\1\\2/' ${fasta_proteins_file} > ${out_fasta} + ${lcl_id_cmd} ${fasta_proteins_file} > ${out_fasta} fi multireader -flags ParseRawID -out-format asn_text -input ${out_fasta} -output ${proteins_asn} multireader -flags ParseRawID -out-format asn_binary -input ${out_fasta} -output ${proteins_asnb} diff --git a/nf/subworkflows/ncbi/shared/diamond/main.nf b/nf/subworkflows/ncbi/shared/diamond/main.nf index 1ef6801..3648ca9 100644 --- a/nf/subworkflows/ncbi/shared/diamond/main.nf +++ b/nf/subworkflows/ncbi/shared/diamond/main.nf @@ -27,22 +27,18 @@ nextflow.enable.dsl=2 include {to_map; shellSplit } from '../../utilities' -swiss_prot_url='https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/reference_sets/swissprot.asnb.gz' process fetch_swiss_prot_asn { input: + path swiss_prot_url output: - path "output/swissprot.asnb", emit: "swiss_prot_asn" + path "swissprot.asnb", emit: "swiss_prot_asn" script: """ - curl -O '$swiss_prot_url' - gunzip swissprot.asnb.gz - mkdir -p output - mv swissprot.asnb output/swissprot.asnb + zcat ${swiss_prot_url} > swissprot.asnb """ stub: """ - mkdir -p output - touch output/swissprot.asnb + touch swissprot.asnb """ } @@ -55,7 +51,7 @@ process get_swiss_prot_ids { """ mkdir -p output lds2_indexer -db lds -source . - sqlite3 ./lds "SELECT txt_id FROM seq_id WHERE orig=1 AND int_id IS NULL;" > output/swiss_prot_ids + sqlite3 ./lds "SELECT txt_id FROM seq_id WHERE orig=1 AND int_id IS NULL;" > output/swiss_prot_ids """ stub: """ diff --git a/nf/subworkflows/ncbi/target_proteins/best_aligned_prot/main.nf b/nf/subworkflows/ncbi/target_proteins/best_aligned_prot/main.nf index 5e7a7ef..eb04fd5 100644 --- a/nf/subworkflows/ncbi/target_proteins/best_aligned_prot/main.nf +++ b/nf/subworkflows/ncbi/target_proteins/best_aligned_prot/main.nf @@ -14,7 +14,7 @@ workflow best_aligned_prot { parameters // Map : extra parameter and parameter update main: default_params = "" - effective_params = merge_params(default_params, parameters, 'best_aligned_prot') + effective_params = merge_params(default_params, parameters, 'best_placement') run_best_aligned_prot(genome_asn_file, proteins_asn_file, alignment_asn_file, gencoll_file, effective_params) emit: diff --git a/nf/subworkflows/ncbi/target_proteins/main.nf b/nf/subworkflows/ncbi/target_proteins/main.nf index 6e3d705..3c4a821 100644 --- a/nf/subworkflows/ncbi/target_proteins/main.nf +++ b/nf/subworkflows/ncbi/target_proteins/main.nf @@ -4,11 +4,13 @@ nextflow.enable.dsl=2 -include { miniprot } from './miniprot/main' -include { align_filter_sa } from './align_filter_sa/main' -include { best_aligned_prot } from './best_aligned_prot/main' -include { paf2asn } from './paf2asn/main' -include { run_align_sort} from '../default/align_sort_sa/main' +params.import_prefix = "../../../../nf/subworkflows/ncbi/" // redirected during testing + +include { miniprot } from "./${params.import_prefix}target_proteins/miniprot/main" +include { align_filter_sa } from "./${params.import_prefix}target_proteins/align_filter_sa/main" +include { best_aligned_prot } from "./${params.import_prefix}target_proteins/best_aligned_prot/main" +include { paf2asn } from "./${params.import_prefix}target_proteins/paf2asn/main" +include { align_sort_sa} from "./${params.import_prefix}target_proteins/../default/align_sort_sa/main" params.intermediate = false @@ -30,8 +32,7 @@ workflow target_proteins_plane { def converted_asn = paf2asn.out.asn_file best_aligned_prot(genome_asn, proteins_asn, converted_asn.collect(), gencoll_asn, task_params.get('best_aligned_prot', [:])) align_filter_sa(genome_asn, proteins_asn, best_aligned_prot.out.asn_file, task_params.get('align_filter_sa', [:])) - run_align_sort(genome_asn, proteins_asn,align_filter_sa.out.filtered_file, - "-k subject,subject_start,-subject_end,subject_strand,query,query_start,-query_end,query_strand,-num_ident,gap_count" ) + align_sort_sa(genome_asn, proteins_asn,align_filter_sa.out.filtered_file, task_params.get('align_sort_sa', [:])) emit: - protein_alignments = run_align_sort.out + protein_alignments = align_sort_sa.out } diff --git a/nf/ui.nf b/nf/ui.nf index bd8506c..6b66dc1 100644 --- a/nf/ui.nf +++ b/nf/ui.nf @@ -5,7 +5,6 @@ nextflow.enable.dsl=2 include { egapx } from './subworkflows/ncbi/main' -include { only_gnomon } from './subworkflows/ncbi/only_gnomon' params.verbose = false @@ -13,8 +12,17 @@ params.verbose = false process export { publishDir "${params.output}", mode: 'copy', saveAs: { fn -> fn.substring(fn.lastIndexOf('/')+1) } input: - path out_files + path genomic_gff + path genomic_gtf + path genomic_fasta + path transcript_fasta + path cds_fasta + path proteins_fasta path annot_builder_output, stageAs: 'annot_builder_output/*' + path validated, stageAs: 'validated/*' + path stats, stageAs: 'stats/*' + path annotated_genome_asn + path annotation_data_comment // path locus output: path "*", includeInputs: true @@ -34,55 +42,60 @@ workflow { def input_params = params.get('input', [:]) def genome = input_params.get('genome', []) def proteins = input_params.get('proteins', []) + def proteins_trusted = input_params.get('proteins_trusted', []) def reads_query = input_params.get('reads_query', []) def reads_ids = input_params.get('reads_ids', []) def reads = input_params.get('reads', []) def reads_metadata = input_params.get('reads_metadata', []) def organelles = input_params.get('organelles', []) ?: [] def tax_id = input_params.get('taxid', []) + def symbol_format_class = input_params.get('symbol_format_class', []) def hmm_params = input_params.get('hmm', []) ?: [] - def hmm_taxid = input_params.get('hmm_taxid', []) ?: [] + def train_hmm = input_params.get('train_hmm', []) def softmask = input_params.get('softmask', []) ?: [] def max_intron = input_params.get('max_intron', []) def genome_size_threshold = input_params.get('genome_size_threshold', []) + def ortho_files = input_params.get('ortho', []) ?: [] def rnaseq_alignments = input_params.get('rnaseq_alignments', []) ?: [] def protein_alignments = input_params.get('protein_alignments', []) ?: [] + def reference_sets = input_params.get('reference_sets', []) ?: [] def task_params = params.get('tasks', [:]) - def func_name = params.get('func_name', '') if (params.verbose) { println("input params:\ngenome ${genome}") println("proteins ${proteins}") + println("proteins_trusted ${proteins_trusted}") println("reads_query ${reads_query}") println("reads_ids ${reads_ids}") println("reads ${reads}") println("reads_metadata ${reads_metadata}") println("organelles ${organelles}") println("tax_id ${tax_id}") + println("symbol_format_class ${symbol_format_class}") println("hmm_params ${hmm_params}") - println("hmm_taxid ${hmm_taxid}") + println("train_hmm ${train_hmm}") println("softmask ${softmask}") println("max_intron ${max_intron}") println("genome_size_threshold ${genome_size_threshold}") + println("ortho_files ${ortho_files}") println("rnaseq_alignments ${rnaseq_alignments}") println("protein_alignments ${protein_alignments}") - println("func_name ${func_name}") + println("reference_sets ${reference_sets}") // Keep it last as it is large println("task_params ${task_params}") } - if(func_name == 'only_gnomon') { - if (params.verbose) { - print('in gnomon block') - } - only_gnomon(genome, proteins, rnaseq_alignments, protein_alignments, organelles, tax_id, hmm_params, hmm_taxid, softmask, task_params) - export(only_gnomon.out.out_files, only_gnomon.out.evidence) - } - else { - if (params.verbose) { - print('in egapx block') - } - egapx(genome, proteins, reads_query, reads_ids, reads, reads_metadata, organelles, tax_id, hmm_params, hmm_taxid, softmask, max_intron, genome_size_threshold, task_params) - // export(egapx.out.out_files, egapx.out.annot_builder_output, egapx.out.locus) - export(egapx.out.out_files, egapx.out.annot_builder_output) - } + egapx(genome, proteins, proteins_trusted, reads_query, reads_ids, reads, reads_metadata, organelles, tax_id, symbol_format_class, hmm_params, train_hmm, softmask, max_intron, genome_size_threshold, ortho_files, reference_sets, task_params) + // export(egapx.out.out_files, egapx.out.annot_builder_output, egapx.out.locus) + //export(egapx.out.out_files, egapx.out.annot_builder_output) + export(egapx.out.out_ggff, + egapx.out.out_ggtf, + egapx.out.out_gfa, + egapx.out.out_rna_fa, + egapx.out.out_cds_fa, + egapx.out.out_prot_fa, + egapx.out.annot_builder_output, + egapx.out.validated, + egapx.out.stats, + egapx.out.annotated_genome_asn, + egapx.out.annotation_data_comment) } diff --git a/ui/requirements.txt b/requirements.txt similarity index 100% rename from ui/requirements.txt rename to requirements.txt diff --git a/ui/assets/config/docker_image.config b/ui/assets/config/docker_image.config index aa41657..8837ebe 100644 --- a/ui/assets/config/docker_image.config +++ b/ui/assets/config/docker_image.config @@ -1 +1 @@ -process.container = 'ncbi/egapx:0.2-alpha' +process.container = 'ncbi/egapx:0.3.0-alpha' \ No newline at end of file diff --git a/ui/assets/config/executor/ncbi-sge.config b/ui/assets/config/executor/ncbi-sge.config index 41daf3b..45f739b 100644 --- a/ui/assets/config/executor/ncbi-sge.config +++ b/ui/assets/config/executor/ncbi-sge.config @@ -5,25 +5,35 @@ // such configurations for third party, use docker or singularity for this. Uncomment corresponding line below //docker.enabled = true //singularity.enabled = true -process { - executor = 'sge' - clusterOptions = "-m n -V -P progressive -w n -A annotations-euk -l express=TRUE,h_vmem=INFINITY,m_core=12,osverfull='8*'" -} + env.GP_HOME="/netmnt/vast01/egapx/bin/" env.PATH = "/netmnt/vast01/egapx/bin:/netmnt/vast01/egapx/bin/gp:/netmnt/vast01/egapx/bin/third-party/STAR/bin/Linux_x86_64:\$PATH" + process { + executor = 'sge' memory = 60.GB - time = 3.h + time = 6.h + errorStrategy = 'retry' + maxRetries = 3 + // We can't request SGE 'slots' ('cpus' directive) but we need to pass the number of available threads + // to underlying process to run it correctly. We use 'ext.cpus' for this and use clusterOptions to allocate + // large enough instance via m_thread + ext.cpus = 7 + clusterOptions = "-m n -V -P progressive -w n -A annotations-euk -l express=TRUE,h_vmem=INFINITY,m_thread=8,osverfull='8*'" withLabel: 'big_job' { memory = 120.GB + ext.cpus = 15 + clusterOptions = "-m n -V -P progressive -w n -A annotations-euk -l express=TRUE,h_vmem=INFINITY,m_thread=16,osverfull='8*'" } withLabel: 'huge_job' { memory = 200.GB + ext.cpus = 31 + clusterOptions = "-m n -V -P progressive -w n -A annotations-euk -l express=TRUE,h_vmem=INFINITY,m_thread=32,osverfull='8*'" } withLabel: 'long_job' { - time = 6.h + time = 16.h } } diff --git a/ui/assets/default_task_params.yaml b/ui/assets/default_task_params.yaml index 9569214..057dcda 100644 --- a/ui/assets/default_task_params.yaml +++ b/ui/assets/default_task_params.yaml @@ -2,12 +2,14 @@ tasks: setup: convert_genome: force-local-ids convert_proteins: force-local-ids + align_sort_sa: + align_sort: -k subject,subject_start,-subject_end,subject_strand,query,query_start,-query_end,query_strand,-num_ident,gap_count bam_bin_and_sort: bam_bin: -avg-size-per-bin 200000000 -file-pattern 'bin#.bam' -exclude-organelle bam_strandedness: rnaseq_divide_by_strandedness: -min-aligned 1000000 -min-unambiguous 200 -min-unambiguous-pct 2 -percentage-threshold 98 - chainer: + chainer_wnode: chainer_wnode: -altfrac 80.0 -capgap 5 -cdsbonus 0.05 -composite 10000 -end-pair-support-cutoff 0.1 -endprotfrac 0.05 -filters 'remove_single_exon_est_models remove_single_exon_noncoding_models' -high-identity 0.98 -hmaxlen 0.25 -hthresh 0.02 -i3p 14.0 -i5p 7.0 -lenpen 0.005 @@ -25,7 +27,7 @@ tasks: -no-scores getfasta: getfasta: -u full-assembly -bare-accession -use-reference-non-nuclear - gnomon: + gnomon_wnode: annot_wnode: -margin 1000 -mincont 1000 -minlen 225 -mpp 10.0 -ncsp 25 -window 200000 -nonconsens -open generic_action_node: -app annot_wnode @@ -33,9 +35,10 @@ tasks: gpx_qsubmit: '' prot_gnomon_prepare: prot_gnomon_prepare: '' + sra_qry: + sra_query: '' rnaseq_collapse: - gpx_make_outputs: -default-output-name align -slices-for affinity -sort-by job-id - -unzip align + gpx_make_outputs: -default-output-name align -slices-for affinity -sort-by job-id -unzip align gpx_qsubmit: -affinity subject rnaseq_collapse: -backlog 1 -max-jobs 1 -support-non-sra # -rank-counts-precalculated @@ -43,29 +46,49 @@ tasks: star_index: STAR: --runThreadN 8 star_wnode: - gpx_qdump: -unzip '*' - gpx_qsubmit: -affinity subject - star_wnode: -cpus-per-worker 16 -csi-threshold 512000000 -preserve-star-logs + star_wnode: -preserve-star-logs star-params: --alignSJoverhangMin 8 --outFilterMultimapNmax 200 --outFilterMismatchNmax - 50 --runThreadN 16 --genomeLoad NoSharedMemory --outSAMtype SAM --outSAMattributes + 50 --genomeLoad NoSharedMemory --outSAMtype SAM --outSAMattributes 'NH HI AS nM NM MD jM jI XS MC' --outSAMprimaryFlag AllBestScore --outFilterMultimapScoreRange 50 --seedSearchStartLmax 15 --limitOutSAMoneReadBytes 1000000 --outSJtype None miniprot: - split_proteins: -n 100000 + split_proteins: -n 25000 miniprot: -t 31 -p 0.4 --outs=0.4 paf2asn: paf2asn: -prosplign-refinement best_aligned_prot: - best_aligned_prot: -asm_alns_filter 'reciprocity = 3' + best_placement: -asm_alns_filter 'reciprocity = 3' chainer_sort_alignments: align_sort: -ifmt seq-align -k subject,subject_start,-subject_end,subject_strand,query,query_start,-query_end,query_strand,-num_ident,gap_count align_filter_sa: align_filter: -filter 'rank=1 OR (pct_identity_gapopen_only > 58 AND (pct_coverage > 50 OR align_length_ungap > 1000))' -ifmt seq-align gnomon_training: gnomon_training: -asn -b - diamond: + diamond_identify: diamond: -query-fmt seq-ids -subject-fmt seq-ids -output-prefix hits -ofmt seq-align-set - diamond_blastp: --sam-query-len --very-sensitive --unal 0 --comp-based-stats 0 --masking 0 + diamond_blastp: --sam-query-len --comp-based-stats 0 --evalue 0.0001 --very-sensitive --masking 0 --unal 0 diamond_orthology: - diamond_orhtology: -ofmt seq-align-set -query-fmt seq-ids -subject-fmt seq-ids -output-prefix hits - diamond_orthology_blastp: --sam-query-len --very-sensitive --unal 0 --comp-based-stats 0 --masking 0 + diamond: -ofmt seq-align-set -query-fmt seq-ids -subject-fmt seq-ids -output-prefix hits + diamond_blastp: --sam-query-len --very-sensitive --unal 0 --max-target-seqs 100 --comp-based-stats 0 --masking 0 + gnomon_biotype: + gnomon_biotype: '' + find_orthologs: + find_orthologs: '' + locus_link: + locus_type: '' + locus_track: + locus_track: '' + final_asn_markup: + final_asn: '-annot-provider "GenBank submitter" -annot-pipeline "NCBI EGAPx"' + extract_products_from_models: + extract_products: '' + best_protein_hits: + align_filter: " -filter 'pct_coverage >= 50' " + align_sort: ' -k query,-bit_score,slen,-align_length -group 1 -top 1 ' + annot_builder: + annot_builder: '' + annotwriter: + annotwriter: '' + convert_annotations: + annotwriter_gff: '' + annotwriter_gtf: '' diff --git a/ui/egapx.py b/ui/egapx.py index 490f0a3..816b8d4 100755 --- a/ui/egapx.py +++ b/ui/egapx.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # requires pip install pyyaml # +from errno import ENOENT import shlex import shutil import sys @@ -24,10 +25,18 @@ import yaml +software_version = "0.3.0-alpha" + VERBOSITY_DEFAULT=0 VERBOSITY_QUIET=-1 VERBOSITY_VERBOSE=1 +# Clades with exceptional treatment +PLANTS=33090 +VERTEBRATES=7742 +MAMMALS=40674 +MAGNOLIOPSIDA=3398 + FTP_EGAP_PROTOCOL = "https" FTP_EGAP_SERVER = "ftp.ncbi.nlm.nih.gov" FTP_EGAP_ROOT_PATH = "genomes/TOOLS/EGAP/support_data" @@ -49,11 +58,13 @@ def parse_args(argv): parser.add_argument("-n", "--dry-run", action="store_true", default=False) parser.add_argument("-st", "--stub-run", action="store_true", default=False) parser.add_argument("-so", "--summary-only", help="Print result statistics only if available, do not compute result", action="store_true", default=False) + parser.add_argument("-ot", "--ortho-taxid", default=0, help="Taxid of reference data for orthology tasks") group = parser.add_argument_group('download') group.add_argument("-dl", "--download-only", help="Download external files to local storage, so that future runs can be isolated", action="store_true", default=False) parser.add_argument("-lc", "--local-cache", help="Where to store the downloaded files", default="") parser.add_argument("-q", "--quiet", dest='verbosity', action='store_const', const=VERBOSITY_QUIET, default=VERBOSITY_DEFAULT) parser.add_argument("-v", "--verbose", dest='verbosity', action='store_const', const=VERBOSITY_VERBOSE, default=VERBOSITY_DEFAULT) + parser.add_argument("-V", "--version", dest='report_version', help="Report software version", action='store_true', default=False) parser.add_argument("-fn", "--func_name", help="func_name", default="") return parser.parse_args(argv[1:]) @@ -161,6 +172,20 @@ def download_ftp_dir(self, ftp_path, local_path): ##time.sleep(0.1) ## else: . or .. do nothing + def list_ftp_dir(self, ftp_path): + fl = [] + try: + fl = self.ftp.mlsd(ftp_path) + except: + return list() + rl = list() + for ftp_item in fl: + ftp_type = ftp_item[1]['type'] + if ftp_type=='dir' or ftp_type=='cdir' or ftp_type=='pdir': + continue + name = ftp_item[0] + rl.append(name) + return rl def download_egapx_ftp_data(local_cache_dir): global user_cache_dir @@ -198,32 +223,37 @@ def repackage_inputs(run_inputs): return new_inputs -def convert_value(value): +def convert_value(value, key, strict): "Convert paths to absolute paths when possible, look into objects and lists" if isinstance(value, dict): - return {k: convert_value(v) for k, v in value.items()} + return {k: convert_value(v, key, strict) for k, v in value.items()} elif isinstance(value, list): - return [convert_value(v) for v in value] + return [convert_value(v, key, strict) for v in value] else: - if value == '' or re.match(r'[a-z0-9]{2,5}://', value) or not os.path.exists(value): + if not isinstance(value, str) or value == '' or re.match(r'[a-z0-9]{2,5}://', value): # do not convert - this is a URL or empty string or non-file return value + if not os.path.exists(value): + if strict: + raise OSError(ENOENT, f"File for parameter '{key}' doesn't exist", value) + else: + return value # convert to absolute path - return str(Path(value).absolute()) + return str(Path(value).resolve()) -path_inputs = { 'genome', 'hmm', 'softmask', 'reads_metadata', 'organelles', 'proteins', 'reads', 'rnaseq_alignments', 'protein_alignments' } +path_inputs = {'genome', 'hmm', 'softmask', 'reads_metadata', 'organelles', + 'proteins', 'proteins_trusted', 'reads', + 'rnaseq_alignments', 'protein_alignments', 'ortho', 'reference_sets' } def convert_paths(run_inputs): "Convert paths to absolute paths where appropriate" input_root = run_inputs['input'] for key in input_root: if key in path_inputs: - if isinstance(input_root[key], list): - input_root[key] = [convert_value(v) for v in input_root[key]] - else: - input_root[key] = convert_value(input_root[key]) + strict = key != 'reads' + input_root[key] = convert_value(input_root[key], key, strict) if 'output' in run_inputs: - run_inputs['output'] = convert_value(run_inputs['output']) + run_inputs['output'] = convert_value(run_inputs['output'], 'output', False) def prepare_reads(run_inputs): @@ -232,45 +262,75 @@ def prepare_reads(run_inputs): if 'reads' not in run_inputs['input'] or 'output' not in run_inputs: return prefixes = defaultdict(list) + has_files = False reads = run_inputs['input']['reads'] if type(reads) == str: del run_inputs['input']['reads'] - run_inputs['input']['reads_query'] = reads - return + if os.path.exists(reads): + has_files = True + with open(reads) as f: + # Parse lines into run name and read file + for line in f: + line = line.strip() + if not line or line[0] == '#': + continue + mo = re.match(r'([^ \t]+)[ \t]+(.*)', line) + if mo: + prefixes[mo.group(1)].append(mo.group(2)) + else: + run_inputs['input']['reads_query'] = reads + return + else: + for rf in run_inputs['input']['reads']: + if type(rf) == str: + name = Path(rf).parts[-1] + mo = re.match(r'([^._]+)', name) + if mo and mo.group(1) != name: + has_files = True + prefixes[mo.group(1)].append(rf) + elif type(rf) == list: + has_files = True + # find if rf is a tuple already in 'fromPairs' format, that is [sample_id, [read1, read2]] + if len(rf) == 2 and type(rf[0]) == str and type(rf[1]) == list: + for r in rf[1]: + if type(r) != str: + raise Exception("Invalid read input") + prefixes[rf[0]].append(r) + else: + # rf is a list of files, we follow star_wnode algorithm to find 'sample_id' + names = list(map(lambda x: re.match(r'([^.]+)', Path(x).parts[-1]).group(1), rf)) + # Find common prefix + names.sort() + if len(names) == 1: + sample_id = names[0] + else: + for i in range(min(len(names[0]), len(names[-1]))): + if names[0][i] != names[-1][i]: + break + sample_id = names[0][0:i] + # Dont end prefix with . or _ + i = len(sample_id) - 1 + while i > 0 and sample_id[-1] in "._": + sample_id = sample_id[:-1] + i -= 1 + prefixes[sample_id] = rf # Create fake metadata file for reads containing only one meaningful information piece - pairedness # Have an entry in this file only for SRA runs - files with prefixes SRR, ERR, and DRR and digits # Non-SRA reads are handled by rnaseq_collapse internally - has_files = False - for rf in run_inputs['input']['reads']: - if type(rf) == str: - name = Path(rf).parts[-1] - mo = re.match(r'([^._]+)', name) - if mo and mo.group(1) != name: - has_files = True - prefixes[mo.group(1)].append(rf) - elif type(rf) == list: - has_files = True - names = list(map(lambda x: re.match(r'([^.]+)', Path(x).parts[-1]).group(1), rf)) - # Find common prefix - names.sort() - if len(names) == 1: - sample_id = names[0] - else: - for i in range(min(len(names[0]), len(names[-1]))): - if names[0][i] != names[-1][i]: - break - sample_id = names[0][0:i] - # Dont end prefix with . or _ - i = len(sample_id) - 1 - while i > 0 and sample_id[-1] in "._": - sample_id = sample_id[:-1] - i -= 1 - prefixes[sample_id] = rf if has_files: # len(prefixes): # Always create metadata file even if it's empty output = run_inputs['output'] with tempfile.NamedTemporaryFile(mode='w', delete=False, dir=output, prefix='egapx_reads_metadata_', suffix='.tsv') as f: for k, v in prefixes.items(): + # Check file names in v for matching SRR pattern + mos = [ re.search(r'([SDE]RR[0-9]+)', x) for x in v ] + # Distinct SRRs + srrs = list(set([ mo.group(1) for mo in mos if mo ])) + if len(srrs) > 1: + print(f"Error: Run set {k} has multiple run ids in read files: {srrs}") + sys.exit(1) + if srrs: + k = srrs[0] if re.fullmatch(r'([DES]RR[0-9]+)', k): paired = 'paired' if len(v) == 2 else 'unpaired' # SRR9005248 NA paired 2 2 NA NA NA NA NA NA NA 0 @@ -284,6 +344,99 @@ def prepare_reads(run_inputs): run_inputs['input']['reads_query'] = "[Accession] OR ".join(reads) + "[Accession]" + +''' +from /src/internal/gpipe/app/locusXref/populate_tax_tables.cpp + +278 bool fish_lineage = lineage.find("Actinopterygii") != string::npos || +279 lineage.find("Coelacanthimorpha") != string::npos || +280 lineage.find("Chondrichthyes") != string::npos || +281 lineage.find("Dipnoi") != string::npos; +282 int pub_stat = 4, use_in_lxr_client = 1, pref_map_link_type = 448; +283 int tax_id_name_auth = 0; +284 if(gbdiv == "gbdiv_vrt" && fish_lineage) { +285 tax_id_name_auth = 7955; +286 } else if(gbdiv == "gbdiv_pri" || gbdiv == "gbdiv_mam" +287 || gbdiv == "gbdiv_vrt" || gbdiv == "gbdiv_rod") { +288 tax_id_name_auth = 9606; +289 } else if(gbdiv == "gbdiv_pln") { +290 tax_id_name_auth = 3702; +291 } else if(gbdiv == "gbdiv_inv" && +292 (lineage.find("Insecta") != string::npos || +293 lineage.find("Arthropoda") != string::npos)) +294 { +295 tax_id_name_auth = 7227; +296 } +297 string symbol_format_class = "allupper"; +298 string symbol_format = "[A-Z][A-Z0-9][-A-Z0-9]%"; +299 if(tax_id_name_auth == 7227) { +300 symbol_format_class = "NULL"; +301 symbol_format = "NULL"; +302 } else if(gbdiv == "gbdiv_rod" || gbdiv == "gbdiv_inv") { +303 symbol_format_class = "uplow"; +304 symbol_format = "[A-Z][a-z0-9][-a-z0-9]%"; +305 } +306 if(fish_lineage) { +307 //if looks like a fish +308 symbol_format_class = "alllow"; +309 symbol_format = "[a-z][a-z0-9][-a-z0-9]%"; +310 } +''' +def get_symbol_format_class_for_taxid(taxid): + #print('e:get_symbol_format_class_for_taxid') + #print(f'taxid: {taxid}') + lineage, ranks = get_lineage(taxid) + #print(f'lineage: {lineage}') + #print(f'ranks: {ranks}') + + format_class = '' + + ACTINOPTERYGII = 7898 + COELACANTHIMORPHA = 118072 + CHONDRICHTHYES = 7777 + DIPNOMORPHA = 7878 + FISH = [ACTINOPTERYGII,COELACANTHIMORPHA,CHONDRICHTHYES,DIPNOMORPHA] + PRIMATE = 9443 + MAMMAL = 40674 + RODENT = 9989 + VERTEBRATE = 7742 + ANIMALS = 33208 + ## define gbdiv_inv as has ANIMALS but not VERTEBRATE + INSECTA = 50557 + ARTHROPOD = 6656 + VIRIDIPLANTAE = 33090 + LEPIDOSAURS = 8504 + AMPHIBIANS = 8292 + + + #test_taxids = [7898, 9989, 7742, 6656, 33090] + #for tt in test_taxids: + # print(f' ct: {tt} {lineage.count(tt)}') + + is_fish = [i for i in lineage if i in FISH] + is_invert = (ANIMALS in lineage) and not (VERTEBRATE in lineage) + + #print(f'is_fish: {is_fish}') + #print(f'is_invert: {is_invert}') + #print(f'ANIM: {(ANIMALS in lineage)}') + #print(f'VERT: {(VERTEBRATE in lineage)}') + #print(f'INSE: {(INSECTA in lineage)}') + #print(f'ARTH: {(ARTHROPOD in lineage)}') + #print(f'LEPI: {(LEPIDOSAURS in lineage)}') + #print(f'AMPH: {(AMPHIBIANS in lineage)}') + + format_class = 'allupper' + if is_invert and (INSECTA in lineage or ARTHROPOD in lineage): + format_class = 'NULL' + elif (RODENT in lineage) or is_invert: + format_class = 'uplow' + elif (LEPIDOSAURS in lineage) or (AMPHIBIANS in lineage) or is_fish: + format_class = 'alllow' + + #print('x:get_symbol_format_class_for_taxid') + return format_class + + def expand_and_validate_params(run_inputs): """ Expand implicit parameters and validate inputs Args: @@ -299,13 +452,17 @@ def expand_and_validate_params(run_inputs): taxid = inputs['taxid'] + if 'symbol_format_class' not in inputs: + symbol_format_class = get_symbol_format_class_for_taxid(taxid) + inputs['symbol_format_class'] = symbol_format_class + if 'genome' not in inputs: print("ERROR: Missing parameter: 'genome'") return False # Check for proteins input and if empty or no input at all, add closest protein bag if 'proteins' not in inputs: - proteins = get_closest_protein_bag(taxid) + proteins,trusted = get_closest_protein_bag(taxid) if not proteins: # Proteins are not specified explicitly and not found by taxid print(f"ERROR: Proteins are not found for tax id {inputs['taxid']}") @@ -313,6 +470,8 @@ def expand_and_validate_params(run_inputs): print(" You can specify proteins manually using 'proteins' parameter") return False inputs['proteins'] = proteins + if trusted: + inputs['proteins_trusted'] = trusted has_rnaseq = 'reads' in inputs or 'reads_ids' in inputs or 'reads_query' in inputs if not has_rnaseq: @@ -323,12 +482,12 @@ def expand_and_validate_params(run_inputs): return False if 'hmm' not in inputs: - best_taxid, best_hmm = get_closest_hmm(taxid) + best_hmm, good_match = get_closest_hmm(taxid) inputs['hmm'] = best_hmm - inputs['hmm_taxid'] = best_taxid + inputs['train_hmm'] = not good_match else: # We assume the user knows what they're doing and the training is not necessary - inputs['hmm_taxid'] = taxid + inputs['train_hmm'] = False if 'max_intron' not in inputs: max_intron, genome_size_threshold = get_max_intron(taxid) @@ -337,6 +496,38 @@ def expand_and_validate_params(run_inputs): else: # Given max_intron is a hard limit, no further calculation is necessary inputs['genome_size_threshold'] = 0 + + if 'ortho' not in inputs or inputs['ortho'] is None or len(inputs['ortho']) < 4: + ortho_files = dict() + if 'ortho' in inputs and isinstance(inputs['ortho'], dict): + ortho_files = inputs['ortho'] + + chosen_taxid=0 + if 'taxid' in ortho_files: + chosen_taxid = ortho_files['taxid'] + if chosen_taxid == 0: + chosen_taxid = get_closest_ortho_ref_taxid(taxid) + ortho_files['taxid'] = chosen_taxid + + file_id = ['genomic.fna', 'genomic.gff', 'protein.faa'] + + possible_files = [] + try: + possible_files = get_files_under_path('ortholog_references', f'{chosen_taxid}/current') + except: + print(f'Could not find path for ortho taxid {chosen_taxid}') + return False + for pf in possible_files: + for fi in file_id: + if fi in ortho_files: + continue + if pf.find(fi) > -1: + ortho_files[fi] = pf + + ortho_files['name_from.rpt'] = get_file_path('ortholog_references',f'{chosen_taxid}/name_from_ortholog.rpt') + inputs['ortho'] = ortho_files + if 'reference_sets' not in inputs or inputs['reference_sets'] is None: + inputs['reference_sets'] = get_file_path('reference_sets', 'swissprot.asnb.gz') return True @@ -369,6 +560,13 @@ def get_cache_dir(): data_version_cache = {} def get_versioned_path(subsystem, filename): + """ + Retrieve the versioned path for a given subsystem and filename. + + Checks the cache for the subsystem version, and if not available, + downloads the manifest file to populate the cache. Returns the path + including the version if found; otherwise, returns the default path. + """ global data_version_cache global user_cache_dir if not data_version_cache: @@ -415,6 +613,31 @@ def get_file_path(subsystem, filename): return file_path return file_url +def get_files_under_path(subsystem, part_path): + cache_dir = get_cache_dir() + vfn = get_versioned_path(subsystem, part_path) + file_path = os.path.join(cache_dir, vfn) + file_url = f"{FTP_EGAP_ROOT}/{vfn}" + ## look under file_path + files_below = list() + try: + for i in Path(file_path).iterdir(): + files_below.append(str(i)) + if files_below: + return files_below + except: + None + ## if nothing, look under file_url + if not files_below: + ftpd = FtpDownloader() + ftpd.connect(FTP_EGAP_SERVER) + ftp_dir = f'{FTP_EGAP_ROOT_PATH}/{vfn}' + files_found = ftpd.list_ftp_dir(ftp_dir) + files_online = list() + for i in files_found: + files_online.append( f"{FTP_EGAP_ROOT}/{vfn}/{i}") ### .replace('//','/') ) + return files_online + return list() def get_config(script_directory, args): config_file = "" @@ -459,12 +682,13 @@ def get_config(script_directory, args): lineage_cache = {} def get_lineage(taxid): global lineage_cache + ranks = {} if not taxid: - return [] + return [], {} if taxid in lineage_cache: return lineage_cache[taxid] # Try cached taxonomy database - taxonomy_db_name = os.path.join(get_cache_dir(), get_versioned_path("taxonomy", "taxonomy4blast.sqlite3")) + taxonomy_db_name = os.path.join(get_cache_dir(), get_versioned_path("taxonomy", "taxonomy.sqlite3")) if os.path.exists(taxonomy_db_name): conn = sqlite3.connect(taxonomy_db_name) if conn: @@ -472,20 +696,30 @@ def get_lineage(taxid): lineage = [taxid] cur_taxid = taxid while cur_taxid != 1: - c.execute("SELECT parent FROM TaxidInfo WHERE taxid = ?", (cur_taxid,)) - cur_taxid = c.fetchone()[0] - lineage.append(cur_taxid) + c.execute("SELECT parent, rank FROM TaxidInfo WHERE taxid = ?", (cur_taxid,)) + parent, rank = c.fetchone() + if rank: + ranks[cur_taxid] = rank + lineage.append(parent) + cur_taxid = parent lineage.reverse() - lineage_cache[taxid] = lineage - return lineage + lineage_cache[taxid] = lineage, ranks + return lineage, ranks # Fallback to API taxon_json_file = urlopen(dataset_taxonomy_url+str(taxid)) taxon = json.load(taxon_json_file)["taxonomy_nodes"][0] lineage = taxon["taxonomy"]["lineage"] lineage.append(taxon["taxonomy"]["tax_id"]) - lineage_cache[taxid] = lineage - return lineage + + ranks_file = urlopen(dataset_taxonomy_url+",".join(map(str, lineage))) + nodes = json.load(ranks_file)["taxonomy_nodes"] + for node in nodes: + if 'rank' in node["taxonomy"]: + ranks[node["taxonomy"]["tax_id"]] = node["taxonomy"]["rank"] + + lineage_cache[taxid] = lineage, ranks + return lineage, ranks def get_tax_file(subsystem, tax_path): @@ -502,7 +736,7 @@ def get_tax_file(subsystem, tax_path): def get_closest_protein_bag(taxid): if not taxid: - return '' + return '','' taxids_file = get_tax_file("target_proteins", "taxid.list") taxids_list = [] @@ -515,7 +749,7 @@ def get_closest_protein_bag(taxid): t = parts[0] taxids_list.append(int(t)) - lineage = get_lineage(taxid) + lineage, _ = get_lineage(taxid) best_taxid = None best_score = 0 @@ -529,14 +763,34 @@ def get_closest_protein_bag(taxid): best_taxid = t if best_score == 0: - return '' + return '','' # print(best_taxid, best_score) - return get_file_path("target_proteins", f"{best_taxid}.faa.gz") + protein_file = get_file_path("target_proteins", f"{best_taxid}.faa.gz") + trusted_file = get_file_path("target_proteins", f"{best_taxid}.trusted_proteins.gi") + return protein_file,trusted_file def get_closest_hmm(taxid): + """ + Given a taxid, this function returns the closest HMM parameters file and a boolean indicating whether the match is good or not. + + Parameters: + taxid (int): The taxid for which to find the closest HMM parameters file. + + Returns: + tuple: A tuple containing two elements: + - str: The path to the closest HMM parameters file. + - bool: A boolean indicating whether the match is good or not. + + Raises: + None + + Examples: + >>> get_closest_hmm(123456) + ('/path/to/hmm_parameters/123456.params', True) + """ if not taxid: - return 0, "" + return "", False taxids_file = get_tax_file("gnomon", "hmm_parameters/taxid.list") @@ -551,21 +805,20 @@ def get_closest_hmm(taxid): l = map(lambda x: int(x) if x[-1] != ';' else int(x[:-1]), parts[1].split()) lineages.append((int(t), list(l)+[int(t)])) - #if len(lineages) < len(taxids_list): - # taxonomy_json_file = urlopen(dataset_taxonomy_url+','.join(taxids_list)) - # taxonomy = json.load(taxonomy_json_file)["taxonomy_nodes"] - # lineages = [ (t["taxonomy"]["tax_id"], t["taxonomy"]["lineage"] + [t["taxonomy"]["tax_id"]]) for t in taxonomy ] - - lineage = get_lineage(taxid) + lineage, ranks = get_lineage(taxid) + is_mammal = MAMMALS in lineage best_lineage = None best_taxid = None best_score = 0 + best_good_match = False for (t, l) in lineages: pos1 = 0 last_match = 0 + last_good_match = False for pos in range(len(lineage)): tax_id = lineage[pos] + rank = ranks.get(tax_id, '') while tax_id != l[pos1]: if pos1 + 1 < len(l): pos1 += 1 @@ -573,26 +826,87 @@ def get_closest_hmm(taxid): break if tax_id == l[pos1]: last_match = pos1 + last_good_match = last_good_match or rank == 'GENUS' or (rank == 'FAMILY' and is_mammal) else: break if last_match > best_score: best_score = last_match best_taxid = t best_lineage = l + best_good_match = last_good_match if best_score == 0: - return 0, "" + return "", False + + # Either perfect match or match of the genus or family for mammals + good_match = best_good_match or best_score == len(best_lineage) - 1 # print(best_lineage) # print(best_taxid, best_score) - return best_taxid, get_file_path("gnomon", f"hmm_parameters/{best_taxid}.params") + return get_file_path("gnomon", f"hmm_parameters/{best_taxid}.params"), good_match + + +def get_closest_ortho_ref_taxid(taxid): + if not taxid: + return 0 + + taxids_file = get_tax_file("ortholog_references", "taxid.list") + + taxids_list = [] + lineages = [] + for line in taxids_file: + parts = line.decode("utf-8").strip().split('\t') + if len(parts) > 0: + t = parts[0] + taxids_list.append(t) + if len(parts) > 1: + l = map(lambda x: int(x) if x[-1] != ';' else int(x[:-1]), parts[1].split()) + lineages.append((int(t), list(l)+[int(t)])) + + lineage, ranks = get_lineage(taxid) + + is_mammal = MAMMALS in lineage + best_lineage = None + best_taxid = None + best_score = 0 + best_good_match = False + for (t, l) in lineages: + pos1 = 0 + last_match = 0 + last_good_match = False + for pos in range(len(lineage)): + tax_id = lineage[pos] + rank = ranks.get(tax_id, '') + while tax_id != l[pos1]: + if pos1 + 1 < len(l): + pos1 += 1 + else: + break + if tax_id == l[pos1]: + last_match = pos1 + last_good_match = last_good_match or rank == 'GENUS' or (rank == 'FAMILY' and is_mammal) + else: + break + if last_match > best_score: + best_score = last_match + best_taxid = t + best_lineage = l + best_good_match = last_good_match + + + if best_score == 0: + return 0 + + # Either perfect match or match of the genus or family for mammals + good_match = best_good_match or best_score == len(best_lineage) - 1 + ##print(best_lineage) + ##print(best_taxid, best_score) + return best_taxid -PLANTS=33090 -VERTEBRATES=7742 def get_max_intron(taxid): if not taxid: return 0, 0 - lineage = get_lineage(taxid) + lineage, _ = get_lineage(taxid) if PLANTS in lineage: return 300000, 3000000000 elif VERTEBRATES in lineage: @@ -644,7 +958,7 @@ def merge_params(task_params, run_inputs): def print_statistics(output): - accept_gff = Path(output) / 'accept.gff' + accept_gff = Path(output) / 'complete.genomic.gff' print(f"Statistics for {accept_gff}") counter = defaultdict(int) if accept_gff.exists(): @@ -663,14 +977,34 @@ def print_statistics(output): print(f"{k:12s} {counter[k]}") +def get_software_version(): + global software_version + if not software_version or software_version[0] == '$': + process_result = subprocess.run(["git", "describe", "--tags"], stdout=subprocess.PIPE) + if process_result.returncode == 0: + version = process_result.stdout.decode('utf-8').strip() + else: + version = "unknown" + else: + version = software_version + return version + + +run_date = datetime.datetime.now().strftime("%Y_%m_%d") + def main(argv): "Main script for EGAPx" - #warn user that this is an alpha release - print("\n!!WARNING!!\nThis is an alpha release with limited features and organism scope to collect initial feedback on execution. Outputs are not yet complete and not intended for production use.\n") # Parse command line args = parse_args(argv) + if args.report_version: + print(f"EGAPx {get_software_version()}") + return 0 + + #warn user that this is an alpha release + print("\n!!WARNING!!\nThis is an alpha release with limited features and organism scope to collect initial feedback on execution. Outputs are not yet complete and not intended for production use.\n") + global user_cache_dir if args.local_cache: # print(f"Local cache: {args.local_cache}") @@ -708,16 +1042,50 @@ def main(argv): # Read default task parameters into a dict task_params = yaml.safe_load(open(Path(script_directory) / 'assets' / 'default_task_params.yaml', 'r')) run_inputs = repackage_inputs(yaml.safe_load(open(args.filename, 'r'))) - + + if int(args.ortho_taxid) > 0: + inputs = run_inputs['inputs'] + inputs['ortho'] = {'taxid': int(args.ortho_taxid)} + if not expand_and_validate_params(run_inputs): return 1 + # In GNOMON's chainer module, the default is -minlen 165 and -minscor 25.0, + # use -minlen 225 and -minscor 40.0 for Magnoliopsida and Vertebrates, + lineage, _ = get_lineage(run_inputs['input']['taxid']) + if MAGNOLIOPSIDA in lineage or VERTEBRATES in lineage: + minlen = 225 + minscor = 40.0 + else: + minlen = 165 + minscor = 25.0 + task_params = merge_params(task_params, {'tasks': { 'chainer': {'chainer_wnode': f"-minlen {minlen} -minscor {minscor}"}}}) + + # Add some parameters to specific tasks + inputs = run_inputs['input'] + final_asn_params = f"-annot-software-version {get_software_version()}" + if 'annotation_provider' in inputs: + final_asn_params += f" -annot-provider \"{inputs['annotation_provider']}\"" + if 'annotation_name_prefix' in inputs: + annotation_name_prefix = re.sub(r'\s+', '-', inputs['annotation_name_prefix']) + final_asn_params += f" -annot-name {annotation_name_prefix}-GB_{run_date}" + else: + final_asn_params += f" -annot-name GB_{run_date}" + if 'locus_tag_prefix' in inputs: + locus_tag_prefix = re.sub(r'\s+', '-', inputs['locus_tag_prefix']) + final_asn_params += f" -locus-tag-prefix {locus_tag_prefix}" + task_params = merge_params(task_params, {'tasks': { 'final_asn_markup': {'final_asn': final_asn_params}}} ) + # Command line overrides manifest input if args.output: run_inputs['output'] = args.output # Convert inputs to absolute paths - convert_paths(run_inputs) + try: + convert_paths(run_inputs) + except OSError as e: + print(F"{e.strerror}: {e.filename}") + return 1 # Create output directory if needed os.makedirs(run_inputs['output'], exist_ok=True) @@ -766,7 +1134,8 @@ def main(argv): nf_cmd += ["-with-report", f"{args.report}.report.html", "-with-timeline", f"{args.report}.timeline.html"] else: nf_cmd += ["-with-report", f"{output}/run.report.html", "-with-timeline", f"{output}/run.timeline.html"] - + + nf_cmd += ["-with-dag", 'dag.dot'] nf_cmd += ["-with-trace", f"{output}/run.trace.txt"] # if output directory does not exist, it will be created if not os.path.exists(output): From 5cdbc46f76687d42ca8fb208ca184c4600193b58 Mon Sep 17 00:00:00 2001 From: Victor Joukov Date: Tue, 5 Nov 2024 17:00:14 -0500 Subject: [PATCH 2/2] README version fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bbc7842..ca4e3d2 100644 --- a/README.md +++ b/README.md @@ -419,7 +419,7 @@ If you do not have internet access from your cluster, you can run EGAPx in offli ``` rm egap*sif singularity cache clean -singularity pull docker://ncbi/egapx:0.3-alpha +singularity pull docker://ncbi/egapx:0.3.0-alpha ``` - Clone the repo: