Merge pull request #191 from MelbourneGenomics/dev

Upgrade 2.4
MelbourneGenomics · Nov 1, 2016 · 423bafc · 423bafc
2 parents 908dce4 + 92af917
commit 423bafc
Show file tree

Hide file tree

Showing 52 changed files with 2,993 additions and 2,096 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
 /tools/
 /data/
 /pipeline/config.groovy
-/tasks/nectar_assets/current.manifest.json
+/tasks/nectar/current.manifest.json
 *.pyc
 *.swp
diff --git a/Dockerfile b/Dockerfile
@@ -4,7 +4,7 @@ FROM ubuntu
 SHELL ["/bin/bash", "-c"]
 
 # Install basic linux tools that we need to make python
-RUN apt-get update && apt-get install -y curl make build-essential libsqlite3-dev
+RUN apt-get update && apt-get install -y curl make build-essential libssl-dev zlib1g-dev libsqlite3-dev libreadline-dev git libbz2-dev libz-dev libssl-dev gfortran xorg-dev libcurl4-openssl-dev ncurses-dev openjdk-8-jdk
 
 # Copy in the repository
 ADD . /opt/cpipe

diff --git a/README.md b/README.md
@@ -1,16 +1,39 @@
-Cpipe  
-=======================
+# Cpipe  
 
 Cpipe is a clinically focused exome sequencing pipeline developed
 by the Melbourne Genomics Health Alliance. Cpipe offers an industry
 standard variant calling pipeline with a suite of additional features 
 needed by diagnostic laboratories added on top.
 
+A simplified workflow is presented here. For more specific information, refer to the [documentation](docs/index.md).
+
+## Basic Installation
+
 To set up Cpipe, clone this repository and then run the install script:
 
     git clone https://github.com/MelbourneGenomics/cpipe.git
     cd cpipe
-    ./pipeline/scripts/install.sh
+    cp /path/to/swift_credentials.sh .
+    ./install.sh
+
+For more detailed instructions, have a look at the [installation documentation](docs/install.md).
+
+## Creating your analysis batch
+
+* Next, create the analysis directory and copy in your fastq files.
+   ```bash
+   mkdir -p batches/<batch_identifier>/data
+   cp  <fastq_files> batches/<batch_identifier>/data
+   ```
+* Now, rename your fastqs to ensure they fit the following pattern:
+`sampleID_<anything>_L[0-9]*_R[0-9].fastq.gz`
+* Lastly, create a metadata file for your batch using:
+`./cpipe batch add_batch --batch <batch_identifier> --profile ALL`
+
+For more information about this stage, refer to the [batches documentation](docs/batches.md).
+
+## Running the Pipeline
 
-For further instructions, take a look at the [User Guide](https://melbournegenomics.github.io/docs/Cpipe_User_Guide.pdf).
+Now, all you need to do is run `./cpipe --batch <batch_identifier>` run
 
+The run command is documented in the [command documentation](docs/commands.md#run).
diff --git a/cpipe b/cpipe
@@ -1,9 +1,52 @@
-#!/usr/bin/env bash
+#!/usr/bin/env bash 
 set -e
 
+function check_java {
+    # Run the java check if necessary
+    if (( $JAVA_CHECK )) ; then
+        doit check_java #> /dev/null
+    fi
+}
+
+function print_general_usage {
+    echo "${normal}Usage: ./cpipe <CPIPE OPTIONS> COMMAND <COMMAND OPTIONS>"
+    echo "${bold}Commands (type --help after any command for more details):"
+    echo "  ${bold}run${normal}: Runs the analysis pipeline"
+    echo "  ${bold}test${normal}: Runs the pipeline tests"
+    echo "  ${bold}batch${normal}: Creates and modifies analysis batches"
+    echo "  ${bold}genelist${normal}: Creates and modifies genelists"
+    echo "  ${bold}metadata${normal}: Creates and modifies sample metadata files"
+    echo "${bold}Cpipe Options"
+    echo "  ${bold}-b, --batch <batch name>"
+    echo "    ${normal}Specify a batch (a subdirectory inside batches) to use for the run and bpipe commands. Defaults to a batch named 'batch'"
+    echo "  ${bold}--help, --usage"
+    echo "    ${normal}Prints this help page"
+    echo "  ${bold}-j, --no-java-check"
+    echo "    ${normal}Disables the java version check. Only do this if you know what you're doing"
+}
+
+function print_metadata_usage {
+    echo "${normal}Usage: ./cpipe <CPIPE OPTIONS> metadata <SUBCOMMAND>"
+    echo "${bold}metadata subcommands (use --help after each for usage options):"
+    echo "  ${bold}check${normal}: Check an existing metadata file"
+    echo "  ${bold}update${normal}: Update an existing metadata file"
+}
+
+function print_run_usage {
+    echo "${normal}Usage: ./cpipe run <COMMAND OPTIONS>"
+    echo "${bold}Run Options"
+    echo "  ${bold}--help, --usage"
+    echo "    ${normal}Prints this help page"
+    echo "  ${bold}-p, --bpipe-options <options>"
+    echo "    ${normal}Specify options to pass to bpipe. Refer to http://docs.bpipe.org/Commands/run/ for reference"
+}
+
 # Set useful variables
 ROOT=$(readlink -f $(dirname $BASH_SOURCE))
 
+# default batch name
+BATCH='batch'
+
 # Load config groovy
 source pipeline/scripts/config_groovy_util.sh
 load_config
@@ -12,32 +55,30 @@ load_config
 source environment.sh
 
 # Printing utilities
-bold=$(tput bold)
-normal=$(tput sgr0)
+if [[ $- == *i* ]]; then
+    bold=$(tput bold)
+    normal=$(tput sgr0)
+else
+    bold=
+    normal=
+fi
 
 #Parse command line arguments
 JAVA_CHECK=1
-ARGS=$(env POSIXLY_CORRECT=1 getopt -o j --long "no-java-check,help,usage" -n $(basename $BASH_SOURCE) -- "$@")
+ARGS=$(env POSIXLY_CORRECT=1 getopt -o jb: --long "no-java-check,help,usage,batch:" -n $(basename $BASH_SOURCE) -- "$@")
 eval set -- "$ARGS"
 while true ; do
     case "$1" in
+        -b|--batch)
+            BATCH=$2
+            shift 2
+        ;;
         -j|--no-java-check)
             JAVA_CHECK=0
             shift 1
         ;;
         --help|--usage)
-            echo "${normal}Usage: ./cpipe <CPIPE OPTIONS> COMMAND <COMMAND OPTIONS>"
-            echo "${bold}Commands (type --help after any command for more details):"
-            echo "  ${bold}pipeline${normal}: Runs the analysis pipeline"
-            echo "  ${bold}test${normal}: Runs the pipeline tests"
-            echo "  ${bold}batch${normal}: Creates and modifies analysis batches"
-            echo "  ${bold}genelist${normal}: Creates and modifies genelists"
-            echo "  ${bold}metadata${normal}: Creates and modifies sample metadata files"
-            echo "${bold}Cpipe Options"
-            echo "  ${bold}--help, --usage"
-            echo "    ${normal}Prints this help page"
-            echo "  ${bold}-j, --no-java-check"
-            echo "    ${normal}Disables the java version check. Only do this if you know what you're doing"
+            print_general_usage
             exit 0
         ;;
         --)
@@ -47,10 +88,6 @@ while true ; do
     esac
 done
 
-# Run the java check if necessary
-if (( $JAVA_CHECK )) ; then
-    doit check_java #> /dev/null
-fi
 
 case "$1" in
     batch)
@@ -66,47 +103,52 @@ case "$1" in
     metadata)
         shift 1
         case "$1" in
+            --help|--usage)
+                print_metadata_usage
+                exit 0
+            ;;
             check)
+                shift 1
                 #e.g. docker run cpipe metadata check < ./batches/batch_identifier/samples.txt
                 python pipeline/scripts/check_metadata.py "$@" < /dev/stdin
             ;;
             update)
+                shift 1
                 #e.g. docker run cpipe metadata update --sample sample_name --name prioritised_genes --value “4:ABC1,ABC2” --target ./batches/batch_identifier/samples.txt
                 python pipeline/scripts/update_metadata.py "$@" < /dev/stdin
             ;;
+            *)
+                print_metadata_usage
+                exit 1
+            ;;
+
         esac
     # e.g. docker run cpipe genelist show_bed --profile profile_name
     ;;
-    pipeline)
+    bpipe)
+        shift 1
+        cd ${ROOT}/batches/${BATCH}/analysis
+        echo `pwd`
+        ../../../bpipe $@
+    ;;
+    run)
 
         # Parse args
         shift 1
-        ARGS=$(getopt -o b:p: --long "batch:,bpipe-options:,help,usage" -n $(basename $BASH_SOURCE) -- "$@")
+        ARGS=$(getopt -o p: --long ",bpipe-options:,help,usage" -n $(basename $BASH_SOURCE) -- "$@")
         eval set -- "$ARGS"
 
         # Default args
-        BATCH='batch'
 
         # Process args - they can specify a batch directory to replace the default 'batch', and they can specify bpipe options manually
         while true ; do
                 case "$1" in
-                    -b|--batch)
-                        BATCH=$2
-                        shift 2
-                    ;;
                     -p|--bpipe-options)
                         BPIPE_OPTIONS=$2
                         shift 2
                     ;;
                     --usage|--help)
-                        echo "${normal}Usage: ./cpipe pipeline <COMMAND OPTIONS>"
-                        echo "${bold}Pipeline Options"
-                        echo "  ${bold}--help, --usage"
-                        echo "    ${normal}Prints this help page"
-                        echo "  ${bold}-b, --batch <batch name>"
-                        echo "    ${normal}Specify a batch to run (a subdirectory inside batches). Defaults to a batch named 'batch'"
-                        echo "  ${bold}-p, --bpipe-options <options>"
-                        echo "    ${normal}Specify options to pass to bpipe. Refer to http://docs.bpipe.org/Commands/Commands/ for reference"
+                        print_run_usage
                         exit 0
                     ;;
                     --)
@@ -116,12 +158,19 @@ case "$1" in
                 esac
         done
 
+        check_java
         mkdir -p batches/${BATCH}/analysis
-        cd batches/${BATCH}/analysis
+        cd ${ROOT}/batches/${BATCH}/analysis
         ../../../bpipe run ${BPIPE_OPTIONS} ../../../pipeline/pipeline.groovy ../samples.txt < /dev/stdin
     ;;
     test)
         shift 1
+        check_java
         pipeline/scripts/run_unit_tests.sh && pipeline/scripts/run_tests.sh detect_mutations_test
     ;;
+    *)
+        echo "Invalid cpipe command!"
+        print_general_usage
+        exit 1
+    ;;
 esac
diff --git a/docs/batches.md b/docs/batches.md
@@ -0,0 +1,108 @@
+# Batches
+
+* [Introduction](#introduction)
+* [Manipulating Batches](#manipulating-batches)
+  * [Creating a Batch](#creating-a-batch)
+* [Files](#files)
+  * [Data Directory](#data-directory)
+  * [Sample Metadata](#sample-metadata)
+* [config.batch.groovy](#configbatchgroovy)
+
+## Introduction
+In Cpipe, a batch is a group of samples to be analysed at the same time. In the filesystem, a batch is a directory inside
+ `cpipe/batches`. For example, a batch named `batch_001` would mean creating a `cpipe/batches/batch_001`gi directory.
+
+## Manipulating Batches
+### Creating a Batch
+Once you have your fastq files, follow these steps to create a new analysis batch:
+* Create the batch directory and copy in the fastq data:
+
+    ```bash
+    mkdir -p batches/batch_identifier/data
+    cp  your_fastq_files batches/batch_identifier/data
+    ```
+* Create the metadata file using:
+    ```bash
+      python ./cpipe batch add_batch --batch <batch identifier> --profile profile_name
+    ```
+
+  For more information on this command, refer to the [`add_batch` documentation](commands.md#add-batch)
+
+### Adding More Samples
+
+To add more samples to an existing batch, use the `./cpipe batch add_samples` command. Refer to 
+[its documentation](./commands.md#add-sample)
+
+### Viewing Batch Information
+
+Cpipe provides two utility commands for viewing batch information:
+* `./cpipe batch show_batches` lists all the batches in the current installation. Refer to 
+[its documentation](commands.md#show-batches) for more information
+* `./cpipe batch show_batch --batch <batch name>` will list information about an existing batch. Refer to
+[its documentation](commands.md#show-batch) for more information.
+
+## Files
+Inside the batch directory (`batches/<batch name>`) are three fundamental elements. Here they are covered in more detail.
+ * The `data` subdirectory (mandatory) 
+ * The sample metadata file (mandatory)
+ * A `config.batch.groovy` configuration file (optional)
+
+### Data Directory 
+The data directory is a directory named `data` that will hold all of the fastq samples for this batch. 
+Each sample in this directory  must fit the pattern `<sample id>_<anything>_<lane number>_<read number>.fastq.gz`, for 
+example, `00NA12877_Coriell_000_TGx140395_TL140776_L001_R1.fastq.gz`. The components to this filename are as follows
+* `sample id` is any unique name for the sample, e.g. `NA12878` in our example
+* `anything` of course can be anything produced by the sequencer, in this case `Coriell_000_TGx140395_TL140776` 
+* `lane number` must be the letter L followed by the lane number, in this case L001
+* `read number` must be the letter R followed by either 1 or 2, the read number, which is `R1` in this example
+
+### Sample Metadata
+The sample metadata is the `samples.txt` located in the batch directory (batches/<batch name>) of the batch you're 
+running. The file is a TSV (tab-separated text file), where each line corresponds to an input sample. Here you can set
+various options about each sample
+
+|Position | Name | Notes | Allowed Values | Availability|
+|---|---|---|---|---|
+|1 | Batch | Identifier for this batch (for Melbourne Genomics this is a 3-digit ID) | Unrestricted | 2.2+|
+|2 | Sample ID | Identifier for this sample (for Melbourne Genomics this is a 9-digit ID) | Unrestricted | 2.2+|
+|3 | DNA Tube ID |  	 | Unrestricted | 2.2+|
+|4 | Sex |  	 | Male, Female, Unknown, other | 2.2+|
+|5 | DNA Concentration | ng/uL | Numeric [0-9.] | 2.2+|
+|6 | DNA Volume | uL | Unrestricted | 2.2+|
+|7 | DNA Quantity | ng | Numeric [0-9.] | 2.2+|
+|8 | DNA Quality |  	 | Numeric [0-9.] | 2.2+|
+|9 | DNA Date | Date of extraction | Comma separated list of dates of the format  yyyymmdd | 2.2+|
+|10 | Cohort | Name of target region to be analysed for the patient | Unrestricted | 2.2+|
+|11 | Sample Type |  	 | Normal, Tumour | 2.2+|
+|12 | Fastq files |  	 | Comma separated list of FASTQ files found in the data directory. | 2.2+|
+|13 | Prioritised genes | The categories are used to prioritise variants from these genes in the gene priority column of the pipeline output. | Comma separated gene list, space separating priorities, e.g. 3:GABRD,KCNAB2,ALG6 4:CASQ2,HAX1,CHRNB2,KCNJ10 | 2.2+ |
+|14 | Consanguinity |  	 | No, Yes, Suspected, Unknown| 2.2+ |
+|15 | Variants file | Known variants for the disease (not implemented) | Unrestricted |2.2+|
+|16 | Pedigree file | PED specification for trios (see Trio Analysis below) | Unrestricted|2.2+|
+|17 | Ethnicity | For filtering on specific variants (not implemented) | Unknown, European, African, Asian|2.2+ |
+|18 | Variant call group | Comma separated list of samples to call as a group (not implemented) | Unrestricted| 2.2+ |
+|19 | Capture date | Exome capture date | Comma separated list of dates of the format  yyyymmdd|2.2+
+|20 | Sequencing Date | Date of sequencing. | Comma separated list of dates of the format  yyyymmdd|2.2+ |
+|21 | Mean Coverage | Total on-target aligned mean coverage, post duplicate removal as obtained by the sequencing laboratory. | Numeric [0-9.]|2.2+ |
+|22 | Duplicate % | Percentage of detected duplicates removed before calculating mean on-target coverage |  	2.2+|
+|23 | Machine ID | Provided by the sequencing laboratory | Comma separated list | 2.2+|
+|24 | DNA Extraction Lab |  |   | 2.2+ |
+|25 | Sequencing Lab |  |   | 2.2+ |
+|26 | Exome capture |   |   | 2.2+ |
+|27| Library preparation |  |   |2.2+ |
+|28 | Barcode pool size |   |   |2.2+|
+|29 | Read type |   |   | 2.2+ |
+|30 | Machine type| |   |2.2+ |
+|31 | Sequencing chemistry |    |   | 2.2+ |
+|32|Sequencing software |   |   | 2.2+ |
+|33 | Demultiplex software |    |   |2.2+|
+|34 | Hospital centre | Origin of the patient sample |  	| 2.2+ |
+|35| Sequencing contact | Where sequencing alerts should be sent | Unrestricted | 2.2+ | 
+|36|Pipeline contact | Where pipeline result alerts should be sent | Unrestricted | 2.2+ | 
+|37|Notes | Additional notes or relevant information about the sequencing | Unrestricted |2.2+|
+|38 | Pipeline notes | Additional notes relevant to the operation of the pipeline | Unrestricted | 2.3 |
+|39 | Analysis type | Currently unused | Unrestricted | 2.3 |
+
+### config.batch.groovy
+The final file that can be part of a batch is an optional configuration file. Refer to the [configuration](configuration.md)
+section for more details.