Skip to content

Commit

Permalink
Merge pull request wtsi-npg#242 from ces/devel
Browse files Browse the repository at this point in the history
Add support for loading PacBio ccs secondary analysis BAM files
  • Loading branch information
kjsanger authored Jul 23, 2019
2 parents d5b5189 + 8531523 commit 7dea509
Show file tree
Hide file tree
Showing 18 changed files with 332 additions and 48 deletions.
4 changes: 4 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@


- Add support for loading PacBio ccs BAM files and setting target = 1
on relevant PacBio sequence files.

Release 2.13.0

- WTSI::NPG::HTS::Illumina::ResultSet - added geno to genotype_regex.
Expand Down
4 changes: 4 additions & 0 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -16510,6 +16510,10 @@ t/data/pacbio/sequel_analysis/001612/tasks/barcoding.tasks.lima-0/lima_output.lb
t/data/pacbio/sequel_analysis/001612/tasks/barcoding.tasks.lima-0/lima_output.removed.bam
t/data/pacbio/sequel_analysis/001612/tasks/barcoding.tasks.lima-0/lima_output.removed.bam.pbi
t/data/pacbio/sequel_analysis/001612/tasks/barcoding.tasks.lima-0/lima_output.removed.subreadset.xml
t/data/pacbio/sequel_analysis/000226/entry-points/acf46f00-12b8-45e6-bc10-b0790f8d6758.subreadset.xml
t/data/pacbio/sequel_analysis/000226/tasks/pbcoretools.tasks.auto_ccs_outputs-0
t/data/pacbio/sequel_analysis/000226/tasks/pbcoretools.tasks.auto_ccs_outputs-0/m64016_190608_025655.ccs.bam
t/data/pacbio/sequel_analysis/000226/tasks/pbcoretools.tasks.auto_ccs_outputs-0/m64016_190608_025655.ccs.bam.pbi
t/data/pacbio/superfoo/24862_627/A01_1/Analysis_Results/m131209_183112_00127_c100579142550000001823092301191430_s1_p0.1.bax.h5
t/data/pacbio/superfoo/24862_627/A01_1/Analysis_Results/m131209_183112_00127_c100579142550000001823092301191430_s1_p0.1.log
t/data/pacbio/superfoo/24862_627/A01_1/Analysis_Results/m131209_183112_00127_c100579142550000001823092301191430_s1_p0.2.bax.h5
Expand Down
Empty file modified bin/npg_pacbio_analysis_monitor.pl
100644 → 100755
Empty file.
84 changes: 54 additions & 30 deletions lib/WTSI/NPG/HTS/PacBio/Annotator.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package WTSI::NPG::HTS::PacBio::Annotator;
use List::AllUtils qw[uniq];
use Moose::Role;
use WTSI::NPG::iRODS::Metadata;
use WTSI::DNAP::Utilities::Params qw[function_params];

our $VERSION = '';

Expand All @@ -13,7 +14,16 @@ with qw[
=head2 make_primary_metadata
Arg [1] PacBio run metadata, WTSI::NPG::HTS::PacBio::Metadata.
Arg [2] Is data R & D? Boolean. Optional, defaults to false.
Named args : data_level Processing level of data being archived
e.g. Primary - off instrument, secondary
- subsequently post processed. Optional.
is_target Is target? If false then target flag
is not set. Data is not target where it
is not deplexed or where data at a different
data level is the default for the customer.
Boolean. Defaults to true.
is_r_and_d Is data R & D? Boolean. Defaults to false.
Example : my @avus = $ann->make_primary_metadata($metadata);
Description: Return instrument, run, cell index, collection number, set
Expand All @@ -23,35 +33,49 @@ with qw[
=cut

sub make_primary_metadata {
my ($self, $metadata, $is_r_and_d) = @_;

defined $metadata or
$self->logconfess('A defined metadata argument is required');

my @avus;
push @avus, $self->make_avu($PACBIO_CELL_INDEX, $metadata->cell_index);
push @avus, $self->make_avu($PACBIO_COLLECTION_NUMBER, $metadata->collection_number);
push @avus, $self->make_avu($PACBIO_INSTRUMENT_NAME, $metadata->instrument_name);
push @avus, $self->make_avu($PACBIO_RUN, $metadata->run_name);
push @avus, $self->make_avu($PACBIO_WELL, $metadata->well_name);
push @avus, $self->make_avu($PACBIO_SAMPLE_LOAD_NAME, $metadata->sample_name);

# Deprecated field, used in early version of RS
if ($metadata->has_set_number){
push @avus, $self->make_avu($PACBIO_SET_NUMBER, $metadata->set_number);
}

if ($is_r_and_d) {
# R & D data
push @avus, $self->make_avu($SAMPLE_NAME, $metadata->sample_name);
}
else {
# Production data
push @avus, $self->make_avu($PACBIO_SOURCE, $PACBIO_PRODUCTION);
}

return @avus;
{
my $positional = 2;
my @named = qw[data_level is_target is_r_and_d ];
my $params = function_params($positional, @named);

sub make_primary_metadata {
my ($self, $metadata) = $params->parse(@_);

defined $metadata or
$self->logconfess('A defined meta argument is required');

my @avus;
push @avus, $self->make_avu($PACBIO_CELL_INDEX, $metadata->cell_index);
push @avus, $self->make_avu($PACBIO_COLLECTION_NUMBER, $metadata->collection_number);
push @avus, $self->make_avu($PACBIO_INSTRUMENT_NAME, $metadata->instrument_name);
push @avus, $self->make_avu($PACBIO_RUN, $metadata->run_name);
push @avus, $self->make_avu($PACBIO_WELL, $metadata->well_name);
push @avus, $self->make_avu($PACBIO_SAMPLE_LOAD_NAME, $metadata->sample_name);

if ($params->data_level) {
push @avus, $self->make_avu($PACBIO_DATA_LEVEL, $params->data_level);
}

# Deprecated field, used in early version of RS
if ($metadata->has_set_number){
push @avus, $self->make_avu($PACBIO_SET_NUMBER, $metadata->set_number);
}

if ($params->is_r_and_d) {
# R & D data
push @avus, $self->make_avu($SAMPLE_NAME, $metadata->sample_name);
}
else {
# Production data
push @avus, $self->make_avu($PACBIO_SOURCE, $PACBIO_PRODUCTION);
}

if ($params->is_target || !defined $params->is_target) {
push @avus, $self->make_avu($TARGET, 1);
}

return @avus;
}
}

=head2 make_secondary_metadata
Expand Down
7 changes: 6 additions & 1 deletion lib/WTSI/NPG/HTS/PacBio/Metadata.pm
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,12 @@ has 'results_folder' =>
predicate => 'has_results_folder',
documentation => 'The results folder');


has 'is_ccs' =>
(isa => 'Str',
is => 'ro',
required => 0,
predicate => 'has_is_ccs',
documentation => 'Is the PacBio data ccs');


around BUILDARGS => sub {
Expand Down
6 changes: 5 additions & 1 deletion lib/WTSI/NPG/HTS/PacBio/RunPublisher.pm
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,11 @@ sub publish_basx_files {
$num_files = $num_processed = $num_errors = scalar @{$files};
}
else {
my @primary_avus = $self->make_primary_metadata($metadata, $is_r_and_d);
my $is_target = $is_r_and_d ? 0 : 1;
my @primary_avus = $self->make_primary_metadata
($metadata,
is_target => $is_target,
is_r_and_d => $is_r_and_d);
my @secondary_avus = $self->make_secondary_metadata(@run_records);
my @extra_avus = $self->make_avu($FILE_TYPE, 'bas');

Expand Down
1 change: 0 additions & 1 deletion lib/WTSI/NPG/HTS/PacBio/Sequel/APIClient.pm
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ use Moose;
use MooseX::StrictConstructor;
use URI;
use URI::Split qw(uri_join);
use Readonly;
use JSON;

with qw[
Expand Down
33 changes: 24 additions & 9 deletions lib/WTSI/NPG/HTS/PacBio/Sequel/AnalysisPublisher.pm
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,14 @@ our $METADATA_SET = 'subreadset';
# Location of source metadata file
our $ENTRY_DIR = 'entry-points';

our $NOT_DEPLEXED = '\.removed\.';

# Well directory pattern
our $WELL_DIRECTORY_PATTERN = '\d+_[A-Z]\d+$';

# Additional sequence filenames permitted for loading
our @FNAME_PERMITTED = qw[removed ccs];

# Data processing level
our $DATA_LEVEL = 'secondary';

has 'analysis_path' =>
(isa => 'Str',
Expand Down Expand Up @@ -106,16 +109,23 @@ sub publish_sequence_files {
if ($tag_id) {
@tag_records = $self->find_pacbio_runs
($self->_metadata->run_name, $self->_metadata->well_name, $tag_id);
} else {
$self->_is_allowed_fname($file) or
$self->logcroak("Unexpected file name for $file");
}

my @records =
(@tag_records == 1) ?
@tag_records :
$self->find_pacbio_runs($self->_metadata->run_name,
$self->_metadata->well_name);

if (@records >= 1) {
my @primary_avus = $self->make_primary_metadata($self->_metadata);
my $is_target = @records > 1 ? 0 : 1;

my @primary_avus = $self->make_primary_metadata
($self->_metadata,
data_level => $DATA_LEVEL,
is_target => $is_target);
my @secondary_avus = $self->make_secondary_metadata(@records);

my ($a_files, $a_processed, $a_errors) =
Expand Down Expand Up @@ -246,14 +256,16 @@ sub _get_tag_from_fname {
my ($bc1, $bc2) = ($1, $2);
$tag_id = ($bc1 == $bc2) ? $bc1 : undef;
}

defined ($tag_id || $file =~ /$NOT_DEPLEXED/smx) or
$self->logcroak("Unexpected deplexed file name : $file");

return $tag_id;
}

sub _dest_path{
sub _is_allowed_fname {
my ($self, $file) = @_;
my @exists = grep { $file =~ m{[.] $_ [.]}smx } @FNAME_PERMITTED;
return @exists == 1 ? 1 : 0;
}

sub _dest_path {
my ($self) = @_;

@{$self->smrt_names} == 1 or
Expand All @@ -278,6 +290,9 @@ WTSI::NPG::HTS::PacBio::Sequel::AnalysisPublisher
Publishes relevant files to iRODS, adds metadata and sets permissions.
This module is suitable for loading auto secondary analysis output from
demultiplex jobs, ccs analysis and combined demultiplex+css analysis.
Since SMRT Link v7 deplexing jobs have produced BAM files for identified
barcode tags and also files named removed.bam (equivalent to tag zero
in Illumina) which contain the reads not assigned to any tag. Expected
Expand Down
7 changes: 7 additions & 0 deletions lib/WTSI/NPG/HTS/PacBio/Sequel/MetaXMLParser.pm
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ our $CELL_INDEX_TAG = 'CellIndex';
our $OUTPUT_TAG = 'OutputOptions';
our $RFOLDER_TAG = 'ResultsFolder';

our $IS_CCS_TAG = 'IsCCS';


=head2 parse_file
Expand Down Expand Up @@ -76,6 +77,9 @@ sub parse_file {
my $results_folder =
$output->getElementsByTagName($prefix . $RFOLDER_TAG)->[0]->string_value;

my $is_ccs = $dom->getElementsByTagName($prefix . $IS_CCS_TAG) ?
$dom->getElementsByTagName($prefix . $IS_CCS_TAG)->[0]->string_value : 0;

return WTSI::NPG::HTS::PacBio::Metadata->new
(file_path => $file_path,
instrument_name => $instrument_name,
Expand All @@ -86,6 +90,7 @@ sub parse_file {
collection_number => $collection_number,
cell_index => $cell_index,
results_folder => $results_folder,
is_ccs => $is_ccs,
);
}

Expand All @@ -106,6 +111,8 @@ WTSI::NPG::HTS::PacBio::Sequel::MetaXMLParser
Parser for the Sequel PacBio metadata XML file(s) found in each SMRT
cell subdirectory of completed run data.
Some fields e.g. IsCCS are only found in XMLs running ICS version 7+.
=head1 AUTHOR
Guoying Qi E<lt>[email protected]E<gt>
Expand Down
12 changes: 11 additions & 1 deletion lib/WTSI/NPG/HTS/PacBio/Sequel/RunPublisher.pm
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ our $FILE_PREFIX_PATTERN = 'm\d+_\d+_\d+';
# Well directory pattern
our $WELL_DIRECTORY_PATTERN = '\d+_[A-Z]\d+$';

# Data processing level
our $DATA_LEVEL = 'primary';

override '_build_directory_pattern' => sub {
my ($self) = @_;

Expand Down Expand Up @@ -150,7 +153,14 @@ sub publish_sequence_files {
": publishing '$smrt_name' as R and D data");
}

my @primary_avus = $self->make_primary_metadata($metadata, $is_r_and_d);
my $is_target =
($metadata->is_ccs eq 'true' || @run_records > 1 || $is_r_and_d) ? 0 : 1;

my @primary_avus = $self->make_primary_metadata
($metadata,
data_level => $DATA_LEVEL,
is_target => $is_target,
is_r_and_d => $is_r_and_d);
my @secondary_avus = $self->make_secondary_metadata(@run_records);

my $files = $self->list_sequence_files($smrt_name);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
<WellName>A02</WellName>
<Concentration>0</Concentration>
<InsertSize>20000</InsertSize>
<IsCCS>false</IsCCS>
<SampleReuseEnabled>false</SampleReuseEnabled>
<StageHotstartEnabled>false</StageHotstartEnabled>
<SizeSelectionEnabled>false</SizeSelectionEnabled>
Expand Down
Loading

0 comments on commit 7dea509

Please sign in to comment.