Skip to content

Commit

Permalink
Add the necessary fields and logic for storing single-cell sparsity m…
Browse files Browse the repository at this point in the history
…etrics

Add 3 fields in BioAssay to keep track of the number of cells, design
elements and cell-by-design-element pairs with a non-zero expression
value.

See #1273 for details.
  • Loading branch information
arteymix committed Oct 29, 2024
1 parent 7471c00 commit dd9e0f2
Show file tree
Hide file tree
Showing 5 changed files with 192 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
*/
package ubic.gemma.model.expression.bioAssay;

import ubic.gemma.model.common.auditAndSecurity.Securable;
import org.hibernate.search.annotations.*;
import ubic.gemma.model.common.AbstractDescribable;
import ubic.gemma.model.common.auditAndSecurity.Securable;
import ubic.gemma.model.common.auditAndSecurity.SecuredChild;
import ubic.gemma.model.common.description.DatabaseEntry;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
Expand All @@ -30,7 +30,6 @@
import javax.persistence.Transient;
import java.io.Serializable;
import java.util.Date;
import java.util.Objects;

/**
* Represents the bringing together of a biomaterial with an assay of some sort (typically an expression assay). We
Expand All @@ -42,33 +41,102 @@ public class BioAssay extends AbstractDescribable implements SecuredChild, Seria

private static final long serialVersionUID = -7868768731812964045L;

private Boolean isOutlier = false;
private Date processingDate;
@Nullable
private Long sequenceReadCount;
@Nullable
private Integer sequenceReadLength;
@Nullable
private Boolean sequencePairedReads;

/**
* Platform used in this assay.
*/
private ArrayDesign arrayDesignUsed;

/**
* If the sample data was switched to another platform, this is what it was originally.
* If the assay data was switched to another platform, this is what it was originally.
*/
@Nullable
private ArrayDesign originalPlatform;

/**
* Sample used in this assay.
*/
private BioMaterial sampleUsed;

/**
* Accession for this assay.
*/
private DatabaseEntry accession;

/**
* Indicates if the assay should be considered an outlier based on QC.
* <p>
* The audit trail for the owning {@link ubic.gemma.model.expression.experiment.ExpressionExperiment} tracks when
* this was done.
*/
private Boolean isOutlier = false;

/**
* Indicates the date that the assay was processed in the original study. This would correspond to "batch" in the
* experimental design.
*/
@Nullable
private Date processingDate;

/**
* Free-form additional metadata.
*/
@Nullable
private String metadata;

/**
* For sequence-read based data, the total number of reads in the assay, computed from the data as the total of the
* values for the elements assayed.
*/
@Nullable
private Long sequenceReadCount;
/**
* For sequencing-based data, the length of the reads. If it was paired reads, this is understood to be the length
* for each "end". If the read length was variable (due to quality trimming, etc.) this will be treated as
* representing the mean read length.
*/
@Nullable
private Integer sequenceReadLength;
/**
* For sequence-based data, this should be set to true if the sequencing was paired-end reads and false otherwise.
* It should be left as null if it isn't known.
*/
@Nullable
private Boolean sequencePairedReads;
/**
* For RNA-seq representation of representative headers from the FASTQ file(s). If there is more than on FASTQ file,
* this string will contain multiple newline-delimited headers.
*/
@Nullable
private String fastqHeaders;

/**
* Number of cells for the assay that have at least one expressed gene.
* <p>
* If this assay correspond to a pseudo-bulk, it is the number of such cells in the aggregate.
* <p>
* This applies to the preferred set of single-cell vectors.
*/
@Nullable
private Integer numberOfCells;
/**
* Number of design elements in the assay with at least one cell expressing it.
* <p>
* If this assay correspond to a pseudo-bulk, it is the number of such design elements in the aggregate.
* <p>
* This applies to the preferred set of single-cell vectors.
*/
@Nullable
private Integer numberOfDesignElements;
/**
* Number of cell-gene pairs with non-zero expression values.
* <p>
* If this assay correspond to a pseudo-bulk, it is the number of such pairs in the aggregate.
* <p>
* This applies to the preferred set of single-cell vectors.
*/
@Nullable
private Integer numberOfCellsByDesignElements;

@Override
public boolean equals( Object object ) {
if ( this == object )
Expand Down Expand Up @@ -120,11 +188,6 @@ public void setArrayDesignUsed( ArrayDesign arrayDesignUsed ) {
this.arrayDesignUsed = arrayDesignUsed;
}

/**
* @return Used to indicate if the sample should be considered an outlier based on QC. The audit trail for the
* entity tracks
* when this was done.
*/
public Boolean getIsOutlier() {
return this.isOutlier;
}
Expand All @@ -133,15 +196,12 @@ public void setIsOutlier( Boolean isOutlier ) {
this.isOutlier = isOutlier;
}

/**
* @return Indicates the date that the assay was processed in the original study. This would correspond to "batch"
* information and will often be a "scan date" or similar information extracted from the raw data files.
*/
@Nullable
public Date getProcessingDate() {
return this.processingDate;
}

public void setProcessingDate( Date processingDate ) {
public void setProcessingDate( @Nullable Date processingDate ) {
this.processingDate = processingDate;
}

Expand All @@ -160,11 +220,6 @@ public Securable getSecurityOwner() {
return null;
}

/**
* @return For sequence-based data, this should be set to true if the sequencing was paired-end reads and false
* otherwise.
* It should be left as null if it isn't known.
*/
@Nullable
public Boolean getSequencePairedReads() {
return this.sequencePairedReads;
Expand All @@ -174,11 +229,6 @@ public void setSequencePairedReads( @Nullable Boolean sequencePairedReads ) {
this.sequencePairedReads = sequencePairedReads;
}

/**
* @return For sequence-read based data, the total number of reads in the sample, computed from the data as the
* total of the
* values for the elements assayed.
*/
@Nullable
public Long getSequenceReadCount() {
return this.sequenceReadCount;
Expand All @@ -188,12 +238,6 @@ public void setSequenceReadCount( @Nullable Long sequenceReadCount ) {
this.sequenceReadCount = sequenceReadCount;
}

/**
* @return For sequencing-based data, the length of the reads. If it was paired reads, this is understood to be the
* length
* for each "end". If the read length was variable (due to quality trimming, etc.) this will be treated as
* representing the mean read length.
*/
@Nullable
public Integer getSequenceReadLength() {
return this.sequenceReadLength;
Expand All @@ -203,11 +247,12 @@ public void setSequenceReadLength( @Nullable Integer sequenceReadLength ) {
this.sequenceReadLength = sequenceReadLength;
}

@Nullable
public String getMetadata() {
return metadata;
}

public void setMetadata( String metadata ) {
public void setMetadata( @Nullable String metadata ) {
this.metadata = metadata;
}

Expand All @@ -216,21 +261,46 @@ public ArrayDesign getOriginalPlatform() {
return originalPlatform;
}

public void setOriginalPlatform( ArrayDesign originalPlatform ) {
if ( this.originalPlatform != null && !( this.originalPlatform.equals( originalPlatform ) ) ) {
System.err.println( "Warning: setting 'original platform' to a different value?" );
}
public void setOriginalPlatform( @Nullable ArrayDesign originalPlatform ) {
this.originalPlatform = originalPlatform;
}

@Nullable
public String getFastqHeaders() {
return fastqHeaders;
}

public void setFastqHeaders( String fastqHeaders ) {
public void setFastqHeaders( @Nullable String fastqHeaders ) {
this.fastqHeaders = fastqHeaders;
}

@Nullable
public Integer getNumberOfCells() {
return numberOfCells;
}

public void setNumberOfCells( @Nullable Integer numberOfCells ) {
this.numberOfCells = numberOfCells;
}

@Nullable
public Integer getNumberOfDesignElements() {
return numberOfDesignElements;
}

public void setNumberOfDesignElements( @Nullable Integer numberOfDesignElements ) {
this.numberOfDesignElements = numberOfDesignElements;
}

@Nullable
public Integer getNumberOfCellsByDesignElements() {
return numberOfCellsByDesignElements;
}

public void setNumberOfCellsByDesignElements( @Nullable Integer numberOfCellsByDesignElements ) {
this.numberOfCellsByDesignElements = numberOfCellsByDesignElements;
}

public static final class Factory {

public static BioAssay newInstance() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.expression.experiment.FactorValue;
import ubic.gemma.persistence.service.common.auditAndSecurity.AuditTrailService;
import ubic.gemma.persistence.service.expression.bioAssay.BioAssayService;
import ubic.gemma.persistence.service.expression.bioAssayData.BioAssayDimensionService;

import java.util.*;
Expand All @@ -43,22 +44,23 @@ public class SingleCellExpressionExperimentAggregatorServiceImpl implements Sing
@Autowired
private BioAssayDimensionService bioAssayDimensionService;

@Autowired
private BioAssayService bioAssayService;

@Autowired
private AuditTrailService auditTrailService;

/**
*/
@Override
@Transactional
public QuantitationType aggregateVectors( ExpressionExperiment ee, QuantitationType quantitationType, List<BioAssay> cellBAs, boolean makePreferred ) {
// FIXME: this is needed because if EE is not in the session, getPreferredSingleCellDataVectors() will retrieve
// a distinct QT than that of ee.getQuantitationTypes()
ee = expressionExperimentService.loadOrFail( ee.getId() );
ExpressionExperiment finalEe = ee;
Collection<SingleCellExpressionDataVector> vectors = singleCellExpressionExperimentService.getSingleCellDataVectors( ee, quantitationType );
if ( vectors.isEmpty() ) {
throw new IllegalStateException( finalEe + " does not have single-cell vectors for " + quantitationType + "." );
throw new IllegalStateException( ee + " does not have single-cell vectors for " + quantitationType + "." );
}
ExpressionExperiment finalEe = ee;
CellTypeAssignment cta = singleCellExpressionExperimentService.getPreferredCellTypeAssignment( ee )
.orElseThrow( () -> new IllegalStateException( finalEe + " does not have a preferred cell type assignment." ) );
ExperimentalFactor cellTypeFactor = singleCellExpressionExperimentService.getCellTypeFactor( ee )
Expand Down Expand Up @@ -154,17 +156,30 @@ private QuantitationType aggregateVectorsInternal( ExpressionExperiment ee, List
+ "Expression data has been aggregated by " + cellTypeFactorName + " using " + method + "." );
newQt.setIsPreferred( makePreferred );

Map<BioAssay, Integer> cellsByBioAssay = new HashMap<>();
Map<BioAssay, Integer> designElementsByBioAssay = new HashMap<>();
Map<BioAssay, Integer> cellByDesignElementByBioAssay = new HashMap<>();
Collection<RawExpressionDataVector> rawVectors = new ArrayList<>( vectors.size() );
for ( SingleCellExpressionDataVector v : vectors ) {
RawExpressionDataVector rawVector = new RawExpressionDataVector();
rawVector.setExpressionExperiment( ee );
rawVector.setQuantitationType( newQt );
rawVector.setBioAssayDimension( newBad );
rawVector.setDesignElement( v.getDesignElement() );
rawVector.setData( aggregateData( v, newBad, cellTypeAssignment, sourceBioAssayMap, cellTypes, method ) );
rawVector.setData( aggregateData( v, newBad, cellTypeAssignment, sourceBioAssayMap, cellTypes, method, cellsByBioAssay, designElementsByBioAssay, cellByDesignElementByBioAssay ) );
rawVectors.add( rawVector );
}

if ( makePreferred ) {
log.info( "Applying single-cell sparsity metrics to the aggregated assays..." );
for ( BioAssay ba : cellBAs ) {
ba.setNumberOfCells( cellsByBioAssay.getOrDefault( ba, 0 ) );
ba.setNumberOfDesignElements( designElementsByBioAssay.getOrDefault( ba, 0 ) );
ba.setNumberOfCellsByDesignElements( cellByDesignElementByBioAssay.getOrDefault( ba, 0 ) );
}
bioAssayService.update( cellBAs );
}

int newVecs = expressionExperimentService.addRawDataVectors( ee, newQt, rawVectors );
String note = String.format( "Created %d aggregated raw vectors for %s.", newVecs, newQt );
log.info( note );
Expand All @@ -176,7 +191,7 @@ private QuantitationType aggregateVectorsInternal( ExpressionExperiment ee, List
/**
* Aggregate the single-cell data to match the target BAD.
*/
private byte[] aggregateData( SingleCellExpressionDataVector scv, BioAssayDimension bad, CellTypeAssignment cta, Map<BioAssay, BioAssay> sourceBioAssayMap, Map<BioAssay, Characteristic> cellTypes, SingleCellExpressionAggregationMethod method ) {
private byte[] aggregateData( SingleCellExpressionDataVector scv, BioAssayDimension bad, CellTypeAssignment cta, Map<BioAssay, BioAssay> sourceBioAssayMap, Map<BioAssay, Characteristic> cellTypes, SingleCellExpressionAggregationMethod method, Map<BioAssay, Integer> cellsByBioAssay, Map<BioAssay, Integer> designElementsByBioAssay, Map<BioAssay, Integer> cellByDesignElementByBioAssay ) {
List<BioAssay> samples = bad.getBioAssays();
int numSamples = samples.size();
double[] rv = new double[numSamples];
Expand Down Expand Up @@ -214,6 +229,7 @@ private byte[] aggregateData( SingleCellExpressionDataVector scv, BioAssayDimens
} else if ( method == SingleCellExpressionAggregationMethod.LOG1P_SUM ) {
rv[i] = Math.log1p( rv[i] );
}
// TODO: update the metrics
}
return doubleArrayToBytes( rv );
}
Expand Down
Loading

0 comments on commit dd9e0f2

Please sign in to comment.