Deconvolution of mixture using single-cell data

This is the backbone function which takes in single cell expression data to deconvolute spatial transcriptomics spots.

Usage

# S4 method for SingleCellExperiment,ANY
SPOTlight(
  x,
  y,
  ...,
  assay = "counts",
  groups = colLabels(x, onAbsence = "error")
)

# S4 method for ANY,SingleCellExperiment
SPOTlight(x, y, ..., assay = "counts")

# S4 method for Seurat,ANY
SPOTlight(x, y, ..., slot = "counts", assay = "RNA", groups = Idents(x))

# S4 method for ANY,Seurat
SPOTlight(x, y, ..., slot = "counts", assay = "RNA")

# S4 method for ANY,dgCMatrix
SPOTlight(x, y, ..., slot = "counts", assay = "RNA")

# S4 method for dgCMatrix,ANY
SPOTlight(x, y, ..., slot = "counts", assay = "RNA")

# S4 method for ANY,DelayedMatrix
SPOTlight(x, y, ..., slot = "counts", assay = "RNA")

# S4 method for DelayedMatrix,ANY
SPOTlight(x, y, ..., slot = "counts", assay = "RNA")

# S4 method for ANY,ANY
SPOTlight(x, y, ...)

# S4 method for matrix,matrix
SPOTlight(
  x,
  y,
  groups,
  mgs,
  n_top = NULL,
  gene_id = "gene",
  group_id = "cluster",
  weight_id = "weight",
  hvg = NULL,
  scale = TRUE,
  model = c("ns", "std"),
  min_prop = 0.01,
  verbose = TRUE,
  ...
)

Arguments

x, y: single-cell and mixture dataset, respectively. Can be a numeric matrix, SingleCellExperiment or SeuratObjecy.
...: additional parameters.
assay: if the object is of Class Seurat, character string specifying the assay from which to extract the expression matrix. By default "RNA".
groups: vector of group labels for cells in x. When x is a SingleCellExperiment or SeuratObject, defaults to colLabels and Idents(x), respectively.
slot: if the object is of Class Seurat, character string specifying the slot from which to extract the expression matrix. By default "counts".
mgs: data.frame or DataFrame of marker genes. Must contain columns holding gene identifiers, group labels and the weight (e.g., logFC, -log(p-value) a feature has in a given group.
n_top: integer scalar specifying the number of markers to select per group. By default NULL uses all the marker genes to initialize the model.
gene_id, group_id, weight_id: character specifying the column in mgs containing gene identifiers, group labels and weights, respectively.
hvg: character vector containing hvg to include in the model. By default NULL.
scale: logical specifying whether to scale single-cell counts to unit variance. This gives the user the option to normalize the data beforehand as you see fit (CPM, FPKM, ...) when passing a matrix or specifying the slot from where to extract the count data.
model: character string indicating which model to use when running NMF. Either "ns" (default) or "std".
min_prop: scalar in [0,1] setting the minimum contribution expected from a cell type in x to observations in y. By default 0.
verbose: logical. Should information on progress be reported?

Value

a numeric matrix with rows corresponding to samples and columns to groups

Details

SPOTlight uses a Non-Negative Matrix Factorization approach to learn which genes are important for each cell type. In order to drive the factorization and give more importance to cell type marker genes we previously compute them and use them to initialize the basis matrix. This initialized matrices will then be used to carry out the factorization with the single cell expression data. Once the model has learn the topic profiles for each cell type we use non-negative least squares (NNLS) to obtain the topic contributions to each spot. Lastly, NNLS is again used to obtain the proportion of each cell type for each spot by finding the fitting the single-cell topic profiles to the spots topic contributions.

Author

Marc Elosua-Bayes & Helena L. Crowell

Examples

library(scater)
#> Loading required package: SingleCellExperiment
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#> 
#> Attaching package: ‘matrixStats’
#> The following objects are masked from ‘package:Biobase’:
#> 
#>     anyMissing, rowMedians
#> 
#> Attaching package: ‘MatrixGenerics’
#> The following objects are masked from ‘package:matrixStats’:
#> 
#>     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#>     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#>     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#>     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#>     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#>     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#>     colWeightedMeans, colWeightedMedians, colWeightedSds,
#>     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#>     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#>     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#>     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#>     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#>     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#>     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#>     rowWeightedSds, rowWeightedVars
#> The following object is masked from ‘package:Biobase’:
#> 
#>     rowMedians
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: S4Vectors
#> 
#> Attaching package: ‘S4Vectors’
#> The following objects are masked from ‘package:base’:
#> 
#>     I, expand.grid, unname
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: scuttle
#> Loading required package: ggplot2
library(scran)
library(ExperimentHub)
#> Loading required package: AnnotationHub
#> Loading required package: BiocFileCache
#> Loading required package: dbplyr
#> 
#> Attaching package: ‘AnnotationHub’
#> The following object is masked from ‘package:Biobase’:
#> 
#>     cache
library(TENxVisiumData)
#> Loading required package: SpatialExperiment
#> snapshotDate(): 2021-10-19

# Initialize a Hub instance storing a complete set of records
eh <- ExperimentHub()
#> snapshotDate(): 2021-10-19

# Retrieve any records that match our keyword(s) of interest
query(eh, "Tabula Muris Senis droplet Kidney")
#> ExperimentHub with 7 records
#> # snapshotDate(): 2021-10-19
#> # $dataprovider: The Tabula Muris Consortium
#> # $species: Mus musculus
#> # $rdataclass: matrix, H5File, DFrame
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> #   rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["EH6386"]]' 
#> 
#>            title                                             
#>   EH6386 | Tabula Muris Senis droplet Kidney colData         
#>   EH6387 | Tabula Muris Senis droplet Kidney counts          
#>   EH6388 | Tabula Muris Senis droplet Kidney processed counts
#>   EH6389 | Tabula Muris Senis droplet Kidney rowData         
#>   EH6390 | Tabula Muris Senis droplet Kidney PCA             
#>   EH6391 | Tabula Muris Senis droplet Kidney tSNE            
#>   EH6392 | Tabula Muris Senis droplet Kidney UMAP            
query(eh, "MouseKidneyCoronal")
#> ExperimentHub with 2 records
#> # snapshotDate(): 2021-10-19
#> # $dataprovider: 10X Genomics
#> # $species: Mus musculus
#> # $rdataclass: SpatialExperiment
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> #   rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["EH6707"]]' 
#> 
#>            title                   
#>   EH6707 | MouseKidneyCoronal      
#>   EH6743 | MouseKidneyCoronal_v3.13

# Get Visium data from 'TENxVisiumData'
spe <- MouseKidneyCoronal()
#> see ?TENxVisiumData and browseVignettes('TENxVisiumData') for documentation
#> loading from cache

# Use symbols instead of Ensembl IDs as feature names
rownames(spe) <- rowData(spe)$symbol

# Load SCE data
library(TabulaMurisSenisData)
sce <- TabulaMurisSenisDroplet(tissues = "Kidney")$Kidney
#> snapshotDate(): 2021-10-19
#> see ?TabulaMurisSenisData and browseVignettes('TabulaMurisSenisData') for documentation
#> loading from cache
#> require(“rhdf5”)
#> see ?TabulaMurisSenisData and browseVignettes('TabulaMurisSenisData') for documentation
#> loading from cache
#> see ?TabulaMurisSenisData and browseVignettes('TabulaMurisSenisData') for documentation
#> loading from cache
#> see ?TabulaMurisSenisData and browseVignettes('TabulaMurisSenisData') for documentation
#> loading from cache
#> see ?TabulaMurisSenisData and browseVignettes('TabulaMurisSenisData') for documentation
#> loading from cache
#> see ?TabulaMurisSenisData and browseVignettes('TabulaMurisSenisData') for documentation
#> loading from cache

# Keep cells from 18m mice with clear cell type annotations
sce <- subset(sce, , age == "18m")
sce <- subset(sce, , ! free_annotation %in% c("nan", "CD45"))

# Get the top 3000 highly variable genes
sce <- logNormCounts(sce)
dec <- modelGeneVar(sce)
hvg <- getTopHVGs(dec, n = 3000)
colLabels(sce) <- colData(sce)$free_annotation

# Get vector indicating which genes
# are neither ribosomal or mitochondrial
genes <- !grepl("^Rp[l|s]|Mt", rownames(sce))

# Compute marker genes
mgs <- scoreMarkers(sce, subset.row = genes)
mgs_ls <- lapply(names(mgs), function(i){
  x <- mgs[[i]]
  # Filter and keep relevant marker genes, those with AUC > 0.8
  x <- x[x$mean.AUC > 0.8, ]
  # Sort the genes from highest to lowest weight
  x <- x[order(x$mean.AUC, decreasing = TRUE), ]
  # Add gene and cluster id to the dataframe
  x$gene <- rownames(x)
  x$cluster <- i
  data.frame(x)
})
mgs_df <- do.call(rbind, mgs_ls)

# split cell indices by identity
idx <- split(seq(ncol(sce)), sce$free_annotation)
# downsample to at most 20 cells per identity
n_cells <- 20
cs_keep <- lapply(idx, function(i) {
  n <- length(i)
  if (n < n_cells)
    n_cells <- n
  sample(i, n_cells)
})
sce <- sce[, unlist(cs_keep)]

res <- SPOTlight(
    x = counts(sce),
    y = counts(spe),
    groups = sce$free_annotation,
    mgs = mgs_df,
    hvg = hvg,
    weight_id = "mean.AUC",
    group_id = "cluster",
    gene_id = "gene")
#> Scaling count matrix
#> Seeding initial matrices
#> Training NMF model
#> Time for training: 8.95min
#> Deconvoluting mixture data