vignettes/batch-correction.Rmd
batch-correction.Rmd
Here, we demonstrate how BANKSY can be used with Harmony for integrating multiple spatial omics datasets in the presence of strong batch effects. We use 10x Visium data of the human dorsolateral prefrontal cortex from Maynard et al (2018). The data comprise 12 samples obtained from 3 subjects, with manual annotation of the layers in each sample.
library(Banksy)
library(SummarizedExperiment)
library(SpatialExperiment)
library(Seurat)
library(scran)
library(data.table)
library(harmony)
library(scater)
library(cowplot)
library(ggplot2)
SEED <- 1000
We fetch the data for all 12 DLPFC samples with the spatialLIBD package. This might take awhile.
library(spatialLIBD)
library(ExperimentHub)
ehub <- ExperimentHub::ExperimentHub()
spe <- spatialLIBD::fetch_data(type = "spe", eh = ehub)
After the download is completed, we trim the SpatialExperiment object, retaining only the counts and some metadata such as the sample identifier and pathology annotations. This saves some memory.
#' Remove NA spots
na_id <- which(is.na(spe$layer_guess_reordered_short))
spe <- spe[, -na_id]
#' Trim
imgData(spe) <- NULL
assay(spe, "logcounts") <- NULL
reducedDims(spe) <- NULL
rowData(spe) <- NULL
colData(spe) <- DataFrame(
sample_id = spe$sample_id,
subject_id = factor(spe$sample_id, labels = rep(paste0("Subject", 1:3), each = 4)),
clust_annotation = factor(as.numeric(spe$layer_guess_reordered_short)),
in_tissue = spe$in_tissue,
row.names = colnames(spe)
)
colnames(spe) <- paste0(colnames(spe), "_", spe$sample_id)
invisible(gc())
We analyse the first sample of each subject due to vignette runtime constraints.
spe <- spe[, spe$sample_id %in% c("151507", "151669", "151673")]
sample_names <- unique(spe$sample_id)
Next, stagger the spatial coordinates across the samples so that spots from different samples do not overlap.
#' Stagger spatial coordinates
locs <- spatialCoords(spe)
locs <- cbind(locs, sample_id = factor(spe$sample_id))
locs_dt <- data.table(locs)
colnames(locs_dt) <- c("sdimx", "sdimy", "group")
locs_dt[, sdimx := sdimx - min(sdimx), by = group]
global_max <- max(locs_dt$sdimx) * 1.5
locs_dt[, sdimx := sdimx + group * global_max]
locs <- as.matrix(locs_dt[, 1:2])
rownames(locs) <- colnames(spe)
spatialCoords(spe) <- locs
Find highly variable features and normalize counts. Here we use
Seurat, but other methods may also be used
(e.g. scran::getTopHVGs
).
#' Get HVGs
seu <- as.Seurat(spe, data = NULL)
seu <- FindVariableFeatures(seu, nfeatures = 2000)
#' Normalize data
scale_factor <- median(colSums(assay(spe, "counts")))
seu <- NormalizeData(seu, scale.factor = scale_factor, normalization.method = "RC")
#' Add data to SpatialExperiment and subset to HVGs
aname <- "normcounts"
assay(spe, aname) <- GetAssayData(seu)
spe <- spe[VariableFeatures(seu),]
Compute BANKSY neighborhood matrices. We use k_geom=18
corresponding to first and second-order neighbors in 10x Visium.
compute_agf <- TRUE
k_geom <- 18
spe <- computeBanksy(spe, assay_name = aname, compute_agf = compute_agf, k_geom = k_geom)
Run PCA on the BANKSY matrix:
lambda <- 0.2
npcs <- 20
use_agf <- TRUE
spe <- runBanksyPCA(spe, use_agf = use_agf, lambda = lambda, npcs = npcs, seed = SEED)
We run Harmony on the PCs of the BANKSY matrix:
set.seed(SEED)
harmony_embedding <- HarmonyMatrix(
data_mat = reducedDim(spe, "PCA_M1_lam0.2"),
meta_data = colData(spe),
vars_use = c("sample_id", "subject_id"),
do_pca = FALSE,
max.iter.harmony = 20,
verbose = FALSE
)
reducedDim(spe, "Harmony_BANKSY") <- harmony_embedding
Next, run UMAP on the ‘raw’ and Harmony corrected PCA embeddings:
spe <- runBanksyUMAP(spe, use_agf = TRUE, lambda = lambda, npcs = npcs)
spe <- runBanksyUMAP(spe, dimred = "Harmony_BANKSY")
Visualize the UMAPs annotated by subject ID:
plot_grid(
plotReducedDim(spe, "UMAP_M1_lam0.2",
point_size = 0.1,
point_alpha = 0.5,
color_by = "subject_id") +
theme(legend.position = "none"),
plotReducedDim(spe, "UMAP_Harmony_BANKSY",
point_size = 0.1,
point_alpha = 0.5,
color_by = "subject_id") +
theme(legend.title = element_blank()) +
guides(colour = guide_legend(override.aes = list(size = 5, alpha = 1))),
nrow = 1,
rel_widths = c(1, 1.2)
)
Cluster the Harmony corrected PCA embedding:
spe <- clusterBanksy(spe, dimred = "Harmony_BANKSY", resolution = 0.55, seed = SEED)
spe <- connectClusters(spe, map_to = "clust_annotation")
Generate spatial plots:
cnm <- clusterNames(spe)[2]
spatial_plots <- lapply(sample_names, function(snm) {
x <- spe[, spe$sample_id == snm]
ari <- aricode::ARI(x$clust_annotation, colData(x)[, cnm])
df <- cbind.data.frame(clust=colData(x)[[cnm]], spatialCoords(x))
ggplot(df, aes(x=sdimy, y=sdimx, col=clust)) +
geom_point(size = 0.5) +
scale_color_manual(values = pals::kelly()[-1]) +
theme_classic() +
theme(
legend.position = "none",
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank()) +
labs(title = sprintf("Sample %s - ARI: %s", snm, round(ari, 3))) +
coord_equal()
})
plot_grid(plotlist = spatial_plots, ncol = 3, byrow = FALSE)
Vignette runtime:
#> Time difference of 1.338061 mins
sessionInfo()
#> R version 4.3.2 (2023-10-31)
#> Platform: aarch64-apple-darwin20 (64-bit)
#> Running under: macOS Sonoma 14.2.1
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> time zone: America/Detroit
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] ExperimentHub_2.10.0 AnnotationHub_3.10.0
#> [3] BiocFileCache_2.10.1 dbplyr_2.4.0
#> [5] spatialLIBD_1.14.1 cowplot_1.1.3
#> [7] scater_1.30.1 ggplot2_3.4.4
#> [9] harmony_1.2.0 Rcpp_1.0.12
#> [11] data.table_1.15.0 scran_1.30.2
#> [13] scuttle_1.12.0 Seurat_5.0.1
#> [15] SeuratObject_5.0.1 sp_2.1-3
#> [17] SpatialExperiment_1.12.0 SingleCellExperiment_1.24.0
#> [19] SummarizedExperiment_1.32.0 Biobase_2.62.0
#> [21] GenomicRanges_1.54.1 GenomeInfoDb_1.38.6
#> [23] IRanges_2.36.0 S4Vectors_0.40.2
#> [25] BiocGenerics_0.48.1 MatrixGenerics_1.14.0
#> [27] matrixStats_1.2.0 Banksy_0.99.12
#> [29] BiocStyle_2.30.0
#>
#> loaded via a namespace (and not attached):
#> [1] fs_1.6.3 spatstat.sparse_3.0-3
#> [3] bitops_1.0-7 doParallel_1.0.17
#> [5] httr_1.4.7 RColorBrewer_1.1-3
#> [7] tools_4.3.2 sctransform_0.4.1
#> [9] DT_0.31 utf8_1.2.4
#> [11] R6_2.5.1 lazyeval_0.2.2
#> [13] uwot_0.1.16 withr_3.0.0
#> [15] gridExtra_2.3 progressr_0.14.0
#> [17] cli_3.6.2 textshaping_0.3.7
#> [19] spatstat.explore_3.2-6 fastDummies_1.7.3
#> [21] labeling_0.4.3 sass_0.4.8
#> [23] spatstat.data_3.0-4 ggridges_0.5.6
#> [25] pbapply_1.7-2 pkgdown_2.0.7
#> [27] Rsamtools_2.18.0 systemfonts_1.0.5
#> [29] dbscan_1.1-12 aricode_1.0.3
#> [31] dichromat_2.0-0.1 sessioninfo_1.2.2
#> [33] parallelly_1.37.0 attempt_0.3.1
#> [35] maps_3.4.2 limma_3.58.1
#> [37] pals_1.8 rstudioapi_0.15.0
#> [39] RSQLite_2.3.5 BiocIO_1.12.0
#> [41] generics_0.1.3 ica_1.0-3
#> [43] spatstat.random_3.2-2 dplyr_1.1.4
#> [45] Matrix_1.6-5 ggbeeswarm_0.7.2
#> [47] fansi_1.0.6 abind_1.4-5
#> [49] lifecycle_1.0.4 yaml_2.3.8
#> [51] edgeR_4.0.15 SparseArray_1.2.4
#> [53] Rtsne_0.17 paletteer_1.6.0
#> [55] grid_4.3.2 blob_1.2.4
#> [57] promises_1.2.1 dqrng_0.3.2
#> [59] crayon_1.5.2 miniUI_0.1.1.1
#> [61] lattice_0.22-5 beachmat_2.18.0
#> [63] mapproj_1.2.11 KEGGREST_1.42.0
#> [65] magick_2.8.2 pillar_1.9.0
#> [67] knitr_1.45 metapod_1.10.1
#> [69] rjson_0.2.21 future.apply_1.11.1
#> [71] codetools_0.2-19 leiden_0.4.3.1
#> [73] glue_1.7.0 vctrs_0.6.5
#> [75] png_0.1-8 spam_2.10-0
#> [77] gtable_0.3.4 rematch2_2.1.2
#> [79] cachem_1.0.8 xfun_0.42
#> [81] S4Arrays_1.2.0 mime_0.12
#> [83] survival_3.5-7 RcppHungarian_0.3
#> [85] iterators_1.0.14 fields_15.2
#> [87] statmod_1.5.0 bluster_1.12.0
#> [89] interactiveDisplayBase_1.40.0 ellipsis_0.3.2
#> [91] fitdistrplus_1.1-11 ROCR_1.0-11
#> [93] nlme_3.1-164 bit64_4.0.5
#> [95] filelock_1.0.3 RcppAnnoy_0.0.22
#> [97] bslib_0.6.1 irlba_2.3.5.1
#> [99] vipor_0.4.7 KernSmooth_2.23-22
#> [101] colorspace_2.1-0 DBI_1.2.1
#> [103] tidyselect_1.2.0 bit_4.0.5
#> [105] compiler_4.3.2 curl_5.2.0
#> [107] BiocNeighbors_1.20.2 desc_1.4.3
#> [109] DelayedArray_0.28.0 plotly_4.10.4
#> [111] rtracklayer_1.62.0 bookdown_0.37
#> [113] scales_1.3.0 lmtest_0.9-40
#> [115] rappdirs_0.3.3 stringr_1.5.1
#> [117] digest_0.6.34 goftest_1.2-3
#> [119] spatstat.utils_3.0-4 rmarkdown_2.25
#> [121] benchmarkmeData_1.0.4 RhpcBLASctl_0.23-42
#> [123] XVector_0.42.0 htmltools_0.5.7
#> [125] pkgconfig_2.0.3 sparseMatrixStats_1.14.0
#> [127] highr_0.10 fastmap_1.1.1
#> [129] rlang_1.1.3 htmlwidgets_1.6.4
#> [131] shiny_1.8.0 DelayedMatrixStats_1.24.0
#> [133] farver_2.1.1 jquerylib_0.1.4
#> [135] zoo_1.8-12 jsonlite_1.8.8
#> [137] BiocParallel_1.36.0 mclust_6.0.1
#> [139] config_0.3.2 BiocSingular_1.18.0
#> [141] RCurl_1.98-1.14 magrittr_2.0.3
#> [143] GenomeInfoDbData_1.2.11 dotCall64_1.1-1
#> [145] patchwork_1.2.0 munsell_0.5.0
#> [147] viridis_0.6.5 reticulate_1.35.0
#> [149] leidenAlg_1.1.2 stringi_1.8.3
#> [151] zlibbioc_1.48.0 MASS_7.3-60.0.1
#> [153] plyr_1.8.9 parallel_4.3.2
#> [155] listenv_0.9.1 ggrepel_0.9.5
#> [157] deldir_2.0-2 Biostrings_2.70.2
#> [159] sccore_1.0.4 splines_4.3.2
#> [161] tensor_1.5 locfit_1.5-9.8
#> [163] igraph_2.0.1.1 spatstat.geom_3.2-8
#> [165] RcppHNSW_0.6.0 reshape2_1.4.4
#> [167] ScaledMatrix_1.10.0 XML_3.99-0.16.1
#> [169] BiocVersion_3.18.1 evaluate_0.23
#> [171] golem_0.4.1 BiocManager_1.30.22
#> [173] foreach_1.5.2 httpuv_1.6.14
#> [175] RANN_2.6.1 tidyr_1.3.1
#> [177] purrr_1.0.2 polyclip_1.10-6
#> [179] benchmarkme_1.0.8 future_1.33.1
#> [181] scattermore_1.2 rsvd_1.0.5
#> [183] xtable_1.8-4 restfulr_0.0.15
#> [185] RSpectra_0.16-1 later_1.3.2
#> [187] viridisLite_0.4.2 ragg_1.2.7
#> [189] tibble_3.2.1 GenomicAlignments_1.38.2
#> [191] AnnotationDbi_1.64.1 memoise_2.0.1
#> [193] beeswarm_0.4.0 cluster_2.1.6
#> [195] shinyWidgets_0.8.1 globals_0.16.2