We will replicate some of the results from the following paper by Muto et al (2022)
The paper is a study o polycystic kidney disease (PKD) using single-cell RNA sequencing (scRNA-seq) technologies. The main goals of study are:
The original data were downloaded from the NCBI GEO database GSE185948, which contained over 100K nuclei, each with over 30K genes. For the tutorial purposes, we have pre-processed the data set with the following steps:
To understand the above bioinformatics workflow, it is helpful to browse through the Seraut tutorial.
We will first set a working directory, load libraries, and read the scRNA-Seq data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pheatmap)
library(umap)
library(broom)
# Install EnhancedVolcano once:
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("EnhancedVolcano")
library(EnhancedVolcano)
## Loading required package: ggrepel
getwd()
## [1] "C:/Users/weiga/Dropbox/Courses/bio-47120-2024"
x <- read_tsv("https://borreliabase.org/~wgqiu/muto.tsv")
## Rows: 2400 Columns: 2002
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): id, cell.type
## dbl (2000): PTPRQ, PTPRO, ST6GALNAC3, MAGI2, SERPINE1, SLC8A1, SLC26A7, PLA2...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(x)
## # A tibble: 6 × 2,002
## id cell.type PTPRQ PTPRO ST6GALNAC3 MAGI2 SERPINE1 SLC8A1 SLC26A7
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 PKD_ACAG… PT2 -0.258 -0.338 -0.359 -1.49 1.85 -0.968 -0.405
## 2 PKD_ACCA… DCT -0.385 -0.523 -0.422 1.63 -0.421 0.500 -0.251
## 3 PKD_ACGC… DCT -0.210 -0.355 -0.536 0.490 -0.338 -0.828 -0.297
## 4 PKD_ACGC… z12 -0.798 -0.966 2.10 0.827 -0.199 0.0483 -0.308
## 5 PKD_ACGG… PT1 1.56 -0.211 -0.400 -0.479 0.431 1.81 -0.372
## 6 PKD_ACGG… TAL2 -0.0403 0.0271 -0.365 -1.32 -0.330 -0.256 -0.203
## # ℹ 1,993 more variables: PLA2R1 <dbl>, SLC12A3 <dbl>, AC109466.1 <dbl>,
## # EMCN <dbl>, NTNG1 <dbl>, TIMP3 <dbl>, ZPLD1 <dbl>, LINC01811 <dbl>,
## # CELF2 <dbl>, PLAT <dbl>, ROBO2 <dbl>, FGF1 <dbl>, FMN2 <dbl>,
## # SLC14A2 <dbl>, PAPPA2 <dbl>, PLPP1 <dbl>, ADAMTS19 <dbl>, CLNK <dbl>,
## # LDB2 <dbl>, AC093912.1 <dbl>, ADGRL3 <dbl>, NKAIN2 <dbl>, CLIC5 <dbl>,
## # NPHS2 <dbl>, ADGRF5 <dbl>, FLT1 <dbl>, ADAMTS6 <dbl>, RGS6 <dbl>,
## # DCC <dbl>, SNTG1 <dbl>, CNTNAP5 <dbl>, ENPP2 <dbl>, NPHS1 <dbl>, …
Questions on data dimension:
We will add a categorical column “cohort” to distinguish between PKD (“PKD”) and controls (“CTL”).
We will add another categorical “sample” column for individual samples (PKD1, PKD2, CTL1, CTL2, etc)
x <- x %>%
mutate(cohort = str_remove(id, "_.+")) %>%
mutate(sample = str_remove(id, "_.+_") )
table(x$cohort)
##
## Cont PKD
## 1000 1400
table(x$sample)
##
## Cont1 Cont2 Cont3 Cont4 Cont5 PKD1 PKD3 PKD4 PKD5 PKD6 PKD7 PKD8
## 200 200 200 200 200 200 200 200 200 200 200 200
table(x$cell.type)
##
## CNT_PC DCT ENDO FIB ICA-ICB LEUK PEC PODO PT1 PT2
## 352 204 104 188 122 123 45 45 343 189
## TAL1 TAL2 z12
## 172 483 30
Questions on biological samples:
We will make sure that data are (1) normally distributed, and (2) normalized between individual samples. These are necessary pre-conditions for most statistical analyses.
x.long <- x %>%
pivot_longer(3:2002, names_to = "gene", values_to = "rna.ct") # these are 2000 gene columns
head(x.long)
## # A tibble: 6 × 6
## id cell.type cohort sample gene rna.ct
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 PKD_ACAGCCGAGATAACGT-1_1 PT2 PKD PKD1 PTPRQ -0.258
## 2 PKD_ACAGCCGAGATAACGT-1_1 PT2 PKD PKD1 PTPRO -0.338
## 3 PKD_ACAGCCGAGATAACGT-1_1 PT2 PKD PKD1 ST6GALNAC3 -0.359
## 4 PKD_ACAGCCGAGATAACGT-1_1 PT2 PKD PKD1 MAGI2 -1.49
## 5 PKD_ACAGCCGAGATAACGT-1_1 PT2 PKD PKD1 SERPINE1 1.85
## 6 PKD_ACAGCCGAGATAACGT-1_1 PT2 PKD PKD1 SLC8A1 -0.968
x.long %>%
ggplot(aes(x = sample, y = rna.ct)) +
geom_violin()
x.long %>%
sample_n(1000) %>% # take 1000 random rows; otherwise too much
ggplot(aes(sample = rna.ct)) +
stat_qq() +
stat_qq_line() +
facet_wrap(~sample)
Similar cells are expected to express similar set of genes. Thus, we can identify cell groups by similarity in gene expression levels. As in the iris data set, we will perform heatmap analysis, followed by dimension reduction with PCA and UMAP.
x.heat <- x %>% select(-c(1,2,2003, 2004)) %>% # use only gene columns
as.data.frame()
rownames(x.heat) <- x$id
# create a data frame containing cell types
ann_cell <- data.frame(cell.type = factor(x$cell.type))
rownames(ann_cell) <- x$id
# sample 1/20 of columns and 1/50 of rows (otherwise takes too much resource)
pick.columns <- sample(colnames(x.heat), 100)
x.pick <- x.heat %>%
select(all_of(pick.columns)) %>%
sample_n(200)
pheatmap(x.pick, show_rownames = F,
annotation_row = ann_cell)
Questions:
# run PCA without scaling
x.pca <- prcomp(x.heat)
# show variance explained by each principal component
plot(x.pca)
pca.coord <- as.data.frame(x.pca[[5]])
# extrac PC1 and PC2, add a species column
df.pca <- tibble(pca.coord[1:2]) %>%
mutate(cell.type = x$cell.type)
df.pca %>%
ggplot(aes(x = PC1, y = PC2, color = cell.type)) +
geom_point(size = 0.1, alpha = 0.5) + # increase point size & make transparent
theme_bw() +
theme(legend.position = "bottom") +
stat_ellipse() #
Questions:
x.umap <- umap(x.heat)
df.umap <- data.frame(x.umap$layout) %>%
mutate(cell.type = x$cell.type) %>%
mutate(sample = x$sample) %>%
mutate(cohort = x$cohort)
df.umap %>% ggplot(aes(x = X1, y = X2, color = cell.type)) +
geom_point(alpha = 0.2) +
theme_bw() +
facet_wrap(~cohort)
The goal is to identify genes significantly up- or down-regulated in a particular cell type. To that end, we perform a t-test for each gene between the target cell type and the other cell types.
To balance the sample sizes, we randomly select 100 cells from the target cells and from the non-target cells, respectively.
target.cells <- x %>%
filter(cell.type == 'CNT_PC')
target.cells <- target.cells %>%
sample_n(100)
non.target.cells <- x %>%
filter(cell.type != 'CNT_PC')
non.target.cells <- non.target.cells %>%
sample_n(100) %>%
mutate(cell.type = 'non-target')
y <- bind_rows(target.cells, non.target.cells)
y.long <- y %>%
pivot_longer(3:2002, names_to = "gene", values_to = "rna.ct")
t.out <- y.long %>%
group_by(gene) %>%
do(tidy(t.test(data = ., rna.ct ~ cell.type)))
EnhancedVolcano(toptable = t.out, lab = t.out$gene, x = "estimate", y = "p.value",
# ylim = c(0, 4),
# pCutoff = 0.05,
title = "Tissue-specific genes",
subtitle = "cell type: CNT_PC")
Questions:
We verify top-5 marker genes with a violin plot:
top5 <- t.out %>%
arrange(p.value) %>%
head(5) %>%
pull(gene)
x.long %>%
filter(gene %in% top5) %>%
ggplot(aes(x = cell.type, y = rna.ct)) +
geom_violin() +
geom_jitter(shape = 1, alpha = 0.1) +
coord_flip() +
facet_wrap(~gene)
Questions:
The goal is to identify genes significantly up- or down-regulated in PKD patients (relative to the healthy control) for a particular cell type. To that end, we perform a t-test for each gene between PKD and CTL.
x.cell <- x.long %>%
filter(cell.type == 'CNT_PC')
t.out2 <- x.cell %>%
group_by(gene) %>%
do(tidy(t.test(data = ., rna.ct ~ cohort)))
t.out2
## # A tibble: 2,000 × 11
## # Groups: gene [2,000]
## gene estimate estimate1 estimate2 statistic p.value parameter conf.low
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A2M -0.0201 -0.162 -0.142 -0.411 0.681 168. -0.116
## 2 ABCA10 0.120 -0.0323 -0.152 1.15 0.254 119. -0.0872
## 3 ABCA4 -0.394 0.194 0.588 -3.09 0.00217 346. -0.646
## 4 ABCA5 0.0641 -0.0277 -0.0919 0.556 0.579 126. -0.164
## 5 ABCA9 -0.0284 -0.271 -0.242 -0.416 0.678 128. -0.163
## 6 ABCB1 0.113 0.0557 -0.0576 0.960 0.339 109. -0.121
## 7 ABCB5 0.0583 0.0413 -0.0170 2.25 0.0270 84.2 0.00678
## 8 ABCC3 -0.117 -0.490 -0.374 -1.63 0.103 301. -0.257
## 9 ABCC4 -0.259 -0.334 -0.0751 -3.28 0.00116 315. -0.414
## 10 ABCC8 0.0790 -0.0167 -0.0958 1.98 0.0487 322. 0.000458
## # ℹ 1,990 more rows
## # ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
EnhancedVolcano(toptable = t.out2, lab = t.out2$gene, x = "estimate", y = "p.value",
# ylim = c(0, 4),
pCutoff = 0.01,
title = "PKD associated genes",
subtitle = "cell type: CNT_PC")
Questions:
We verify top-5 marker genes with a violin plot:
top5 <- t.out2 %>%
arrange(p.value) %>%
head(5) %>%
pull(gene)
x.long %>%
filter(gene %in% top5) %>%
ggplot(aes(x = cell.type, y = rna.ct, color = cohort)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(shape = 1, alpha = 0.1) +
coord_flip() +
facet_wrap(~gene)
Questions: