Part 1. Getting started

library(tidyverse) # load library
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(pheatmap) # load another library (heatmap)
getwd() # show working directory
## [1] "/Users/weigang/Dropbox/Courses/bio-714-spring"
# R as a calculator, control-Enter/Command-Enter
4.50 + 6.7
## [1] 11.2
59/89.6
## [1] 0.6584821
log10(2)
## [1] 0.30103
log2(4) # fold change (FC): 2-fold increase
## [1] 2
log2(0.5) # fold change (FC): 1-fold decrease
## [1] -1
1:10 # a vector
##  [1]  1  2  3  4  5  6  7  8  9 10
seq(from = 1, to = 10, by = 1) # same as above, with a function
##  [1]  1  2  3  4  5  6  7  8  9 10
# save outputs to an object (a data container to store outputs of a function)
x <- seq(from = 1, to = 10, by = 3)
x * 2 # doulbe each value in the vector
## [1]  2  8 14 20
x ^ 2 # square each value
## [1]   1  16  49 100
# get help
?seq # opens the built-in help page
example(seq) # run the built-in example
## 
## seq> seq(0, 1, length.out = 11)
##  [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
## 
## seq> seq(stats::rnorm(20)) # effectively 'along'
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
## 
## seq> seq(1, 9, by = 2)     # matches 'end'
## [1] 1 3 5 7 9
## 
## seq> seq(1, 9, by = pi)    # stays below 'end'
## [1] 1.000000 4.141593 7.283185
## 
## seq> seq(1, 6, by = 3)
## [1] 1 4
## 
## seq> seq(1.575, 5.125, by = 0.05)
##  [1] 1.575 1.625 1.675 1.725 1.775 1.825 1.875 1.925 1.975 2.025 2.075 2.125
## [13] 2.175 2.225 2.275 2.325 2.375 2.425 2.475 2.525 2.575 2.625 2.675 2.725
## [25] 2.775 2.825 2.875 2.925 2.975 3.025 3.075 3.125 3.175 3.225 3.275 3.325
## [37] 3.375 3.425 3.475 3.525 3.575 3.625 3.675 3.725 3.775 3.825 3.875 3.925
## [49] 3.975 4.025 4.075 4.125 4.175 4.225 4.275 4.325 4.375 4.425 4.475 4.525
## [61] 4.575 4.625 4.675 4.725 4.775 4.825 4.875 4.925 4.975 5.025 5.075 5.125
## 
## seq> seq(17) # same as 1:17, or even better seq_len(17)
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17

Part 2: Data manipulation & visualizaiton

data(iris) # load the famous iris data set

# show summaries
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
tail(iris)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 145          6.7         3.3          5.7         2.5 virginica
## 146          6.7         3.0          5.2         2.3 virginica
## 147          6.3         2.5          5.0         1.9 virginica
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica
dim(iris)
## [1] 150   5
glimpse(iris)
## Rows: 150
## Columns: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.…
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.…
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.…
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.…
## $ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa, s…
colnames(iris) # note all variables and names are CASE-SENSITIVE
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"
# filter rows and select columns
x.setosa <- iris %>% # use the pipe operator "ctl-shift-M" 
  filter(Species == 'setosa') # choose rows; Note the double equal (means comparison)

x.iris <- iris %>% 
  select(1:4) %>% # choose columns 1-4
  select(-1) # remove the species column
############
# 3 ways to show data distribution
# Data visualization: show data distribution
# the aes() function maps variables to x, y axis
# the geom_xxx functions render into geometric shapes
# Method 1. Histogram (one dimensional visual), to show distribution of a numerical variable
iris %>% 
  ggplot(aes(x=Sepal.Length)) + # use the aes() function to map variables to axes
  geom_histogram(bins=10) + # choose the graphic types; bins: how many equal-length segments
  facet_wrap(~Species) + # one species per panel (multi-panel plot by a categorical variable)
  scale_y_log10() # apply log10 normalization
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 12 rows containing missing values (geom_bar).

# Method 2. Boxplot: map Species to x axis, map Sepal.Length to y axis
iris %>% 
  ggplot(aes(x = Species, y = Sepal.Length)) + # categroical variable on x-axis, numerical on y-axis
  geom_boxplot() + # choose boxplot
  geom_jitter(shape = 1) # add data points to show sample sizes

# Method 3. Violin plot
iris %>% 
  ggplot(aes(x = Species, y = Sepal.Length)) +
  geom_violin() + # violin plot show side-way histograms
  geom_jitter(shape = 1) # choose symbol

##################

# Scatterplot to show relation between two numerical variables 
iris %>% 
  ggplot(aes(x = Sepal.Length, y = Sepal.Width)) + 
  geom_point()

# Map species (a categorical variable) to colors
iris %>% 
  ggplot(aes(x=Sepal.Length, y= Sepal.Width, color = Species)) + 
  geom_point()

# add regression lines
iris %>% 
  ggplot(aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 
  geom_point() + 
  geom_smooth(method = "lm") # add a linear regression line
## `geom_smooth()` using formula 'y ~ x'

Part 3. Advanced topic: cluster analysis

The goal is to find groups in samples (in rows) based on the four traits (in columns). The hypothesis is that the samples of the same species should be more similar to each other than to samples from a difference species.

In machine learning, this is a classification problem. The model trained on the 150 samples could be used to identify (predict) species identity of a new sample based on the four measurements.

# Method 1. hierarchical clustering with a heatmap
x.iris <- iris %>% select(-5) # remove the species column (retain only numerical variables)
rownames(x.iris) <- paste("sample", 1:150)

ann_row <- data.frame(species = factor(iris$Species)) # a data frame containing species for each sample
rownames(ann_row) <- paste("sample", 1:150)

pheatmap(x.iris) # basic heatmap: rows are samples, cols are variables, colors are values, scale to normalize among variables (mean =0); row-side dendrogram groups samples with similar values 

# annotated heatmap: scale by each measurement: z-score, standard deviates (mean = 0, in the units of sd; -3 ~ 3 for a normally distributed variable)
pheatmap(x.iris, scale = "column", show_rownames = F, annotation_row = ann_row) # add rowside color to show species

# Method 2. principal component analysis (PCA), which projects this 4-dimensional data set (four measurements) into 2D space
# Alternative dimensional reduction: MDS (multi-dimensional scaling), UMAP (non-linear combination), t-sne (for binary measurements)
# a principal component is a linear combination of original variables
pca.iris <- prcomp(x.iris, scale. = T) # run PCA with scaling to z-scores
plot(pca.iris) # show variance explained by each principal component

biplot(pca.iris) # show sample groups

# Make a better plot
pca.coord <- as.data.frame(pca.iris[[5]]) # get coordinates from the PCA output for PC1 & PC2

df.pca <- tibble(pca.coord[1:2]) %>% 
  mutate(species = iris$Species) # extrac PC1 and PC2, add a species column

# plot 1st & 2nd principal components
df.pca %>% 
  ggplot(aes(x = PC1, y = PC2, color = species)) + 
  geom_point(size = 3, alpha = 0.5) + 
  theme_bw() + 
  theme(legend.position = "bottom") + 
  stat_ellipse() # shows clear separation of setosa samples from the other two species, and partial overlap between the vesicolor and virginica samples

Part 4. A study of gene expression

Read, show & transform data

# read gene expression dataset
ge <- read_tsv("http://borreliabase.org/~wgqiu/data-sets-for-biostat/ge-20.tsv")
## Rows: 20 Columns: 60
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): Gene
## dbl (59): EFM192A, EVSAT, MDAMB453, MDAMB231, MDAMB468, BT549, CAL851, EFM19...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# view data: genes in rows, cells in columns
head(ge)
## # A tibble: 6 × 60
##   Gene  EFM192A EVSAT MDAMB453 MDAMB231 MDAMB468 BT549 CAL851 EFM19 BT474 HS606T
##   <chr>   <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>  <dbl>
## 1 AGR2    11.9   7.29    13.0      8.58     9.05  3.74  10.1  13.9  13.5    3.81
## 2 MIR2…    8.73  8.96     3.73     3.98    12.6   3.75  11.1  11.9   5.15   3.71
## 3 AZGP1   10.7  13.0     12.1      3.69    12.8   3.78   9.06  8.47  3.72   3.63
## 4 AKR1…    5.27  6.89     4.39    12.8     13.6  12.9   14.6   3.84  5.58  13.2 
## 5 CAV1     6.18  4.03     4.41    13.4     12.3  13.6   13.1   8.54  6.05  12.9 
## 6 BASP1    8.55  4.55     4.95    11.6      4.50 13.1   11.6   4.44 12.0   12.0 
## # … with 49 more variables: MDAMB415 <dbl>, AU565 <dbl>, HS578T <dbl>,
## #   BT20 <dbl>, MDAMB361 <dbl>, MDAMB436 <dbl>, SKBR3 <dbl>, HCC1143 <dbl>,
## #   HCC1937 <dbl>, HCC1569 <dbl>, HCC1395 <dbl>, HCC1187 <dbl>, HCC2218 <dbl>,
## #   HCC1954 <dbl>, CAMA1 <dbl>, HMC18 <dbl>, HCC2157 <dbl>, HS739T <dbl>,
## #   HCC38 <dbl>, DU4475 <dbl>, MDAMB175VII <dbl>, YMB1 <dbl>, T47D <dbl>,
## #   ZR751 <dbl>, HCC70 <dbl>, MCF7 <dbl>, HCC202 <dbl>, MDAMB157 <dbl>,
## #   HCC1419 <dbl>, BT483 <dbl>, HS281T <dbl>, HCC1428 <dbl>, HS274T <dbl>, …
tail(ge)
## # A tibble: 6 × 60
##   Gene  EFM192A EVSAT MDAMB453 MDAMB231 MDAMB468 BT549 CAL851 EFM19 BT474 HS606T
##   <chr>   <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>  <dbl>
## 1 SLC4…    7.52  8.94    11.7      7.14     3.68  3.56   4.02 11.8  11.0    9.28
## 2 CAV2     4.13  5.70     3.94    12.2     11.8  12.2   12.3   9.07  4.42  11.6 
## 3 TWIS…    3.31  7.28     3.13     3.02     3.91 11.0    7.08  3.31  9.30  10.1 
## 4 SEPP1   10.6  10.9     11.6      4.57    10.2   4.16   4.58  6.43 10.4    5.60
## 5 FBP1    10.2  11.3      4.57     9.32     6.26  4.06   8.60 11.5  10.5    4.47
## 6 ALDH…    9.93  6.12     4.34     6.71    12.3   9.57  12.6   8.75  5.22   6.96
## # … with 49 more variables: MDAMB415 <dbl>, AU565 <dbl>, HS578T <dbl>,
## #   BT20 <dbl>, MDAMB361 <dbl>, MDAMB436 <dbl>, SKBR3 <dbl>, HCC1143 <dbl>,
## #   HCC1937 <dbl>, HCC1569 <dbl>, HCC1395 <dbl>, HCC1187 <dbl>, HCC2218 <dbl>,
## #   HCC1954 <dbl>, CAMA1 <dbl>, HMC18 <dbl>, HCC2157 <dbl>, HS739T <dbl>,
## #   HCC38 <dbl>, DU4475 <dbl>, MDAMB175VII <dbl>, YMB1 <dbl>, T47D <dbl>,
## #   ZR751 <dbl>, HCC70 <dbl>, MCF7 <dbl>, HCC202 <dbl>, MDAMB157 <dbl>,
## #   HCC1419 <dbl>, BT483 <dbl>, HS281T <dbl>, HCC1428 <dbl>, HS274T <dbl>, …
# change into tall/tidy table (from a wide/untidy table): samples in rows, variables in columns, categories as categorical columns
ge.tall <- ge %>% pivot_longer(2:60, names_to = "cell", values_to = "expression")

# show top rows
head(ge.tall)
## # A tibble: 6 × 3
##   Gene  cell     expression
##   <chr> <chr>         <dbl>
## 1 AGR2  EFM192A       11.9 
## 2 AGR2  EVSAT          7.29
## 3 AGR2  MDAMB453      13.0 
## 4 AGR2  MDAMB231       8.58
## 5 AGR2  MDAMB468       9.05
## 6 AGR2  BT549          3.74

One-dimensional plot for a numeric variable: Histogram

# plot histogram: map expression to X-axis
ge.tall %>% ggplot(aes(x = expression)) +
  geom_histogram(bins = 50)

# show distribution for each gene
ge.tall %>% ggplot(aes(x = expression)) +
  geom_histogram(bins = 50) + 
  facet_wrap(~Gene) # panel as a dimension

Two-dimensional plot for a numeric and a categorical variable: Boxplot

# with boxplot: map Gene to X-axis, expression to Y-axis
ge.tall %>% ggplot(aes(x = Gene, y=expression)) +
  geom_boxplot()

ge.tall %>% ggplot(aes(x = Gene, y=expression)) +
  geom_violin()

# flip coordinates, giving gene names more space
ge.tall %>% ggplot(aes(x = Gene, y=expression)) +
  geom_boxplot() + coord_flip()

# add points
ge.tall %>% ggplot(aes(x = Gene, y=expression)) +
  geom_boxplot() + 
  geom_jitter(color= "gray", shape=1) + 
  coord_flip() + 
#  geom_point(shape = 1) +
  theme_bw()

Heatmap: two categorical variables (Gene & Cell), one numerical variable (gene expression, mapped to color)

# cluster analysis with heatmap
ge.mat <- as.matrix(ge[,-1]) # create a matrix
rownames(ge.mat) <- ge$Gene # add gene names as row names
pheatmap(ge.mat)

pca.ge <- prcomp(t(ge.mat), scale. = T) # run PCA with scaling to z-scores
plot(pca.ge) # show variance explained by each principal component

biplot(pca.ge) # show sample groups

# Make a better plot
pca.coord <- as.data.frame(pca.ge[[5]]) # get coordinates from the PCA output for PC1 & PC2

df.pca <- tibble(pca.coord[1:2]) %>% 
  mutate(cells = colnames(ge.mat)) # extrac PC1 and PC2, add a species column

# plot 1st & 2nd principal components
df.pca %>% 
  ggplot(aes(x = PC1, y = PC2, label = cells)) + 
  geom_point(size = 3, alpha = 0.5) + 
  theme_bw() + 
  geom_text()

#  theme(legend.position = "bottom") 
#  stat_ellipse() # shows clear separation of setosa samples from the other two species, and partial overlap between the vesicolor and virginica samples