Introduction

This is an R Markdown report to visualize computational results of African Swine Fever Virus (ASFV) genomics. Goals of the project include:

Previous computational steps

For more details on the project, clone the project repository at https://edu.gitee.com/huntercollege/repos/huntercollege/asfv-genomics/sources.

Basic genome tree

First, we set working directory, load libraries, and read data. We make a preliminary genome-tree plot using the “ape” package.

setwd("~/Dropbox/Courses/KIZ-Fall-2024/afsv-genomics")
library(ape)
library(readxl)
library(ggtree)
## ggtree v3.14.0 Learn more at https://yulab-smu.top/contribution-tree-data/
## 
## Please cite:
## 
## Guangchuang Yu.  Data Integration, Manipulation and Visualization of
## Phylogenetic Trees (1st edition). Chapman and Hall/CRC. 2022,
## doi:10.1201/9781003279242, ISBN: 9781032233574
## 
## Attaching package: 'ggtree'
## The following object is masked from 'package:ape':
## 
##     rotate
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks ggtree::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::where()  masks ape::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)
t <- read.tree("data/asfv-317-mid.dnd")
plot.phylo(t, show.tip.label = F, type = "fan", no.margin = T)
add.scale.bar()

Read meta data

meta <- read_xlsx("docs/metadata-from-Charles.xlsx", skip = 1, trim_ws = T)
meta <- meta |> mutate(Genotype_p72 = str_extract(Genotype_p72, "\\S+"))

meta <- meta |> mutate(Region = case_when(
  Region == 'Africa' ~ '0_African',
  Region == 'Asia' ~ '1_Asia',
  Region == 'Europe' ~ '2_Europe',
  Region == 'South America' ~ '3_S.America')
  )

head(meta)
## # A tibble: 6 × 11
##   Accession Strain  Location Country Region PubmedURL PubSubDate          Length
##   <chr>     <chr>   <chr>    <chr>   <chr>  <chr>     <dttm>               <dbl>
## 1 NC_044942 BA71    Spain    Spain   2_Eur… 26618713  NA                  180365
## 2 MW723489  Nu1995… Italy: … Italy   2_Eur… 34696424  2021-10-04 00:00:00 181753
## 3 MW800838  Or_1984 Italy: … Italy   2_Eur… 34696424  2021-10-04 00:00:00 181924
## 4 MW736609  6396 WB Italy: … Italy   2_Eur… 34696424  2021-10-04 00:00:00 181756
## 5 MW723492  4996 WB Italy: … Italy   2_Eur… 34696424  2021-10-04 00:00:00 181751
## 6 MW723482  Nu1986  Italy: … Italy   2_Eur… 34696424  2021-10-04 00:00:00 181788
## # ℹ 3 more variables: Host <chr>, Genotype_p72 <chr>, AddInfo <chr>

Streamlined genome tree

We removed the closely related OTUs (mostly genotypes I and II isolates) with tree-walk. A total of 62 genomes remain after tree trimming.

# BASH (not R) command:
biotree --walk "NC_044942" > walk-fromBA71.txt
walk <- read_tsv("data/walk-fromBA71.txt", col_names = c("otu", "dist", "order"))
## Rows: 316 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): otu
## dbl (2): dist, order
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
walk |> 
  ggplot(aes(x = order, y = dist)) +
  geom_line() +
  xlab("tree walk order") +
  ylab("distance from BA71") +
  theme_bw()

Discuss:

There are 3 flat steps in the tree-walk line plot. What do these OTUs represent?

tree <- treeio::read.tree("data/asfv-62.dnd")

# join tree with meta data
tips <- tibble(id=tree$tip.label)
tips <- tips %>% left_join(meta, c("id" = "Accession"))

# color the regions (four colors) and genotypes (rainbow colors)
col <- c("green", "red", "blue", "orange", rainbow(length(unique(meta$Genotype_p72))))

# plot genome tree
ggtree(tree)  %<+% tips +
  geom_treescale(width = 0.01) + 
  geom_tippoint(aes(color = Genotype_p72)) +
  geom_tiplab(align = TRUE, size = 2, aes(color = Region)) +
  scale_color_manual(values = col)  +
  theme(legend.title = element_blank()) 

Tasks and Questions:

Visualize tree inconsistencies

Plot a gene tree:

tree2 <- treeio::read.tree("data/test-gene-tree.dnd")

# join tree with meta data
tips <- tibble(id=tree2$tip.label)
tips <- tips %>% left_join(meta, c("id" = "Accession"))

# color the regions (four colors) and genotypes (rainbow colors)
col <- c("green", "red", "blue", "orange", rainbow(length(unique(meta$Genotype_p72))))

# plot genome tree
ggtree(tree2)  %<+% tips +
  geom_treescale(width = 0.1) + 
  geom_tippoint(aes(color = Genotype_p72)) +
  geom_tiplab(align = TRUE, size = 2, aes(color = Region)) +
  scale_color_manual(values = col)  +
  theme(legend.title = element_blank()) 

If tree topology between the genome and gene trees has major differences, we expect to see major differences in branch length. We first obtain the tree distances from OTUs to the reference genome with the following command:

biotree --dist-all test-gene-tree.dnd | grep "NC_" | sed "s/NC_044942//" | tr -s "\t" | sed "s/^\t//" > dist-gene-tree.tsv
biotree --dist-all asfv-62.dnd | grep "NC_" | sed "s/NC_044942//" | tr -s "\t" | sed "s/^\t//" > dist-genome-tree.tsv
d1 <- read_tsv("data/dist-gene-tree.tsv", col_names = c("otu", "dist"))
## Rows: 61 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): otu
## dbl (1): dist
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d2 <- read_tsv("data/dist-genome-tree.tsv", col_names = c("otu", "dist"))
## Rows: 61 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): otu
## dbl (1): dist
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d <- d1 |> left_join(d2, "otu")
colnames(d) <- c("otu", "dist.gene", "dist.genome")
d |>
  ggplot(aes(x = dist.genome, y = dist.gene, label = otu)) +
  geom_point(shape = 1) +
  geom_text_repel() +
  geom_smooth(method = "lm") +
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: ggrepel: 45 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Discussion:

Which isolate shows the most inconsistency with the genome-based tree?

Site-specific evolutionary rates for a gene

Nucleotide sites

Obtained by running:

# BASH (not R) command:
iqtree2 -s test.cds --seqtype CODON --rate
nt.rate <- read_tsv("data/test-rates-nt.tsv", skip = 8)
## Rows: 170 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (4): Site, Rate, Cat, C_Rate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nt.rate <- nt.rate |>
  mutate(codon.site = case_when(
    Site %% 3 == 1 ~ "site-1",
    Site %% 3 == 2 ~ "site-2",
    Site %% 3 == 0 ~ "site-3"
  ))

nt.rate |>
  ggplot(aes(x = Site, y = Rate)) +
  geom_point(shape = 1) +
  geom_line() +
  facet_wrap(~codon.site, nrow = 3) +
  theme_bw()

Amino-acid sites

Obtained by running:

# BASH (not R) command:
iqtree2 -s test-aa.fas --rate

# Or, more robustly (thanks to Alex)
iqtree2 -s test-aa.fa --mlrate
aa.rate <- read_tsv("data/test-rates-aa.tsv", skip = 8)
## Rows: 170 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (4): Site, Rate, Cat, C_Rate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
aa.seq <- scan("data/test-aa.txt", what = "c")
aas <- str_split_1(aa.seq, pattern = "")

aa.rate <- aa.rate |>
  mutate(aa = aas)

aa.rate |>
  ggplot(aes(x = Site, y = Rate, label = aa)) +
  geom_segment(data = aa.rate, color = "gray", aes(xend = Site, y = 0, yend = Rate)) +
  geom_text() +
  theme_bw()

Validate positive-selected sites

First, we plot log likelihood of the three models. The data was extracted from PAML output file “test.mlc” with the following commands:

# BASH (not R) command:
grep "^lnL" test.mlc | tr -s " " | cut -f2,3 -d" " | sed "s/np://" | tr -d "[):]" | tr ' ' '\t' > test-lnL.tsv

# Or most robustly (to deal with np < 100):
grep "^lnL" paml-runs/F8208_gp152.mlc | tr -s " " | cut -f3,4 -d":" | tr -d "[):]" | sed -E "s/^\s+//" | cut -f1,2 -d" "
like <- read_tsv("data/test-lnL.tsv",col_names = c("num.par", "log.likelihood"))
## Rows: 3 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (2): num.par, log.likelihood
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
like <- like |>
  mutate(model = c("0-one-ratio", "1-nearly-neutral", "2-positive-selection"))

like |>
  ggplot(aes(x = num.par, y = log.likelihood, label = model)) +
  geom_line() +
  geom_point() +
  geom_text(nudge_y = 5) +
  theme_minimal()

Questions:

Second, we visualize the proportions of sites under negative, neutral, and positive selections, respectively. The output file is extracted with the following BASH commands:

# BASH (not R) command:
grep -w "[pw]:" test.mlc | tail -2 | tr -s " " | tr -d ":" | tr " " "\t" > test-prop.tsv
prop <- read.table("data/test-prop.tsv", row.names = 1, col.names = c("para", "neg.sel", "neutral", "pos.sel"))
pie(as.matrix(prop)[1,], col = c("blue", "green", "red"))

We validate the positive-selected sites by a co-visualization of gene tree and alignment and meta data with “Phyloview” at http://u19.borreliabase.org/. The input files are:

Tasks & Questions: