dir <- system.file("extdata", package="airway", mustWork=TRUE)
## [1] "GSE52778_series_matrix.txt" "Homo_sapiens.GRCh37.75_subset.gtf"
## [3] "quants" "sample_table.csv"
## [5] "SraRunInfo_SRP033351.csv" "SRR1039508_subset.bam"
## [7] "SRR1039509_subset.bam" "SRR1039512_subset.bam"
## [9] "SRR1039513_subset.bam" "SRR1039516_subset.bam"
## [11] "SRR1039517_subset.bam" "SRR1039520_subset.bam"
## [13] "SRR1039521_subset.bam"
list.files(file.path(dir, "quants"))
## [1] "SRR1039508" "SRR1039509"
csvfile <- file.path(dir, "sample_table.csv")
coldata <- read.csv(csvfile, row.names=1, stringsAsFactors=FALSE)
coldata
## SampleName cell dex albut Run avgLength Experiment
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120 SRX384353
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101 SRX384357
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358
## Sample BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677
coldata <- coldata[1:2,]
coldata$names <- coldata$Run
coldata$files <- file.path(dir, "quants", coldata$names, "quant.sf.gz")
file.exists(coldata$files)
## [1] TRUE TRUE
library("tximeta")
se <- tximeta(coldata)
## importing quantifications
## reading in files with read_tsv
## 1 2
## found matching transcriptome:
## [ GENCODE - Homo sapiens - release 29 ]
## loading existing TxDb created: 2020-10-13 13:42:29
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## loading existing transcript ranges created: 2020-10-13 13:42:31
## fetching genome info for GENCODE
## [1] 205870 2
## [1] "ENST00000456328.2" "ENST00000450305.2" "ENST00000488147.1"
## [4] "ENST00000619216.1" "ENST00000473358.1" "ENST00000469289.1"
gse <- summarizeToGene(se)
## loading existing TxDb created: 2020-10-13 13:42:29
## obtaining transcript-to-gene mapping from database
## loading existing gene ranges created: 2020-10-13 13:44:23
## summarizing abundance
## summarizing counts
## summarizing length
## [1] 58294 2
## [1] "ENSG00000000003.14" "ENSG00000000005.5" "ENSG00000000419.12"
## [4] "ENSG00000000457.13" "ENSG00000000460.16" "ENSG00000000938.12"

## class: RangedSummarizedExperiment
## dim: 58294 8
## metadata(6): tximetaInfo quantInfo ... txomeInfo txdbInfo
## assays(3): counts abundance length
## rownames(58294): ENSG00000000003.14 ENSG00000000005.5 ...
## ENSG00000285993.1 ENSG00000285994.1
## rowData names(1): gene_id
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(3): names donor condition
## [1] "counts" "abundance" "length"
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 708.164 467.962 900.992 424.368 1188.295
## ENSG00000000005.5 0.000 0.000 0.000 0.000 0.000
## ENSG00000000419.12 455.000 510.000 604.000 352.000 583.000
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 1090.668 805.929 599.337
## ENSG00000000005.5 0.000 0.000 0.000
## ENSG00000000419.12 773.999 409.999 499.000
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520
## 21100805 19298584 26145537 15688246 25268618 31891456 19683767
## SRR1039521
## 21813903
## GRanges object with 58294 ranges and 1 metadata column:
## seqnames ranges strand | gene_id
## <Rle> <IRanges> <Rle> | <character>
## ENSG00000000003.14 chrX 100627109-100639991 - | ENSG00000000003.14
## ENSG00000000005.5 chrX 100584802-100599885 + | ENSG00000000005.5
## ENSG00000000419.12 chr20 50934867-50958555 - | ENSG00000000419.12
## ENSG00000000457.13 chr1 169849631-169894267 - | ENSG00000000457.13
## ENSG00000000460.16 chr1 169662007-169854080 + | ENSG00000000460.16
## ... ... ... ... . ...
## ENSG00000285990.1 chr14 19244904-19269380 - | ENSG00000285990.1
## ENSG00000285991.1 chr6 149817937-149896011 - | ENSG00000285991.1
## ENSG00000285992.1 chr8 47129262-47132628 + | ENSG00000285992.1
## ENSG00000285993.1 chr18 46409197-46410645 - | ENSG00000285993.1
## ENSG00000285994.1 chr10 12563151-12567351 + | ENSG00000285994.1
## -------
## seqinfo: 25 sequences (1 circular) from hg38 genome
## Seqinfo object with 25 sequences (1 circular) from hg38 genome:
## seqnames seqlengths isCircular genome
## chr1 248956422 FALSE hg38
## chr2 242193529 FALSE hg38
## chr3 198295559 FALSE hg38
## chr4 190214555 FALSE hg38
## chr5 181538259 FALSE hg38
## ... ... ... ...
## chr21 46709983 FALSE hg38
## chr22 50818468 FALSE hg38
## chrX 156040895 FALSE hg38
## chrY 57227415 FALSE hg38
## chrM 16569 TRUE hg38
## DataFrame with 8 rows and 3 columns
## names donor condition
## <factor> <factor> <factor>
## SRR1039508 SRR1039508 N61311 Untreated
## SRR1039509 SRR1039509 N61311 Dexamethasone
## SRR1039512 SRR1039512 N052611 Untreated
## SRR1039513 SRR1039513 N052611 Dexamethasone
## SRR1039516 SRR1039516 N080611 Untreated
## SRR1039517 SRR1039517 N080611 Dexamethasone
## SRR1039520 SRR1039520 N061011 Untreated
## SRR1039521 SRR1039521 N061011 Dexamethasone
## [1] N61311 N61311 N052611 N052611 N080611 N080611 N061011 N061011
## Levels: N052611 N061011 N080611 N61311
## [1] Untreated Dexamethasone Untreated Dexamethasone Untreated
## [6] Dexamethasone Untreated Dexamethasone
## Levels: Untreated Dexamethasone
gse$cell <- gse$donor
gse$dex <- gse$condition
## [1] "Untreated" "Dexamethasone"
# when renaming levels, the order must be preserved!
levels(gse$dex) <- c("untrt", "trt")
library("magrittr")
gse$dex %<>% relevel("untrt")
gse$dex
## [1] untrt trt untrt trt untrt trt untrt trt
## Levels: untrt trt
# gse$dex <- relevel(gse$dex, "untrt")
round( colSums(assay(gse)) / 1e6, 1 )
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520
## 21.1 19.3 26.1 15.7 25.3 31.9 19.7
## SRR1039521
## 21.8
dds <- DESeqDataSet(gse, design = ~ cell + dex)
countdata <- round(assays(gse)[["counts"]])
head(countdata, 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 708 468 901 424 1188
## ENSG00000000005.5 0 0 0 0 0
## ENSG00000000419.12 455 510 604 352 583
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 1091 806 599
## ENSG00000000005.5 0 0 0
## ENSG00000000419.12 774 410 499
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ cell + dex)
## [1] 58294
keep <- rowSums(counts(dds)) > 1
dds <- dds[keep,]
nrow(dds)
## [1] 31604
# at least 3 samples with a count of 10 or higher
keep <- rowSums(counts(dds) >= 10) >= 3
lambda <- 10^seq(from = -1, to = 2, length = 1000)
cts <- matrix(rpois(1000*100, lambda), ncol = 100)
library("vsn")
meanSdPlot(cts, ranks = FALSE)

log.cts.one <- log2(cts + 1)
meanSdPlot(log.cts.one, ranks = FALSE)

vsd <- vst(dds, blind = FALSE)
head(assay(vsd), 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 10.105781 9.852029 10.169726 9.991545 10.424865
## ENSG00000000419.12 9.692244 9.923647 9.801921 9.798653 9.763455
## ENSG00000000457.13 9.449592 9.312186 9.362754 9.459168 9.281415
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 10.194490 10.315814 10.002177
## ENSG00000000419.12 9.874703 9.683211 9.845507
## ENSG00000000457.13 9.395937 9.477971 9.477027
## DataFrame with 8 rows and 5 columns
## names donor condition cell dex
## <factor> <factor> <factor> <factor> <factor>
## SRR1039508 SRR1039508 N61311 Untreated N61311 untrt
## SRR1039509 SRR1039509 N61311 Dexamethasone N61311 trt
## SRR1039512 SRR1039512 N052611 Untreated N052611 untrt
## SRR1039513 SRR1039513 N052611 Dexamethasone N052611 trt
## SRR1039516 SRR1039516 N080611 Untreated N080611 untrt
## SRR1039517 SRR1039517 N080611 Dexamethasone N080611 trt
## SRR1039520 SRR1039520 N061011 Untreated N061011 untrt
## SRR1039521 SRR1039521 N061011 Dexamethasone N061011 trt
rld <- rlog(dds, blind = FALSE)
head(assay(rld), 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 9.482613 9.172197 9.558383 9.346001 9.851349
## ENSG00000000419.12 8.860186 9.150196 9.000042 8.995902 8.951327
## ENSG00000000457.13 8.354790 8.166700 8.236582 8.366693 8.121781
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 9.587602 9.727248 9.357876
## ENSG00000000419.12 9.091075 8.848782 9.054384
## ENSG00000000457.13 8.282307 8.392384 8.391023
library("dplyr")
library("ggplot2")
dds <- estimateSizeFactors(dds)
df <- bind_rows(
as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = "log2(x + 1)"),
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
colnames(df)[1:2] <- c("x", "y")
lvls <- c("log2(x + 1)", "vst", "rlog")
df$transformation <- factor(df$transformation, levels=lvls)
ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)

sampleDists <- dist(t(assay(vsd)))
sampleDists
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517
## SRR1039509 39.42362
## SRR1039512 32.37620 44.93748
## SRR1039513 51.09677 37.18799 41.79886
## SRR1039516 35.59642 47.54671 34.83458 52.05265
## SRR1039517 51.26314 41.58572 46.89609 40.67315 39.74268
## SRR1039520 32.38578 46.96000 30.35980 48.08669 37.07106 50.38349
## SRR1039521 51.49108 37.57383 47.17283 31.45899 52.62276 41.35941
## SRR1039520
## SRR1039509
## SRR1039512
## SRR1039513
## SRR1039516
## SRR1039517
## SRR1039520
## SRR1039521 43.01502
library("pheatmap")
library("RColorBrewer")
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd$dex, vsd$cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)

library("PoiClaClu")
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
clustering_distance_rows = poisd$dd,
clustering_distance_cols = poisd$dd,
col = colors)

plotPCA(vsd, intgroup = c("dex", "cell"))

pcaData <- plotPCA(vsd, intgroup = c( "dex", "cell"), returnData = TRUE)
pcaData
## PC1 PC2 group dex cell name
## SRR1039508 -14.311369 -2.6000421 untrt:N61311 untrt N61311 SRR1039508
## SRR1039509 8.058574 -0.7500532 trt:N61311 trt N61311 SRR1039509
## SRR1039512 -9.404122 -4.3920761 untrt:N052611 untrt N052611 SRR1039512
## SRR1039513 14.497842 -4.1323833 trt:N052611 trt N052611 SRR1039513
## SRR1039516 -12.365055 11.2109581 untrt:N080611 untrt N080611 SRR1039516
## SRR1039517 9.343946 14.9115160 trt:N080611 trt N080611 SRR1039517
## SRR1039520 -10.852633 -7.7618618 untrt:N061011 untrt N061011 SRR1039520
## SRR1039521 15.032816 -6.4860576 trt:N061011 trt N061011 SRR1039521
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed() +
ggtitle("PCA with VST data")

library("glmpca")
gpca <- glmpca(counts(dds), L=2)
gpca.dat <- gpca$factors
gpca.dat$dex <- dds$dex
gpca.dat$cell <- dds$cell
ggplot(gpca.dat, aes(x = dim1, y = dim2, color = dex, shape = cell)) +
geom_point(size =3) + coord_fixed() + ggtitle("glmpca - Generalized PCA")

mds <- as.data.frame(colData(vsd)) %>%
cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed() + ggtitle("MDS with VST data")

mdsPois <- as.data.frame(colData(dds)) %>%
cbind(cmdscale(samplePoisDistMatrix))
ggplot(mdsPois, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed() + ggtitle("MDS with PoissonDistances")

## log2 fold change (MLE): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 31604 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.14 739.940717 -0.3611537 0.106869 -3.379419 0.000726392
## ENSG00000000419.12 511.735722 0.2063147 0.128665 1.603509 0.108822318
## ENSG00000000457.13 314.194855 0.0378308 0.158633 0.238479 0.811509461
## ENSG00000000460.16 79.793622 -0.1152590 0.314991 -0.365912 0.714430444
## ENSG00000000938.12 0.307267 -1.3691185 3.503764 -0.390757 0.695977205
## ... ... ... ... ... ...
## ENSG00000285979.1 38.353886 0.3423657 0.359511 0.952310 0.340940
## ENSG00000285987.1 1.562508 0.7064145 1.547295 0.456548 0.647996
## ENSG00000285990.1 0.642315 0.3647333 3.433276 0.106235 0.915396
## ENSG00000285991.1 11.276284 -0.1165515 0.748601 -0.155692 0.876275
## ENSG00000285994.1 3.651041 -0.0960094 1.068660 -0.089841 0.928414
## padj
## <numeric>
## ENSG00000000003.14 0.00531137
## ENSG00000000419.12 0.29318870
## ENSG00000000457.13 0.92255697
## ENSG00000000460.16 0.87298038
## ENSG00000000938.12 NA
## ... ...
## ENSG00000285979.1 0.600750
## ENSG00000285987.1 NA
## ENSG00000285990.1 NA
## ENSG00000285991.1 0.952921
## ENSG00000285994.1 NA
res <- results(dds, contrast=c("dex","trt","untrt"))
mcols(res, use.names = TRUE)
## DataFrame with 6 rows and 2 columns
## type description
## <character> <character>
## baseMean intermediate mean of normalized counts for all samples
## log2FoldChange results log2 fold change (MLE): dex trt vs untrt
## lfcSE results standard error: dex trt vs untrt
## stat results Wald statistic: dex trt vs untrt
## pvalue results Wald test p-value: dex trt vs untrt
## padj results BH adjusted p-values
##
## out of 31604 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 2373, 7.5%
## LFC < 0 (down) : 1949, 6.2%
## outliers [1] : 0, 0%
## low counts [2] : 14706, 47%
## (mean count < 9)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)
##
## FALSE TRUE
## 13357 3541
resLFC1 <- results(dds, lfcThreshold=1)
table(resLFC1$padj < 0.1)
##
## FALSE TRUE
## 16687 211
results(dds, contrast = c("cell", "N061011", "N61311"))
## log2 fold change (MLE): cell N061011 vs N61311
## Wald test p-value: cell N061011 vs N61311
## DataFrame with 31604 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.14 739.940717 0.270945 0.152171 1.780534 0.0749886
## ENSG00000000419.12 511.735722 -0.071831 0.182817 -0.392912 0.6943842
## ENSG00000000457.13 314.194855 0.179881 0.225122 0.799036 0.4242696
## ENSG00000000460.16 79.793622 -0.119482 0.441594 -0.270570 0.7867217
## ENSG00000000938.12 0.307267 0.000000 4.997580 0.000000 1.0000000
## ... ... ... ... ... ...
## ENSG00000285979.1 38.353886 0.0589757 0.512391 0.1150989 0.908367
## ENSG00000285987.1 1.562508 1.0216804 2.201861 0.4640078 0.642642
## ENSG00000285990.1 0.642315 -3.0956404 4.852715 -0.6379193 0.523526
## ENSG00000285991.1 11.276284 -0.8779628 1.046963 -0.8385804 0.401705
## ENSG00000285994.1 3.651041 -0.0192351 1.513236 -0.0127112 0.989858
## padj
## <numeric>
## ENSG00000000003.14 0.378828
## ENSG00000000419.12 0.936703
## ENSG00000000457.13 0.820733
## ENSG00000000460.16 0.960662
## ENSG00000000938.12 NA
## ... ...
## ENSG00000285979.1 0.98371
## ENSG00000285987.1 NA
## ENSG00000285990.1 NA
## ENSG00000285991.1 NA
## ENSG00000285994.1 NA
sum(res$pvalue < 0.05, na.rm=TRUE)
## [1] 5170
## [1] 31604
sum(res$padj < 0.1, na.rm=TRUE)
## [1] 4322
resSig <- subset(res, padj < 0.1)
head(resSig[ order(resSig$log2FoldChange), ])
## log2 fold change (MLE): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000216490.3 42.3007 -5.72483 1.475652 -3.87952 1.04661e-04
## ENSG00000267339.5 30.5206 -5.39781 0.773017 -6.98278 2.89390e-12
## ENSG00000257542.5 10.0399 -5.25991 1.282001 -4.10289 4.08015e-05
## ENSG00000146006.7 61.6448 -4.49504 0.663821 -6.77147 1.27484e-11
## ENSG00000108700.4 14.6324 -4.09069 0.941842 -4.34328 1.40369e-05
## ENSG00000213240.8 12.0962 -3.87313 1.274133 -3.03981 2.36725e-03
## padj
## <numeric>
## ENSG00000216490.3 9.87853e-04
## ENSG00000267339.5 9.45863e-11
## ENSG00000257542.5 4.30646e-04
## ENSG00000146006.7 3.82632e-10
## ENSG00000108700.4 1.66687e-04
## ENSG00000213240.8 1.44987e-02
head(resSig[ order(resSig$log2FoldChange, decreasing = TRUE), ])
## log2 fold change (MLE): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000254692.1 62.2302 10.20714 3.37706 3.02250 2.50700e-03
## ENSG00000179593.15 67.0895 9.50515 1.07705 8.82513 1.09334e-18
## ENSG00000268173.3 46.4370 8.40438 3.38506 2.48279 1.30358e-02
## ENSG00000224712.12 35.5607 7.16686 2.16476 3.31070 9.30628e-04
## ENSG00000109906.13 438.1940 6.37750 0.31381 20.32276 8.08934e-92
## ENSG00000257663.1 24.3946 6.34758 2.09531 3.02942 2.45024e-03
## padj
## <numeric>
## ENSG00000254692.1 1.52167e-02
## ENSG00000179593.15 6.81746e-17
## ENSG00000268173.3 5.94385e-02
## ENSG00000224712.12 6.55240e-03
## ENSG00000109906.13 1.24267e-88
## ENSG00000257663.1 1.49151e-02
topGene <- rownames(res)[which.min(res$padj)]
plotCounts(dds, gene = topGene, intgroup=c("dex"))

library("ggbeeswarm")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = c("dex","cell"),
returnData = TRUE)
ggplot(geneCounts, aes(x = dex, y = count, color = cell)) +
scale_y_log10() + geom_beeswarm(cex = 3)

ggplot(geneCounts, aes(x = dex, y = count, color = cell, group = cell)) +
scale_y_log10() + geom_point(size = 3) + geom_line()

library("apeglm")
resultsNames(dds)
## [1] "Intercept" "cell_N061011_vs_N052611"
## [3] "cell_N080611_vs_N052611" "cell_N61311_vs_N052611"
## [5] "dex_trt_vs_untrt"
res <- lfcShrink(dds, coef="dex_trt_vs_untrt", type="apeglm")
plotMA(res, ylim = c(-5, 5))

res.noshr <- results(dds, name="dex_trt_vs_untrt")
plotMA(res.noshr, ylim = c(-5, 5))

plotMA(res, ylim = c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
})

hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20,
col = "grey50", border = "white")

library("genefilter")
topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20)
mat <- assay(vsd)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
anno <- as.data.frame(colData(vsd)[, c("cell","dex")])
pheatmap(mat, annotation_col = anno)

qs <- c(0, quantile(resLFC1$baseMean[resLFC1$baseMean > 0], 0:6/6))
bins <- cut(resLFC1$baseMean, qs)
levels(bins) <- paste0("~", round(signif((qs[-1] + qs[-length(qs)])/2, 2)))
fractionSig <- tapply(resLFC1$pvalue, bins, function(p)
mean(p < .05, na.rm = TRUE))
barplot(fractionSig, xlab = "mean normalized count",
ylab = "fraction of small p values")

# library("IHW")
# res.ihw <- results(dds, filterFun=ihw)
library("AnnotationDbi")
library("org.Hs.eg.db")
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG" "UNIGENE"
## [26] "UNIPROT"
ens.str <- substr(rownames(res), 1, 15)
res$symbol <- mapIds(org.Hs.eg.db,
keys=ens.str,
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
res$entrez <- mapIds(org.Hs.eg.db,
keys=ens.str,
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
resOrdered <- res[order(res$pvalue),]
head(resOrdered)
## log2 fold change (MAP): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 6 rows and 7 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000189221.9 2373.805 3.38765 0.136985 1.84494e-137 3.11758e-133
## ENSG00000120129.5 3420.727 2.96335 0.120850 6.35042e-135 5.36547e-131
## ENSG00000101347.9 14125.584 3.74129 0.157927 2.88983e-127 1.62775e-123
## ENSG00000196136.17 2710.217 3.23518 0.143951 3.68488e-114 1.55668e-110
## ENSG00000152583.12 974.737 4.48641 0.201276 2.94551e-113 9.95466e-110
## ENSG00000211445.11 12512.792 3.75875 0.169536 2.36246e-112 6.65348e-109
## symbol entrez
## <character> <character>
## ENSG00000189221.9 MAOA 4128
## ENSG00000120129.5 DUSP1 1843
## ENSG00000101347.9 SAMHD1 25939
## ENSG00000196136.17 SERPINA3 12
## ENSG00000152583.12 SPARCL1 8404
## ENSG00000211445.11 GPX3 2878
# resOrderedDF <- as.data.frame(resOrdered)[1:100, ]
# write.csv(resOrderedDF, file = "results.csv")
# library("ReportingTools")
# htmlRep <- HTMLReport(shortName="report", title="My report",
# reportDirectory="./report")
# publish(resOrderedDF, htmlRep)
# url <- finish(htmlRep)
# browseURL(url)
resGR <- lfcShrink(dds, coef="dex_trt_vs_untrt", type="apeglm", format="GRanges")
resGR
## GRanges object with 31604 ranges and 5 metadata columns:
## seqnames ranges strand | baseMean
## <Rle> <IRanges> <Rle> | <numeric>
## ENSG00000000003.14 chrX 100627109-100639991 - | 739.940717
## ENSG00000000419.12 chr20 50934867-50958555 - | 511.735722
## ENSG00000000457.13 chr1 169849631-169894267 - | 314.194855
## ENSG00000000460.16 chr1 169662007-169854080 + | 79.793622
## ENSG00000000938.12 chr1 27612064-27635277 - | 0.307267
## ... ... ... ... . ...
## ENSG00000285979.1 chr16 57177349-57181390 + | 38.353886
## ENSG00000285987.1 chr9 84316514-84657077 + | 1.562508
## ENSG00000285990.1 chr14 19244904-19269380 - | 0.642315
## ENSG00000285991.1 chr6 149817937-149896011 - | 11.276284
## ENSG00000285994.1 chr10 12563151-12567351 + | 3.651041
## log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.14 -0.3360046 0.105909 0.000726392 0.00531137
## ENSG00000000419.12 0.1783789 0.122440 0.108822318 0.29318870
## ENSG00000000457.13 0.0299361 0.141095 0.811509461 0.92255697
## ENSG00000000460.16 -0.0555061 0.222787 0.714430444 0.87298038
## ENSG00000000938.12 -0.0115799 0.304740 0.695977205 NA
## ... ... ... ... ...
## ENSG00000285979.1 0.15284386 0.257070 0.340940 0.600750
## ENSG00000285987.1 0.02551527 0.300687 0.647996 NA
## ENSG00000285990.1 -0.00018563 0.304465 0.915396 NA
## ENSG00000285991.1 -0.01507882 0.283931 0.876275 0.952921
## ENSG00000285994.1 -0.00684681 0.293399 0.928414 NA
## -------
## seqinfo: 25 sequences (1 circular) from hg38 genome
ens.str <- substr(names(resGR), 1, 15)
resGR$symbol <- mapIds(org.Hs.eg.db, ens.str, "SYMBOL", "ENSEMBL")
window <- resGR[topGene] + 1e6
strand(window) <- "*"
resGRsub <- resGR[resGR %over% window]
naOrDup <- is.na(resGRsub$symbol) | duplicated(resGRsub$symbol)
resGRsub$group <- ifelse(naOrDup, names(resGRsub), resGRsub$symbol)
status <- factor(ifelse(resGRsub$padj < 0.05 & !is.na(resGRsub$padj),
"sig", "notsig"))
options(ucscChromosomeNames = FALSE)
g <- GenomeAxisTrack()
a <- AnnotationTrack(resGRsub, name = "gene ranges", feature = status)
d <- DataTrack(resGRsub, data = "log2FoldChange", baseline = 0,
type = "h", name = "log2 fold change", strand = "+")
plotTracks(list(g, d, a), groupAnnotation = "group",
notsig = "grey", sig = "hotpink")

dat <- counts(dds, normalized = TRUE)
idx <- rowMeans(dat) > 1
dat <- dat[idx, ]
mod <- model.matrix(~ dex, colData(dds))
mod0 <- model.matrix(~ 1, colData(dds))
svseq <- svaseq(dat, mod, mod0, n.sv = 2)
## Number of significant surrogate variables is: 2
## Iteration (out of 5 ):1 2 3 4 5
## [,1] [,2]
## [1,] 0.2465669 -0.51599084
## [2,] 0.2588137 -0.59462876
## [3,] 0.1384516 0.24920662
## [4,] 0.2179075 0.37716083
## [5,] -0.6042910 -0.06305844
## [6,] -0.6138795 -0.03623320
## [7,] 0.1821306 0.30328185
## [8,] 0.1743002 0.28026195
par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
stripchart(svseq$sv[, i] ~ dds$cell, vertical = TRUE, main = paste0("SV", i))
abline(h = 0)
}

ddssva <- dds
ddssva$SV1 <- svseq$sv[,1]
ddssva$SV2 <- svseq$sv[,2]
design(ddssva) <- ~ SV1 + SV2 + dex
set <- newSeqExpressionSet(counts(dds))
idx <- rowSums(counts(set) > 5) >= 2
set <- set[idx, ]
set <- betweenLaneNormalization(set, which="upper")
not.sig <- rownames(res)[which(res$pvalue > .1)]
empirical <- rownames(set)[ rownames(set) %in% not.sig ]
set <- RUVg(set, empirical, k=2)
pData(set)
## W_1 W_2
## SRR1039508 -0.224881168 0.42992983
## SRR1039509 -0.249022928 0.53858506
## SRR1039512 0.001460949 0.01437385
## SRR1039513 -0.175547525 0.08408354
## SRR1039516 0.599387535 -0.02512358
## SRR1039517 0.590516825 -0.02549392
## SRR1039520 -0.241071948 -0.50369551
## SRR1039521 -0.300841739 -0.51265927
par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
stripchart(pData(set)[, i] ~ dds$cell, vertical = TRUE, main = paste0("W", i))
abline(h = 0)
}

ddsruv <- dds
ddsruv$W1 <- set$W_1
ddsruv$W2 <- set$W_2
design(ddsruv) <- ~ W1 + W2 + dex
library("fission")
data("fission")
ddsTC <- DESeqDataSet(fission, ~ strain + minute + strain:minute)
ddsTC <- DESeq(ddsTC, test="LRT", reduced = ~ strain + minute)
resTC <- results(ddsTC)
resTC$symbol <- mcols(ddsTC)$symbol
head(resTC[order(resTC$padj),], 4)
## log2 fold change (MLE): strainmut.minute180
## LRT p-value: '~ strain + minute + strain:minute' vs '~ strain + minute'
## DataFrame with 4 rows and 7 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## SPBC2F12.09c 174.671 -2.6567195 0.752261 97.2834 1.97415e-19
## SPAC1002.18 444.505 -0.0509321 0.204299 56.9536 5.16955e-11
## SPAC1002.19 336.373 -0.3927490 0.573494 43.5339 2.87980e-08
## SPAC1002.17c 261.773 -1.1387648 0.606129 39.3158 2.05137e-07
## padj symbol
## <numeric> <character>
## SPBC2F12.09c 1.33453e-15 atf21
## SPAC1002.18 1.74731e-07 urg3
## SPAC1002.19 6.48916e-05 urg1
## SPAC1002.17c 3.46682e-04 urg2
fiss <- plotCounts(ddsTC, which.min(resTC$padj),
intgroup = c("minute","strain"), returnData = TRUE)
fiss$minute <- as.numeric(as.character(fiss$minute))
ggplot(fiss,
aes(x = minute, y = count, color = strain, group = strain)) +
geom_point() + stat_summary(fun.y=mean, geom="line") +
scale_y_log10()

## [1] "Intercept" "strain_mut_vs_wt" "minute_15_vs_0"
## [4] "minute_30_vs_0" "minute_60_vs_0" "minute_120_vs_0"
## [7] "minute_180_vs_0" "strainmut.minute15" "strainmut.minute30"
## [10] "strainmut.minute60" "strainmut.minute120" "strainmut.minute180"
res30 <- results(ddsTC, name="strainmut.minute30", test="Wald")
res30[which.min(resTC$padj),]
## log2 fold change (MLE): strainmut.minute30
## Wald test p-value: strainmut.minute30
## DataFrame with 1 row and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## SPBC2F12.09c 174.671 -2.60047 0.634343 -4.09947 4.14099e-05 0.279931
betas <- coef(ddsTC)
colnames(betas)
## [1] "Intercept" "strain_mut_vs_wt" "minute_15_vs_0"
## [4] "minute_30_vs_0" "minute_60_vs_0" "minute_120_vs_0"
## [7] "minute_180_vs_0" "strainmut.minute15" "strainmut.minute30"
## [10] "strainmut.minute60" "strainmut.minute120" "strainmut.minute180"
topGenes <- head(order(resTC$padj),20)
mat <- betas[topGenes, -c(1,2)]
thr <- 3
mat[mat < -thr] <- -thr
mat[mat > thr] <- thr
pheatmap(mat, breaks=seq(from=-thr, to=thr, length=101),
cluster_col=FALSE)
