# source("http://bioconductor.org/biocLite.R")
# biocLite("baySeq")
suppressMessages(library(baySeq))
counts = get(load("counts_Hs.RData"))
dim(counts)
## [1] 27704 6
range(counts)
## [1] 0 899348
is.matrix(counts)
## [1] TRUE
head(counts)
## A1 A2 A3 B1 B2 B3
## ENSG00000101557 2590 2647 3225 3173 3147 3515
## ENSG00000079134 487 488 570 565 663 677
## ENSG00000261724 10 10 14 4 10 9
## ENSG00000158270 13565 14006 16451 9602 9823 10311
## ENSG00000079101 29 23 44 16 20 16
## ENSG00000176912 4 9 5 14 10 20
conditions = get(load("conditions_Hs.RData"))
conditions
## [1] "10mM" "10mM" "10mM" "0mM" "0mM" "0mM"
length(conditions) == ncol(counts) # must be TRUE
## [1] TRUE
length(unique(conditions)) == 2 # simple analysis: just compare two conditions
## [1] TRUE
nr.replicates.group1 = sum(conditions == unique(conditions)[1])
nr.replicates.group1
## [1] 3
nr.replicates.group2 = sum(conditions == unique(conditions)[2])
nr.replicates.group2
## [1] 3
It is better to invoke the real length of the transcripts via the parameter seqlens
(see below).
Transcript sizes can be obtaind from annotation packages:
library(Rsubread)
x <- getInBuiltAnnotation("hg38") # human genome
x[1:5,]
Assuming that the transcript length are not available (but better get them!).
The following might take much time (when running on a single CPU only):
cond1 = unique(conditions)[1]
cond1
## [1] "10mM"
cond2 = unique(conditions)[2]
cond2
## [1] "0mM"
replicates <- as.integer(conditions == cond2) + 1 # required format by baySeq
replicates
## [1] 1 1 1 2 2 2
groups <- list(NDE = rep(1, length(conditions)), DE = replicates)
groups
## $NDE
## [1] 1 1 1 1 1 1
##
## $DE
## [1] 1 1 1 2 2 2
# CD <- new("countData", data = counts, replicates = replicates, groups = groups, seglens = seglens)
CD <- new("countData", data = counts, replicates = replicates, groups = groups)
class(CD)
## [1] "countData"
## attr(,"package")
## [1] "baySeq"
getLibsizes(CD) # for normalization
## A1 A2 A3 B1 B2 B3
## 2060495 2072776 2388694 2504584 2529740 2669190
libsizes(CD) <- getLibsizes(CD) # add the libsizes to the countData object ("CD")
CD@annotation <- data.frame(name = rownames(counts)) # adds annotation to the countData object (just gene names)
# str(CD) # annotation slot created
CD <- getPriors.NB(CD, samplesize = 10000, estimation = "QL", cl = NULL) # cl = NULL : use one CPU only
## Finding priors...
## done.
CD <- getLikelihoods(CD, pET = 'BIC', cl = NULL)
## Finding posterior likelihoods...
## Length of priorReps:0
## Length of priorSubset:27704
## Length of subset:27704
## Length of postRows:27704
## Analysing part 1 of 1
## Preparing data...
## ...............................................
## done.
## Estimating likelihoods...
## ...done!
## .
## done.
topCounts(CD, group = "DE") # extract results
## annotation A1 A2 A3 B1 B2 B3
## ENSG00000243137 ENSG00000243137 23641 23907 29523 280 302 296
## ENSG00000170345 ENSG00000170345 263 232 292 28793 30413 31290
## ENSG00000111799 ENSG00000111799 268937 278560 334921 90710 76936 101573
## ENSG00000125740 ENSG00000125740 13 16 19 12624 12531 13424
## ENSG00000131737 ENSG00000131737 32731 32934 38550 869 973 954
## ENSG00000115461 ENSG00000115461 645324 664023 800642 257452 248876 274757
## ENSG00000060718 ENSG00000060718 647 710 810 26392 23277 29011
## ENSG00000146674 ENSG00000146674 103294 103719 122420 20631 18530 22674
## ENSG00000174059 ENSG00000174059 5862 5943 6995 13 11 21
## ENSG00000137809 ENSG00000137809 10480 10458 12524 912 720 950
## Likelihood ordering FDR.DE FWER.DE
## ENSG00000243137 1 1>2 0 0
## ENSG00000170345 1 2>1 0 0
## ENSG00000111799 1 1>2 0 0
## ENSG00000125740 1 2>1 0 0
## ENSG00000131737 1 1>2 0 0
## ENSG00000115461 1 1>2 0 0
## ENSG00000060718 1 2>1 0 0
## ENSG00000146674 1 1>2 0 0
## ENSG00000174059 1 1>2 0 0
## ENSG00000137809 1 1>2 0 0
test.result.frame = topCounts(CD, group = "DE", decreasing = TRUE, number = nrow(CD@data), normaliseData = FALSE)
ind.A = which(conditions == cond1) # 1 2 3
ind.B = which(conditions == cond2) # 4 5 6
mean.counts.A = apply(counts[,ind.A], 1, mean)
mean.counts.B = apply(counts[,ind.B], 1, mean)
mean.counts.A = mean.counts.A[rownames(test.result.frame)] # re-order according to test.result.frame
mean.counts.B = mean.counts.B[rownames(test.result.frame)] # re-order according to test.result.frame
fold.change = mean.counts.B / mean.counts.A # fold change bigger than 1 if the 2nd condition is bigger
log2FC = log(fold.change, base = 2)
# append to test.result.frame:
test.result.frame$averageA = mean.counts.A
test.result.frame$averageB = mean.counts.B
test.result.frame$FC = fold.change
test.result.frame$log2FC = log2FC
nr.deg = sum(test.result.frame$FDR <= 0.05)
nr.up = sum((test.result.frame$FDR <= 0.05) & (test.result.frame$log2FC >= 0))
nr.down = sum((test.result.frame$FDR <= 0.05) & (test.result.frame$log2FC < 0))
colvec = character()
colvec[which(test.result.frame$FDR <= 0.05)] = "red"
colvec[which(test.result.frame$FDR > 0.05)] = "blue"
maintxt = paste(cond1, "vs.", cond2)
plotMA.CD(CD, samplesA = ind.A, samplesB = ind.B, col = colvec, main = maintxt, font.main = 1, cex = 0.8)
mtext(paste("baySeq: ", nr.deg, "DEG's ", nr.up, "up ", nr.down, "down"), side = 3, col = "blue")
abline(h = c(-2, 2), col = "grey", lty = 2, cex = 0.8) # The horizontal blue lines show 4-fold changes.