What can `baySeq’ do?


Loading the library

# source("https://bioconductor.org/biocLite.R")
# biocLite("baySeq")
suppressMessages(library(baySeq))


Loading RNA-Seq count data

counts = get(load("counts_Hs.RData"))
dim(counts)
## [1] 27704     6
range(counts)
## [1]      0 899348
is.matrix(counts)
## [1] TRUE
head(counts)
##                    A1    A2    A3   B1   B2    B3
## ENSG00000101557  2590  2647  3225 3173 3147  3515
## ENSG00000079134   487   488   570  565  663   677
## ENSG00000261724    10    10    14    4   10     9
## ENSG00000158270 13565 14006 16451 9602 9823 10311
## ENSG00000079101    29    23    44   16   20    16
## ENSG00000176912     4     9     5   14   10    20
conditions = get(load("conditions_Hs.RData"))  
conditions
## [1] "10mM" "10mM" "10mM" "0mM"  "0mM"  "0mM"
length(conditions) == ncol(counts)    # must be TRUE
## [1] TRUE
length(unique(conditions)) == 2       # simple analysis: just compare two conditions 
## [1] TRUE
nr.replicates.group1 = sum(conditions == unique(conditions)[1])
nr.replicates.group1
## [1] 3
nr.replicates.group2 = sum(conditions == unique(conditions)[2])
nr.replicates.group2
## [1] 3


If possible: get annotation data (transcript sizes)

It is better to invoke the real length of the transcripts via the parameter seqlens (see below).
Transcript sizes can be obtaind from annotation packages:

library(Rsubread)
x <- getInBuiltAnnotation("hg38")   # human genome
x[1:5,]


Running baySeq

Assuming that the transcript length are not available (but better get them!).
The following might take much time (when running on a single CPU only):

cond1 = unique(conditions)[1]  
cond1
## [1] "10mM"
cond2 = unique(conditions)[2] 
cond2
## [1] "0mM"
replicates <- as.integer(conditions == cond2) + 1   # required format by baySeq
replicates
## [1] 1 1 1 2 2 2
groups <- list(NDE = rep(1, length(conditions)), DE = replicates) 
groups
## $NDE
## [1] 1 1 1 1 1 1
## 
## $DE
## [1] 1 1 1 2 2 2
#  CD <- new("countData", data = counts, replicates = replicates, groups = groups, seglens = seglens)
CD <- new("countData", data = counts, replicates = replicates, groups = groups)
class(CD)
## [1] "countData"
## attr(,"package")
## [1] "baySeq"
getLibsizes(CD)                  # for normalization 
##      A1      A2      A3      B1      B2      B3 
## 2060495 2072776 2388694 2504584 2529740 2669190
libsizes(CD) <- getLibsizes(CD)  # add the libsizes to the countData object ("CD") 
CD@annotation <- data.frame(name = rownames(counts))  # adds annotation to the countData object (just gene names) 
# str(CD)  # annotation slot created 
CD <- getPriors.NB(CD, samplesize = 10000, estimation = "QL", cl = NULL)  # cl = NULL : use one CPU only
## Finding priors...
## done.
CD <- getLikelihoods(CD, pET = 'BIC', cl = NULL)
## Finding posterior likelihoods...
## Length of priorReps:0
## Length of priorSubset:27704
## Length of subset:27704
## Length of postRows:27704
## Analysing part 1 of 1
## Preparing data...
## ...............................................
## done.
## Estimating likelihoods...
## ...done!
## .
## done.
topCounts(CD, group = "DE")   # extract results 
##                      annotation     A1     A2     A3     B1     B2     B3
## ENSG00000243137 ENSG00000243137  23641  23907  29523    280    302    296
## ENSG00000170345 ENSG00000170345    263    232    292  28793  30413  31290
## ENSG00000111799 ENSG00000111799 268937 278560 334921  90710  76936 101573
## ENSG00000125740 ENSG00000125740     13     16     19  12624  12531  13424
## ENSG00000131737 ENSG00000131737  32731  32934  38550    869    973    954
## ENSG00000115461 ENSG00000115461 645324 664023 800642 257452 248876 274757
## ENSG00000060718 ENSG00000060718    647    710    810  26392  23277  29011
## ENSG00000146674 ENSG00000146674 103294 103719 122420  20631  18530  22674
## ENSG00000174059 ENSG00000174059   5862   5943   6995     13     11     21
## ENSG00000137809 ENSG00000137809  10480  10458  12524    912    720    950
##                 Likelihood ordering FDR.DE FWER.DE
## ENSG00000243137          1      1>2      0       0
## ENSG00000170345          1      2>1      0       0
## ENSG00000111799          1      1>2      0       0
## ENSG00000125740          1      2>1      0       0
## ENSG00000131737          1      1>2      0       0
## ENSG00000115461          1      1>2      0       0
## ENSG00000060718          1      2>1      0       0
## ENSG00000146674          1      1>2      0       0
## ENSG00000174059          1      1>2      0       0
## ENSG00000137809          1      1>2      0       0
test.result.frame =  topCounts(CD, group = "DE", decreasing = TRUE, number = nrow(CD@data), normaliseData = FALSE)


MA plot

ind.A = which(conditions == cond1)      # 1 2 3
ind.B = which(conditions == cond2)      # 4 5 6  
 
mean.counts.A = apply(counts[,ind.A], 1, mean)
mean.counts.B = apply(counts[,ind.B], 1, mean)
mean.counts.A = mean.counts.A[rownames(test.result.frame)]      # re-order according to test.result.frame
mean.counts.B = mean.counts.B[rownames(test.result.frame)]      # re-order according to test.result.frame

fold.change = mean.counts.B / mean.counts.A         # fold change bigger than 1 if the 2nd condition is bigger 
log2FC = log(fold.change, base = 2) 

# append to test.result.frame:    
    
test.result.frame$averageA = mean.counts.A  
test.result.frame$averageB = mean.counts.B  
test.result.frame$FC = fold.change  
test.result.frame$log2FC = log2FC

nr.deg = sum(test.result.frame$FDR <= 0.05)                             
nr.up   = sum((test.result.frame$FDR <= 0.05) & (test.result.frame$log2FC >= 0))        
nr.down = sum((test.result.frame$FDR <= 0.05) & (test.result.frame$log2FC < 0))

colvec = character()
colvec[which(test.result.frame$FDR <= 0.05)] = "red"
colvec[which(test.result.frame$FDR >  0.05)] = "blue" 

maintxt = paste(cond1, "vs.", cond2)   
plotMA.CD(CD, samplesA = ind.A, samplesB = ind.B, col = colvec, main = maintxt, font.main = 1, cex = 0.8)      
mtext(paste("baySeq: ", nr.deg, "DEG's  ", nr.up, "up ", nr.down, "down"), side = 3, col = "blue")   
abline(h = c(-2, 2), col = "grey", lty = 2, cex = 0.8)  # The horizontal blue lines show 4-fold changes.