The package baySeq

What can `baySeq’ do?

Identify differentially expressed genes (DEG’s)
R documentation chapter
Bioconductor
Publication

Loading the library

# source("http://bioconductor.org/biocLite.R")
# biocLite("baySeq")
suppressMessages(library(baySeq))

Loading RNA-Seq count data

counts = get(load("counts_Hs.RData"))
dim(counts)

## [1] 27704     6

range(counts)

## [1]      0 899348

is.matrix(counts)

## [1] TRUE

head(counts)

##                    A1    A2    A3   B1   B2    B3
## ENSG00000101557  2590  2647  3225 3173 3147  3515
## ENSG00000079134   487   488   570  565  663   677
## ENSG00000261724    10    10    14    4   10     9
## ENSG00000158270 13565 14006 16451 9602 9823 10311
## ENSG00000079101    29    23    44   16   20    16
## ENSG00000176912     4     9     5   14   10    20

conditions = get(load("conditions_Hs.RData"))  
conditions

## [1] "10mM" "10mM" "10mM" "0mM"  "0mM"  "0mM"

length(conditions) == ncol(counts)    # must be TRUE

## [1] TRUE

length(unique(conditions)) == 2       # simple analysis: just compare two conditions

## [1] TRUE

nr.replicates.group1 = sum(conditions == unique(conditions)[1])
nr.replicates.group1

## [1] 3

nr.replicates.group2 = sum(conditions == unique(conditions)[2])
nr.replicates.group2

## [1] 3

If possible: get annotation data (transcript sizes)

It is better to invoke the real length of the transcripts via the parameter seqlens (see below).
Transcript sizes can be obtaind from annotation packages:

library(Rsubread)
x <- getInBuiltAnnotation("hg38")   # human genome
x[1:5,]

Running baySeq

Assuming that the transcript length are not available (but better get them!).
The following might take much time (when running on a single CPU only):

cond1 = unique(conditions)[1]  
cond1

## [1] "10mM"

cond2 = unique(conditions)[2] 
cond2

## [1] "0mM"

replicates <- as.integer(conditions == cond2) + 1   # required format by baySeq
replicates

## [1] 1 1 1 2 2 2

groups <- list(NDE = rep(1, length(conditions)), DE = replicates) 
groups

## $NDE
## [1] 1 1 1 1 1 1
## 
## $DE
## [1] 1 1 1 2 2 2

#  CD <- new("countData", data = counts, replicates = replicates, groups = groups, seglens = seglens)
CD <- new("countData", data = counts, replicates = replicates, groups = groups)
class(CD)

## [1] "countData"
## attr(,"package")
## [1] "baySeq"

getLibsizes(CD)                  # for normalization

##      A1      A2      A3      B1      B2      B3 
## 2060495 2072776 2388694 2504584 2529740 2669190

libsizes(CD) <- getLibsizes(CD)  # add the libsizes to the countData object ("CD") 
CD@annotation <- data.frame(name = rownames(counts))  # adds annotation to the countData object (just gene names) 
# str(CD)  # annotation slot created 
CD <- getPriors.NB(CD, samplesize = 10000, estimation = "QL", cl = NULL)  # cl = NULL : use one CPU only

## Finding priors...

## done.

CD <- getLikelihoods(CD, pET = 'BIC', cl = NULL)

## Finding posterior likelihoods...

## Length of priorReps:0

## Length of priorSubset:27704

## Length of subset:27704

## Length of postRows:27704

## Analysing part 1 of 1

## Preparing data...

## ...............................................

## done.

## Estimating likelihoods...

## ...done!

## .

## done.

topCounts(CD, group = "DE")   # extract results

##                      annotation     A1     A2     A3     B1     B2     B3
## ENSG00000243137 ENSG00000243137  23641  23907  29523    280    302    296
## ENSG00000170345 ENSG00000170345    263    232    292  28793  30413  31290
## ENSG00000111799 ENSG00000111799 268937 278560 334921  90710  76936 101573
## ENSG00000125740 ENSG00000125740     13     16     19  12624  12531  13424
## ENSG00000131737 ENSG00000131737  32731  32934  38550    869    973    954
## ENSG00000115461 ENSG00000115461 645324 664023 800642 257452 248876 274757
## ENSG00000060718 ENSG00000060718    647    710    810  26392  23277  29011
## ENSG00000146674 ENSG00000146674 103294 103719 122420  20631  18530  22674
## ENSG00000174059 ENSG00000174059   5862   5943   6995     13     11     21
## ENSG00000137809 ENSG00000137809  10480  10458  12524    912    720    950
##                 Likelihood ordering FDR.DE FWER.DE
## ENSG00000243137          1      1>2      0       0
## ENSG00000170345          1      2>1      0       0
## ENSG00000111799          1      1>2      0       0
## ENSG00000125740          1      2>1      0       0
## ENSG00000131737          1      1>2      0       0
## ENSG00000115461          1      1>2      0       0
## ENSG00000060718          1      2>1      0       0
## ENSG00000146674          1      1>2      0       0
## ENSG00000174059          1      1>2      0       0
## ENSG00000137809          1      1>2      0       0

test.result.frame =  topCounts(CD, group = "DE", decreasing = TRUE, number = nrow(CD@data), normaliseData = FALSE)

MA plot

ind.A = which(conditions == cond1)      # 1 2 3
ind.B = which(conditions == cond2)      # 4 5 6  
 
mean.counts.A = apply(counts[,ind.A], 1, mean)
mean.counts.B = apply(counts[,ind.B], 1, mean)
mean.counts.A = mean.counts.A[rownames(test.result.frame)]      # re-order according to test.result.frame
mean.counts.B = mean.counts.B[rownames(test.result.frame)]      # re-order according to test.result.frame

fold.change = mean.counts.B / mean.counts.A         # fold change bigger than 1 if the 2nd condition is bigger 
log2FC = log(fold.change, base = 2) 

# append to test.result.frame:    
    
test.result.frame$averageA = mean.counts.A  
test.result.frame$averageB = mean.counts.B  
test.result.frame$FC = fold.change  
test.result.frame$log2FC = log2FC

nr.deg = sum(test.result.frame$FDR <= 0.05)                             
nr.up   = sum((test.result.frame$FDR <= 0.05) & (test.result.frame$log2FC >= 0))        
nr.down = sum((test.result.frame$FDR <= 0.05) & (test.result.frame$log2FC < 0))

colvec = character()
colvec[which(test.result.frame$FDR <= 0.05)] = "red"
colvec[which(test.result.frame$FDR >  0.05)] = "blue" 

maintxt = paste(cond1, "vs.", cond2)   
plotMA.CD(CD, samplesA = ind.A, samplesB = ind.B, col = colvec, main = maintxt, font.main = 1, cex = 0.8)      
mtext(paste("baySeq: ", nr.deg, "DEG's  ", nr.up, "up ", nr.down, "down"), side = 3, col = "blue")   
abline(h = c(-2, 2), col = "grey", lty = 2, cex = 0.8)  # The horizontal blue lines show 4-fold changes.