## ----knitr, echo=FALSE, results='hide'-----------------------------------
library("knitr")
opts_chunk$set(tidy=FALSE,dev="pdf",fig.show="hide",
               fig.width=4,fig.height=4.5,
               message=FALSE, warning=FALSE)


## ----style, eval=TRUE, echo=FALSE, results="asis"---------------------------------------
BiocStyle::latex()

## ----options, results="hide", echo=FALSE--------------------------------------
options(digits=3, width=80, prompt=" ", continue=" ")

## ----simResult, cache=TRUE, results='hide'------------------------------------
# load library
library(variancePartition)

# optional step to run analysis in parallel on multicore machines
# Here use 4 threads
# This is strongly recommended since the analysis 
# can be computationally intensive 
library(doParallel)
cl <- makeCluster(4)
registerDoParallel(cl)

# load simulated data:
# geneExpr: matrix of gene expression values
# info: information/metadata about each sample
data(varPartData)

# Specify variables to consider
# Age is continuous so model it as a fixed effect
# Individual and Tissue are both categorical, so model them as random effects
# Note the syntax used to specify random effects
form <- ~ Age + (1|Individual) + (1|Tissue) + (1|Batch) 

# Fit model and extract results
# 1) fit linear mixed model on gene expression
# If categorical variables are specified, a linear mixed model is used
# If all variables are modeled as fixed effects, a linear model is used
# each entry in results is a regression model fit on a single gene
# 2) extract variance fractions from each model fit
# for each gene, returns fraction of variation attributable to each variable 
# Interpretation: the variance explained by each variables after correcting 
# for all other variables
# Note that geneExpr can either be a matrix, 
# and EList output by voom() in the limma package, 
# or an ExpressionSet
varPart <- fitExtractVarPartModel( geneExpr, form, info )

# sort variables (i.e. columns) by median fraction of variance explained
vp <- sortCols( varPart )

# Figure 1a
# Bar plot of variance fractions for the first 10 genes
plotPercentBars( vp[1:10,] )

#
# Figure 1b
# violin plot of contribution of each variable to total variance
plotVarPart( vp )

## ----accessResults, cache=TRUE, warning=FALSE---------------------------------
# Access first entries
head(varPart)

# Access first entries for Individual
head(varPart$Individual)

# sort genes based on variance explained by Individual
head(varPart[order(varPart$Individual, decreasing=TRUE),])

## ----savePlot, cache=TRUE, eval=FALSE-----------------------------------------
#  fig <- plotVarPart( vp )
#  ggsave(file, fig)

## ----plotStratify, cache=TRUE, warning=FALSE----------------------------------
# get gene with the highest variation across Tissues
# create data.frame with expression of gene i and Tissue type for each sample
i <- which.max( varPart$Tissue )
GE <- data.frame( Expression = geneExpr[i,], Tissue = info$Tissue)

# Figure 2a 
# plot expression stratified by Tissue 
plotStratify( Expression ~ Tissue, GE, main=rownames(geneExpr)[i])
#
# get gene with the highest variation across Individuals
# create data.frame with expression of gene i and Tissue type for each sample
i <- which.max( varPart$Individual )
GE <- data.frame( Expression = geneExpr[i,], Individual = info$Individual)

# Figure 2b
# plot expression stratified by Tissue 
label <- paste("Individual:", format(varPart$Individual[i]*100, digits=3), "%")
main <- rownames(geneExpr)[i]
plotStratify(  Expression ~ Individual, GE, colorBy=NULL, text=label, main=main)

## ----cache=TRUE---------------------------------------------------------------
library(lme4)

# fit regression model for the first gene 
fit <- lmer(geneExpr[1,] ~ Age + (1|Individual) + (1|Tissue), info, REML=FALSE )

# extract variance statistics
calcVarPart(fit)

## ----simResult-omit, cache=TRUE, results='hide'-------------------------------
# Fit model
results <- fitVarPartModel( geneExpr, form, info )

# Extract results
varPart <- extractVarPart( results )

## ----simResult-fast-two-step, cache=TRUE, results='hide'----------------------
# Fit model and run summary() function on each model fit 
vpSummaries <- fitVarPartModel( geneExpr, form, info, fxn=summary )

## ----simResult-fast-two-step1, cache=TRUE-------------------------------------
# Show results of summary() for the first gene
vpSummaries[[1]]

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
form <- ~ (1|Tissue) + (1|Individual) + (1|Batch) + Age
varPart <- fitExtractVarPartModel( geneExpr, form, info )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library(limma)
# subtract out effect of Batch
fit <- lmFit( geneExpr, model.matrix(~ Batch, info))
res <- residuals( fit, geneExpr)

# fit model on residuals
form <- ~ (1|Tissue) + (1|Individual) + Age

varPartResid <- fitExtractVarPartModel( res, form, info )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
# subtract out effect of Batch with linear mixed model
modelFit <- fitVarPartModel( geneExpr, ~ (1|Batch), info )
res <- residuals( modelFit )

# fit model on residuals
form <- ~ (1|Tissue) + (1|Individual) + Age

varPartResid <- fitExtractVarPartModel( res, form, info )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
# extract residuals directly without storing intermediate results
residList <- fitVarPartModel( geneExpr, ~ (1|Batch), info, fxn=residuals )

# convert list to matrix
residMatrix = do.call(rbind, residList)

## ----withinTissue, echo=TRUE, cache=TRUE, results='hide'----------------------
# specify formula to model within/between individual variance 
# separately for each tissue
# Note that including +0 ensures each tissue is modeled explicitly
# Otherwise, the first tissue would be used as baseline
form <- ~ (Tissue+0|Individual) + Age + (1|Tissue) + (1|Batch)

# fit model and extract variance percents
varPart <- fitExtractVarPartModel( geneExpr, form, info )

# violin plot
plotVarPart( sortCols(varPart), label.angle=60 )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
form <- ~ (1|Individual) + (1|Tissue) + Age + Height

# fit model
res <- fitVarPartModel( geneExpr[1:4,], form, info )

## ----echo=TRUE, cache=TRUE----------------------------------------------------
# evaluate the collinearity score on the first model fit
# this reports the correlation matrix between coefficients estimates
# for fixed effects
# the collinearity score is the maximum absolute correlation value
# If the collinearity score > .99 then the variance partition 
# estimates may be problematic
# In that case, a least one variable should be omitted
colinearityScore( res[[1]] )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------

form <- ~ (1|Individual) + (1|Tissue) + Age + Height

# Specify custom weights
# In this example the weights are simulated from a uniform distribution
# and are not meaningful.
weights <- matrix(runif(length(geneExpr)), nrow=nrow(geneExpr))

# Specify custom weights
res <- fitExtractVarPartModel( geneExpr[1:4,], form, info, weightsMatrix=weights[1:4,] )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library(limma)
library(edgeR)

# identify genes that pass expression cutoff
isexpr <- rowSums(cpm(geneCounts)>1) >= 0.5 * ncol(geneCounts)

# create data structure with only expressed genes
gExpr <- DGEList(counts=geneCounts[isexpr,])

# Perform TMM normalization
gExpr <- calcNormFactors(gExpr)

# Specify variables to be included in the voom() estimates of uncertainty
# recommend including variables with a small number of categories 
# that explain a substantial amount of variation
design <- model.matrix( ~ Batch, info)

# Estimate precision weights for each gene and sample
# This models uncertainty in expression measurements
vobjGenes <- voom(gExpr, design )

# Define formula
form <- ~ (1|Individual) + (1|Tissue) + (1|Batch) + Age

# variancePartition seamlessly deals with the result of voom()
# by default, it seamlessly models the precision weights
# This can be turned off with useWeights=FALSE
varPart <- fitExtractVarPartModel( vobjGenes, form, info )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library(DESeq2)

# create DESeq2 object from gene-level counts and metadata
dds <- DESeqDataSetFromMatrix(countData = geneCounts,
	colData = info,
	design = ~ 1)

# Estimate library size correction scaling factors
dds <- estimateSizeFactors(dds)

# identify genes that pass expression cutoff
isexpr <- rowSums(fpm(dds)>1) >= 0.5 * ncol(dds)

# compute log2 Fragments Per Million
# Alternatively, fpkm(), vst() or rlog() could be used
quantLog <- log2( fpm( dds )[isexpr,] + 1)

# Define formula
form <- ~ (1|Individual) + (1|Tissue) + (1|Batch) + Age

# Run variancePartition analysis
varPart <- fitExtractVarPartModel( quantLog, form, info)

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library(tximportData)
library(tximport)
library(readr)

# Get data from folder where tximportData is installed
dir <- system.file("extdata", package = "tximportData")
samples <- read.table(file.path(dir, "samples.txt"), header = TRUE)
files <- file.path(dir, "kallisto", samples$run, "abundance.tsv")
names(files) <- paste0("sample", 1:6)

tx2gene <- read.csv(file.path(dir, "tx2gene.csv"))

# reads results from kallisto
txi <- tximport(files, type = "kallisto", tx2gene = tx2gene, 
	countsFromAbundance = "lengthScaledTPM")

# define metadata (usually read from external source)
info_tximport <- data.frame( Sample = sprintf("sample%d", 1:6), 
	Disease=c("case", "control")[c(rep(1, 3), rep(2, 3) )] )

# Extract counts from kallisto
y <- DGEList( txi$counts )

# compute library size normalization
y <- calcNormFactors(y)

# apply voom to estimate precision weights
design <- model.matrix( ~ Disease, data = info_tximport)
vobj <- voom(y, design)

# define formula
form <- ~ (1|Disease)

# Run variancePartition analysis (on only 10 genes)
varPart_tx <- fitExtractVarPartModel( vobj[1:10,], form, info_tximport)


## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library(ballgown)

# Get data from folder where ballgown is installed
data_directory <- system.file('extdata', package='ballgown') 

# Load results of Cufflinks/Tablemaker
bg <- ballgown(dataDir=data_directory, samplePattern='sample', meas='all')

# extract gene-level FPKM quantification
# Expression can be convert to log2-scale if desired
gene_expression <- gexpr(bg)

# extract transcript-level FPKM quantification
# Expression can be convert to log2-scale if desired
transcript_fpkm <- texpr(bg, 'FPKM')

# define metadata (usually read from external source)
info_ballgown <- data.frame( Sample = sprintf("sample%02d", 1:20), 
	Batch = rep(letters[1:4], 5), 
	Disease=c("case", "control")[c(rep(1, 10), rep(2, 10) )] )

# define formula
form <- ~ (1|Batch) + (1|Disease)

# Run variancePartition analysis 
# Gene-level analysis
varPart_gene <- fitExtractVarPartModel( gene_expression, form, info_ballgown)

# Transcript-level analysis
varPart_transcript <- fitExtractVarPartModel( transcript_fpkm, form, info_ballgown)

## ----echo=FALSE, cache=TRUE, results='hide'-----------------------------------
library(variancePartition)

sim_data = function( n, p, var_indiv, var_site){

  info = data.frame(ID = paste("s", 1:n, sep=''), 
    Individual = factor(rep( round(seq(1, n/6, length.out=n/6)),6)), 
    Tissue = factor(sort(rep(toupper(letters[1:3]), n/3))),
    Age = rpois(n, 50))

  geneExpr = matrix(0, nrow=p, ncol=n)
  for( i in 1:p){
    beta_indiv = rnorm(nlevels(info$Individual), 0, sqrt(var_indiv))
    beta_site = rnorm(nlevels(info$Tissue), 0, sqrt(var_site))
    eta = model.matrix( ~ Individual, info) %*% beta_indiv + model.matrix( ~ Tissue+0, info) %*% beta_site

    noise = rbeta(1, 10, 100)
    errVar = var(eta) * (noise) / (1-noise)

    geneExpr[i,] = eta + rnorm(n, 0, sqrt(errVar))
  }
  colnames(geneExpr) = paste("s", 1:ncol(geneExpr), sep='')
  rownames(geneExpr) = paste("gene", 1:nrow(geneExpr), sep='')

  return( list(info=info, geneExpr=geneExpr))
}

plotVar = function( geneExpr, info ){

  form =  ~ (1|Individual) + (1|Tissue)
  varPart = fitExtractVarPartModel( geneExpr, form, info )
  plotVarPart( varPart, col=rainbow(8)[1:3] )
}

plotVarCross = function( geneExpr, info, label.angle=30 ){

  form =  ~ (Tissue+0|Individual) + (1|Tissue)

  #res = fitVarPartModel( geneExpr, form, info )
  #varPart = extractVarPart( res )

  varPart = fitExtractVarPartModel( geneExpr, form, info )

  plotVarPart( varPart, label.angle=label.angle )
}

plotPCA = function(geneExpr, col){

  dcmp = prcomp(t(geneExpr))

  par(mar =  c(4, 4, 1, 1) + 0.1)
  plot(dcmp$x[,1:2], col=col)

}

plotTree = function(geneExpr, col){

  hc = hclust(dist(t(geneExpr)))
  library(dendextend)
  hcd = as.dendrogram(hc)
  labels_colors(hcd) <- col[match(labels(hcd), names(col))]

  par(mar =  c(4, 4, 1, 1) + 0.1, cex=.6)
  plot( hcd, yaxt='n', horiz=TRUE )
}

## ----siteDominant, echo=FALSE, cache=TRUE, results='hide'---------------------
set.seed(1)

n = 60
p = 200

data = sim_data( n, p, 1, 4)

colTissue = rainbow(nlevels(data$info$Tissue))[data$info$Tissue]
names(colTissue) = data$info$ID

colIndiv = rainbow(nlevels(data$info$Individual))[data$info$Individual]
names(colIndiv) = data$info$ID

plotPCA( data$geneExpr, colTissue)

plotPCA( data$geneExpr, colIndiv)

plotTree( data$geneExpr, colTissue)
legend("topleft", legend=levels(data$info$Tissue), fill=rainbow(nlevels(data$info$Tissue))[1:nlevels(data$info$Tissue)], title="Tissue")

plotTree( data$geneExpr, colIndiv)
legend("topleft", legend=levels(data$info$Individual), fill=rainbow(nlevels(data$info$Individual))[1:nlevels(data$info$Individual)], title="Individual")

plotVar( data$geneExpr, data$info )

plotVarCross( data$geneExpr, data$info, label.angle=60 )

## ----IndivDominant, echo=FALSE, cache=TRUE, results='hide'--------------------
set.seed(1)

n = 60
p = 200

data = sim_data( n, p, 3, 1)

colTissue = rainbow(nlevels(data$info$Tissue))[data$info$Tissue]
names(colTissue) = data$info$ID

colIndiv = rainbow(nlevels(data$info$Individual))[data$info$Individual]
names(colIndiv) = data$info$ID

plotPCA( data$geneExpr, colTissue)

plotPCA( data$geneExpr, colIndiv)

plotTree( data$geneExpr, colTissue)
legend("topleft", legend=levels(data$info$Tissue), fill=rainbow(nlevels(data$info$Tissue))[1:nlevels(data$info$Tissue)], title="Tissue")

plotTree( data$geneExpr, colIndiv)
legend("topleft", legend=levels(data$info$Individual), fill=rainbow(nlevels(data$info$Individual))[1:nlevels(data$info$Individual)], title="Individual")

plotVar( data$geneExpr, data$info )

plotVarCross( data$geneExpr, data$info, label.angle=60 )

## ----echo=FALSE, cache=FALSE--------------------------------------------------
# if( "cluster" %in% class(cl) ){
if( exists("cl") ){
	library(doParallel)
	
	# stop cluster and catch warning if invalid
	res = tryCatch( {stopCluster(cl)}, warning = function(x) {
	}, error = function(x) {
	}, finally={
	})
	# warning("STOP CLUSTER!!!!!!!!!!!!!!!!!\n")
}

## ----DE, echo=FALSE, cache=TRUE-----------------------------------------------
set.seed(1)

library(variancePartition)

n = 500
data = data.frame(Sex = c(rep('F', n), rep('M', n)))
data$expression = rnorm(2*n, (as.integer(data$Sex)-1)*2, .3)
data$Sex = factor(data$Sex)

fit = lm(expression ~ Sex, data)
# calcVarPart(fit)
# coef(fit)[2]

plotStratify( expression ~ Sex, data, ylim=c(-6, 9)) +
 annotate("text", x = 0.5, y = 9, hjust=0, label = paste("fold change:", format(coef(fit)[2], digits=3))) +
 annotate("text", x = 0.5, y = 8.2, hjust=0, label = paste("% variance of expression:", format(calcVarPart(fit)[1]*100, digits=3), "%"))


n = 500
data = data.frame(Sex = c(rep('F', n), rep('M', n)))
data$expression = rnorm(2*n, (as.integer(data$Sex)-1)*2, 2.01)
data$Sex = factor(data$Sex)

fit = lm(expression ~ Sex, data)
# calcVarPart(fit)
# coef(fit)[2] 

plotStratify( expression ~ Sex, data, ylim=c(-6, 9)) +
 annotate("text", x = 0.5, y = 9, hjust=0, label = paste("fold change:", format(coef(fit)[2], digits=3))) +
 annotate("text", x = 0.5, y = 8.2, hjust=0, label = paste("% variance of expression:", format(calcVarPart(fit)[1]*100, digits=3), "%"))

## ----sessInfo, results="asis", echo=FALSE-------------------------------------
toLatex(sessionInfo())

## ----resetOptions, results="hide", echo=FALSE---------------------------------
options(prompt="> ", continue="+ ")

