### R code from vignette source 'vignettes/MineICA/inst/doc/MineICA.Rnw'

###################################################
### code chunk number 1: morelib
###################################################
library(Biobase)
library(plyr)
library(ggplot2)
library(foreach)
library(xtable)
library(biomaRt)
library(GOstats)
library(cluster)
library(marray)
library(mclust)
library(RColorBrewer)
library(igraph)
library(Rgraphviz)
library(graph)
library(colorspace)
library(annotate)
library(scales)
library(gtools)


###################################################
### code chunk number 2: lib
###################################################
library(MineICA)


###################################################
### code chunk number 3: helpIcaSet (eval = FALSE)
###################################################
## help(IcaSet)


###################################################
### code chunk number 4: loadMainz
###################################################
## load Mainz expression data and sample annotations.
library(breastCancerMAINZ)
data(mainz)
show(mainz)
## we restrict the data to the 10,000 probe sets with the highest IQR
mainz <- selectFeatures_IQR(mainz,10000)


###################################################
### code chunk number 5: runJade
###################################################
library(JADE)
## Features are mean-centered before ICA computation
exprs(mainz) <- t(apply(exprs(mainz),1,scale,scale=FALSE))
colnames(exprs(mainz)) <- sampleNames(mainz)
## run ICA-JADE
resJade <- runICA(X=exprs(mainz), nbComp=5, method = "JADE", maxit=10000) 


###################################################
### code chunk number 6: fastica (eval = FALSE)
###################################################
## library(fastICA)
## ## Random initializations are used for each iteration of FastICA
## ## Estimates are clustered using hierarchical clustering with average linkage
## res <- clusterFastICARuns(X=exprs(mainz), nbComp=5, alg.type="deflation", nbIt=10, 
##                           funClus="hclust", method="average")


###################################################
### code chunk number 7: buildParams
###################################################
## build params
params <- buildMineICAParams(resPath="mainz/", selCutoff=3, pvalCutoff=0.05)


###################################################
### code chunk number 8: libannot
###################################################
## load annotation package
library(hgu133a.db)


###################################################
### code chunk number 9: lspackage
###################################################
ls("package:hgu133a.db")


###################################################
### code chunk number 10: mart
###################################################
mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")


###################################################
### code chunk number 11: martfilters
###################################################
listFilters(mart)[120:125,]


###################################################
### code chunk number 12: martattr
###################################################
listAttributes(mart)[grep(x=listAttributes(mart)[,1],pattern="affy")[1:5],]


###################################################
### code chunk number 13: buildIcaSet
###################################################
## Define typeID, Mainz data originate from affymetrix HG-U133a  microarray 
## and are indexed by probe sets.
## The probe sets are annotated into Gene Symbols
typeIDmainz <-  c(geneID_annotation="SYMBOL", geneID_biomart="hgnc_symbol", 
                    featureID_biomart="affy_hg_u133a")

## define the reference samples if any, here no normal sample is available
refSamplesMainz <- character(0)

resBuild <- buildIcaSet(params=params, A=data.frame(resJade$A), S=data.frame(resJade$S),
                        dat=exprs(mainz), pData=pData(mainz), refSamples=refSamplesMainz,
                        annotation="hgu133a.db", typeID= typeIDmainz,
                        chipManu = "affymetrix", mart=mart)

icaSetMainz <- resBuild$icaSet
params <- resBuild$params


###################################################
### code chunk number 14: showIcaSet
###################################################
icaSetMainz


###################################################
### code chunk number 15: defannot
###################################################
annot <- pData(icaSetMainz)


###################################################
### code chunk number 16: lookvar
###################################################
varLabels(icaSetMainz)[1:5]
icaSetMainz$grade[1:5]


###################################################
### code chunk number 17: lookfeatures
###################################################
featureNames(icaSetMainz)[1:5] # probe set ids
geneNames(icaSetMainz)[1:5] #gene symbols
sampleNames(icaSetMainz)[1:5] 


###################################################
### code chunk number 18: lookdat (eval = FALSE)
###################################################
## head(dat(icaSetMainz)) #probe set level
## head(datByGene(icaSetMainz)) #gene level


###################################################
### code chunk number 19: lookAS (eval = FALSE)
###################################################
## A(icaSetMainz)
## S(icaSetMainz)
## SByGene(icaSetMainz)


###################################################
### code chunk number 20: lookcomp (eval = FALSE)
###################################################
## nbComp(icaSetMainz)
## compNames(icaSetMainz)
## indComp(icaSetMainz)


###################################################
### code chunk number 21: witG
###################################################
witGenes(icaSetMainz)[1:5]
## We can for example modify the second contributing gene 
witGenes(icaSetMainz)[3] <- "KRT16"


###################################################
### code chunk number 22: MineICA.Rnw:362-363 (eval = FALSE)
###################################################
## compNames(icaSetMainz) <- paste("IC",1:nbComp(icaSetMainz),sep="")


###################################################
### code chunk number 23: MineICA.Rnw:371-379 (eval = FALSE)
###################################################
## ## select tumor samples of grade 3
## keepSamples <- sampleNames(icaSetMainz)[icaSetMainz$grade=="3"]
## ## Subset icaSetMainz to the grade-3 samples 
## icaSetMainz[,keepSamples]
## ## Subset icaSetMainz to the grade-3 samples and the first five components
## icaSetMainz[,keepSamples,1:5]
## ## Subset icaSetMainz to the first 10 features
## icaSetMainz[featureNames(icaSetMainz)[1:10],keepSamples]


###################################################
### code chunk number 24: histProj6
###################################################
hist(S(icaSetMainz)[,1], breaks=50, main="Distribution of feature projection on the first component", 
     xlab="projection values")
abline(v=c(3,-3), col="red", lty=2)


###################################################
### code chunk number 25: selectContribGenes
###################################################
## Extract the contributing genes
contrib <- selectContrib(icaSetMainz, cutoff=3, level="genes")
## Show the first contributing genes of the first and third components
sort(abs(contrib[[1]]),decreasing=TRUE)[1:10]
sort(abs(contrib[[3]]),decreasing=TRUE)[1:10]


###################################################
### code chunk number 26: selectContribGenes (eval = FALSE)
###################################################
## ## One can also want to apply different cutoffs depending on the components
## ## for example using the first 4 components:
## contrib <- selectContrib(icaSetMainz[,,1:4], cutoff=c(4,4,4,3), level="genes")


###################################################
### code chunk number 27: extractComp
###################################################
## extract sample contributions and gene projections of the second component
comp2 <- getComp(icaSetMainz, level="genes", ind=2)
## access the sample contributions 
comp2$contrib[1:5]
## access the gene projections
comp2$proj[1:5]


###################################################
### code chunk number 28: runAn
###################################################
## select the annotations of interest
varLabels(icaSetMainz)
# restrict the phenotype data to the variables of interest
keepVar <- c("age","er","grade")
# specify the variables that should be treated as character
icaSetMainz$er <- c("0"="ER-","1"="ER+")[as.character(icaSetMainz$er)]
icaSetMainz$grade <- as.character(icaSetMainz$grade)


###################################################
### code chunk number 29: runAn (eval = FALSE)
###################################################
## ## Run the analysis of the ICA decomposition
## # only enrichment in KEGG gene sets are tested
## runAn(params=params, icaSet=icaSetMainz, writeGenesByComp = TRUE, 
##       keepVar=keepVar, dbGOstats = "KEGG")


###################################################
### code chunk number 30: runAn
###################################################
resPath(params)


###################################################
### code chunk number 31: writeProj
###################################################
resW <- writeProjByComp(icaSet=icaSetMainz, params=params, mart=mart, 
                        level='genes', selCutoffWrite=2.5)
## the description of the contributing genes of each component is contained 
## in res$listAnnotComp which contains the gene id, its projection value, the number and 
## the indices of the components on which it exceeds the threshold, and its description.
head(resW$listAnnotComp[[1]])
## The number of components a gene contributes to is available 
## in res$nbOccInComp
head(resW$nbOccInComp)
## The output HTML files are located in the path:
genesPath(params)


###################################################
### code chunk number 32: plotHeatmaps (eval = FALSE)
###################################################
## ## selection of the variables we want to display on the heatmap
## keepVar <- c("er","grade")
## ## For the second component, select contributing genes using a threshold of 3 
## ## on the absolute projection values,
## ## heatmap with dendrogram
## resH <- plot_heatmapsOnSel(icaSet = icaSetMainz, selCutoff = 3, level = "genes", 
##                            keepVar = keepVar,
##                            doSamplesDendro = TRUE, doGenesDendro = TRUE, keepComp = 3,
##                            heatmapCol = maPalette(low = "blue",high = "red", mid = "yellow", k=44),
##                            file = "heatmapWithDendro", annot2col=annot2col(params))
## 
## 
## ## heatmap where genes and samples are ordered by contribution values
## resH <- plot_heatmapsOnSel(icaSet = icaSetMainz, selCutoff = 3, level = "genes", 
##                            keepVar = keepVar,
##                            doSamplesDendro = FALSE, doGenesDendro = FALSE, keepComp = 3,
##                            heatmapCol = maPalette(low = "blue",high = "red", mid = "yellow", k=44),
##                            file = "heatmapWithoutDendro", annot2col=annot2col(params))
## 


###################################################
### code chunk number 33: runEnrich (eval = FALSE)
###################################################
## ## run enrichment analysis on the first three components of icaSetMainz, 
## ## using gene sets from KEGG and ontology 'Biological Process' (BP) of Gene Ontology (GO)
## resEnrich <- runEnrich(params=params,icaSet=icaSetMainz[,,1:3],
##                        dbs=c("GO"), ontos="BP")


###################################################
### code chunk number 34: runEnrich2 (eval = FALSE)
###################################################
## ## Access results obtained for GO/BP for the first three components 
## # First component, when gene selection was based  on the absolute projection values
## head(resEnrich$GO$BP[[1]]$both)


###################################################
### code chunk number 35: runEnrich2bis
###################################################
structure(list(GOBPID = c("GO:0000236", "GO:0031581", "GO:0045786", 
"GO:0045104", "GO:0010389", "GO:0031577"), Pvalue = c(1.07618237962117e-05, 
1.31817485465954e-05, 0.00010249730291343, 0.00199032614648593, 
0.0022683621486939, 0.0022683621486939), OddsRatio = c(37.2312925170068, 
93.6306818181818, 9.90278637770898, 36.7090301003344, 34.0807453416149, 
34.0807453416149), ExpCount = c(0.144770177343467, 0.0497647484618169, 
0.79623597538907, 0.0678610206297503, 0.0723850886717336, 0.0723850886717336
), Count = c(4L, 3L, 6L, 2L, 2L, 2L), Size = c(32L, 11L, 176L, 
15L, 16L, 16L), Term = c("mitotic prometaphase", "hemidesmosome assembly", 
"negative regulation of cell cycle", "intermediate filament cytoskeleton organization", 
"regulation of G2/M transition of mitotic cell cycle", "spindle checkpoint"
), In_geneSymbols = c("BIRC5,CDC20,CDCA8,CENPN", "DST,COL17A1,KRT14", 
"BIRC5,DST,CDC20,TOP2A,CDC45,MCM10", "DST,KRT14", "TOP2A,CDC45", 
"BIRC5,CDC20")), .Names = c("GOBPID", "Pvalue", "OddsRatio", 
"ExpCount", "Count", "Size", "Term", "In_geneSymbols"), row.names = c(NA, 
6L), class = "data.frame")


###################################################
### code chunk number 36: runEnrich3 (eval = FALSE)
###################################################
## # Second component, when gene selection was based on the negative projection values
## head(resEnrich$GO$BP[[2]]$left)


###################################################
### code chunk number 37: runEnrich3bis
###################################################
structure(list(GOBPID = c("GO:0002253", "GO:0006959", "GO:0006955", 
"GO:0002684", "GO:0048584", "GO:0002429"), Pvalue = c(1.52817137763709e-07, 
4.81288352728676e-07, 1.22396115326329e-06, 6.35955291059939e-06, 
9.25622695313701e-06, 1.06767418066984e-05), OddsRatio = c(18.3424657534247, 
27.0398009950249, 28.8538011695906, 11.2582995951417, 7.46730769230769, 
21.6746411483254), ExpCount = c(0.668838219326819, 0.317046688382193, 
0.415086965018566, 1.11467391304348, 2.47991313789359, 0.308360477741585
), Count = c(8L, 6L, 6L, 8L, 11L, 5L), Size = c(154L, 73L, 177L, 
293L, 571L, 71L), Term = c("activation of immune response", "humoral immune response", 
"immune response", "positive regulation of immune system process", 
"positive regulation of response to stimulus", "immune response-activating cell surface receptor signaling pathway"
), In_geneSymbols = c("IGHG1,IGKC,LCK,PTPRC,PTPN22,TRBC1,IGKV4-1,TRAT1", 
"IGHG1,IGKC,SH2D1A,POU2AF1,CXCL13,IGKV4-1", "MS4A1,IGHD,IGHM,IGJ,CXCL9,CXCL11", 
"IGHG1,IGKC,SH2D1A,CCL5,CXCL13,PTPN22,TRBC1,IGKV4-1", "IGHG1,IGKC,LCK,SH2D1A,PTPRC,CCL5,CXCL13,PTPN22,TRBC1,IGKV4-1,TRAT1", 
"LCK,PTPRC,PTPN22,TRBC1,TRAT1")), .Names = c("GOBPID", "Pvalue", 
"OddsRatio", "ExpCount", "Count", "Size", "Term", "In_geneSymbols"
), row.names = c(NA, 6L), class = "data.frame")



###################################################
### code chunk number 38: runEnrich4 (eval = FALSE)
###################################################
## # Third component
## head(resEnrich$GO$BP[[3]]$both, n=5)


###################################################
### code chunk number 39: runEnrich4bis
###################################################
structure(list(GOBPID = c("GO:0008544", "GO:0034330", "GO:0045104", 
"GO:0060687", "GO:0060512"), Pvalue = c(8.06278555738841e-06, 
0.000234608964462521, 0.00034947787505441, 0.000886129575409258, 
0.000980819533506728), OddsRatio = c(7.96682149966821, 7.90315480557594, 
27.305, 71.5032679738562, 18.1833333333333), ExpCount = c(1.40028954035469, 
0.891965255157438, 0.143865363735071, 0.0479551212450235, 0.201411509229099
), Count = c(9L, 6L, 3L, 2L, 3L), Size = c(146L, 93L, 15L, 5L, 
21L), Term = c("epidermis development", "cell junction organization", 
"intermediate filament cytoskeleton organization", "regulation of branching involved in prostate gland morphogenesis", 
"prostate gland morphogenesis"), In_geneSymbols = c("EGFR,KRT5,KRT14,KRT15,KRT16,KRT17,KLK7,S100A7,CALML5", 
"CDH3,GPM6B,KRT5,KRT14,SFRP1,UGT8", "KRT14,KRT16,SYNM", "ESR1,SFRP1", 
"ESR1,SERPINB5,SFRP1")), .Names = c("GOBPID", "Pvalue", "OddsRatio", 
"ExpCount", "Count", "Size", "Term", "In_geneSymbols"), row.names = c(NA, 
5L), class = "data.frame")


###################################################
### code chunk number 40: stagebis
###################################################
icaSetMainz$grade <- factor(as.character(icaSetMainz$grade), levels=c("1","2","3"))


###################################################
### code chunk number 41: varAn
###################################################
### Qualitative variables
## Compute Wilcoxon and Kruskall-Wallis tests to compare the distribution 
## of the samples according to their grade and ER status on all components.
resQual <- qualVarAnalysis(params=params, icaSet=icaSetMainz, 
                           keepVar=c("er","grade"),
                           adjustBy="none", typePlot="boxplot", 
                           path="qualVarAnalysis/", filename="qualVar")


###################################################
### code chunk number 42: quantVarAn
###################################################
### Quantitative variables
## Compute pearson correlations between variable 'age' and the sample contributions
## on all components.
## We are interested in correlations exceeding 0.3 in absolute value, and plots will only be drawn
## for correlations exceeding this threshold.
resQuant <- quantVarAnalysis(params=params, icaSet=icaSetMainz, keepVar="age", 
                             typeCor="pearson", cutoffOn="cor",
                             cutoff=0.3, adjustBy="none", 
                             path="quantVarAnalysis/", filename="quantVar")


###################################################
### code chunk number 43: quantVarAncor
###################################################
resQuant$cor[3]


###################################################
### code chunk number 44: plotMix
###################################################
resmix <- plotAllMix(A=A(icaSetMainz), nbMix=2, nbBreaks=50)


###################################################
### code chunk number 45: stageOnHist (eval = FALSE)
###################################################
## ## plot the positions of the samples on the second component according to their ER status
## ## (in a file "er.pdf") 
## plotPosAnnotInComp(icaSet=icaSetMainz, keepVar=c("er"), keepComp=2,  
##                    funClus="Mclust")


###################################################
### code chunk number 46: cluster1
###################################################
## clustering of the samples in 1-dim using the vector 
## of sample contributions of the two first components 
## and Gaussian mixture modeling (Mclust)
clus1 <- clusterSamplesByComp(params=params, icaSet=icaSetMainz[,,,1:2],
                              funClus="Mclust", clusterOn="A", nbClus=2, filename="comp1Mclust")

## The obtained clusters are written in the file "comp1Mclus.txt" of the result path.
clus1$clus[[2]][1:5]


###################################################
### code chunk number 47: cluster2
###################################################
clus2 <- clusterSamplesByComp_multiple(params=params, icaSet=icaSetMainz[,,1:2], 
                                     funClus="kmeans", clusterOn=c("A","S"), level="features", 
                                     nbClus=2, filename="comparKmeans")

## The obtained clusters and their comparison with adjusted Rand indices are written 
## in file "comparKmeans.txt" of the result path.

## Both clustering results are stored in a common data.frame
head(clus2$clus)
## Access Rand index
clus2$comparClus


###################################################
### code chunk number 48: clus2var (eval = FALSE)
###################################################
## ## Test the association between the clustering obtained by Mclust for the first
## ## component and the variables:  
## clus2var <- clusVarAnalysis(icaSet=icaSetMainz[,,1:2], params=params, 
##                             keepVar=c("er","grade"), 
##                             resClus=clus1$clus, funClus="Mclust", adjustBy="none", 
##                             doPlot=TRUE, path="clus2var/", filename="resChitests-Mcluscomp1")
## ## Look at the filename which contains p-values and links to the barplots
## ## p-values are also contained in the ouput of the function:
## clus2var
## 
## 


###################################################
### code chunk number 49: clus2var (eval = FALSE)
###################################################
## structure(list(`1` = c(1.657e-06, 2.315e-08), `2` = c(6.775e-07, 
## 0.0001899)), .Names = c("1", "2"), row.names = c("er", "grade"
## ), class = "data.frame")


###################################################
### code chunk number 50: tuut (eval = FALSE)
###################################################
## ## load three other breast cancer datasets also based on  Affymetrix HG-U133a microarray
## library(breastCancerUPP)
## library(breastCancerTRANSBIG)
## library(breastCancerVDX)
## data(upp)
## data(transbig)
## data(vdx)
## ## function to build IcaSet instances from these three datasets
## treat <- function(es, annot="hgu133a.db") {
##     es <- selectFeatures_IQR(es,10000)
##     exprs(es) <- t(apply(exprs(es),1,scale,scale=FALSE))
##     colnames(exprs(es)) <- sampleNames(es)
##     resJade <- runICA(X=exprs(es), nbComp=5, method = "JADE", maxit=10000)
##     resBuild <- buildIcaSet(params=buildMineICAParams(), A=data.frame(resJade$A), S=data.frame(resJade$S),
##                             dat=exprs(es), pData=pData(es), refSamples=character(0),
##                             annotation=annot, typeID= typeIDmainz,
##                             chipManu = "affymetrix", mart=mart)
##     icaSet <- resBuild$icaSet
## }
## icaSetUpp <- treat(upp, annot="hgu133plus2.db")
## icaSetVdx <- treat(vdx)
## icaSetTransbig <- treat(transbig)


###################################################
### code chunk number 51: compareAn (eval = FALSE)
###################################################
## resGraph <- runCompareIcaSets(icaSets=list(icaSetMainz, icaSetUpp, 
##                               icaSetTransbig, icaSetVdx), 
##                               labAn=c("Mainz", "Upp","Transbig","Vdx"), 
##                               type.corr="pearson", level="genes", 
##                               cutoff_zval=0, fileNodeDescr="nodeDescr.txt", 
##                               fileDataGraph="dataGraph.txt", tkplot=TRUE)


###################################################
### code chunk number 52: compareAnColors (eval = FALSE)
###################################################
## barplot(names.arg=unique(resGraph[[2]]$labAn),height=rep(1,4), 
##         col=unique(resGraph[[2]]$col))


###################################################
### code chunk number 53: inter1 (eval = FALSE)
###################################################
## ## comparison of four components included in the clique of the correlation-based graph
## # that includes the second component of Mainz.
## inter <- compareGenes(keepCompByIcaSet =   c(2,2,2,2),  
##                       icaSets = list(icaSetMainz, icaSetTransbig, icaSetUpp, icaSetVdx),
##                       lab=c("Mainz", "Transbig", "Upp", "Vdx"), cutoff=3, 
##                       type="intersection",  annotate=F)
## head(inter)


###################################################
### code chunk number 54: inter1bis
###################################################
structure(list(min_rank = structure(c(1L, 12L, 22L, 12L, 31L, 
31L), .Label = c("1", "10", "101", "11", "12", "13", "14", "16", 
"17", "18", "19", "2", "20", "21", "22", "23", "24", "25", "26", 
"27", "29", "3", "30", "31", "33", "34", "35", "36", "37", "38", 
"4", "40", "41", "43", "46", "47", "5", "51", "52", "53", "54", 
"58", "6", "63", "69", "7", "70", "74", "8", "9"), class = "factor"), 
    median_rank = c(1, 2, 3, 5, 5.5, 6), ranks = c("1,1,1,1", 
    "2,2,3,2", "3,3,4,3", "5,8,2,5", "4,7,9,4", "6,4,6,6"), scaled_proj = c("-8.9,-9.2,9.2,-9.3", 
    "-8.4,-8.7,7.9,-8.9", "-7,-8.3,7.7,-8.8", "-6.4,-6.1,8.4,-7.6", 
    "-6.6,-6.5,6.7,-7.7", "-6.3,-7.8,7.1,-7.6")), .Names = c("min_rank", 
"median_rank", "ranks", "scaled_proj"), row.names = c("IGL@", 
"IGKV4-1", "IGLV2-23", "NKG7", "IGHM", "TNFRSF17"), class = "data.frame")


###################################################
### code chunk number 55: inter2 (eval = FALSE)
###################################################
## ## comparison of four components included in the clique of the correlation-based graph
## # that includes the third component of Mainz.
## inter <- compareGenes(keepCompByIcaSet = c(3,3,3,1), 
##                       icaSets = list(icaSetMainz, icaSetTransbig, icaSetUpp, icaSetVdx),
##                       lab=c("Mainz", "Transbig", "Upp", "Vdx"), cutoff=3, 
##                       type="intersection",  annotate=F)
## head(inter)


###################################################
### code chunk number 56: inter2bis
###################################################
structure(list(min_rank = structure(c(1L, 7L, 21L, 7L, 7L, 16L
), .Label = c("1", "10", "11", "16", "17", "18", "2", "23", "24", 
"25", "26", "3", "32", "38", "39", "4", "40", "42", "43", "49", 
"5", "51", "58", "6", "7", "8", "9"), class = "factor"), median_rank = c(1, 
4, 6.5, 8.5, 9.5, 9.5), ranks = c("1,1,1,3", "5,5,3,2", "7,7,6,5", 
"13,2,4,21", "2,15,15,4", "4,31,12,7"), scaled_proj = c("7.7,8.7,8.1,7.2", 
"6.3,7.2,7.3,7.3", "6.2,7.1,5.9,6.5", "5.7,7.3,7,5.1", "7.3,5.8,5.4,7", 
"6.3,4.6,5.6,6.2")), .Names = c("min_rank", "median_rank", "ranks", 
"scaled_proj"), row.names = c("GABRP", "MIA", "SERPINB5", "KRT14", 
"KRT16", "KRT81"), class = "data.frame")


