## ----install, eval=FALSE-------------------------------------------------
#  ## try http:// if https:// URLs are not supported
#  source("https://bioconductor.org/biocLite.R")
#  biocLite("psichomics")

## ----load, message=FALSE-------------------------------------------------
library(psichomics)

## ----TCGA options--------------------------------------------------------
# Available tumour types
cohorts <- getFirebrowseCohorts()

# Available sample dates
date <- getFirebrowseDates()

# Available data types
dataTypes <- getFirebrowseDataTypes()

## ----download, eval=FALSE------------------------------------------------
#  # Set download folder
#  folder <- getDownloadsFolder()
#  
#  # Download and load most recent junction quantification and clinical data from
#  # TCGA/Firebrowse for Adrenocortical Carcinoma
#  data <- loadFirebrowseData(folder=folder,
#                           cohort="BRCA",
#                           data=c("Clinical", "junction_quantification"),
#                           date="2016-01-28")
#  
#  # Select clinical and junction quantification dataset
#  clinical <- data[[1]]$`Clinical data`
#  junctionQuant <- data[[1]]$`Junction quantification (Illumina HiSeq)`

## ----prepare examples, include=FALSE-------------------------------------
clinical <- readRDS("BRCA_clinical.RDS")

## ----load local, eval=FALSE----------------------------------------------
#  folder <- "~/Downloads/GTEx/"
#  ignore <- c(".aux.", ".mage-tab.")
#  data <- loadLocalFiles(folder, ignore=ignore)
#  
#  # Select clinical and junction quantification dataset
#  clinical <- data[[1]]$`Clinical data`
#  junctionQuant <- data[[1]]$`Junction quantification (Illumina HiSeq)`

## ----quantify options----------------------------------------------------
# Available alternative splicing annotation
annotList <- listSplicingAnnotations()
annotList

## ----prepare to quantify splicing, eval=FALSE----------------------------
#  # Load Human (hg19/GRCh37 assembly) annotation
#  human <- listSplicingAnnotations()[[1]]
#  annotation <- loadAnnotation(human)

## ----event types---------------------------------------------------------
# Available alternative splicing event types (skipped exon, alternative 
# first/last exon, mutually exclusive exons, etc.)
eventType <- getSplicingEventTypes()
eventType
eventType <- "SE"

## ----quantify splicing, eval=FALSE---------------------------------------
#  # Min. reads threshold: number of reads required to quantify a splicing event
#  minReads <- 10
#  
#  psi <- quantifySplicing(annotation, junctionQuant, eventType=eventType,
#                          minReads=minReads)

## ----load splicing, echo=FALSE-------------------------------------------
psi <- readRDS("BRCA_psi.RDS")

## ----check splicing events-----------------------------------------------
# Check the identifier of the splicing events in the resulting table
events <- rownames(psi)
head(events)

## ----survival groups-----------------------------------------------------
# Get available groups in clinical data
cols <- colnames(clinical)

# Check the name from which to retrieve groups of interest
er_expr <- grep("estrogen_receptor_status", cols, value=TRUE)[1]
er_expr <- createGroupByAttribute(col=er_expr, dataset=clinical)

# Discard values other than "positive" and "negative" for ER expression
er_expr <- er_expr[c("positive", "negative")]

############################################################################
# Now the same for patients treated with tamoxifen. However, TCGA presents #
# data for the administred drugs spread through multiple columns, so...    #
############################################################################

# Look for the appropriate columns with the drugs administred
drug_name <- grep("drug_name", cols, value=TRUE)[1:23]

# Using the previous columns, look for either "tamoxifen" or "tamoxiphen"
tamoxifen <- lapply(drug_name, function(i) grep("tamoxi.*", clinical[ , i]))

# Collect all previous results and create a single list named tamoxifen
tamoxifen <- sort(unique(unlist(tamoxifen)))
tamoxifen <- list("tamoxifen"=tamoxifen)

# Combine all previously created groups
groups <- c(er_expr, tamoxifen)

# Assign each patient to a group
patients <- nrow(clinical)
g <- groupPerPatient(groups[c("tamoxifen", "positive")], patients)

# Append the created groups to the clinical dataset
cl <- cbind(clinical, groups=g)

## ----help formula, eval=FALSE--------------------------------------------
#  help(formula)

## ----survival by er expression and tamoxifen-----------------------------
# Create the right-hand side of a formula
formulaStr <- "groups"

# Events are retrieved based on the information available for time to event: if 
# there is info for a patient, the event is considered as occurred
daysToDeath <- "days_to_death"
survTerms  <- processSurvTerms(cl, censoring="right", event=daysToDeath,
                               timeStart=daysToDeath,
                               formulaStr=formulaStr, scale="years")

require("survival")
surv <- survfit(survTerms)
pvalue <- testSurvival(survTerms)

plotSurvivalCurves(surv, pvalue=pvalue)

## ----cox model-----------------------------------------------------------
survTermsCox <- processSurvTerms(cl, censoring="right", event=daysToDeath,
                                 timeStart=daysToDeath,
                                 formulaStr=formulaStr, coxph=TRUE)
summary(survTermsCox)

## ----plot PCA variance---------------------------------------------------
# Percentage of missing values tolerated by splicing event: tolerated missing
# values are replaced with the median value of that splicing event
naTolerance <- 0

# Center bu do not scale values (they are already scaled between 0 and 1)
center <- TRUE
scale  <- FALSE

# Match patients with samples
samples <- colnames(psi)
match <- getPatientFromSample(samples, clinical)

# Filter alternative splicing quantification to colour values by positive or
# negative ER expression
erGroups <- getMatchingSamples(groups[c("positive", "negative")], samples, 
                               clinical, match=match)
filtered_psi <- psi[ , unlist(erGroups)]

# Perform principal component analysis (transpose alternative splicing
# quantification to have samples as rows)
pca <- performPCA(t(filtered_psi), center=center, scale.=scale,
                  naTolerance=naTolerance)

# Plot the variance explained by each principal component
plotVariance(pca)

## ----plot PCA------------------------------------------------------------
plotPCA(pca, pcX="PC1", pcY="PC2", erGroups)

## ----plot loadings-------------------------------------------------------
plotPCA(pca, pcX="PC1", pcY="PC2", individuals = FALSE, loadings = TRUE)

## ----diff splicing 1-----------------------------------------------------
# Choose a method to use when adjusting p-values ("none" is valid)
# help(p.adjust.methods)
pvalueAdjust <- "BH"

# Check sample types available (normal, tumour, etc.)
types <- parseSampleGroups(colnames(psi))
unique(types)

# Analyse by sample types (by default) and perform all statistical analyses (by 
# default)
event <- "SE_10_+_79797062_79799962_79799983_79800373_RPS24"
eventPSI <- psi[event, ]
stats <- diffAnalyses(eventPSI, pvalueAdjust=pvalueAdjust,
                      analyses = c("wilcoxRankSum", "ttest", "kruskal", 
                                   "levene", "fligner"))
# View(stats)
plotDistribution(as.numeric(eventPSI), groups=types)

## ----diff splicing 2-----------------------------------------------------
filter <- grep("Tumor|Normal", types)
gr_event <- types[filter]
eventPSI <- eventPSI[filter]

stats <- diffAnalyses(eventPSI, groups=gr_event, pvalueAdjust=pvalueAdjust,
                      analyses = c("wilcoxRankSum", "ttest", "kruskal", 
                                   "levene", "fligner"))
# View(stats)
plotDistribution(as.numeric(eventPSI), groups=gr_event)

## ----diff splicing 3-----------------------------------------------------
# Filter alternative splicing quantification by positive or negative ER
# expression; match between patient information and samples as such:
erGroups <- getMatchingSamples(groups[c("negative", "positive")], samples, 
                               clinical, match=match)
eventPSI <- psi[event, unlist(erGroups)]
erGroups <- rep(names(erGroups), sapply(erGroups, length))

stats <- diffAnalyses(eventPSI, groups=erGroups, pvalueAdjust=pvalueAdjust,
                      analyses = c("wilcoxRankSum", "ttest", "kruskal", 
                                   "levene", "fligner"))
# View(stats)
plotDistribution(as.numeric(eventPSI), groups=erGroups)

## ----splicing event data-------------------------------------------------
event <- "SE_10_+_79797062_79799962_79799983_79800373_RPS24"

# Assign alternative splicing quantification to patients based on their samples
clinicalPSI <- getPSIperPatient(psi, match, clinical)
eventPSI <- as.numeric(clinicalPSI[event, ])

# Statistics of the alternative splicing event's quantification
summary(eventPSI)

## ----optimal cut-off-----------------------------------------------------
opt <- optimalPSIcutoff(clinical, eventPSI, censoring="right", 
                        event="days_to_death", timeStart="days_to_death")
optimalCutoff <- opt$par # Optimal exon inclusion level
optimalCutoff
optimalPvalue <- opt$value # Respective p-value
optimalPvalue

## ------------------------------------------------------------------------
cutoff <- optimalCutoff
group <- labelBasedOnCutoff(eventPSI, cutoff, label="PSI values")
survTerms <- processSurvTerms(clinical, censoring="right",
                              event="days_to_death", timeStart="days_to_death", 
                              group=group)

require("survival")
surv <- survfit(survTerms)
pvalue <- testSurvival(survTerms)
plotSurvivalCurves(surv, pvalue=pvalue)

## ----plot transcripts----------------------------------------------------
parsed <- parseSplicingEvent(event)
info <- queryEnsemblByEvent(event, species="human", assembly="hg19")
plotTranscripts(info, parsed$pos[[1]])

## ----plot proteins-------------------------------------------------------
parsed <- parseSplicingEvent(event)
info <- queryEnsemblByEvent(event, species="human", assembly="hg19")

# Some of these transcripts have no corresponding protein
proteins <- info$Transcript$Translation$id
protein <- proteins[[6]]

# Convert protein identifier from ENSEMBL to UniProt
uniprot <- ensemblToUniprot(protein)
uniprot
uniprot <- uniprot[[1]]

plotProtein(uniprot)

## ----exploring diff splicing---------------------------------------------
# Filter alternative splicing quantification by positive or negative ER
# expression; match between patient information and samples as such:
erGroups <- getMatchingSamples(groups[c("negative", "positive")], samples, 
                               clinical, match=match)
eventsPSI <- psi[ , unlist(erGroups)]
erGroups  <- rep(names(erGroups), sapply(erGroups, length))

diff <- diffAnalyses(eventsPSI, erGroups)
# View(diff)

## ----survival analysis on filtered events, results="hide"----------------
# Filter statistically significant events as you see fit
filter <- diff$`Wilcoxon p-value (BH adjusted)` <= 0.05
filter <- filter[!is.na(filter)]
events <- rownames(diff[filter, ])

# Assign alternative splicing quantification to patients based on their samples
clinicalPSI <- getPSIperPatient(psi, match, clinical)

# Get time for all survival analyses of interest (the same for all, so this is
# faster)
daysToDeath <- "days_to_death"
survTime <- getColumnsTime(clinical, event=daysToDeath, timeStart=daysToDeath)

# Prepare progress bar
min <- 1
max <- nrow(clinicalPSI)
pb <- txtProgressBar(min, max, "Calculating optimal PSI cut-off", style=3)

# Prepare empty vectors where information will be stored (faster)
optimalCutoff <- rep(NA, max)
optimalPvalue <- rep(NA, max)

# Retrieve the optimal PSI cut-off and respective p-value for multiple events
for (row in min:max) {
    singlePSI <- as.numeric(clinicalPSI[row, ])
    opt <- optimalPSIcutoff(clinical, singlePSI, censoring="right", 
                            event=daysToDeath, timeStart=daysToDeath,
                            survTime=survTime)
    optimalCutoff[row] <- opt$par # Optimal exon inclusion level
    optimalPvalue[row] <- opt$value # Respective p-value
    setTxtProgressBar(pb, row)
}

# Bind columns to differential splicing analysis if desired
diff <- cbind(diff, optimalCutoff, optimalPvalue)
# View(diff)