### R code from vignette source 'Introduction_to_MutationalPatterns.Rnw'
### Encoding: UTF-8

###################################################
### code chunk number 1: style
###################################################
BiocStyle::latex(use.unsrturl=FALSE)


###################################################
### code chunk number 2: options
###################################################
options(width=96)
library(ggplot2)


###################################################
### code chunk number 3: loading_reference_data
###################################################
library(BSgenome)
head(available.genomes())


###################################################
### code chunk number 4: loading_reference_data
###################################################
ref_genome <- "BSgenome.Hsapiens.UCSC.hg19"
library(ref_genome, character.only = TRUE)


###################################################
### code chunk number 5: load_package
###################################################
library(MutationalPatterns)


###################################################
### code chunk number 6: locate_files
###################################################
vcf_files <- list.files(system.file("extdata", package="MutationalPatterns"),
                        pattern = ".vcf", full.names = TRUE)


###################################################
### code chunk number 7: set_sample_names
###################################################
sample_names <- c(
    "colon1", "colon2", "colon3",
    "intestine1", "intestine2", "intestine3",
    "liver1", "liver2", "liver3")


###################################################
### code chunk number 8: read_vcfs_as_granges
###################################################
vcfs <- read_vcfs_as_granges(vcf_files, sample_names, ref_genome)
summary(vcfs)


###################################################
### code chunk number 9: store_tissue_variable
###################################################
tissue <- c(rep("colon", 3), rep("intestine", 3), rep("liver", 3))


###################################################
### code chunk number 10: mutations_from_vcf
###################################################
muts = mutations_from_vcf(vcfs[[1]])
head(muts, 12)


###################################################
### code chunk number 11: mutation_types
###################################################
types = mutation_types(vcfs[[1]])
head(types, 12)


###################################################
### code chunk number 12: mutation_context
###################################################
context = mutation_context(vcfs[[1]], ref_genome)
head(context, 12)


###################################################
### code chunk number 13: type_context
###################################################
type_context = type_context(vcfs[[1]], ref_genome)
head(type_context$types, 12)
head(type_context$context, 12)


###################################################
### code chunk number 14: mut_type_occurrences
###################################################
type_occurrences <- mut_type_occurrences(vcfs, ref_genome)
type_occurrences


###################################################
### code chunk number 15: plot_spectrum
###################################################
p1 = plot_spectrum(type_occurrences)


###################################################
### code chunk number 16: plot_spectrum_2
###################################################
p2 = plot_spectrum(type_occurrences, CT = TRUE)


###################################################
### code chunk number 17: plot_spectrum_3
###################################################
p3 = plot_spectrum(type_occurrences, CT = TRUE, legend = FALSE)


###################################################
### code chunk number 18: combine_plot_spectrum_noeval (eval = FALSE)
###################################################
## library("gridExtra")
## grid.arrange(p1, p2, p3, ncol=3, widths=c(3,3,1.75))


###################################################
### code chunk number 19: combine_plot_spectrum
###################################################
library("gridExtra")
ggsave("combine_plot_spectrum.pdf",
       grid.arrange(p1, p2, p3, ncol=3, widths=c(3,3,1.75)),
       width=10,
       height=3)


###################################################
### code chunk number 20: plot_spectrum_4
###################################################
p4 <- plot_spectrum(type_occurrences, by = tissue, CT = TRUE, legend = TRUE)


###################################################
### code chunk number 21: plot_spectrum_5
###################################################
palette <- c("pink", "orange", "blue", "lightblue", "green", "red", "purple")
p5 <- plot_spectrum(type_occurrences, CT = TRUE, legend = TRUE, colors = palette)


###################################################
### code chunk number 22: combine_plot_spectrum_2_noeval
###################################################
ggsave("combine_plot_spectrum_2.pdf", 
       grid.arrange(p4, p5, ncol=2, widths=c(4,2.3)), 
       width=10, 
       height=3)


###################################################
### code chunk number 23: combine_plot_spectrum_2 (eval = FALSE)
###################################################
## grid.arrange(p4, p5, ncol=2, widths=c(4,2.3))


###################################################
### code chunk number 24: mut_matrix
###################################################
mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome)


###################################################
### code chunk number 25: plot_96_profile_2
###################################################
plot_96_profile(mut_mat[,c(1,4,7)])


###################################################
### code chunk number 26: psuedo_count
###################################################
mut_mat = mut_mat + 0.0001


###################################################
### code chunk number 27: use_nmf
###################################################
library("NMF")
estimate = nmf(mut_mat, rank=2:5, method="brunet", nrun=100, seed=123456)


###################################################
### code chunk number 28: estimate_rank
###################################################
plot(estimate)


###################################################
### code chunk number 29: extract_signatures
###################################################
nmf_res <- extract_signatures(mut_mat, rank = 2)


###################################################
### code chunk number 30: add_column_names
###################################################
colnames(nmf_res$signatures) <- c("Signature A", "Signature B")


###################################################
### code chunk number 31: plot_96_profile
###################################################
plot_96_profile(nmf_res$signatures)


###################################################
### code chunk number 32: plot_contribution
###################################################
rownames(nmf_res$contribution) <- c("Signature A", "Signature B")
pc1 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "relative")


###################################################
### code chunk number 33: plot_contribution_2
###################################################
pc2 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "absolute")
grid.arrange(pc1, pc2)


###################################################
### code chunk number 34: plot_contribution_3
###################################################
pc3 <- plot_contribution(nmf_res$contribution, nmf_res$signature, 
                         mode = "absolute", index = c(1,2))


###################################################
### code chunk number 35: plot_contribution_4
###################################################
pc4 <- plot_contribution(nmf_res$contribution, nmf_res$signature,
                         mode = "absolute", coord_flip = TRUE)
grid.arrange(pc3, pc4, ncol=2)


###################################################
### code chunk number 36: plot_compare_profiles
###################################################
plot_compare_profiles(mut_mat[,1], 
                        nmf_res$reconstructed[,1], 
                        profile_names = c("Original", "Reconstructed"))


###################################################
### code chunk number 37: download_cancer_signatures
###################################################
sp_url <- paste("http://cancer.sanger.ac.uk/cancergenome/assets/",
                "signatures_probabilities.txt", sep = "")

cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE)
# Reorder (to make the order of the trinucleotide changes the same)
cancer_signatures = cancer_signatures[order(cancer_signatures[,1]),]
# Only signatures in matrix
cancer_signatures = as.matrix(cancer_signatures[,4:33])


###################################################
### code chunk number 38: fit_to_signatures
###################################################
fit_res <- fit_to_signatures(mut_mat, cancer_signatures)


###################################################
### code chunk number 39: plot_contribution_3_noeval (eval = FALSE)
###################################################
## # select signatures with some contribution
## select <- which(rowSums(fit_res$contribution) > 0)
## # plot contribution
## plot_contribution ( fit_res$contribution[select,],
##                     cancer_signatures[,select],
##                     coord_flip = FALSE,
##                     mode = "absolute" )


###################################################
### code chunk number 40: plot_contribution_3
###################################################
# select signatures with some contribution
select <- which(rowSums(fit_res$contribution) > 0)
# plot contribution
ggsave("plot_contribution_3.pdf",
       plot_contribution(fit_res$contribution[select,],
                         cancer_signatures[,select],
                         coord_flip = FALSE,
                         mode = "absolute"),
       width=9,
       height=5)


###################################################
### code chunk number 41: plot_compare_profiles_2
###################################################
plot_compare_profiles ( mut_mat[,1], fit_res$reconstructed[,1],
                        profile_names = c("Original", "Reconstructed") )


###################################################
### code chunk number 42: get_genes
###################################################
# For example get known genes table from UCSC for hg19 using 
# biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
genes_hg19 <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene)


###################################################
### code chunk number 43: strand_from_vcf
###################################################
strand = strand_from_vcf(vcfs[[1]], genes_hg19)
head(strand, 10)


###################################################
### code chunk number 44: mut_matrix_stranded
###################################################
mut_mat_s <- mut_matrix_stranded(vcfs, ref_genome, genes_hg19)
head(mut_mat_s, 10)


###################################################
### code chunk number 45: strand_occurrences
###################################################
strand_counts <- strand_occurrences(mut_mat_s, by=tissue)
head(strand_counts)


###################################################
### code chunk number 46: plot_strand
###################################################
ps1 <- plot_strand(strand_counts, mode = "relative")


###################################################
### code chunk number 47: strand_bias_test
###################################################
strand_bias <- strand_bias_test(strand_counts)


###################################################
### code chunk number 48: plot_strand_bias
###################################################
ps2 <- plot_strand_bias(strand_bias)
grid.arrange(ps1, ps2)


###################################################
### code chunk number 49: extract_signatures
###################################################
nmf_res_strand <- extract_signatures(mut_mat_s, rank = 2)

# Provide signature names
colnames(nmf_res_strand$signatures) <- c("Signature A", "Signature B")


###################################################
### code chunk number 50: plot_192
###################################################
a <- plot_192_profile(nmf_res_strand$signatures)


###################################################
### code chunk number 51: plot_strand_bias
###################################################
b <- plot_signature_strand_bias(nmf_res_strand$signatures)


###################################################
### code chunk number 52: plot_192_profile_noeval (eval = FALSE)
###################################################
## grid.arrange(a, b, ncol=2, widths=c(5,2))


###################################################
### code chunk number 53: plot_192_profile
###################################################
ggsave("plot_192_profile.pdf", 
       grid.arrange(a, b, ncol=2, widths=c(5,2)),
       width = 10,
       height = 5)


###################################################
### code chunk number 54: plot_rainfall_noeval (eval = FALSE)
###################################################
## # Define autosomal chromosomes
## chromosomes <- seqnames(get(ref_genome))[1:22]
## 
## # Make a rainfall plot
## plot_rainfall ( vcfs[[1]], title = names(vcfs[1]),
##                 chromosomes = chromosomes, cex = 1.5, ylim = 1e+09 )


###################################################
### code chunk number 55: plot_rainfall
###################################################
# Define autosomal chromosomes
chromosomes <- seqnames(get(ref_genome))[1:22]

# Make a rainfall plot
ggsave("plot_rainfall.pdf",
       plot_rainfall(vcfs[[1]], title = names(vcfs[1]),
                     chromosomes = chromosomes, cex = 1.5, ylim = 1e+09),
       width=9,
       height=3)


###################################################
### code chunk number 56: plot_rainfall_2
###################################################
chromosomes <- seqnames(get(ref_genome))[1]
plot_rainfall ( vcfs[[1]], title = names(vcfs[1]),
                chromosomes = chromosomes[1], cex = 2, ylim = 1e+09 )


###################################################
### code chunk number 57: install_biomaRt (eval = FALSE)
###################################################
## source("https://bioconductor.org/biocLite.R")
## biocLite("biomaRt")


###################################################
### code chunk number 58: load_biomart
###################################################
library(biomaRt)


###################################################
### code chunk number 59: download_using_biomaRt
###################################################
# regulatory <- useEnsembl(biomart="regulation",
#                          dataset="hsapiens_regulatory_feature",
#                          GRCh = 37)

## Download the regulatory CTCF binding sites and convert them to
## a GRanges object.
# CTCF <- getBM(attributes = c('chromosome_name',
#                             'chromosome_start',
#                             'chromosome_end',
#                             'feature_type_name',
#                             'cell_type_name'),
#              filters = "regulatory_feature_type_name", 
#              values = "CTCF Binding Site", 
#              mart = regulatory)
#
# CTCF_g <- reduce(GRanges(CTCF$chromosome_name,
#                 IRanges(CTCF$chromosome_start,
#                 CTCF$chromosome_end)))
CTCF_g <- readRDS(system.file("states/CTCF_g_data.rds",
                    package="MutationalPatterns"))

## Download the promoter regions and convert them to a GRanges object.

# promoter = getBM(attributes = c('chromosome_name', 'chromosome_start',
#                                 'chromosome_end', 'feature_type_name'),
#                  filters = "regulatory_feature_type_name", 
#                  values = "Promoter", 
#                  mart = regulatory)
# promoter_g = reduce(GRanges(promoter$chromosome_name,
#                     IRanges(promoter$chromosome_start,
#                             promoter$chromosome_end)))
promoter_g <- readRDS(system.file("states/promoter_g_data.rds",
                        package="MutationalPatterns"))

## Download the promoter flanking regions and convert them to a GRanges object.

# flanking = getBM(attributes = c('chromosome_name',
#                                 'chromosome_start',
#                                 'chromosome_end',
#                                 'feature_type_name'),
#                  filters = "regulatory_feature_type_name", 
#                  values = "Promoter Flanking Region", 
#                  mart = regulatory)
# flanking_g = reduce(GRanges(
#                        flanking$chromosome_name,
#                        IRanges(flanking$chromosome_start,
#                        flanking$chromosome_end)))

flanking_g <- readRDS(system.file("states/promoter_flanking_g_data.rds",
                                    package="MutationalPatterns"))


###################################################
### code chunk number 60: combine_genomic_regions
###################################################
regions <- GRangesList(promoter_g, flanking_g, CTCF_g)

names(regions) <- c("Promoter", "Promoter flanking", "CTCF")


###################################################
### code chunk number 61: combine_genomic_regions_2
###################################################
seqlevelsStyle(regions) <- "UCSC"


###################################################
### code chunk number 62: download_bed_data
###################################################
## Get the filename with surveyed/callable regions
surveyed_file <- list.files(system.file("extdata",
                            package = "MutationalPatterns"),
                            pattern = ".bed",
                            full.names = TRUE)

## Import the file using rtracklayer and use the UCSC naming standard
library(rtracklayer)
surveyed <- import(surveyed_file)
seqlevelsStyle(surveyed) <- "UCSC"

## For this example we use the same surveyed file for each sample.
surveyed_list <- rep(list(surveyed), 9)


###################################################
### code chunk number 63: genomic_distribution
###################################################
## Calculate the number of observed and expected number of mutations in
## each genomic regions for each sample.
distr <- genomic_distribution(vcfs, surveyed_list, regions)


###################################################
### code chunk number 64: enrichment_depletion_test
###################################################
## Perform the enrichment/depletion test by tissue type.
distr_test <- enrichment_depletion_test(distr, by = tissue)
head(distr_test)

## Or without specifying the 'by' parameter.
distr_test2 <- enrichment_depletion_test(distr)
head(distr_test2)


###################################################
### code chunk number 65: plot_enrichment_depletion
###################################################
plot_enrichment_depletion(distr_test)


###################################################
### code chunk number 66: sessionInfo
###################################################
toLatex(sessionInfo())