## ----library_install, message=FALSE, cache=FALSE, eval=FALSE---------------
#  source("http://bioconductor.org/biocLite.R")
#  biocLite("GenomicScores")

## ----library_upload, message=FALSE, cache=FALSE----------------------------
library(GenomicScores)

## ----retrieve1, message=FALSE, cache=FALSE---------------------------------
library(phastCons100way.UCSC.hg19)
library(GenomicRanges)
gsco <- phastCons100way.UCSC.hg19

## --------------------------------------------------------------------------
scores(gsco, GRanges(seqnames="chr7", IRanges(start=117232380, width=1)))

## ---- echo=FALSE, eval=FALSE-----------------------------------------------
#  gsco

## ---- echo=FALSE-----------------------------------------------------------
avgs <- readRDS(system.file("extdata", "avgs.rds", package="GenomicScores"))

## ----retrieve2, message=FALSE, cache=FALSE, eval=FALSE---------------------
#  availableGScores()

## ---- echo=FALSE-----------------------------------------------------------
avgs

## ----retrieve3, message=FALSE, cache=FALSE, eval=FALSE---------------------
#  gsco <- getGScores("phastCons100way.UCSC.hg19")

## ----retrieve4, message=FALSE, cache=FALSE---------------------------------
scores(gsco, GRanges(seqnames="chr7", IRanges(start=117232380, width=1)))

## --------------------------------------------------------------------------
library(VariantAnnotation)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

## --------------------------------------------------------------------------
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevelsStyle(vcf)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
seqlevelsStyle(txdb)

## --------------------------------------------------------------------------
seqlevelsStyle(vcf) <- seqlevelsStyle(txdb)
rd <- rowRanges(vcf)
rd[1:3]

## --------------------------------------------------------------------------
loc <- locateVariants(rd, txdb, AllVariants())
loc[1:3]
table(loc$LOCATION)

## --------------------------------------------------------------------------
loc$PHASTCONS <- scores(gsco, loc, scores.only=TRUE)
loc[1:3]

## ----plot1, fig.cap = "Distribution of phastCons conservation scores in variants across different annotated regions. Diamonds indicate mean values.", echo = FALSE, fig.height=5, fig.wide = TRUE, echo=TRUE----
x <- split(loc$PHASTCONS, loc$LOCATION)
mask <- elementNROWS(x) > 0
boxplot(x[mask], ylab="phastCons score")
points(1:length(x[mask])+0.25, sapply(x[mask], mean, na.rm=TRUE), pch=23, bg="black")

## ---- echo=FALSE, eval=FALSE-----------------------------------------------
#  qfun(gsco)
#  dqfun(gsco)

## ----showpositions, message=FALSE, cache=FALSE-----------------------------
origpcscoCDS <- readRDS(system.file("extdata", "origphastCons100wayhg19CDS.rds", package="GenomicScores"))
origpcscoCDS

length(unique(origpcscoCDS$score))

## --------------------------------------------------------------------------
numDecimals <- function(x) {
  spl <- strsplit(as.character(x+1), "\\.")
  spl <- sapply(spl, "[", 2)
  spl[is.na(spl)] <- ""
  nchar(spl)
}

nd1 <- numDecimals(origpcscoCDS$score)
table(nd1)

## ----showpositions2, message=FALSE, cache=FALSE----------------------------
origpcsco3UTRs <- readRDS(system.file("extdata", "origphastCons100wayhg193UTR.rds", package="GenomicScores"))
origpcsco3UTRs

length(table(origpcsco3UTRs$score))

nd2 <- numDecimals(origpcsco3UTRs$score)
table(nd2)

## --------------------------------------------------------------------------
pkgpcscoCDS <- scores(gsco, origpcscoCDS, scores.only=TRUE)
pkgpcsco3UTRs <- scores(gsco, origpcsco3UTRs, scores.only=TRUE)

## ----plot2, fig.cap = "Original and lossy-compressed phastCons scores. Top panels: comparison of the distribution of values. Bottom panels: comparison of the cumulative distribution", echo = FALSE, fig.height=12, fig.wide = TRUE----
par(mfrow=c(2, 2))
plot(origpcscoCDS$score, jitter(pkgpcscoCDS), pch=19, cex=1,
     xaxt="n", yaxt="n", xlab="Original phastCons scores (CDS)",
     ylab="Compressed phastCons scores (CDS)")
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
abline(0, 1)
plot(origpcsco3UTRs$score, jitter(pkgpcsco3UTRs), pch=19, cex=1,
     xaxt="n", yaxt="n", xlab="Original phastCons scores (3' UTR)",
     ylab="Compressed phastCons scores (3' UTR)")
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
abline(0, 1)
ForigCDS <- ecdf(origpcscoCDS$score)
FpkgCDS <- ecdf(pkgpcscoCDS)
plot(sort(origpcscoCDS$score), ForigCDS(sort(origpcscoCDS$score)), xaxt="n", yaxt="n",
     pch=".", cex=4, xlab="phastCons scores (CDS)", ylab="F(x)", ylim=c(0, 1))
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
points(sort(pkgpcscoCDS), FpkgCDS(sort(pkgpcscoCDS)), pch=19, cex=1)
legend("topleft", c("Original score", "Compressed score"), pch=c(46, 19),
       pt.cex=c(4, 1), inset=0.01, bg="white")
Forig3UTRs <- ecdf(origpcsco3UTRs$score)
Fpkg3UTRs <- ecdf(pkgpcsco3UTRs)
plot(sort(origpcsco3UTRs$score), Forig3UTRs(sort(origpcsco3UTRs$score)), xaxt="n", yaxt="n",
     pch=".", cex=4, xlab="phastCons scores (3'UTR)", ylab="F(x)", ylim=c(0, 1))
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
points(sort(pkgpcsco3UTRs), Fpkg3UTRs(sort(pkgpcsco3UTRs)), pch=19, cex=1)
legend("topleft", c("Original score", "Compressed score"), pch=c(46, 19),
       pt.cex=c(4, 1), inset=0.01, bg="white")


## ----session_info, cache=FALSE---------------------------------------------
sessionInfo()

