### R code from vignette source 'vignettes/GWASTools/inst/doc/DataCleaning.Rnw'
### Encoding: UTF-8

###################################################
### code chunk number 1: DataCleaning.Rnw:122-151
###################################################
library(GWASTools)
library(GWASdata)
# Load the SNP annotation (simple data frame)
data(illumina_snp_annot)
# Create a SnpAnnotationDataFrame
snpAnnot <- SnpAnnotationDataFrame(illumina_snp_annot)
# names of columns
varLabels(snpAnnot)
# data
head(pData(snpAnnot))
# Add metadata to describe the columns
meta <- varMetadata(snpAnnot)
meta[c("snpID", "chromosome", "position", "rsID", "alleleA", "alleleB",
  "BeadSetID", "IntensityOnly", "tAA", "tAB", "tBB", "rAA", "rAB", "rBB"), 
  "labelDescription"] <- c("unique integer ID for SNPs",                      
  paste("integer code for chromosome: 1:22=autosomes,",
   "23=X, 24=pseudoautosomal, 25=Y, 26=Mitochondrial, 27=Unknown"), 
  "base pair position on chromosome (build 36)",
  "RS identifier", 
  "alelleA", "alleleB", 
  "BeadSet ID from Illumina",
  "1=no genotypes were attempted for this assay",
  "mean theta for AA cluster",
  "mean theta for AB cluster",
  "mean theta for BB cluster",
  "mean R for AA cluster",
  "mean R for AB cluster",
  "mean R for BB cluster")
varMetadata(snpAnnot) <- meta


###################################################
### code chunk number 2: DataCleaning.Rnw:157-166
###################################################
snpID <- snpAnnot$snpID
snpID <- getSnpID(snpAnnot)
chrom <- snpAnnot[["chromosome"]]
chrom <- getChromosome(snpAnnot)
table(chrom)
chrom <- getChromosome(snpAnnot, char=TRUE)
table(chrom)
position <- getPosition(snpAnnot)
rsID <- getVariable(snpAnnot, "rsID")


###################################################
### code chunk number 3: DataCleaning.Rnw:189-199
###################################################
tmp <- snpAnnot[,c("snpID", "chromosome", "position")]
snp <- getAnnotation(tmp)
snp$flag <- sample(c(TRUE, FALSE), nrow(snp), replace=TRUE)
pData(tmp) <- snp
meta <- getMetadata(tmp)
meta["flag", "labelDescription"] <- "flag"
varMetadata(tmp) <- meta
getVariableNames(tmp)
varLabels(tmp)[4] <- "FLAG"
rm(tmp)


###################################################
### code chunk number 4: DataCleaning.Rnw:227-254
###################################################
# Load the scan annotation (simple data frame)
data(illumina_scan_annot)
# Create a ScanAnnotationDataFrame
scanAnnot <- ScanAnnotationDataFrame(illumina_scan_annot)
# names of columns
varLabels(scanAnnot)
# data
head(pData(scanAnnot))
# Add metadata to describe the columns
meta <- varMetadata(scanAnnot)
meta[c("scanID", "subjectID", "family", "father", "mother",
  "CoriellID", "race", "sex", "status", "genoRunID", "plate",
  "batch", "file"), "labelDescription"] <- 
   c("unique integer ID for scans", 
  "subject identifier (may have multiple scans)",
  "family identifier", 
  "father identifier as subjectID",
  "mother identifier as subjectID",
  "Coriell subject identifier",
  "HapMap population group",
  "sex coded as M=male and F=female",
  "simulated case/control status" ,
  "genotyping instance identifier",
  "plate containing samples processed together for genotyping chemistry",
  "simulated genotyping batch",
  "raw data file")
varMetadata(scanAnnot) <- meta


###################################################
### code chunk number 5: DataCleaning.Rnw:261-266
###################################################
scanID <- scanAnnot$scanID
scanID <- getScanID(scanAnnot)
sex <- scanAnnot[["sex"]]
sex <- getSex(scanAnnot)
subjectID <- getVariable(scanAnnot, "subjectID")


###################################################
### code chunk number 6: DataCleaning.Rnw:329-339
###################################################
geno.nc.file <- "tmp.geno.nc"
# get snp annotation data frame for function
snp <- illumina_snp_annot[,c("snpID", "chromosome", "position")]
ncdfCreate(ncdf.filename = geno.nc.file,
       snp.annotation = snp,
       variables = c("genotype"),
       array.name = "Human1Mv_C",
       genome.build = "36",
       n.samples = 3,
       precision = "single")


###################################################
### code chunk number 7: DataCleaning.Rnw:348-354
###################################################
# first 3 samples only
scan.annotation <- illumina_scan_annot[1:3, c("scanID", "genoRunID", "file")]
names(scan.annotation) <- c("scanID", "scanName", "file")
# indicate which column of SNP annotation is referenced in data files
snp.annotation <- illumina_snp_annot[,c("snpID", "rsID")]
names(snp.annotation) <- c("snpID", "snpName")


###################################################
### code chunk number 8: DataCleaning.Rnw:361-366
###################################################
col.nums <- as.integer(c(1,2,12,13))
names(col.nums) <- c("snp", "sample", "a1", "a2")
# Define a path to the raw data files which are read by 
#   ncdfAddData to access the raw genotypic data
path <- system.file("extdata", "illumina_raw_data", package="GWASdata")


###################################################
### code chunk number 9: DataCleaning.Rnw:375-406
###################################################
diag.geno.file <- "diag.geno.RData"
diag.geno <- ncdfAddData(path = path,
  ncdf.filename = geno.nc.file, 
  snp.annotation = snp.annotation,
  scan.annotation = scan.annotation, 
  sep.type = ",",
  skip.num = 11,
  col.total = 21,
  col.nums = col.nums,
  scan.name.in.file = 1,
  scan.start.index = 1,
  diagnostics.filename = diag.geno.file,
  verbose = FALSE)
# Look at the values included in the "diag.geno" object which holds 
#   all output from the function call
names(diag.geno)
# `read.file' is a vector indicating whether (1) or not (0) each file 
#   specified in the `files' argument was read successfully
table(diag.geno$read.file)
# `row.num' is a vector of the number of rows read from each file
table(diag.geno$row.num) 
# `sample.match' is a vector indicating whether (1) or not (0) 
#   the sample name inside the raw text file matches that in the 
#   sample annotation data.frame
table(diag.geno$sample.match)
# `snp.chk' is a vector indicating whether (1) or not (0) 
#   the raw text file has the expected set of SNP names
table(diag.geno$snp.chk)
# `chk' is a vector indicating whether (1) or not (0) all previous
#   checks were successful and the data were written to the NetCDF file
table(diag.geno$chk)


###################################################
### code chunk number 10: DataCleaning.Rnw:412-435
###################################################
check.geno.file <- "check.geno.RData"
check.geno <- ncdfCheckGenotype(path = path,
  ncdf.filename = geno.nc.file, 
  snp.annotation = snp.annotation,
  scan.annotation = scan.annotation, 
  sep.type = ",",
  skip.num = 11,
  col.total = 21,
  col.nums = col.nums,
  scan.name.in.file = 1,
  check.scan.index = 1:3,
  n.scans.loaded = 3,
  diagnostics.filename = check.geno.file,
  verbose = FALSE)
# Look at the values included in the "check.geno" object which holds 
#   all output from the function call
names(check.geno)
# 'snp.order' is a vector indicating whether (1) or not (0) the snp ids 
#   are in the same order in each file.
table(check.geno$snp.order)
# 'geno.chk' is a vector indicating whether (1) or not (0) the genotypes 
#   in the netCDF match the text file
table(check.geno$geno.chk)


###################################################
### code chunk number 11: DataCleaning.Rnw:445-455
###################################################
(nc <- NcdfGenotypeReader(geno.nc.file))
nscan(nc)
nsnp(nc)
head(getScanID(nc))
head(getSnpID(nc))
head(getChromosome(nc))
head(getPosition(nc))
# genotypes for the first 3 samples and  the first 5 SNPs
getGenotype(nc, snp=c(1,5), scan=c(1,3))
close(nc)


###################################################
### code chunk number 12: DataCleaning.Rnw:484-494
###################################################
qxy.nc.file <- "tmp.qxy.nc"
# get snp annotation data frame for function
snp <- illumina_snp_annot[,c("snpID", "chromosome", "position")]
ncdfCreate(ncdf.filename = qxy.nc.file,
       snp.annotation = snp,
       variables = c("quality","X","Y"),
       array.name = "Human1Mv_C",
       genome.build = "36",
       n.samples = 3,
       precision = "single")


###################################################
### code chunk number 13: DataCleaning.Rnw:501-509
###################################################
# first 3 samples only
scan.annotation <- illumina_scan_annot[1:3, c("scanID", "genoRunID", "file")]
names(scan.annotation) <- c("scanID", "scanName", "file")
# indicate which column of SNP annotation is referenced in data files
snp.annotation <- illumina_snp_annot[,c("snpID", "rsID")]
names(snp.annotation) <- c("snpID", "snpName")
col.nums <- as.integer(c(1,2,5,16,17))
names(col.nums) <- c("snp", "sample", "qs", "x", "y")


###################################################
### code chunk number 14: DataCleaning.Rnw:517-530
###################################################
diag.qxy.file <- "diag.qxy.RData"
diag.qxy <- ncdfAddData(path = path,
  ncdf.filename = qxy.nc.file, 
  snp.annotation = snp.annotation,
  scan.annotation = scan.annotation, 
  sep.type = ",",
  skip.num = 11,
  col.total = 21,
  col.nums = col.nums,
  scan.name.in.file = 1,
  scan.start.index = 1,
  diagnostics.filename = diag.qxy.file,
  verbose = FALSE)


###################################################
### code chunk number 15: DataCleaning.Rnw:536-554
###################################################
scan.annotation <- illumina_scan_annot[1:5,
  c("scanID", "genoRunID", "file")]
names(scan.annotation) <- c("scanID", "scanName", "file")
check.qxy.file <- "check.qxy.RData"
check.qxy <- ncdfCheckIntensity(path = path,
  intenpath = path,
  ncdf.filename = qxy.nc.file, 
  snp.annotation = snp.annotation,
  scan.annotation = scan.annotation, 
  sep.type = ",",
  skip.num = 11,
  col.total = 21,
  col.nums = col.nums,
  scan.name.in.file = 1,
  check.scan.index = 1:3,
  n.scans.loaded = 3,
  diagnostics.filename = check.qxy.file,
  verbose = FALSE)


###################################################
### code chunk number 16: DataCleaning.Rnw:563-577
###################################################
(nc <- NcdfIntensityReader(qxy.nc.file))
nscan(nc)
nsnp(nc)
head(getScanID(nc))
head(getSnpID(nc))
head(getChromosome(nc))
head(getPosition(nc))
# quality score for the first 3 samples and  the first 5 SNPs
getQuality(nc, snp=c(1,5), scan=c(1,3))
# X intensity for the first 3 samples and  the first 5 SNPs
getX(nc, snp=c(1,5), scan=c(1,3))
# Y intensity for the first 3 samples and  the first 5 SNPs
getY(nc, snp=c(1,5), scan=c(1,3))
close(nc)


###################################################
### code chunk number 17: DataCleaning.Rnw:657-667
###################################################
bl.nc.file <- "tmp.bl.nc"
# get snp annotation data frame for function
snp <- illumina_snp_annot[,c("snpID", "chromosome", "position")]
ncdfCreate(ncdf.filename = bl.nc.file,
       snp.annotation = snp,
       variables = c("BAlleleFreq","LogRRatio"),
       array.name = "Human1Mv_C",
       genome.build = "36",
       n.samples = 3,
       precision = "single")


###################################################
### code chunk number 18: DataCleaning.Rnw:673-695
###################################################
# first 3 samples only
scan.annotation <- illumina_scan_annot[1:3, c("scanID", "genoRunID", "file")]
names(scan.annotation) <- c("scanID", "scanName", "file")
# indicate which column of SNP annotation is referenced in data files
snp.annotation <- illumina_snp_annot[,c("snpID", "rsID")]
names(snp.annotation) <- c("snpID", "snpName")
col.nums <- as.integer(c(1,2,20,21))
names(col.nums) <- c("snp", "sample", "ballelefreq", "logrratio")

diag.bl.file <- "diag.bl.RData"
diag.bl <- ncdfAddData(path = path,
  ncdf.filename = bl.nc.file, 
  snp.annotation = snp.annotation,
  scan.annotation = scan.annotation, 
  sep.type = ",",
  skip.num = 11,
  col.total = 21,
  col.nums = col.nums,
  scan.name.in.file = 1,
  scan.start.index = 1,
  diagnostics.filename = diag.bl.file,
  verbose = FALSE)


###################################################
### code chunk number 19: DataCleaning.Rnw:702-706
###################################################
(nc <- NcdfIntensityReader(bl.nc.file))
getBAlleleFreq(nc, snp=c(1,5), scan=c(1,3))
getLogRRatio(nc, snp=c(1,5), scan=c(1,3))
close(nc)


###################################################
### code chunk number 20: DataCleaning.Rnw:710-713
###################################################
file.remove(geno.nc.file, qxy.nc.file, bl.nc.file)
file.remove(diag.geno.file, diag.qxy.file, diag.bl.file)
file.remove(check.geno.file, check.qxy.file)


###################################################
### code chunk number 21: DataCleaning.Rnw:729-738
###################################################
genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata")
nc <- NcdfGenotypeReader(genofile)
# only NetCDF file
genoData <- GenotypeData(nc)
# with scan annotation
genoData <- GenotypeData(nc, scanAnnot=scanAnnot)
# with scan and SNP annotation
genoData <- GenotypeData(nc, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
genoData


###################################################
### code chunk number 22: DataCleaning.Rnw:745-764
###################################################
nsnp(genoData)
nscan(genoData)
# scan annotation
range(getScanID(genoData))
hasSex(genoData)
table(getSex(genoData))
hasScanVariable(genoData, "subjectID")
head(getScanVariable(genoData, "subjectID"))
getScanVariableNames(genoData)
# snp annotation
range(getSnpID(genoData))
table(getChromosome(genoData, char=TRUE))
head(getPosition(genoData))
hasSnpVariable(genoData, "rsID")
head(getSnpVariable(genoData, "rsID"))
getSnpVariableNames(genoData)
# genotypes
getGenotype(genoData, snp=c(1,5), scan=c(1,5))
close(genoData)


###################################################
### code chunk number 23: DataCleaning.Rnw:771-790
###################################################
# quality score, X and X intensity
qxyfile <- system.file("extdata", "illumina_qxy.nc", package="GWASdata")
nc <- NcdfIntensityReader(qxyfile)
qxyData <- IntensityData(NcdfIntensityReader(qxyfile), 
                         snpAnnot=snpAnnot, scanAnnot=scanAnnot)
qxyData
getQuality(qxyData, snp=c(1,5), scan=c(1,5))
getX(qxyData, snp=c(1,5), scan=c(1,5))
getY(qxyData, snp=c(1,5), scan=c(1,5))
close(qxyData)
# BAF/LRR
blfile <- system.file("extdata", "illumina_bl.nc", package="GWASdata")
nc <- NcdfIntensityReader(blfile)
blData <- IntensityData(NcdfIntensityReader(blfile), 
                        snpAnnot=snpAnnot, scanAnnot=scanAnnot)
blData
getBAlleleFreq(blData, snp=c(1,5), scan=c(1,5))
getLogRRatio(blData, snp=c(1,5), scan=c(1,5))
close(blData)


###################################################
### code chunk number 24: DataCleaning.Rnw:837-850
###################################################
# open the netCDF file and create a GenotypeData object
ncfile <- system.file("extdata", "illumina_geno.nc", package="GWASdata")
nc <- NcdfGenotypeReader(ncfile)
# sex is required for this function, so we need the scan annotation
genoData <-  GenotypeData(nc, scanAnnot=scanAnnot)
# Calculate the number of missing calls for each snp over all samples 
#     for each sex separately
miss <- missingGenotypeBySnpSex(genoData)
# Examine the results
names(miss)
head(miss$missing.counts)
miss$scans.per.sex
head(miss$missing.fraction)


###################################################
### code chunk number 25: DataCleaning.Rnw:859-866
###################################################
# Make sure ordering matches snp annotation
allequal(snpAnnot$snpID, as.numeric(names(miss$missing.fraction)))
snpAnnot$missing.n1 <- miss$missing.fraction
varMetadata(snpAnnot)["missing.n1", "labelDescription"] <- paste(
  "fraction of genotype calls missing over all samples",
  "except that females are excluded for Y chr SNPs")
summary(snpAnnot$missing.n1)


###################################################
### code chunk number 26: DataCleaning.Rnw:873-876
###################################################
hist(snpAnnot$missing.n1, ylim=c(0,100),
     xlab="SNP missing call rate", 
     main="Missing Call Rate for All Probes")


###################################################
### code chunk number 27: DataCleaning.Rnw:879-890
###################################################
# Find the number of SNPs with every call missing
length(snpAnnot$missing.n1[snpAnnot$missing.n1 == 1])
# Fraction of autosomal SNPs with missing call rate < 0.05
x <- snpAnnot$missing.n1[snpAnnot$chromosome < 23]
length(x[x < 0.05]) / length(x)
# Fraction of X chromosome SNPs with missing call rate < 0.05
x <- snpAnnot$missing.n1[snpAnnot$chromosome == 23]
length(x[x < 0.05]) / length(x)
# Fraction of Y chromosome SNPs with missing call rate < 0.05
x <- snpAnnot$missing.n1[snpAnnot$chromosome == 25]
length(x[x < 0.05]) / length(x)


###################################################
### code chunk number 28: DataCleaning.Rnw:908-922
###################################################
# Want to exclude all SNP probes with 100% missing call rate
# Check on how many SNPs to exclude
sum(snpAnnot$missing.n1 == 1)
# Create a variable that contains the IDs of these SNPs to exclude
snpexcl <- snpAnnot$snpID[snpAnnot$missing.n1 == 1]
length(snpexcl)
# Calculate the missing call rate per sample
miss <- missingGenotypeByScanChrom(genoData, snp.exclude=snpexcl)
names(miss)
head(miss$missing.counts)
head(miss$snps.per.chr)
# Check to make sure that the correct number of SNPs were excluded 
sum(miss$snps.per.chr)
nrow(snpAnnot) - sum(miss$snps.per.chr)


###################################################
### code chunk number 29: DataCleaning.Rnw:929-938
###################################################
head(miss$missing.fraction)
# Check the ordering matches the sample annotation file
allequal(names(miss$missing.fraction), scanAnnot$scanID)
# Add the missing call rates vector to the sample annotation file
scanAnnot$missing.e1 <- miss$missing.fraction
varMetadata(scanAnnot)["missing.e1", "labelDescription"] <- paste(
  "fraction of genotype calls missing over all snps with missing.n1<1",
  "except that Y chr SNPs are excluded for females")
summary(scanAnnot$missing.e1)


###################################################
### code chunk number 30: DataCleaning.Rnw:950-953
###################################################
hist(scanAnnot$missing.e1, 
     xlab="Fraction of missing calls over all probes", 
     main="Histogram of Sample Missing Call Rate for all Samples")


###################################################
### code chunk number 31: DataCleaning.Rnw:956-962
###################################################
# Look at missing.e1 for males
summary(scanAnnot$missing.e1[scanAnnot$sex == "M"])
# Look at missing.e1 for females
summary(scanAnnot$missing.e1[scanAnnot$sex == "F"])
# Number of samples with missing call rate > 5%
sum(scanAnnot$missing.e1 > 0.05)


###################################################
### code chunk number 32: DataCleaning.Rnw:972-994
###################################################
auto <- colnames(miss$missing.counts) %in% 1:22
missa <- rowSums(miss$missing.counts[,auto]) / sum(miss$snps.per.chr[auto])
summary(missa)
missx <- miss$missing.counts[,"X"] / miss$snps.per.chr["X"]
summary(missx)
# check they match sample annotation file
allequal(names(missa), scanAnnot$scanID)
allequal(names(missx), scanAnnot$scanID)
# Add these separate sample missing call rates to the sample 
# annotation
scanAnnot$miss.e1.auto <- missa
scanAnnot$miss.e1.xchr <- missx
# Order scanAnnot by missing.e1 so duplicate subjectIDs 
# with a higher missing rate are marked as duplicates
scanAnnot <- scanAnnot[order(scanAnnot$subjectID, scanAnnot$missing.e1),]
scanAnnot$duplicated <- duplicated(scanAnnot$subjectID)
table(scanAnnot$duplicated, useNA="ifany")
# Put scanAnnot back in scanID order; this is very important!!
scanAnnot <- scanAnnot[order(scanAnnot$scanID),]
allequal(scanAnnot$scanID, sort(scanAnnot$scanID))
varMetadata(scanAnnot)["duplicated", "labelDescription"] <- 
  "TRUE for duplicate scan with higher missing.e1"


###################################################
### code chunk number 33: DataCleaning.Rnw:1011-1021
###################################################
# Find the samples with missing.e1 > .05 and make a vector of 
# scanID to exclude from the calculation
scan.exclude <- scanAnnot$scanID[scanAnnot$missing.e1 > 0.05]
# Call missingGenotypeBySnpSex and save the output
miss <- missingGenotypeBySnpSex(genoData, scan.exclude=scan.exclude)
snpAnnot$missing.n2 <- miss$missing.fraction
varMetadata(snpAnnot)["missing.n2", "labelDescription"] <- paste(
  "fraction of genotype calls missing over all samples with missing.e1<0.05",
  "except that females are excluded for Y chr SNPs")
summary(snpAnnot$missing.n2)


###################################################
### code chunk number 34: DataCleaning.Rnw:1031-1041
###################################################
# Create a vector of the SNPs to exclude.
snpexcl <- snpAnnot$snpID[snpAnnot$missing.n2 >= 0.05]
length(snpexcl)
miss <- missingGenotypeByScanChrom(genoData, snp.exclude=snpexcl)
# Add the missing call rates vector to the sample annotation file
scanAnnot$missing.e2 <- miss$missing.fraction
varMetadata(scanAnnot)["missing.e2", "labelDescription"] <- paste(
  "fraction of genotype calls missing over all snps with missing.n2<0.05",
  "except that Y chr SNPs are excluded for females")
summary(scanAnnot$missing.e2)


###################################################
### code chunk number 35: DataCleaning.Rnw:1047-1050
###################################################
hist(scanAnnot$missing.e2, xlab="Fraction of missing calls over all probes 
     with missing call rate < 0.05", 
     main="Histogram of Sample Missing Call Rate for all Samples")


###################################################
### code chunk number 36: DataCleaning.Rnw:1063-1067
###################################################
varLabels(scanAnnot)
# Check how many batches exist and how many samples are in each batch
length(unique(scanAnnot$batch))
table(scanAnnot$batch, useNA="ifany")


###################################################
### code chunk number 37: DataCleaning.Rnw:1070-1074
###################################################
# Plot the distribution of the number of samples per batch.
barplot(table(scanAnnot$batch), 
        ylab="Number of Samples", xlab="Batch", 
        main="Distribution of Samples per Batch")


###################################################
### code chunk number 38: DataCleaning.Rnw:1077-1086
###################################################
# Examine the mean missing call rate per batch for all SNPs
batches <- unique(scanAnnot$batch)
bmiss <- rep(NA,length(batches)); names(bmiss) <- batches
bn <- rep(NA,length(batches)); names(bn) <- batches
for(i in 1:length(batches)) {
  x <- scanAnnot$missing.e1[is.element(scanAnnot$batch, batches[i])]
  bmiss[i] <- mean(x)
  bn[i] <- length(x)
}


###################################################
### code chunk number 39: DataCleaning.Rnw:1097-1099
###################################################
y <- lm(bmiss ~ bn)
anova(y)


###################################################
### code chunk number 40: DataCleaning.Rnw:1102-1106
###################################################
plot(bn, bmiss, 
 xlab="Number of samples per batch", ylab="Mean missing call rate", 
 main="Mean Missing Call Rate vs\nSamples per Batch")
abline(y$coefficients)


###################################################
### code chunk number 41: DataCleaning.Rnw:1127-1135
###################################################
res <- batchChisqTest(genoData, batchVar="batch", return.by.snp=TRUE)
close(genoData)
# chi-square values for each SNP
dim(res$chisq)
# genomic inflation factor
res$lambda
# average chi-square test statistic for each of the batches
res$mean.chisq


###################################################
### code chunk number 42: DataCleaning.Rnw:1140-1158
###################################################
x <- table(scanAnnot$race, useNA="ifany")
x
x[1] / sum(x)
x[2] / sum(x)
x <- table(scanAnnot$race, scanAnnot$batch)
x
# Run an approximate chi-square test to see if there are ethnic effects
chisq <- chisq.test(x)
chisq$p.value
# Calculate the fraction of samples in each batch that are CEU
batches <- unique(scanAnnot$batch)
eth <- rep(NA,length(batches)); names(eth) <- sort(batches)
for(i in 1:length(batches)){
  x <- scanAnnot$race[is.element(scanAnnot$batch, batches[i])]
  xl <- length(x[x == "CEU"])
  eth[i] <- xl / length(x)
}
allequal(names(eth), names(res$mean.chisq))


###################################################
### code chunk number 43: DataCleaning.Rnw:1161-1168
###################################################
# Plot the average Chi-Square test statistic against the 
#     fraction of samples that are CEU
plot(eth, res$mean.chisq, xlab="Fraction of CEU Samples per Batch", 
  ylab="Average Chi-square Test Statistic",
  main="Fraction of CEU Samples per Batch 
  vs Average Chi-square Test Statistic")
abline(v=mean(eth), lty=2, col="red")


###################################################
### code chunk number 44: DataCleaning.Rnw:1199-1207
###################################################
qxyfile <- system.file("extdata", "illumina_qxy.nc", package="GWASdata")
qualNC <- NcdfIntensityReader(qxyfile)
qualData <- IntensityData(qualNC, scanAnnot=scanAnnot)
genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata")
genoNC <- NcdfGenotypeReader(genofile)
genoData <- GenotypeData(genoNC, scanAnnot=scanAnnot)
qual.results <- qualityScoreByScan(qualData, genoData)
close(qualData)


###################################################
### code chunk number 45: DataCleaning.Rnw:1214-1216
###################################################
hist(qual.results[,"median.quality"], main="Median Genotype Quality Scores 
  of Samples", xlab="Median Quality")


###################################################
### code chunk number 46: DataCleaning.Rnw:1247-1254
###################################################
blfile <- system.file("extdata", "illumina_bl.nc", package="GWASdata")
blNC <- NcdfIntensityReader(blfile)
blData <- IntensityData(blNC, scanAnnot=scanAnnot)
nbins <- rep(12, 3)
slidingBAF12 <- sdByScanChromWindow(blData, genoData, nbins=nbins)
names(slidingBAF12)
dim(slidingBAF12[["21"]])


###################################################
### code chunk number 47: DataCleaning.Rnw:1262-1265
###################################################
sds.chr <- meanSdByChromWindow(slidingBAF12, scanAnnot$sex)
sds.chr[["21"]]
sds.chr[["X"]]     


###################################################
### code chunk number 48: DataCleaning.Rnw:1271-1275
###################################################
res12bin4sd <- findBAFvariance(sds.chr, slidingBAF12, scanAnnot$sex, 
                               sd.threshold=4)
head(res12bin4sd)
table(res12bin4sd[, "chromosome"])


###################################################
### code chunk number 49: DataCleaning.Rnw:1282-1287
###################################################
scanID <- as.integer(res12bin4sd[, "scanID"])
chrom <- as.integer(res12bin4sd[, "chromosome"])
chrom[res12bin4sd[, "chromosome"] == "X"] <- 23
bincode <- paste("Bin", res12bin4sd[, "bin"], sep = " ")
chromIntensityPlot(blData, scanID, chrom, info=bincode, ideogram=FALSE)


###################################################
### code chunk number 50: DataCleaning.Rnw:1290-1291
###################################################
close(blData)


###################################################
### code chunk number 51: DataCleaning.Rnw:1317-1321
###################################################
miss <- missingGenotypeByScanChrom(genoData)
miss.rate <- t(apply(miss$missing.counts, 1, function(x) {
  x / miss$snps.per.chr}))
miss.rate <- as.data.frame(miss.rate)


###################################################
### code chunk number 52: DataCleaning.Rnw:1325-1328
###################################################
cols <- names(miss.rate) %in% c(1:22, "X", "XY")
boxplot(miss.rate[,cols], main="Missingness by Chromosome", 
  ylab="Proportion Missing", xlab="Chromosome")


###################################################
### code chunk number 53: DataCleaning.Rnw:1331-1334
###################################################
boxplot(miss.rate$X ~ scanAnnot$sex, 
  main="X Chromosome Missingness by Sex", 
  ylab="Proportion Missing")


###################################################
### code chunk number 54: DataCleaning.Rnw:1341-1353
###################################################
# Calculate heterozygosity by scan by chromosome
het.results <- hetByScanChrom(genoData)
close(genoData)
# Ensure heterozygosity results are ordered correctly
allequal(scanAnnot$scanID, rownames(het.results))
# Write autosomal and X chr heterozygosity to sample annot
scanAnnot$het.A <- het.results[,"A"]
scanAnnot$het.X <- het.results[,"X"]
varMetadata(scanAnnot)["het.A", "labelDescription"] <- 
  "fraction of heterozygotes for autosomal SNPs"
varMetadata(scanAnnot)["het.X", "labelDescription"] <- 
  "fraction of heterozygotes for X chromosome SNPs"


###################################################
### code chunk number 55: DataCleaning.Rnw:1361-1363
###################################################
boxplot(scanAnnot$het.A ~ scanAnnot$race, 
  main="Autosomal Heterozygosity")


###################################################
### code chunk number 56: DataCleaning.Rnw:1366-1369
###################################################
female <- scanAnnot$sex == "F"
boxplot(scanAnnot$het.X[female] ~ scanAnnot$race[female], 
  main="X Chromosome Heterozygosity in Females")


###################################################
### code chunk number 57: DataCleaning.Rnw:1406-1411
###################################################
qxyfile <- system.file("extdata", "illumina_qxy.nc", package="GWASdata")
intenNC <- NcdfIntensityReader(qxyfile)
inten.by.chrom <- meanIntensityByScanChrom(intenNC)
close(intenNC)
names(inten.by.chrom)


###################################################
### code chunk number 58: DataCleaning.Rnw:1419-1429
###################################################
mninten <- inten.by.chrom[[1]]  # mean intensities
dim(mninten)
# Check to be sure sample ordering is consistent
allequal(scanAnnot$scanID, rownames(mninten))
# Assign each sex a color
xcol <- rep(NA, nrow(scanAnnot))
xcol[scanAnnot$sex == "M"] <- "blue"
xcol[scanAnnot$sex == "F"] <- "red"
nx <- sum(snpAnnot$chromosome == 23)
ny <- sum(snpAnnot$chromosome == 25)


###################################################
### code chunk number 59: DataCleaning.Rnw:1439-1471
###################################################
#All intensities
x1 <-mninten[,"X"]; y1 <- mninten[,"Y"]
main1 <- "Mean X vs \nMean Y Chromosome Intensity"
#Het on X vs X intensity
x2 <- mninten[,"X"]; y2 <- scanAnnot$het.X
main2 <- "Mean X Chromosome Intensity vs
  Mean X Chromosome Heterozygosity"
# Het on X vs Y intensity
y3 <- mninten[,"Y"]; x3 <- scanAnnot$het.X
main3 <- "Mean X Chromosome Heterozygosity vs
  Mean Y Chromosome Intensity"
# X vs A het
x4 <- scanAnnot$het.A[scanAnnot$sex == "F"]
y4 <- scanAnnot$het.X[scanAnnot$sex == "F"]
main4 <- "Mean Autosomal Heterozygosity vs
  Mean X Chromosome Heterozygosity"
cols <- c("blue","red")
mf <- c("male", "female")
xintenlab <- paste("X intensity (n=", nx, ")", sep="")
yintenlab <- paste("Y intensity (n=", ny, ")", sep="")
pdf("DataCleaning-sex.pdf")
par(mfrow=c(2,2))
plot(x1, y1, xlab=xintenlab, ylab=yintenlab, 
  main=main1, col=xcol, cex.main=0.8)
legend("topright",mf,col=cols,pch=c(1,1))
plot(x2, y2, col=xcol, xlab=xintenlab, 
  ylab="X heterozygosity", main=main2, cex.main=0.8)
plot(x3, y3, col=xcol, ylab=yintenlab, 
  xlab="X heterozygosity", main=main3, cex.main=0.8)
plot(x4,y4, col="red", xlab="Autosomal heterozygosity", 
  ylab="X heterozygosity", main=main4, cex.main=0.8)
dev.off()


###################################################
### code chunk number 60: DataCleaning.Rnw:1504-1507
###################################################
ncfile <- system.file("extdata", "illumina_geno.nc", package="GWASdata")
gdsfile <- "tmp_geno.gds"
convertNcdfGds(ncfile, gdsfile)


###################################################
### code chunk number 61: DataCleaning.Rnw:1515-1519
###################################################
snp.auto <- pData(snpAnnot)[snpAnnot$chromosome < 23, ]
snp.auto.nonmiss <- snp.auto[snp.auto$missing.n1 < 0.05, ]
snp.sel <- snp.auto.nonmiss$snpID
length(snp.sel)


###################################################
### code chunk number 62: DataCleaning.Rnw:1528-1537
###################################################
sample.sel <- scanAnnot$scanID[scanAnnot$race == "CEU"]
length(sample.sel)
library(SNPRelate)
gdsobj <- openfn.gds(gdsfile)
ibdobj <- snpgdsIBDMLE(gdsobj, sample.id=sample.sel, snp.id=snp.sel, method="EM")
closefn.gds(gdsobj)
names(ibdobj)
dim(ibdobj$k0)
ibdobj$k0[1:5,1:5]


###################################################
### code chunk number 63: DataCleaning.Rnw:1548-1554
###################################################
ped <- pData(scanAnnot)[scanAnnot$race == "CEU",
  c("family", "subjectID", "father", "mother", "sex")]
dim(ped)
names(ped) <- c("family", "individ", "father", "mother", "sex")
ped[1:5, ]
(chk <- pedigreeCheck(ped))


###################################################
### code chunk number 64: DataCleaning.Rnw:1560-1563
###################################################
dups <- chk$duplicates
uni.ped <- pedigreeDeleteDuplicates(ped, dups)
(chk <- pedigreeCheck(uni.ped))


###################################################
### code chunk number 65: DataCleaning.Rnw:1568-1574
###################################################
ni <- chk$parent.no.individ.entry
parent <- data.frame(family=ni$family, individ=ni$parentID, 
                     father=0, mother=0, sex="F", 
                     stringsAsFactors=FALSE)
ped.complete <- rbind(uni.ped, parent)
(chk <- pedigreeCheck(ped.complete))


###################################################
### code chunk number 66: DataCleaning.Rnw:1580-1587
###################################################
subf <- chk$subfamilies.ident
table(subf$family)
subf.ids <- subf$individ[subf$subfamily == 2]
newfam <- ped.complete$individ %in% subf.ids
ped.complete$family[newfam] <- ped.complete$family[newfam] + 10000
table(ped.complete$family)
pedigreeCheck(ped.complete)


###################################################
### code chunk number 67: DataCleaning.Rnw:1597-1602
###################################################
rels <- pedigreePairwiseRelatedness(ped.complete)
length(rels$inbred.fam)
relprs <- rels$relativeprs
relprs[1:5,]
table(relprs$relation)


###################################################
### code chunk number 68: DataCleaning.Rnw:1612-1634
###################################################
samp <- pData(scanAnnot)[,c("scanID", "subjectID")]
samp <- samp[match(ibdobj$sample.id, samp$scanID),]
names(samp) <- c("scanID", "Individ")
## snpgdsIBDSelection expects character sample.id
ibdobj$sample.id <- as.character(ibdobj$sample.id)
ibd <- snpgdsIBDSelection(ibdobj, kinship.cutoff=1/32)
ibd <- merge(ibd, samp, by.x="ID1", by.y="scanID")
ibd <- merge(ibd, samp, by.x="ID2", by.y="scanID", suffixes=c("1","2"))
ibd$ii <- paste(pmin(ibd$Individ1, ibd$Individ2),
                pmax(ibd$Individ1, ibd$Individ2))

relprs$ii <- paste(pmin(relprs$Individ1, relprs$Individ2),
                   pmax(relprs$Individ1, relprs$Individ2))
ibd <- merge(ibd, relprs[,c("ii","relation")], all.x=TRUE)
names(ibd)[names(ibd) == "relation"] <- "exp.rel"
ibd$exp.rel[ibd$Individ1 == ibd$Individ2] <- "Dup"
ibd$exp.rel[is.na(ibd$exp.rel)] <- "U"
table(ibd$exp.rel, useNA="ifany")

# assign observed relationships
ibd$obs.rel <- ibdAssignRelatedness(ibd$k0, ibd$k1)
table(ibd$obs.rel, useNA="ifany")


###################################################
### code chunk number 69: DataCleaning.Rnw:1641-1643
###################################################
ibdPlot(ibd$k0, ibd$k1, relation=ibd$exp.rel,
  main="HapMap CEU IBD Estimates \nEM Algorithm")


###################################################
### code chunk number 70: DataCleaning.Rnw:1662-1691
###################################################
# Allele frequency calculation in YRI samples
# open the NetCDF file and create a GenotypeData object
nc <- NcdfGenotypeReader(ncfile)
genoData <-  GenotypeData(nc, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
# sample selection - unduplicated YRI samples
sample.excl <- scanAnnot$scanID[scanAnnot$race != "YRI" | scanAnnot$duplicated]
length(sample.excl)
afreq <- alleleFrequency(genoData, scan.exclude=sample.excl)
close(genoData)
# Add allele frequency to SNP annotation
afreq <- afreq[,"all"] # males and females combined
snpAnnot$A.freq.yri <- afreq
varMetadata(snpAnnot)["A.freq.yri", "labelDescription"] <- 
  "frequency of allele A in YRI subjects"
# Initial selection: autosomal SNPs with low missing call rate and MAF>0
init.sel <- snpAnnot$missing.n1 < 0.05 & snpAnnot$chromosome < 23 &
  afreq > 0 & afreq < 1
sum(init.sel)
# Call the function to select SNPs no closer than 15000 bases
set.seed(1001)
n <- 15000
rsnp2 <- apartSnpSelection(snpAnnot$chromosome, snpAnnot$position, 
                           min.dist=n, init.sel=init.sel)
sum(rsnp2)
# Add the SNP selection to the SNP annotation file
snpAnnot$r15kb <- rsnp2
varMetadata(snpAnnot)["r15kb", "labelDescription"] <- paste(
  "Logical indicator for autosomal SNPs with missing.n1<0.05",
  "and YRI MAF>0, at least 15 kb apart")


###################################################
### code chunk number 71: DataCleaning.Rnw:1697-1704
###################################################
sample.sel <- scanAnnot$scanID[scanAnnot$race == "YRI"]
snp.sel <- snpAnnot$snpID[snpAnnot$r15kb]
gdsobj <- openfn.gds(gdsfile)
ibdobj <- snpgdsIBDMoM(gdsobj, sample.id=sample.sel, snp.id=snp.sel)
closefn.gds(gdsobj)
dim(ibdobj$k0)
ibdobj$k0[1:5,1:5]


###################################################
### code chunk number 72: DataCleaning.Rnw:1714-1720
###################################################
ped <- pData(scanAnnot)[scanAnnot$race == "YRI",
  c("family", "subjectID", "father", "mother", "sex")]
dim(ped)
names(ped) <- c("family", "individ", "father", "mother", "sex")
ped[1:5, ]
(chk <- pedigreeCheck(ped))


###################################################
### code chunk number 73: DataCleaning.Rnw:1727-1730
###################################################
dups <- chk$duplicates
uni.ped <- pedigreeDeleteDuplicates(ped, dups)
(chk <- pedigreeCheck(uni.ped))


###################################################
### code chunk number 74: DataCleaning.Rnw:1736-1740
###################################################
subf <- chk$subfamilies.ident
uni.ped <- uni.ped[!(uni.ped$family %in% subf$family),]
table(uni.ped$family)
pedigreeCheck(uni.ped)


###################################################
### code chunk number 75: DataCleaning.Rnw:1746-1774
###################################################
rels <- pedigreePairwiseRelatedness(uni.ped)
length(rels$inbred.fam)
relprs <- rels$relativeprs
relprs[1:5,]
table(relprs$relation)

samp <- pData(scanAnnot)[,c("scanID", "subjectID")]
samp <- samp[match(ibdobj$sample.id, samp$scanID),]
names(samp) <- c("scanID", "Individ")
## snpgdsIBDSelection expects character sample.id
ibdobj$sample.id <- as.character(ibdobj$sample.id)
ibd <- snpgdsIBDSelection(ibdobj, kinship.cutoff=1/32)
ibd <- merge(ibd, samp, by.x="ID1", by.y="scanID")
ibd <- merge(ibd, samp, by.x="ID2", by.y="scanID", suffixes=c("1","2"))
ibd$ii <- paste(pmin(ibd$Individ1, ibd$Individ2),
                pmax(ibd$Individ1, ibd$Individ2))

relprs$ii <- paste(pmin(relprs$Individ1, relprs$Individ2),
                   pmax(relprs$Individ1, relprs$Individ2))
ibd <- merge(ibd, relprs[,c("ii","relation")], all.x=TRUE)
names(ibd)[names(ibd) == "relation"] <- "exp.rel"
ibd$exp.rel[ibd$Individ1 == ibd$Individ2] <- "Dup"
ibd$exp.rel[is.na(ibd$exp.rel)] <- "U"
table(ibd$exp.rel, useNA="ifany")

# assign observed relationships
ibd$obs.rel <- ibdAssignRelatedness(ibd$k0, ibd$k1)
table(ibd$obs.rel, useNA="ifany")


###################################################
### code chunk number 76: DataCleaning.Rnw:1783-1785
###################################################
ibdPlot(ibd$k0, ibd$k1, relation=ibd$exp.rel,
  main="HapMap YRI IBD Estimates \nPLINK MOM Approach")


###################################################
### code chunk number 77: DataCleaning.Rnw:1813-1836
###################################################
filt <- get(data(pcaSnpFilters.hg18))
chrom <- getChromosome(snpAnnot)
pos <- getPosition(snpAnnot)
snpID <- getSnpID(snpAnnot)
snp.filt <- rep(TRUE, length(snpID))
for (f in 1:nrow(filt)) {
  snp.filt[chrom == filt$chrom[f] & filt$start.base[f] < pos
           & pos < filt$end.base[f]] <- FALSE
}
snp.sel <- snpID[snp.filt]
length(snp.sel)

sample.sel <- scanAnnot$scanID[!scanAnnot$duplicated]
length(sample.sel)

gdsobj <- openfn.gds(gdsfile)
snpset <- snpgdsLDpruning(gdsobj, sample.id=sample.sel, snp.id=snp.sel,
                          autosome.only=TRUE, maf=0.05, missing.rate=0.05,
                          method="corr", slide.max.bp=10e6,
                          ld.threshold=sqrt(0.1))

snp.pruned <- unlist(snpset, use.names=FALSE)
length(snp.pruned)


###################################################
### code chunk number 78: DataCleaning.Rnw:1842-1846
###################################################
pca <- snpgdsPCA(gdsobj, sample.id=sample.sel, snp.id=snp.pruned)
names(pca)
length(pca$eigenval)
dim(pca$eigenvect)


###################################################
### code chunk number 79: DataCleaning.Rnw:1852-1860
###################################################
# Calculate the percentage of variance explained 
# by each principal component.  
pc.frac <- pca$eigenval/sum(pca$eigenval)
lbls <- paste("EV", 1:4, "\n", format(pc.frac[1:4], digits=2), sep="")
samp <- pData(scanAnnot)[match(pca$sample.id, scanAnnot$scanID),]
cols <- rep(NA, nrow(samp))
cols[samp$race == "CEU"] <- "blue"
cols[samp$race == "YRI"] <- "red"


###################################################
### code chunk number 80: DataCleaning.Rnw:1863-1865
###################################################
pairs(pca$eigenvect[,1:4], col=cols, labels=lbls,
  main = "CEU: blue, YRI: red")


###################################################
### code chunk number 81: DataCleaning.Rnw:1876-1889
###################################################
par.coord <- pca$eigenvect
rangel <- apply(par.coord, 2, function(x) range(x)[1])
rangeh <- apply(par.coord, 2, function(x) range(x)[2])
std.coord <- par.coord
for (i in 1:14)
  std.coord[,i] <- (par.coord[,i] - rangel[i])/(rangeh[i]-rangel[i]) 
plot(c(0,15), c(0,1), type = 'n', axes = FALSE, ylab = "", xlab = "", 
 main = "Parallel Coordinates Plot
 CEU: blue, YRI: red")
for (j in 1:13)
  for (i in sample(1:nrow(std.coord)) )
    lines(c(j,j+1), std.coord[i,c(j,j+1)], col=cols[i], lwd=0.25)
axis(1, at = 1:14, labels = paste("PC",1:14, sep = "."))


###################################################
### code chunk number 82: DataCleaning.Rnw:1899-1910
###################################################
corr <- snpgdsPCACorr(pca, gdsobj, eig.which=1:4)
closefn.gds(gdsobj)
snp <- snpAnnot[match(corr$snp.id, snpID),]
chrom <- getChromosome(snp, char=TRUE)
pdf("DataCleaning-corr.pdf")
par(mfrow=c(4,1))
for (i in 1:4) {
  snpCorrelationPlot(abs(corr$snpcorr[i,]), chrom,
                     main=paste("Eigenvector",i), ylim=c(0,1))
}
dev.off()


###################################################
### code chunk number 83: DataCleaning.Rnw:1914-1915
###################################################
file.remove(gdsfile)


###################################################
### code chunk number 84: DataCleaning.Rnw:1935-1940
###################################################
princomp <- as.data.frame(pca$eigenvect)
samples.nodup <- pData(scanAnnot)[!scanAnnot$duplicated,]
princomp$scanID <- as.factor(samples.nodup$scanID)
princomp$case.ctrl.status <- as.factor(samples.nodup$status)
princomp$race <- as.factor(samples.nodup$race)


###################################################
### code chunk number 85: DataCleaning.Rnw:1946-1953
###################################################
pc.percent <- 100 * pca$eigenval[1:32]/sum(pca$eigenval)
pc.percent
lbls <- paste("EV", 1:3, "\n", format(pc.percent[1:3], digits=2), "%", sep="")
table(samples.nodup$status)
cols <- rep(NA, nrow(samples.nodup))
cols[samples.nodup$status == 1] <- "green"
cols[samples.nodup$status == 0] <- "magenta"


###################################################
### code chunk number 86: DataCleaning.Rnw:1963-1965
###################################################
pairs(pca$eigenvect[,1:3], col=cols, labels=lbls,
  main = "First Three EVs by Case-Control Status")


###################################################
### code chunk number 87: DataCleaning.Rnw:1968-1970
###################################################
boxplot(princomp[, 1] ~ princomp$case.ctrl.status, 
  ylab = "PC1", main = "PC1 vs. Case-control Status")


###################################################
### code chunk number 88: DataCleaning.Rnw:1973-1975
###################################################
boxplot(princomp[, 2] ~ princomp$case.ctrl.status, 
  ylab = "PC2", main = "PC2 vs. Case-control Status")


###################################################
### code chunk number 89: DataCleaning.Rnw:1978-1980
###################################################
boxplot(princomp[, 3] ~ princomp$case.ctrl.status, 
  ylab = "PC3", main = "PC3 vs. Case-control Status")


###################################################
### code chunk number 90: DataCleaning.Rnw:1983-1992
###################################################
aov.p1 <- aov(princomp[,1] ~ princomp$race * 
  princomp$case.ctrl.status, princomp)
summary(aov.p1)
aov.p2 <- aov(princomp[,2] ~ princomp$race * 
  princomp$case.ctrl.status, princomp)
summary(aov.p2)
aov.p3 <- aov(princomp[,3] ~ princomp$race * 
  princomp$case.ctrl.status, princomp)
summary(aov.p3)


###################################################
### code chunk number 91: DataCleaning.Rnw:2012-2014
###################################################
lm.all <- lm(scanAnnot$missing.e1 ~ scanAnnot$status)
summary(aov(lm.all))


###################################################
### code chunk number 92: DataCleaning.Rnw:2017-2019
###################################################
boxplot(scanAnnot$missing.e1 ~ scanAnnot$status, ylab = 
  "Mean missing call rate", main="Mean missing call rate by case status")


###################################################
### code chunk number 93: DataCleaning.Rnw:2035-2042
###################################################
blfile <- system.file("extdata", "illumina_bl.nc", package="GWASdata")
blnc <- NcdfIntensityReader(blfile)
blData <-  IntensityData(blnc, snpAnnot=snpAnnot, scanAnnot=scanAnnot)

genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata")
genonc <- NcdfGenotypeReader(genofile)
genoData <-  GenotypeData(genonc, snpAnnot=snpAnnot, scanAnnot=scanAnnot)


###################################################
### code chunk number 94: DataCleaning.Rnw:2047-2050
###################################################
baf.sd <- sdByScanChromWindow(blData, genoData, var="BAlleleFreq")
med.baf.sd <- medianSdOverAutosomes(baf.sd)
low.qual.ids <- med.baf.sd$scanID[med.baf.sd$med.sd > 0.05]


###################################################
### code chunk number 95: DataCleaning.Rnw:2055-2072
###################################################
chrom <- snpAnnot$chromosome
pos <- snpAnnot$position
hla.df <- get(data(HLA.hg18))
hla <- chrom == "6" & pos >= hla.df$start.base & pos <= hla.df$end.base
xtr.df <- get(data(pseudoautosomal.hg18))
xtr <- chrom == "X" & pos >= xtr.df["X.XTR", "start.base"] &
  pos <= xtr.df["X.XTR", "end.base"]
centromeres <- get(data(centromeres.hg18))
gap <- rep(FALSE, nrow(snpAnnot))
for (i in 1:nrow(centromeres)) {
  ingap <- chrom == centromeres$chrom[i] & pos > centromeres$left.base[i] &
    pos < centromeres$right.base[i]
  gap <- gap | ingap
}
ignore <- snpAnnot$missing.n1 == 1 #ignore includes intensity-only and failed snps
snp.exclude <- ignore | hla | xtr | gap
snp.ok <- snpAnnot$snpID[!snp.exclude]


###################################################
### code chunk number 96: DataCleaning.Rnw:2077-2082
###################################################
scan.ids <- scanAnnot$scanID[1:10]
chrom.ids <- 21:23
baf.seg <- anomSegmentBAF(blData, genoData, scan.ids=scan.ids,
  chrom.ids=chrom.ids, snp.ids=snp.ok, verbose=FALSE)
head(baf.seg)


###################################################
### code chunk number 97: DataCleaning.Rnw:2087-2093
###################################################
baf.anom <- anomFilterBAF(blData, genoData, segments=baf.seg, 
  snp.ids=snp.ok, centromere=centromeres, low.qual.ids=low.qual.ids,
  verbose=FALSE)
names(baf.anom)
baf.filt <- baf.anom$filtered
head(baf.filt)


###################################################
### code chunk number 98: DataCleaning.Rnw:2105-2111
###################################################
loh.anom <- anomDetectLOH(blData, genoData, scan.ids=scan.ids,
  chrom.ids=chrom.ids, snp.ids=snp.ok, known.anoms=baf.filt,
  verbose=FALSE)
names(loh.anom)
loh.filt <- loh.anom$filtered
head(loh.filt)


###################################################
### code chunk number 99: DataCleaning.Rnw:2120-2134
###################################################
# create required data frame
baf.filt$method <- "BAF"
if (!is.null(loh.filt)) {
  loh.filt$method <- "LOH"
  cols <- intersect(names(baf.filt), names(loh.filt))
  anoms <- rbind(baf.filt[,cols], loh.filt[,cols])
} else {
  anoms <- baf.filt
}
anoms$anom.id <- 1:nrow(anoms)

stats <- anomSegStats(blData, genoData, snp.ids=snp.ok, anom=anoms,
                      centromere=centromeres)
names(stats)


###################################################
### code chunk number 100: DataCleaning.Rnw:2140-2143
###################################################
snp.not.ok <- snpAnnot$snpID[snp.exclude]
anomStatsPlot(blData, genoData, anom.stats=stats[1,],
  snp.ineligible=snp.not.ok, centromere=centromeres, cex.leg=1)


###################################################
### code chunk number 101: DataCleaning.Rnw:2153-2155
###################################################
lrr.sd <- sdByScanChromWindow(blData, var="LogRRatio", incl.hom=TRUE)
med.lrr.sd <- medianSdOverAutosomes(lrr.sd)


###################################################
### code chunk number 102: DataCleaning.Rnw:2161-2163
###################################################
baf.seg.info <- baf.anom$seg.info
loh.seg.info <- loh.anom$base.info[,c("scanID", "chromosome", "num.segs")]


###################################################
### code chunk number 103: DataCleaning.Rnw:2173-2178
###################################################
snpAnnot$eligible <- !snp.exclude
baf.low.qual <- anomIdentifyLowQuality(snpAnnot, med.baf.sd, baf.seg.info,
  sd.thresh=0.1, sng.seg.thresh=0.0008, auto.seg.thresh=0.0001)
loh.low.qual <- anomIdentifyLowQuality(snpAnnot, med.lrr.sd, loh.seg.info,
  sd.thresh=0.25, sng.seg.thresh=0.0048, auto.seg.thresh=0.0006)


###################################################
### code chunk number 104: DataCleaning.Rnw:2183-2185
###################################################
close(blData)
close(genoData)


###################################################
### code chunk number 105: DataCleaning.Rnw:2200-2211
###################################################
# anomalies to filter
anom.filt <- stats[,c("scanID", "chromosome", "left.base", "right.base")]
# whole.chrom column is required and can be used for sex chromosome
#   anomalies such as XXX
anom.filt$whole.chrom <- FALSE 
# select unique subjects
subj <- scanAnnot$scanID[!scanAnnot$duplicated]
subj.filt.ncdf <- "subj_filt.ncdf"
ncdfSetMissingGenotypes(genofile, subj.filt.ncdf, anom.filt, sample.include=subj)
(nc <- NcdfGenotypeReader(subj.filt.ncdf))
close(nc)


###################################################
### code chunk number 106: DataCleaning.Rnw:2236-2238
###################################################
scan.excl <- scanAnnot$scanID[scanAnnot$missing.e1 >= 0.05]
length(scan.excl)


###################################################
### code chunk number 107: DataCleaning.Rnw:2249-2280
###################################################
snp.excl <- snpAnnot$snpID[snpAnnot$missing.n1 == 1]
length(snp.excl)
genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata")
genoNC <- NcdfGenotypeReader(genofile)
genoData <- GenotypeData(genoNC, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
dupdisc <- duplicateDiscordance(genoData, subjName.col="subjectID",
  scan.exclude=scan.excl, snp.exclude=snp.excl)
names(dupdisc)
head(dupdisc$discordance.by.snp)
length(dupdisc$discordance.by.subject)
dupdisc$discordance.by.subject[[2]]
# each entry is a 2x2 matrix, but only one value of each
# is important since these are all pairs
npair <- length(dupdisc$discordance.by.subject)
disc.subj <- rep(NA, npair)
subjID <- rep(NA, npair)
race <- rep(NA, npair)
for (i in 1:npair) {
  disc.subj[i] <- dupdisc$discordance.by.subject[[i]][1,2]
  subjID[i] <- names(dupdisc$discordance.by.subject)[i]
  race[i] <- scanAnnot$race[scanAnnot$subjectID == subjID[i]][1]
}
dat <- data.frame(subjID=subjID, disc=disc.subj, pop=race,
                  stringsAsFactors=FALSE)
summary(dat$disc)
# Assign colors for the duplicate samples based on population group.
dat$col <- NA
dat$col[dat$pop == "CEU"] <- "blue"
dat$col[dat$pop == "YRI"] <- "red"
dat <- dat[order(dat$disc),]
dat$rank <- 1:npair


###################################################
### code chunk number 108: DataCleaning.Rnw:2283-2288
###################################################
# Plot the sample discordance rates color coded by race.
plot(dat$disc, dat$rank, col=dat$col, ylab="rank",
  xlab="Discordance rate between duplicate samples", 
  main="Duplicate Sample Discordance by Continental Ancestry")
legend("bottomright", unique(dat$pop), pch=rep(1,2), col=unique(dat$col))


###################################################
### code chunk number 109: DataCleaning.Rnw:2312-2313
###################################################
duplicateDiscordanceProbability(npair)


###################################################
### code chunk number 110: DataCleaning.Rnw:2334-2346
###################################################
men.list <- with(pData(scanAnnot), mendelList(family, subjectID, 
  father, mother, sex, scanID))
res <- mendelListAsDataFrame(men.list)
head(res)
dim(res)
# Only want to use SNPs with missing.n1 < 0.05
snp.excl <- snpAnnot$snpID[snpAnnot$missing.n1 >= 0.05]
length(snp.excl)
mend <- mendelErr(genoData, men.list, snp.exclude=snp.excl)
names(mend)
head(mend$trios)
names(mend$snp)


###################################################
### code chunk number 111: DataCleaning.Rnw:2356-2359
###################################################
# Calculate the error rate
err <- mend$snp$error.cnt / mend$snp$check.cnt
table(err == 0, useNA="ifany")


###################################################
### code chunk number 112: DataCleaning.Rnw:2362-2364
###################################################
plot(err, rank(err), xlab="Error Rate (fraction)",
  ylab="rank", main="Mendelian Error Rate per SNP, ranked")


###################################################
### code chunk number 113: DataCleaning.Rnw:2372-2384
###################################################
fam <- mend$snp$error.cnt
n <- mend$snp$check.cnt
summary(fam)
# SNPs with errors
length(fam[n > 0 & fam > 0])
# SNPs for which more than one family has an error
length(fam[n > 0 & fam > 1])
# Get the SNPs with valid trios for error detection
val <- length(fam[n > 0])
noerr <- length(fam[n > 0 & fam == 0])
# Divide to get fraction with no errors
noerr / val


###################################################
### code chunk number 114: DataCleaning.Rnw:2391-2404
###################################################
snp.sel <- match(names(mend$snp$error.cnt), snpAnnot$snpID)
snpAnnot$mendel.err.count[snp.sel] <- mend$snp$error.cnt
snpAnnot$mendel.err.sampsize[snp.sel] <- mend$snp$check.cnt
allequal(snpAnnot$snpID, sort(snpAnnot$snpID))
# The high number of NA values is due to the filtering out of SNPs 
#    before the Mendelian error rate calculation
sum(is.na(snpAnnot$mendel.err.count))
sum(is.na(snpAnnot$mendel.err.sampsize))
varMetadata(snpAnnot)["mendel.err.count", "labelDescription"] <- 
  paste("number of Mendelian errors detected in trios averaged over",
        "multiple combinations of replicate genotyping instances")
varMetadata(snpAnnot)["mendel.err.sampsize", "labelDescription"] <- 
  "number of opportunities to detect Mendelian error in trios"


###################################################
### code chunk number 115: DataCleaning.Rnw:2416-2433
###################################################
# Get a vector of SNPs to check
snp <- pData(snpAnnot)
snp$err.rate <- snp$mendel.err.count / 
  snp$mendel.err.sampsize
snp <- snp[order(snp$err.rate, decreasing=TRUE),]
snp <- snp[1:9,]
xyfile <- system.file("extdata", "illumina_qxy.nc", package="GWASdata")
xyNC <- NcdfIntensityReader(xyfile)
xyData <- IntensityData(xyNC, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
pdf(file="DataCleaning-mendel.pdf")
par(mfrow = c(3,3))
mtxt <- paste("SNP", snp$rsID, "\nMendelian Error Rate",
   format(snp$err.rate, digits=5))
genoClusterPlot(xyData, genoData, snpID=snp$snpID, main.txt=mtxt,
                cex.main=0.9) 
dev.off()
close(xyData)


###################################################
### code chunk number 116: DataCleaning.Rnw:2446-2463
###################################################
# Calculate the fraction of SNPs with an error for each trio
trios <- mend$trios
trios$Mend.err <- trios$Men.err.cnt/trios$Men.cnt
summary(trios$Mend.err)
# Start by pulling out the vectors needed from `trios'
tmp <- trios[, c("fam.id", "Mend.err")]; dim(tmp)
# Change fam.id to match the sample annotation column name
names(tmp) <- c("family", "Mend.err.rate.fam")
# Merge the variables into the sample annotation file
scanAnnot$mend.err.rate.fam <- NA
for (i in 1:nrow(tmp)) {
  ind <- which(is.element(scanAnnot$family, tmp$family[i]))
  scanAnnot$mend.err.rate.fam[ind] <- tmp$Mend.err.rate.fam[i]
}
head(scanAnnot$mend.err.rate.fam)
varMetadata(scanAnnot)["mend.err.rate.fam", "labelDescription"] <-
  "Mendelian error rate per family"


###################################################
### code chunk number 117: DataCleaning.Rnw:2471-2482
###################################################
# Get the families that have non-NA values for the family 
#     Mendelian error rate
fams <- pData(scanAnnot)[!is.na(scanAnnot$mend.err.rate.fam) & 
  !duplicated(scanAnnot$family), c("family", 
  "mend.err.rate.fam", "race")]
dim(fams)
table(fams$race, useNA="ifany")
# Assign colors for the different ethnicities in these families
pcol <- rep(NA, nrow(fams))
pcol[fams$race == "CEU"] <- "blue"
pcol[fams$race == "YRI"] <- "red"


###################################################
### code chunk number 118: DataCleaning.Rnw:2485-2490
###################################################
plot(fams$mend.err.rate.fam*100, rank(fams$mend.err.rate.fam), 
  main="Mendelian Error rate per Family, ranked",
  xlab="Mendelian error rate per family (percent)", 
  ylab="rank", col=pcol)
legend("bottomright", c("CEU", "YRI"), pch=c(1,1), col=c("blue", "red"))


###################################################
### code chunk number 119: DataCleaning.Rnw:2508-2517
###################################################
head(pData(scanAnnot)[,c("father", "mother")])
nonfounders <- scanAnnot$father != 0 & 
               scanAnnot$mother != 0
table(nonfounders)
scan.excl <- scanAnnot$scanID[scanAnnot$race != "CEU" |
   nonfounders | scanAnnot$duplicated]
length(scan.excl)
hwe <- gwasExactHW(genoData, scan.exclude=scan.excl)
close(genoData)


###################################################
### code chunk number 120: DataCleaning.Rnw:2525-2536
###################################################
names(hwe)
dim(hwe)
# Check on sample sizes for autosomes and X chromosome
hwe$N <- hwe$nAA + hwe$nAB + hwe$nBB
summary(hwe$N[is.element(hwe$chromosome,1:22)])
summary(hwe$N[is.element(hwe$chromosome,23)])
hwe$p.value[1:10]
sum(is.na(hwe$p.value[hwe$chromosome == 24])) # XY
sum(is.na(hwe$p.value[hwe$chromosome == 23])) # X
hwe$MAF[1:10]
hwe[1:3, c("nAA", "nAB", "nBB")]


###################################################
### code chunk number 121: DataCleaning.Rnw:2543-2544
###################################################
summary(hwe$f)


###################################################
### code chunk number 122: DataCleaning.Rnw:2547-2549
###################################################
hist(hwe$f, main="Histogram of the Inbreeding Coefficient 
  For CEU Samples", xlab="Inbreeding Coefficient")


###################################################
### code chunk number 123: DataCleaning.Rnw:2552-2555
###################################################
# Check the MAF of those SNPs with f=1
chkf <- hwe[!is.na(hwe$f) & hwe$f==1,]; dim(chkf)
summary(chkf$MAF)


###################################################
### code chunk number 124: DataCleaning.Rnw:2562-2573
###################################################
hwe.0 <- hwe[hwe$MAF > 0,]; dim(hwe.0)
# Only keep the autosomal SNPs for first plot
pval <- hwe.0$p.value[is.element(hwe.0$chromosome, 1:22)]
length(pval)
pval <- pval[!is.na(pval)]
length(pval)
# X chromosome SNPs for plot 2
pval.x <- hwe.0$p.value[is.element(hwe.0$chromosome, 23)]
length(pval.x)
pval.x <- pval.x[!is.na(pval.x)]
length(pval.x)


###################################################
### code chunk number 125: DataCleaning.Rnw:2575-2582
###################################################
pdf(file = "DataCleaning-hwe.pdf")
par(mfrow=c(2,2))
qqPlot(pval=pval, truncate = FALSE, main="Autosomes, all")
qqPlot(pval=pval, truncate = TRUE, main="Autosomes, truncated")
qqPlot(pval=pval.x, truncate = FALSE, main="X chromosome, all")
qqPlot(pval=pval.x, truncate = TRUE, main="X chromosome, truncated")
dev.off()


###################################################
### code chunk number 126: DataCleaning.Rnw:2588-2591
###################################################
plot(hwe.0$MAF, -log10(hwe.0$p.value), 
  xlab="Minor Allele Frequency", ylab="-log(p-value)", 
  main="Minor Allele Frequency vs\nP-value")


###################################################
### code chunk number 127: DataCleaning.Rnw:2627-2634
###################################################
genoNC <- NcdfGenotypeReader(subj.filt.ncdf)
subjAnnot <- scanAnnot[scanAnnot$scanID %in% getScanID(genoNC),]
genoData <- GenotypeData(genoNC, scanAnnot=subjAnnot)
assoc <- assocTestRegression(genoData, outcome="status", covar.list=c(""),
  ivar.list=c(""), model.type="logistic", robust=TRUE,
  chromosome.set=c(24:26))
close(genoData)


###################################################
### code chunk number 128: DataCleaning.Rnw:2648-2650
###################################################
qqPlot(pval=assoc$model.1.additive.LR.pval.G,
  truncate=TRUE, main="QQ Plot of Likelihood Ratio Test p-values")


###################################################
### code chunk number 129: DataCleaning.Rnw:2658-2660
###################################################
chrom <- getChromosome(snpAnnot, char=TRUE)
chrom.sel <- chrom %in% c("XY", "Y", "M")


###################################################
### code chunk number 130: DataCleaning.Rnw:2663-2665
###################################################
manhattanPlot(assoc$model.1.additive.LR.pval.G,
  chromosome=chrom[chrom.sel])


###################################################
### code chunk number 131: DataCleaning.Rnw:2677-2696
###################################################
# Identify SNPs with lowest p-values
snp <- pData(snpAnnot)[chrom.sel, c("snpID", "rsID")]
allequal(snp$snpID, assoc$snpID)
snp$pval <- assoc$model.1.additive.LR.pval.G
snp <- snp[order(snp$pval),]
snp <- snp[1:9,]
xyfile <- system.file("extdata", "illumina_qxy.nc", package="GWASdata")
xyNC <- NcdfIntensityReader(xyfile)
xyData <- IntensityData(xyNC, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata")
genoNC <- NcdfGenotypeReader(genofile)
genoData <- GenotypeData(genoNC, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
pdf(file="DataCleaning-cluster.pdf")
par(mfrow = c(3,3))
mtxt <- paste("SNP", snp$rsID, "\np =", format(snp$pval, digits=4))
genoClusterPlot(xyData, genoData, snpID=snp$snpID, main.txt=mtxt) 
dev.off()
close(xyData)
close(genoData)


###################################################
### code chunk number 132: DataCleaning.Rnw:2702-2703
###################################################
unlink(subj.filt.ncdf)


