-
Notifications
You must be signed in to change notification settings - Fork 28
/
cnv.rmd
128 lines (104 loc) · 3.99 KB
/
cnv.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
```{r cnv-init, echo=FALSE, message=F}
library(knitr)
library(Cairo)
opts_chunk$set(warning=FALSE, fig.width=6, fig.height=6, dev="CairoPNG", stop=TRUE)
library(minfi, quietly=TRUE)
library(CopyNumber450kData, quietly=TRUE)
library(CopyNumber450k, quietly=TRUE)
library(GEOquery)
```
# Test that CopyNumber450k and meffil produce comparable results
```{r}
warning("This script may fail because of CopyNumber450k and minfi version incompatibilities.")
```
## Download example data set
```{r child = 'dataset-450k-demo.rmd'}
```
```{r}
path <- download.450k.demo.dataset()
```
## Copy numbers using meffil
```{r}
library(meffil)
samplesheet <- meffil.create.samplesheet(path)[1:4,]
```
Create references using data from the Bioconductor CopyNumber450kData R package.
```{r, results="hide", message=FALSE}
library(minfi, quietly=TRUE)
library(CopyNumber450kData, quietly=TRUE)
meffil.add.copynumber450k.references(verbose=T)
```
```{r}
meffil.list.cnv.references()
```
Calculate copy numbers using meffil.
```{r comparison-meffil-cnv, cache=T}
segments.meffil <- meffil.calculate.cnv(samplesheet, "copynumber450k", verbose=T) ## 20m
matrix.meffil <- meffil.cnv.matrix(segments.meffil, "450k")
```
## Copy numbers using CopyNumber450k
Load the data using minfi.
```{r, message=FALSE}
raw.minfi <- read.metharray.exp(base = path)
raw.minfi <- raw.minfi[,basename(samplesheet$Basename)]
```
Load control data for estimating copy numbers.
```{r}
library(CopyNumber450k, quietly=TRUE)
data(RGcontrolSetEx)
```
It is necessary to merge the control dataset with the dataset we plan to analyze.
The following code sets up the sample groups and names to make this possible.
```{r}
phenoData(RGcontrolSetEx) <- AnnotatedDataFrame(data.frame(Sample_Group="control",
Sample_Name=colnames(RGcontrolSetEx),
stringsAsFactors=F))
phenoData(raw.minfi) <- AnnotatedDataFrame(data.frame(Sample_Group="data",
Sample_Name=basename(samplesheet$Basename),
stringsAsFactors=F))
sampleNames(raw.minfi) <- phenoData(raw.minfi)@data$Sample_Name
sampleNames(RGcontrolSetEx) <- phenoData(RGcontrolSetEx)@data$Sample_Name
```
Calculate copy numbers using CopyNumber450k.
```{r comparison-minfi-cnv, cache=T}
raw.cnv <- CNV450kSet(combine(raw.minfi, RGcontrolSetEx))
raw.cnv <- dropSNPprobes(raw.cnv, maf_threshold=0.01)
norm.cnv <- normalize(raw.cnv, "quantile")
norm.cnv <- segmentize(norm.cnv, alpha=0.001)
segments.cnv <- getSegments(norm.cnv)
matrix.cnv <- meffil.cnv.matrix(segments.cnv)
```
## Compare meffil and CopyNumber450k results
The diagonals give correlations between methods
for the same samples.
```{r}
r <- cor(matrix.cnv, matrix.meffil, use="p")
quantile(diag(r))
quantile(r)
```
Instead of correlation, compare the number of CpG sites
in variable regions identified by both methods
to the number of such sites identified by either method.
```{r}
identify.variable.sites <- function(segments) {
features <- meffil.get.features("450k")
segments <- segments[which(segments$adjusted.pvalue < 0.05),]
variable.idx <- unlist(lapply(1:nrow(segments), function(i) {
which(features$chromosome == segments$chrom[i]
& features$position >= segments$loc.start[i]
& features$position <= segments$loc.end[i])
}))
features$name[variable.idx]
}
variable.meffil <- lapply(segments.meffil, identify.variable.sites)
variable.cnv <- lapply(segments.cnv, identify.variable.sites)
sapply(variable.meffil, length)
sapply(variable.cnv, length)
in.both <- sapply(1:length(variable.meffil),
function(i) length(intersect(variable.meffil[[i]], variable.cnv[[i]])))
in.either <- sapply(1:length(variable.meffil),
function(i) length(union(variable.meffil[[i]], variable.cnv[[i]])))
in.both/in.either
sapply(variable.meffil, length)/in.either
sapply(variable.cnv, length)/in.either
```