Skip to content

Commit e0efc10

Browse files
committed
Merge branch 'Figures'
2 parents 0efee54 + bcf2237 commit e0efc10

File tree

8 files changed

+422
-25
lines changed

8 files changed

+422
-25
lines changed

DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@ Collate:
2626
'seqlm-package.r'
2727
'segmentation.r'
2828
'datasets.r'
29+
'visualisation.r'

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
export(seqlm)
2+
export(seqlmplots)
3+
export(seqlmreport)
24
import(GenomicRanges)
35
import(IRanges)
46
import(Matrix)

R/segmentation.r

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -370,8 +370,12 @@ additional_annotation = function(res, df){
370370
#'
371371
#' Segments genome based on given linear models and and calculates the significance of regions
372372
#'
373-
#' Implementation details
374-
#'
373+
#' The analysis can be time consuming if the whole genome is analysed at once.
374+
#' If the computer has multicore capabilities it is easy to parallelize the
375+
#' calculations. We use the \code{\link{foreach}}framework by Revolution
376+
#' Computing for parallelization. To enable the parallelization one has to
377+
#' register the parallel backend before and this will be used by seqlm.
378+
#'
375379
#' @param values a matrix where columns are samples and rows correspond to the sites
376380
#' @param genome_information \code{\link{GRanges}} object giving the positions
377381
#' of the probes, names should correspond to rownames of values.
@@ -397,7 +401,20 @@ additional_annotation = function(res, df){
397401
#'
398402
#' \dontrun{
399403
#' data(tissue_small)
400-
#' seqlm(tissue_small$values, tissue_small$genome_information, tissue_small$annotation)
404+
#'
405+
#' # Find regions
406+
#' segments = seqlm(tissue_small$values, tissue_small$genome_information, tissue_small$annotation)
407+
#'
408+
#' # The calculation can be parallelized by registering a parallel processing backend
409+
#' library(doParallel)
410+
#' registerDoParallel(cores = 2)
411+
#' segments = seqlm(values = tissue_small$values, genome_information = tissue_small$genome_information, annotation = tissue_small$annotation)
412+
#'
413+
#' # To visualise the results it is possible to plot the most imortant sites and generate a HTML report
414+
#' temp = tempdir()
415+
#' seqlmreport(segments[1:10], tissue_small$values, tissue_small$genome_information, tissue_small$annotation, dir = temp)
416+
#'
417+
#' # To see the results open the index.html file generated into the directory temp
401418
#' }
402419
#' @export
403420
seqlm = function(values, genome_information, annotation, n0 = 1, m0 = 10, sig0 = NA, alpha = 2, max_block_length = 50, max_dist = 1000){
@@ -447,6 +464,13 @@ seqlm = function(values, genome_information, annotation, n0 = 1, m0 = 10, sig0 =
447464
elementMetadata(res) = DataFrame(elementMetadata(res), segment_ann)
448465
}
449466

467+
# Add probe names
468+
names = names(genome_information)
469+
elementMetadata(res) = DataFrame(elementMetadata(res), probes = apply(cbind(res$startIndex, res$endIndex), 1, function(x) paste0(names[x[1]:x[2]], collapse = ";")))
470+
471+
# Remove startIndex and endIndex
472+
elementMetadata(res) = elementMetadata(res)[-(2:3)]
473+
450474
return (res[order(abs(res$tstat), decreasing=TRUE)])
451475
}
452476
##

R/visualisation.r

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
## Functions to visualize the regions
2+
fortify_seqlmplot = function(segment, values, annotation, genome_information, expand){
3+
# Get rows from the matrix
4+
segment_expanded = segment
5+
start(segment_expanded) = start(segment_expanded) - expand
6+
end(segment_expanded) = end(segment_expanded) + expand
7+
8+
gi = subsetByOverlaps(genome_information, segment)
9+
gi_expanded = subsetByOverlaps(genome_information, segment_expanded)
10+
11+
values0 = as.matrix(values[names(gi_expanded), ])
12+
13+
# Bring into long format
14+
df = melt(values0, varnames = c("Probe", "Sample"))
15+
16+
# Add annotations
17+
a = data.frame(Sample = colnames(values), Annotation = annotation)
18+
df = merge(df, a)
19+
20+
# Add region information
21+
probe_det = data.frame(Probe = names(gi_expanded), Region = !(gi_expanded %outside% gi), Position = start(gi_expanded))
22+
df = merge(df, probe_det)
23+
24+
# Calculate the box that shows region
25+
reg = probe_det[probe_det$Region,]
26+
nonreg = probe_det[!probe_det$Region,]
27+
28+
smaller = nonreg[nonreg$Position < min(reg$Position),]
29+
bigger = nonreg[nonreg$Position > max(reg$Position),]
30+
31+
if(nrow(smaller) > 0){
32+
diff = min(reg$Position) - max(smaller$Position)
33+
start = min(reg$Position) - min(100, diff / 2)
34+
}
35+
else{
36+
start = min(reg$Position) - 100
37+
}
38+
39+
if(nrow(bigger) > 0){
40+
diff = min(bigger$Position) - max(reg$Position)
41+
end = max(reg$Position) + min(100, diff / 2)
42+
}
43+
else{
44+
end = max(reg$Position) + 100
45+
}
46+
47+
box = data.frame(start = start, end = end)
48+
49+
return(list(df = df, box = box))
50+
}
51+
52+
draw_seqlmplot = function(df, box, ylim, expand){
53+
plot = qplot(x = Position, y = value, geom = c("line", "point"), colour = Annotation, group = Sample, data = df) + geom_rect(aes(xmin = box$start, xmax = box$end, ymin = -Inf, ymax = Inf), colour = "grey20", fill = "grey95") + geom_point() + geom_line() + geom_jitter(position = position_jitter(width = .1)) + scale_y_continuous(limits = ylim) + scale_x_continuous(limits = c(box$start - expand, box$end + expand)) + theme_bw()
54+
}
55+
56+
seqlmplot = function(segment, values, annotation, genome_information, expand, ylim = extendrange(values), filename = NA, ...){
57+
data = fortify_seqlmplot(segment = segment, values = values, annotation = annotation, genome_information = genome_information, expand = expand)
58+
59+
plot = draw_seqlmplot(df = data$df, box = data$box, ylim, expand = expand)
60+
61+
if(is.na(filename)){
62+
print(plot)
63+
}
64+
else{
65+
ggsave(filename, plot, ...)
66+
}
67+
}
68+
69+
70+
#' Visualise the regions
71+
#'
72+
#' Generate plots about the seqlm results
73+
#'
74+
#' The number of results from \code{\link{seqlm}} can be large
75+
#' and visualising all these regions might not be desirable.
76+
#' Therefore, it is advisable to filter the results befor
77+
#' plotting.
78+
#'
79+
#' @param segments selection of significant regions by \code{\link{seqlm}} function
80+
#' @param values same values matrix that was used in \code{seqlm}
81+
#' @param genome_information same genome_information object that was used in \code{seqlm}
82+
#' @param annotation same annotation vector that was used in \code{seqlm}
83+
#' @param expand number of basepairs to extend the region on plot
84+
#' @param ylim two element vector giving the lower and higher limit of the y axis
85+
#' @param dir directory where to put the images, of NA then plots are drawn into the plotting window
86+
#' @param filetype picture filetype
87+
#' @param ... extra parameters to \code{\link{ggsave}}
88+
#' @author Raivo Kolde <rkolde@@gmail.com>
89+
#'
90+
#' @export
91+
seqlmplots = function(segments, values, genome_information, annotation, expand = 100, ylim = extendrange(values), dir = NA, filetype = "png", ...){
92+
# Match values and genome_information
93+
mp = match_positions(values, genome_information)
94+
values = mp$values
95+
genome_information = mp$genome_information
96+
97+
# Draw pictures
98+
for(i in 1:length(segments)){
99+
if(is.na(dir)){
100+
filename = NA
101+
}
102+
else{
103+
filename = file.path(dir, sprintf("%d.%s", i, filetype))
104+
}
105+
seqlmplot(segment = segments[i], values = values, annotation = annotation, genome_information = genome_information, expand = expand, ylim = ylim, filename = filename, ...)
106+
107+
}
108+
}
109+
110+
111+
## seqlm raport
112+
raport_template = '
113+
<!DOCTYPE html>
114+
<html>
115+
<head>
116+
<style type="text/css">.knitr.inline {
117+
background-color: #f7f7f7;
118+
border: solid 0px #b0b0b0
119+
}
120+
.message {
121+
font-style: italic
122+
}
123+
.source,.output,.warning,.error,.message {
124+
padding: 0em 1em;
125+
border: solid 1px #f7f7f7
126+
}
127+
.source {
128+
background-color: #f7f7f7
129+
}
130+
.rimage.left {
131+
text-align: left
132+
}
133+
.rimage.right {
134+
text-align: right
135+
}
136+
.rimage.center {
137+
text-align: center
138+
}
139+
.source {
140+
color: #333
141+
}
142+
.background {
143+
color: #f7f7f7
144+
}
145+
</style>
146+
<title>%s</title>
147+
</head>
148+
<body>
149+
150+
<code class="knitr inline">
151+
<h1> %s </h1>
152+
153+
%s
154+
</code>
155+
</body>
156+
</html>
157+
'
158+
chunk_template = '
159+
<h2> Segment %d </h2>
160+
161+
<table>
162+
<tr>
163+
<td><b>Location</b></td>
164+
<td>%s</td>
165+
</tr>
166+
%s
167+
</table>
168+
169+
<div class="rimage default"><img src="%s" class="plot"/></div>
170+
'
171+
172+
annotation_template = '
173+
<tr>
174+
<td><b>%s</b></td>
175+
<td>%s</td>
176+
</tr>
177+
'
178+
179+
location_template = 'chr%s:%d-%d'
180+
181+
annotation_table = function(x){
182+
res = paste(sprintf(annotation_template, "Coefficient", round(x$coef, 3)),
183+
sprintf(annotation_template, "FDR", sprintf("%.3g", x$fdr)),
184+
sprintf(annotation_template, "Bonferroni", sprintf("%.3g", x$bonferroni)),
185+
sprintf(annotation_template, "Length in probes", x$length),
186+
sprintf(annotation_template, "Length in bp", end(x) - start(x))
187+
)
188+
189+
xx = as.data.frame(elementMetadata(x))
190+
n = which(colnames(xx) == "bonferroni")
191+
192+
if(!(n == ncol(xx))){
193+
for(i in (n + 1):ncol(xx)){
194+
res = paste(res, sprintf(annotation_template, colnames(xx)[i], xx[1, i]), sep = "\n")
195+
}
196+
}
197+
198+
return(res)
199+
}
200+
201+
202+
#' Generate the HTML report for the seqlm results
203+
#'
204+
#' Generate the HTML report for the seqlm results
205+
#'
206+
#' @param segments selection of significant regions by \code{\link{seqlm}} function
207+
#' @param values same values matrix that was used in \code{seqlm}
208+
#' @param genome_information same genome_information object that was used in \code{seqlm}
209+
#' @param annotation same annotation vector that was used in \code{seqlm}
210+
#' @param expand number of basepairs to extend the region on plot
211+
#' @param ylim two element vector giving the lower and higher limit of the y axis
212+
#' @param dir directory where to put the page, if the directory does not exist it will be created
213+
#' @param width picture width in inches
214+
#' @param height picture height in inches
215+
#' @param dpi dots per inch, to calibrate the picture size in pixels
216+
#' @param main title for the report
217+
#'
218+
#' @author Kaspar Martens <kmartens@@ut.ee> Raivo Kolde <rkolde@@gmail.com>
219+
#'
220+
#' @export
221+
seqlmreport = function(segments, values, genome_information, annotation, ylim = extendrange(values), dir = NA, expand = 100, width = 8, height = 5, dpi = 100, main = "seqlm results"){
222+
# Create main directory
223+
if(!file.exists(dir)){
224+
dir.create(dir)
225+
}
226+
227+
# Create image directory
228+
img_dir = file.path(dir, "img")
229+
if(!file.exists(img_dir)){
230+
dir.create(img_dir)
231+
}
232+
233+
# Create images
234+
seqlmplots(segments, values, genome_information, annotation, ylim = ylim, dir = img_dir, expand = expand, width = width, height = height, dpi = dpi)
235+
236+
# Create HTML file
237+
chunks = ''
238+
239+
for(i in 1:length(segments)){
240+
location = sprintf(location_template, seqnames(segments[i]), start(segments[i]), end(segments[i]))
241+
242+
chunk = sprintf(chunk_template, i, location, annotation_table(segments[i]), sprintf("img/%d.png", i))
243+
244+
chunks = paste(chunks, chunk, sep = "\n\n")
245+
}
246+
247+
page = sprintf(raport_template, main, main, chunks)
248+
249+
cat(page, file = file.path(dir, "index.html"))
250+
}
251+
252+
253+
##

README.md

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,6 @@ seqlm
33

44
An R package for identification of differentially methylated regions (DMRs) from high density chip, for example Illumina 450K, data.
55

6-
Method
7-
------
8-
The seqlm method works in three stages.
9-
10-
**Stage 1:** The genome is divided into smaller pieces based on a genomic distance cutoff.
11-
12-
**Stage 2:** In each piece probes are segmented into regions that have approximately constant difference between the groups of interest. Example of the segmentation and its process is shown in [schema].
13-
14-
* In sliding windows with variable sizes we fit a linear models to the data.
15-
* For each model we record the description length - the amount of bits needed to describe the data using the model
16-
* Using dynamic programming we find the segmentation that minimizes total description length
17-
18-
**Stage 3:** We assess the relevance of each segment, by using a mixed model where the classes are a fixed effect and a sample is a random effect. This model takes into account the repeated nature of the consecutive methylation measurements. The segments are ordered by their significance.
19-
20-
![Example of seqlm segmentation][schema]
21-
22-
[schema]: https://raw.github.com/raivokolde/seqlm/gh-pages/pics/schema.png "Example of seqlm segmentation"
23-
246
Installation
257
------------
268
The most convenient way to install the package is by using the `devtools` package.
@@ -47,7 +29,7 @@ An example dataset `tissue_small` is included in the package. It contains data c
4729

4830
```s
4931
data(tissue_small)
50-
seqlm(values = tissue_small$values, genome_information = tissue_small$genome_information, annotation = tissue_small$annotation)
32+
segments = seqlm(values = tissue_small$values, genome_information = tissue_small$genome_information, annotation = tissue_small$annotation)
5133
```
5234

5335
The result of the analysis is a GRanges object containing the locations of the regions and associated statistics.
@@ -57,11 +39,32 @@ The analysis can be time consuming, if the whole genome is analysed at once. If
5739
```s
5840
library(doParallel)
5941
registerDoParallel(cores = 2)
60-
seqlm(values = tissue_small$values, genome_information = tissue_small$genome_information, annotation = tissue_small$annotation)
42+
segments = seqlm(values = tissue_small$values, genome_information = tissue_small$genome_information, annotation = tissue_small$annotation)
6143
```
6244

45+
To visualise the results it is possible to plot the most imortant sites and generate a HTML report
46+
temp = tempdir()
47+
seqlmreport(segments[1:10], tissue_small$values, tissue_small$genome_information, tissue_small$annotation, dir = temp)
6348

49+
[Here](???) Is an example of the resulting file.
6450

51+
Method
52+
------
53+
The seqlm method works in three stages.
54+
55+
**Stage 1:** The genome is divided into smaller pieces based on a genomic distance cutoff.
56+
57+
**Stage 2:** In each piece probes are segmented into regions that have approximately constant difference between the groups of interest. Example of the segmentation and its process is shown in [schema].
58+
59+
* In sliding windows with variable sizes we fit a linear models to the data.
60+
* For each model we record the description length - the amount of bits needed to describe the data using the model
61+
* Using dynamic programming we find the segmentation that minimizes total description length
62+
63+
**Stage 3:** We assess the relevance of each segment, by using a mixed model where the classes are a fixed effect and a sample is a random effect. This model takes into account the repeated nature of the consecutive methylation measurements. The segments are ordered by their significance.
64+
65+
![Example of seqlm segmentation][schema]
66+
67+
[schema]: https://raw.github.com/raivokolde/seqlm/gh-pages/pics/schema.png "Example of seqlm segmentation"
6568

6669

6770

0 commit comments

Comments
 (0)