raivokolde
diff --git a/‎DESCRIPTION‎
Lines changed: 1 addition & 0 deletions b/‎DESCRIPTION‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/segmentation.r‎
Lines changed: 27 additions & 3 deletions b/‎R/segmentation.r‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎R/visualisation.r‎
Lines changed: 253 additions & 0 deletions b/‎R/visualisation.r‎
Lines changed: 253 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 23 additions & 20 deletions b/‎README.md‎
Lines changed: 23 additions & 20 deletions
@@ -26,3 +26,4 @@ Collate:
     'seqlm-package.r'
     'segmentation.r'
     'datasets.r'
+    'visualisation.r'
@@ -1,4 +1,6 @@
 export(seqlm)
+export(seqlmplots)
+export(seqlmreport)
 import(GenomicRanges)
 import(IRanges)
 import(Matrix)
 
@@ -370,8 +370,12 @@ additional_annotation = function(res, df){
 #' 
 #' Segments genome based on given linear models and and calculates the significance of regions
 #' 
-#' Implementation details
-#'
+#' The analysis can be time consuming if the whole genome is analysed at once.
+#'  If the computer has multicore capabilities it is easy to parallelize the 
+#' calculations. We use the \code{\link{foreach}}framework by Revolution 
+#' Computing for parallelization. To enable the parallelization one has to 
+#' register the parallel backend before and this will be used by seqlm.
+#' 
 #' @param values a matrix where columns are samples and rows correspond to the sites
 #' @param genome_information \code{\link{GRanges}} object giving the positions 
 #' of the probes, names should correspond to rownames of values. 
@@ -397,7 +401,20 @@ additional_annotation = function(res, df){
 #' 
 #' \dontrun{
 #' data(tissue_small)
-#' seqlm(tissue_small$values, tissue_small$genome_information, tissue_small$annotation)
+#' 
+#' # Find regions 
+#' segments = seqlm(tissue_small$values, tissue_small$genome_information, tissue_small$annotation)
+#' 
+#' # The calculation can be parallelized by registering a parallel processing backend
+#' library(doParallel)
+#' registerDoParallel(cores = 2)
+#' segments = seqlm(values = tissue_small$values, genome_information = tissue_small$genome_information, annotation =  tissue_small$annotation)
+#' 
+#' # To visualise the results it is possible to plot the most imortant sites and generate a HTML report
+#' temp = tempdir()
+#' seqlmreport(segments[1:10], tissue_small$values, tissue_small$genome_information, tissue_small$annotation, dir = temp)
+#' 
+#' # To see the results open the index.html file generated into the directory temp
 #' }
 #' @export
 seqlm = function(values, genome_information, annotation, n0 = 1, m0 = 10, sig0 = NA, alpha = 2, max_block_length = 50, max_dist = 1000){
@@ -447,6 +464,13 @@ seqlm = function(values, genome_information, annotation, n0 = 1, m0 = 10, sig0 =
 		elementMetadata(res) = DataFrame(elementMetadata(res), segment_ann)
 	}
 
+	# Add probe names 
+	names = names(genome_information)
+	elementMetadata(res) = DataFrame(elementMetadata(res), probes = apply(cbind(res$startIndex, res$endIndex), 1, function(x) paste0(names[x[1]:x[2]], collapse = ";")))
+	
+	# Remove startIndex and endIndex
+	elementMetadata(res) = elementMetadata(res)[-(2:3)] 
+	
 	return (res[order(abs(res$tstat), decreasing=TRUE)])
 }
 ##
 
@@ -0,0 +1,253 @@
+## Functions to visualize the regions
+fortify_seqlmplot = function(segment, values, annotation, genome_information, expand){
+	# Get rows from the matrix
+	segment_expanded = segment
+	start(segment_expanded) = start(segment_expanded) - expand
+	end(segment_expanded) = end(segment_expanded) + expand
+
+	gi = subsetByOverlaps(genome_information, segment)
+	gi_expanded = subsetByOverlaps(genome_information, segment_expanded)
+	
+	values0 = as.matrix(values[names(gi_expanded), ])
+	
+	# Bring into long format
+	df = melt(values0, varnames = c("Probe", "Sample"))
+	
+	# Add annotations
+	a = data.frame(Sample = colnames(values), Annotation = annotation)
+	df = merge(df, a)
+	
+	# Add region information
+	probe_det = data.frame(Probe = names(gi_expanded), Region = !(gi_expanded %outside% gi), Position = start(gi_expanded)) 
+	df = merge(df, probe_det)
+	
+	# Calculate the box that shows region
+	reg = probe_det[probe_det$Region,]
+	nonreg = probe_det[!probe_det$Region,]
+	
+	smaller = nonreg[nonreg$Position < min(reg$Position),]
+	bigger = nonreg[nonreg$Position > max(reg$Position),]
+	
+	if(nrow(smaller) > 0){
+		diff = min(reg$Position) - max(smaller$Position)
+		start = min(reg$Position) - min(100, diff / 2)
+	}
+	else{
+		start = min(reg$Position) - 100
+	}
+	
+	if(nrow(bigger) > 0){
+		diff = min(bigger$Position) - max(reg$Position)
+		end = max(reg$Position) + min(100, diff / 2)
+	}
+	else{
+		end = max(reg$Position) + 100
+	}
+	
+	box = data.frame(start = start, end = end)
+	
+	return(list(df = df, box = box))
+}
+
+draw_seqlmplot = function(df, box, ylim, expand){
+	plot = qplot(x = Position, y = value, geom = c("line", "point"), colour = Annotation, group = Sample, data = df) + geom_rect(aes(xmin = box$start, xmax = box$end, ymin = -Inf, ymax = Inf), colour = "grey20", fill = "grey95") + geom_point() + geom_line() +  geom_jitter(position = position_jitter(width = .1)) + scale_y_continuous(limits = ylim) + scale_x_continuous(limits = c(box$start - expand, box$end + expand)) + theme_bw() 
+}
+	
+seqlmplot = function(segment, values, annotation, genome_information, expand, ylim = extendrange(values), filename = NA, ...){
+	data = fortify_seqlmplot(segment = segment, values = values, annotation = annotation, genome_information = genome_information, expand = expand)
+	
+	plot = draw_seqlmplot(df = data$df, box = data$box, ylim, expand = expand)
+	
+	if(is.na(filename)){
+		print(plot)
+	}
+	else{
+		ggsave(filename, plot, ...)
+	}
+}
+	
+ 
+#' Visualise the regions
+#' 
+#' Generate plots about the seqlm results
+#' 
+#' The number of results from \code{\link{seqlm}} can be large 
+#' and visualising all these regions might not be desirable. 
+#' Therefore, it is advisable to filter the results befor 
+#' plotting.  
+#'
+#' @param segments selection of significant regions by \code{\link{seqlm}} function 
+#' @param values same values matrix that was used in \code{seqlm}
+#' @param genome_information same genome_information object that was used in \code{seqlm}
+#' @param annotation same annotation vector that was used in \code{seqlm}
+#' @param expand number of basepairs to extend the region on plot
+#' @param ylim two element vector giving the lower and higher limit of the y axis
+#' @param dir  directory where to put the images, of NA then plots are drawn into the plotting window
+#' @param filetype picture filetype 
+#' @param  ... extra parameters to \code{\link{ggsave}}
+#' @author  Raivo Kolde <rkolde@@gmail.com>
+#' 
+#' @export
+seqlmplots = function(segments, values, genome_information, annotation, expand = 100, ylim = extendrange(values), dir = NA, filetype = "png", ...){
+	# Match values and genome_information
+	mp = match_positions(values, genome_information)
+	values = mp$values
+	genome_information = mp$genome_information
+	
+	# Draw pictures
+	for(i in 1:length(segments)){
+		if(is.na(dir)){
+			filename = NA
+		}
+		else{
+			filename = file.path(dir, sprintf("%d.%s", i, filetype))
+		}
+		seqlmplot(segment = segments[i], values = values, annotation = annotation, genome_information = genome_information,  expand = expand, ylim = ylim, filename = filename, ...)
+		
+	}
+}
+	
+
+## seqlm raport
+raport_template = '
+<!DOCTYPE html>
+<html>
+<head>
+<style type="text/css">.knitr.inline {
+	background-color: #f7f7f7;
+	border: solid 0px #b0b0b0
+}
+.message {
+	font-style: italic
+}
+.source,.output,.warning,.error,.message {
+	padding: 0em 1em;
+	border: solid 1px #f7f7f7
+}
+.source {
+	background-color: #f7f7f7
+}
+.rimage.left {
+	text-align: left
+}
+.rimage.right {
+	text-align: right
+}
+.rimage.center {
+	text-align: center
+}
+.source {
+	color: #333
+}
+.background {
+	color: #f7f7f7
+}
+</style>
+<title>%s</title>
+</head>
+<body>
+
+<code class="knitr inline">
+<h1> %s </h1>
+
+%s
+</code>
+</body>
+</html>
+'
+chunk_template = '
+<h2> Segment %d </h2> 
+
+<table>
+	<tr>
+		<td><b>Location</b></td>
+		<td>%s</td>
+	</tr>
+	%s
+</table>
+
+<div class="rimage default"><img src="%s" class="plot"/></div>
+'
+
+annotation_template = '
+<tr>
+	<td><b>%s</b></td>
+	<td>%s</td>
+</tr>
+'
+
+location_template = 'chr%s:%d-%d'
+
+annotation_table = function(x){
+	res = paste(sprintf(annotation_template, "Coefficient", round(x$coef, 3)), 
+		sprintf(annotation_template, "FDR", sprintf("%.3g", x$fdr)),
+		sprintf(annotation_template, "Bonferroni", sprintf("%.3g", x$bonferroni)), 
+		sprintf(annotation_template, "Length in probes", x$length),
+		sprintf(annotation_template, "Length in bp", end(x) - start(x))
+	)
+	
+	xx = as.data.frame(elementMetadata(x))
+	n = which(colnames(xx) == "bonferroni")
+	
+	if(!(n == ncol(xx))){
+		for(i in (n + 1):ncol(xx)){
+			res = paste(res, sprintf(annotation_template, colnames(xx)[i], xx[1, i]), sep = "\n")
+		}
+	}
+	
+	return(res)
+}
+
+ 
+#' Generate the HTML report for the seqlm results
+#' 
+#' Generate the HTML report for the seqlm results
+#'
+#' @param segments selection of significant regions by \code{\link{seqlm}} function 
+#' @param values same values matrix that was used in \code{seqlm}
+#' @param genome_information same genome_information object that was used in \code{seqlm}
+#' @param annotation same annotation vector that was used in \code{seqlm}
+#' @param expand number of basepairs to extend the region on plot
+#' @param ylim two element vector giving the lower and higher limit of the y axis
+#' @param dir directory where to put the page, if the directory does not exist it will be created
+#' @param width picture width in inches
+#' @param height picture height in inches
+#' @param dpi dots per inch, to calibrate the picture size in pixels
+#' @param main title for the report
+#' 
+#' @author  Kaspar Martens <kmartens@@ut.ee> Raivo Kolde <rkolde@@gmail.com>
+#' 
+#' @export
+seqlmreport = function(segments, values, genome_information, annotation, ylim = extendrange(values), dir = NA, expand = 100, width = 8, height = 5, dpi = 100, main = "seqlm results"){
+	# Create main directory 
+	if(!file.exists(dir)){
+		dir.create(dir)
+	}
+	
+	# Create image directory
+	img_dir = file.path(dir, "img")
+	if(!file.exists(img_dir)){
+		dir.create(img_dir)
+	}
+	
+	# Create images
+	seqlmplots(segments, values, genome_information, annotation, ylim = ylim, dir = img_dir, expand = expand, width = width, height = height, dpi = dpi)
+	
+	# Create HTML file
+	chunks = ''
+	
+	for(i in 1:length(segments)){
+		location = sprintf(location_template, seqnames(segments[i]), start(segments[i]), end(segments[i]))
+		
+		chunk = sprintf(chunk_template, i, location, annotation_table(segments[i]), sprintf("img/%d.png", i))
+		
+		chunks = paste(chunks, chunk, sep = "\n\n")
+	}
+	
+	page = sprintf(raport_template, main, main, chunks)
+	
+	cat(page, file = file.path(dir, "index.html"))
+}
+
+
+##
@@ -3,24 +3,6 @@ seqlm
 
 An R package for identification of differentially methylated regions (DMRs) from high density chip, for example Illumina 450K, data. 
 
-Method
-------
-The seqlm method works in three stages. 
-
-**Stage 1:** The genome is divided into smaller pieces based on a genomic distance cutoff. 
-
-**Stage 2:** In each piece probes are segmented into regions that have approximately constant difference between the groups of interest. Example of the segmentation and its process is shown in [schema].
-
-* In sliding windows with variable sizes we fit a linear models to the data.
-* For each model we record the description length - the amount of bits needed to describe the data using the model
-* Using dynamic programming we find the segmentation that minimizes total description length
-
-**Stage 3:** We assess the relevance of each segment, by using a mixed model where the classes are a fixed effect and a sample is a random effect. This model takes into account the repeated nature of the consecutive methylation measurements. The segments are ordered by their significance.
-
-![Example of seqlm segmentation][schema]
-
-[schema]: https://raw.github.com/raivokolde/seqlm/gh-pages/pics/schema.png "Example of seqlm segmentation"
-
 Installation
 ------------
 The most convenient way to install the package is by using the `devtools` package.
@@ -47,7 +29,7 @@ An example dataset `tissue_small` is included in the package. It contains data c
 
 ```s
 data(tissue_small)
-seqlm(values = tissue_small$values, genome_information = tissue_small$genome_information, annotation =  tissue_small$annotation)
+segments = seqlm(values = tissue_small$values, genome_information = tissue_small$genome_information, annotation =  tissue_small$annotation)
 ```
 
 The result of the analysis is a GRanges object containing the locations of the regions and associated statistics. 
@@ -57,11 +39,32 @@ The analysis can be time consuming, if the whole genome is analysed at once. If
 ```s
 library(doParallel)
 registerDoParallel(cores = 2)
-seqlm(values = tissue_small$values, genome_information = tissue_small$genome_information, annotation =  tissue_small$annotation)
+segments = seqlm(values = tissue_small$values, genome_information = tissue_small$genome_information, annotation =  tissue_small$annotation)
 ```
 
+To visualise the results it is possible to plot the most imortant sites and generate a HTML report
+temp = tempdir()
+seqlmreport(segments[1:10], tissue_small$values, tissue_small$genome_information, tissue_small$annotation, dir = temp)
 
+[Here](???) Is an example of the resulting file.
 
+Method
+------
+The seqlm method works in three stages. 
+
+**Stage 1:** The genome is divided into smaller pieces based on a genomic distance cutoff. 
+
+**Stage 2:** In each piece probes are segmented into regions that have approximately constant difference between the groups of interest. Example of the segmentation and its process is shown in [schema].
+
+* In sliding windows with variable sizes we fit a linear models to the data.
+* For each model we record the description length - the amount of bits needed to describe the data using the model
+* Using dynamic programming we find the segmentation that minimizes total description length
+
+**Stage 3:** We assess the relevance of each segment, by using a mixed model where the classes are a fixed effect and a sample is a random effect. This model takes into account the repeated nature of the consecutive methylation measurements. The segments are ordered by their significance.
+
+![Example of seqlm segmentation][schema]
+
+[schema]: https://raw.github.com/raivokolde/seqlm/gh-pages/pics/schema.png "Example of seqlm segmentation"