\documentclass[letterpaper,12pt]{article}

\usepackage{epsfig,graphicx,graphics,amssymb,amsmath,latexsym,amsfonts,url,fullpage}
\usepackage[authoryear,round]{natbib}
\usepackage[plainpages=false,pdfpagelabels]{hyperref}

%\usepackage{/usr/lib/R/share/texmf/Sweave}
\usepackage[noblocks]{authblk}
 
% instead of Sweave library
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\setkeys{Gin}{width=0.8\textwidth}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{fontshape=sl}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{}
\DefineVerbatimEnvironment{Scode}{Verbatim}{fontshape=sl}
\newenvironment{Schunk}{}{} 

%%%%%%%%%%%%%%%%%%%%%%%%%
% For colors

\usepackage{color}
\definecolor{red}{rgb}{1, 0, 0}
\definecolor{green}{rgb}{0, 1, 0}
\definecolor{blue}{rgb}{0, 0, 1}
\definecolor{myblue}{rgb}{0.25, 0, 0.75}
\definecolor{myred}{rgb}{0.75, 0, 0}
\definecolor{gray}{rgb}{0.5, 0.5, 0.5}
\definecolor{purple}{rgb}{0.65, 0, 0.75}
\definecolor{orange}{rgb}{1, 0.65, 0}




\title{Supervised detection of conserved motifs in DNA sequences with \texttt{cosmo}}

\author{Oliver Bembom, Fabian Gallusser, Sandrine Dudoit\\
Division of Biostatistics, University of California, Berkeley}


\date{\today}

% \VignetteIndexEntry{Supervised detection of conserved motifs in DNA sequences with cosmo}
% \VignetteDepends{seqLogo}
% \VignetteKeyword{motif search}

\begin{document}

\maketitle

\tableofcontents


\section{Introduction}

\subsection{Overview}
The Bioconductor R package \texttt{cosmo} implements an algorithm for
searching a set of unaligned DNA sequences for a shared motif that
may, for example, represent a common transcription factor binding site
\citep{Bembom}.
\texttt{cosmo} is extends the popular motif discovery tool \texttt{MEME}
\citep{Bailey95a} in that it allows the search to be supervised
by specifying a set of constraints that the motif to be discovered must
satisfy.  
Such constraints may, for example, consist of bounds on the
information content across certain regions of the unknown motif and
can often be formulated on the basis of prior knowledge about the
structure of the transcription factor in question. 
The user is not required to specify \textit{a priori} the width of the
unknown motif, the distribution of motif occurrences among the input
sequences (OOPS, ZOOPS, or TCM), or a single correct constraint set.
Instead these three model parameters can be selected in a
data-adaptive manner.

\subsection{Motivation}

An important goal in contemporary biology consists of deciphering the
complex network that regulates the expression of an organism's genome.
A central role in this network is played by transcription factors that
regulate gene expression by binding to conserved short sequences in
the vicinity of their target genes \citep{Davidson}.  The discovery
and description of these \textit{binding sites} or \textit{motifs} has
therefore been at the heart of efforts aimed at understanding gene
regulatory networks.

Traditionally, experimental methods have been used for this purpose,
leading to a set of target sites from multiple genes that could then
be aligned to estimate the position weight matrix (PWM) of the motif -
a $4 \times W$ matrix in which position $(j,w)$ gives the probability
of observing nucleotide $j$ in position $w$ of a motif of length $W$.
Currently, however, such position weight matrix estimates are more
commonly obtained by applying pattern discovery algorithms to
functional genomics data.  Modern high-throughput methods such as cDNA
microarrays \citep{Roth, Eisen98,Bussemaker}or SAGE \citep{Powell},
for example, can identify sets of co-regulated genes whose promoter
sequences can then be scanned for statistically over-represented
patterns that are likely transcription factor binding sites
\citep{LawrenceEtAl, Bussemaker}.

While this approach has proven fruitful for the discovery of such
binding sites in yeast, its application to metazoan genomes has met
with considerable difficulty since binding sites tend to be spread out
over much larger regions of genomic sequence.  Efforts at tackling
this signal-to-noise problem have concentrated mostly on phylogenetic
footprinting, i.e.\ cross-species sequence comparisons that remove
noise by focusing on sequences under selective pressure
\citep{Fickett}.  \cite{Sandelin}, however, recently described an
alternative approach that is based on prior knowledge about the
structural class of the mediating transcription factor of interest.
Such knowledge is often available on the basis of genetics or
similarities between biological systems.  For most structurally
related families of transcription factors, there are clear
similarities in the sequences of the sites to which they bind
\citep{Luscombe}.  \cite{Eisen05}, for example, has demonstrated that
motifs bound by proteins with structurally similar DNA binding domains
tend to have similar information content profiles \citep{Schneider}.
Prior knowledge about the structural class of the mediating
transcription factor thus often translates into constraints on the
unknown position weight matrix that can be used to enhance the
performance of pattern discovery algorithms.  \cite{Sandelin} show
that the benefit of such prior knowledge is comparable to the
specificity improvements obtained through phylogenetic footprinting.

Currently, only a few motif finding algorithms such as ANN-Spec
\citep{Workman} or the Gibbs motif sampler \citep{Neuwald, Thompson}
are capable of incorporating prior knowledge about the unknown motif.
These algorithms generally require the user to supply an appropriate
prior distribution on the entries of the corresponding position weight
matrix.  \texttt{cosmo} instead allows the user to target the motif
search by specifying a set of constraints that the 
unknown position weight matrix must satisfy.  The algorithm is based
on a probabilistic model that describes the DNA sequences of interest
through a two-component multinomial mixture model as first introduced
by \cite{Lawrence}, with estimates of the position weight matrix
entries obtained by maximizing the observed data likelihood over the
smaller parameter space corresponding to the imposed constraints.


\section{Methods}

\subsection{Probabilistic models}

\label{sec.models}

\subsubsection{Motifs and background}

\label{sec.motif}

All of the models described below assume that sequences are generated
according to a multinomial mixture model with two components, one that
describes the distribution of nucleotides in the motif, and one that
describes the distribution of nucleotides in the background.
Nucleotides that are part of the length-$W$ transcription factor
binding site are assumed to be generated independently of each other
according to multinomial distributions that are allowed to be
different for each nucleotide in the motif.  Nucleotides that are not
part of a motif are assumed to be generated according to a $k$-th
order Markov model that allows the parameter vector of the multinomial
distribution of the current nucleotide to depend on the previous $k$
nucleotides.

 
\subsubsection{OOPS}
 
The one-occurrence-per-sequence (OOPS) model assumes that every
sequence contains exactly one occurrence of the motif.  For a given
sequence of length $L_i$, any of the $L_i-W+1$ eligible motif starts
are equally likely to be the start site of the motif.  At a given
start site, the motif is equally likely to be present in either one of
the two possible orientations.  For example, a motif with consensus
sequence ATGCCC may be present as ATGCCC or in its reverse complement
orientation as GGGCAT.


\subsubsection{ZOOPS}

The zero-or-one-occurrence-per-sequence (ZOOPS) model assumes that a
given sequence contains one occurrence of the motif with probability
$\pi$ and no occurrences of the motif with probability $1 - \pi$.  For
a given sequence that contains a motif, any of the $L_i-W+1$ eligible
motif starts are equally to be the start site of the motif.  At a
given start site, the motif is equally likely to be present in either
one of the two possible orientations.

\subsubsection{TCM}

The OOPS and ZOOPS models allow at most one occurrence of the motif
per sequence. However, there are many biological examples of DNA
sequences that contain multiple occurrences of the same transcription
factor binding site. \cite{Bailey95a} propose a two-component mixture
(TCM) model for this situation that allows each sequence to contain an
arbitrary number of non-overlapping occurrences of the motif.  This
model assumes that a given sequence is generated by repeatedly
deciding whether to insert a background nucleotide or a motif of width
$W$. As before, a motif is inserted in either one of the two possible
orientations with equal probability.  We denote by $\lambda$ the
probability that a motif is inserted at a given position rather than a
background nucleotide.


\subsection{Constraints}


\subsubsection{Motif intervals}

Many motifs can be conceptually divided into separate intervals that
each correspond to a distinct set of constraints on the position
weight matrix.  In order to specify constraints for \texttt{cosmo}, we
hence first specify how the motif can be divided into separate
intervals.  Since the true motif width is usually unknown, forcing
\texttt{cosmo} to search a range of candidate values, we have to
specify how the width of each interval changes with varying motif
widths.  We offer three possibilities: The length of an interval may
be a fixed number of based pairs no matter what the length of the
whole motif is; alternatively, the length of an interval may always be
a fixed proportion of the length of the whole motif; finally, a motif
may contain one interval that for each motif width is assigned
whatever number of base pairs is left after all intervals of the first
two kinds have been allocated.  Once the motif has been divided into
separate intervals, we can add a number of different constraints to
individual intervals or to the motif as a whole.


\subsubsection{Bound constraints on the information content across an interval}
An important summary measure of a given position weight matrix is its
information content profile.  The information content at position $w$
of the motif is given by
\[ IC(w) = \log_2(J) + \sum_{j=1}^J p_{wj}\log_2(p_{wj}) = \log_2(J) -
entropy(w)
\] where $J$ denotes the number of letters in the alphabet from which
the sequences have been derived so that here $J=4$.  The information
content is measured in bits and, in the case of DNA sequences, ranges
from 0 to 2 bits.  A position in the motif at which all nucleotides
occur with equal probability has an information content of 0 bits,
while a position at which only a single nucleotide can occur has an
information content of 2 bits.  The information content at a given
position can therefore be thought of as giving a measure of the
tolerance for substitutions in that position: Positions that are
highly conserved and thus have a low tolerance for substitutions
correspond to high information content, while positions with a high
tolerance for substitutions correspond to low information content.

It has been shown that the information content at a given position of
a motif is proportional to the number of contacts between the protein
and the base pair at that position.  We therefore expect higher
information content in regions of the motif that are bound by the
transcription factor than in the remaining regions.  If the
transcription factor contains two DNA-binding domains whose target
sequences in the motif are separated by a short stretch of sequence
that does not interact with the protein, we would expect that the
information content of the motif follows a high-low-high pattern.  In
this case, it may be useful to give bounds on the information content
across an individual interval. 



\subsubsection{Shape constraints on the information content profile across an interval}
We may want to exclude position weight matrices from consideration
whose information content profile is sharply discontinuous across a
given interval.  This can be achieved by requiring the information
content profile across that interval to follow a linear or monotone
shape.  In both cases, we may also give bounds on the information
content at the left and right edge of the interval.


\subsubsection{Lower bounds on nucleotide frequencies across an interval} 
We may suspect that a given nucleotide occurs with high frequency
across a certain interval.  In that case, we may require that the
average frequency of a given nucleotide $j$ across all positions in
the interval is no less than some lower bound.  Similarly, we may
require that the GC-content or AT-content across an interval is no
less than some lower bound.  If the length of the interval does not
change with varying motif width, we may also impose lower bounds for
nucleotide frequencies at a single position in that interval.


\subsubsection{Palindromic intervals}
If the DNA-binding domains of the transcription factor are
homodimeric, the DNA stretches that are bound by the transcription
factor will be palindromes of each other.  \texttt{cosmo} allows the
user to specify two intervals that are thought to be palindromic with
respect to each other.  In particular, we require that the frequency
of nucleotide $j$ at position $l$ in the interval equal the frequency
of the palindrome of nucleotide $j$ at position $l$ from the right
edge of interval, to within a given error bound.



\subsubsection{Submotifs}
Families of transcription factors are often characterized by the
occurrence of a certain submotif within the motif.  The exact location
of the submotif within the motif, however, can vary widely.  DNA
sequences bound by transcription factors with an ETS domain, for
example, all contain the stretch GGAA somewhere within the binding
site.  \texttt{cosmo} allows the user to specify such a submotif that
is then required to occur somewhere within the motif of interest, with
nucleotide frequencies of the consensus nucleotides in the submotif
roughly equal to some user-specified frequency.


\subsubsection{Bounds on differences of shape parameters}

Sometimes we may wish to impose constraints on the shape of the
information content that cannot be specified by the shape constraints
described above.  For example, we may wish to require that the
information content across a certain interval is constant, or that the
information content profile be continuous at the junction between two
intervals.

Such constraints can be formulated by giving bounds on the difference
between two shape parameters.  Recall that shape constraints on the
information content profile across an interval are parameterized
using the information content at the left and right edge of the
interval.  Hence we may require that these two quantities be
identical, corresponding to a constant information content profile
across that interval.  As another example, we might require that the
information content at the end of one interval is identical to the
information content at the beginning of the next interval,
corresponding to the constraint that the information content profile
be continuous at the junction between the two intervals.


\subsection{Model selection}


The probabilistic models described above are indexed by by following four parameters: 

\begin{enumerate}

\item The order of the background Markov model.

\item The width of the motif.

\item The type of model used to describe the data generating process
  (OOPS, ZOOPS, or TCM).

\item The set of constraints on the position weight matrix of the motif,

\end{enumerate}

For each one of these four parameters, \texttt{cosmo} allows the user
to either make a manual selection or to have the appropriate index
selected data-adaptively. 
For data-adaptive selection, the user may choose from among a number
of different model selection approaches.

\subsubsection{Likelihood-based validity functionals}

Apart from the likelihood, we also consider Akaike's Information
Criterion AIC \citep{Akaike} and the Bayesian Information Criterion
BIC \citep{Schwarz}. 
BIC has been found to work fairly well for selecting the unknown motif
width, while the likelihood and AIC generally perform quite poorly in
the model selection tasks we consider.


\subsubsection{E-value of the resulting multiple alignment}

The E-value of the multiple alignment consisting of the predicted
motif occurrences is an approximate $p$-value for testing the
null hypothesis that this alignment was obtained from a set of
sequences that were generated entirely from the background
distribution against the alternative hypothesis that the sequences
were generated according to the estimated model.

This measure of statistical significance has been found to work well
for all three model selection problems and forms the default approach
used by \texttt{cosmo} for selecting the motif width as well as the
distribution of motif occurrences.

\subsubsection{Likelihood-based cross-validation}

\label{sec.cv}

Likelihood-based cross-validation is a popular approach to model
selection in the context of density estimation \citep{vdLCV}.  The
general idea of cross-validation is to divide the original dataset
into a training set that is used to estimate the parameters of a given
model and a validation set that is then used to evaluate the
performance of this estimated model.  This performance assessment is
based on an appropriately specified loss function.  In the context of
likelihood-based cross-validation this loss function is taken to be
the Kullback-Leibler divergence ($DKL$), which gives a measure of the
distance between two densities $f$ and $g$.

Likelihood-based cross-validation generally performs rather poorly at
the model selection problems encountered here, presumably because it
is aimed at estimating the entire density of the data-generating
distribution well instead of the lower dimensional functionals of this
density we are concerned with here.


\subsubsection{Cross-validation based on the Euclidean norm}


For selection problems in which $W$ is fixed, notably in selecting
between candidate constraint sets, we may want to use as loss function
the Euclidean norm between a position weight matrix estimate obtained
under a candidate constraint set and an independent position weight
matrix estimate obtained under no constraints.

Cross-validation based on this loss function has been found to perform
well for the purpose of selecting between different candidate
constraint sets and forms the default approach employed by
\texttt{cosmo} for this problem.

\subsubsection{Separate model selection criteria for different parameters}

The user is allowed to specify different model selection criteria for
selecting the different parameters. In fact, the default settings
cause \texttt{cosmo} to select the motif width and the model type
based on the E-value criterion, but the constraint set by
cross-validation based on the Euclidean-norm loss function. 

\texttt{cosmo} handles such situations using the following profiling approach.
For each given combination of constraint set and model, it first
finds the optimal motif width based on the criterion selected for
choosing the motif width. In the next step, it selects the optimal
model type for each given constraint set at the chosen value for the
motif width. Finally, it selects the optimal constraint set for the
chosen values of the motif width and model type. This approach is
computationally attractive since \texttt{cosmo} is not required to
evaluate the different model selection criteria for all possible
candidate models.


\section{Software implementation}

\subsection{Overview}
The supervised motif detection algorithm described above is
implemented in the Bioconductor R package \texttt{cosmo}.
This package offers functions for generating random sequences
according to the three different probabilistic models, functions for
generating R objects representing sets of constraints on the
unknown position weight matrix, as well as a function for carrying out
the algorithm itself.

Before being able to access these functions, the user is required to load the
package using the \texttt{library()} command:  

<<load, echo=TRUE>>=
library(cosmo) 
@ 

\subsection{Simulating sequences}
\label{sec.sim}
The function \texttt{rseq()} allows the user to generate random
sequences according to the OOPS, ZOOPS, or TCM models:  
<<rseqArgs, echo=TRUE>>=
args(rseq)
@ 

\noindent\textbf{INPUT.}
\begin{enumerate}
\item
The number of sequences to be generated, \texttt{numSeqs}.

\item
The number of nucleotides in each sequence, \texttt{seqLength}.
This may be either a single number, in which case that number is taken to be
the common length of all sequence, or a vector of sequence lengths.

\item
The intensity parameter for the ZOOPS and TCM models, \texttt{rate}.
For the ZOOPS model, this corresponds to $\pi$; for the TCM model,
this corresponds to $\lambda$.  

\item 
The position weight matrix of the motif, \texttt{pwm}.

\item
The transition matrix for the background Markov model, \texttt{transMats}.
This is a list of matrices, with the first matrix given the transition
probabilities for the 0th order Markov model, the second matrix giving
the transition probabilities for a 1st order Markov model, and so on.

\item 
The distribution of motif occurrences, \texttt{model}. This is either
``ZOOPS'' or ``TCM''; the OOPS model is a special case of the ``ZOOPS''
model.  

\item 
A choice for whether motifs may only be inserted in the forward strand
orientation instead of allowing the reverse complement orientation as well,
\texttt{posOnly}.

\end{enumerate}

\noindent\textbf{OUTPUT.}
\begin{enumerate}
\item
  A list of the generated sequences, \texttt{seqs}.
\item
  An \texttt{align} object \texttt{motifs} summarizing the positions
  of the inserted motif occurrences.
\item
  An object \texttt{empPWM} of class \texttt{pwm} representing the position
  weight matrix obtained by aligning the inserted motifs.
\end{enumerate}


\noindent\textbf{EXAMPLE.}\\

\noindent The \texttt{cosmo} package contains the following example of a
position weight matrix for a motif of width 8:
<<pwm, echo=TRUE>>=
data(motifPWM)
motifPWM
@ 
The \texttt{seqLogo()} function found in the \texttt{seqLogo} package
can be used to produce the sequence logo shown in figure
\ref{fig.logo1} \citep{SeqLogos}.   
\begin{figure}
\begin{center}
\caption{Sequence logo of motif used for simulating sequences. \label{fig.logo1}}
<<seqLogo1, fig=TRUE, echo=FALSE, height=4,width=6>>=
seqLogo(motifPWM)
@ 
\end{center}
\end{figure}
The \texttt{cosmo} package also contains the following example of
transition matrices needed  for a second-order Markov model for the
distribution of background nucleotides: 
<<transMats, echo=TRUE>>=
data(transMats)
transMats

@ 
We may now generate 20 sequence each of length 100 nucleotides
according to the OOPS model and this position weight matrix and 
background distribution as follows:  
<<rseqEx, echo=TRUE>>=
simSeqs <- rseq(20, 100, 1.0, motifPWM, transMats, "ZOOPS")
simSeqs$motifs

@ 


\subsection{Constructing constraint sets}
\label{sec.conset}

The \texttt{cosmo} package defines classes \texttt{constraintSet} and
\texttt{constraintGroup} that represent a single constraint sets and a
collection of constraint Sets, respectively. 
A \texttt{constraintSet} object is initially constructed using the
function \texttt{makeConSet()}
<<makeConSetArg, echo=TRUE>>=
args(makeConSet)
@ 
that takes as arguments the number of intervals that the motif is to
be divided into,  the types of those intervals and the lengths of
those intervals. 
A constraint set consisting of a 3-bp interval, a variable-length
interval, and another 3-bp interval is constructed as
<<makeConSetEx, echo=TRUE>>=
conSet1 <- makeConSet(numInt=3, type=c("B","V","B"),length=c(3,NA,3))
@ 
\texttt{constraintSet} objects are displayed in the format employed by
the \texttt{cosmoweb} web application
(\url{http://cosmoweb.berkeley.edu}):
<<printConSet, echo=TRUE>>= 
conSet1
@ 
We may now construct a list of constraints that can then be added to
this constraint set. 
To require the information content across the first interval to be
bounded between 1.0 and 2.0, we construct the following
\texttt{boundCon} object: 
<<boundCon1, echo=TRUE>>=
boundCon1 <- makeBoundCon(lower=1.0, upper=2.0)
@ 
Likewise, we construct the following bound constraint for the second interval:
<<boundCon2, echo=TRUE>>=
boundCon2 <- makeBoundCon(lower=0.0, upper=1.0)
@ 
Lastly, we may construct a \texttt{palCon} object to require that
intervals 1 and 3 be palindromes of each other:  
<<palCon, echo=TRUE>>=
palCon1 <- makePalCon(int1=1, int2=3, errBnd=0.05)
@ 
These constraints can now be added to the appropriate intervals of the
initially defined \texttt{constraintSet} object: 
<<addCon, echo=TRUE>>=
constraint <- list(boundCon1, boundCon2, palCon1)
int <- list(1, 2, NA)
conSet1 <- addCon(conSet=conSet1, constraint=constraint, int=int)
conSet1
@ 
We construct a second constraint set that requires the motif to
contain the submotif TATA: 
<<conSet2, echo=TRUE>>=
conSet2 <- makeConSet(numInt=1, type="V",length=NA)
subCon1 <- makeSubMotifCon(submotif="TATA",minfreq=0.9)
conSet2 <- addCon(conSet=conSet2, constraint=subCon1, int=NA)
conSet2
@ 


\subsection{\texttt{cosmo} function}

The \texttt{cosmo()} function carries out the supervised motif
detection algorithm described above.  
<<cosmoArgs, echo=TRUE>>=
args(cosmo)
@ 

\noindent\textbf{INPUT.}
\begin{enumerate}
\item
 A reference to the set of sequence to be analyzed,
 \texttt{seqs}. This may be a list with each element representing a sequence
 in the form of a single string such as "ACGTAGCTAG" ("seq" entry)
 and a description ("desc" entry), the path of a file that contains the
 sequences in FASTA format, or the string ``browse'', in which case
 the user is prompted to browse for a FASTA file containing the input
 sequences. 
 
\item 
 A reference to the constraint sets, \texttt{constraints}. This may be
 a \texttt{constraintSet} object,  a list of such objects, the name of a file 
 containing the constraint definitions in the format used by
 \texttt{cosmoweb}, the string ``None'' for no constraints. If the
 \texttt{cosmoGUI} package has been installed, constraint sets may
 also be defined through an interactive Tcl/Tk-based GUI by specifying
 \texttt{constraints=''GUI''} (see figure \ref{fig.gui}). 

 \item
 The minimum and maximum motif widths to search through,
 \texttt{minW} and \texttt{maxW}.
 \item
   A character vector giving the model types to be considered as
   candidates for the sequences at hand, \texttt{models}. The possible
   candidates are ``OOPS'', ``ZOOPS'', and ``TCM''.

 \item
 A logical indicator for whether motifs are allowed to occur in the
 reverse complement orientation, \texttt{revComp}.

 \item
 The minimum and maximum number of motif occurrences in the entire set
 of sequences, \texttt{minSites} and \texttt{maxSites}.

 \item 
 The number of starting values to use for the constrained optimization
 of the likelihood function, \texttt{starts}. In many cases, increasing
 the number of starting values can help improve the performance of the
 algorithm, whereas decreaseing the number of starting values will
 reduce the computing time.

 \item
 A number of more advanced parameters, pertaining mostly to the model
 selection procedure. The default values will be perfectly sufficient
 for the vast majority of users, with the available options mostly
 given for testing and simulation purposes.

\end{enumerate}



\begin{figure}
  \parbox{175.mm}{
    \centering
    \includegraphics{gui}
  }
\caption{GUI for constructing constraint sets.}
\label{fig.gui}
\end{figure}

\noindent\textbf{OUTPUT.}\\

\noindent The S4 class/method object-oriented programming approach was adopted
to summarize the results of the motif search. Specifically, the output
is an instance of the class \texttt{cosmo}. A brief description of the
class is given next. Please consult the documentation for details,
e.g., using \texttt{class ? cosmo} and \texttt{methods ? cosmo}.

<<cosmoClass, echo=TRUE>>=
slotNames("cosmo")
@ 

\begin{enumerate}
\item
  A list \texttt{seqs} with each element representing one sequence of
  the input dataset in the form of a single string such as
  "ACGTAGCTAG" ("seq" entry) and a description ("desc" entry).
\item
  The estimated position weight matrix, \texttt{pwm}. This is an instance
  of the class \texttt{pwm}, containing additionally the information
  content profile of the position weight matrix and the corresponding
  consensus sequence. Invoking the \texttt{plot()} method on an object
  of class \texttt{pwm} produces a plot of the sequence logo of the
  position weight matrix \citep{Schneider}

\item
  A summary of the model selection process for the order of the
  background Markov model, \texttt{back}. This is a data.frame
  that gives for each candidate order the cross-validated
  Kullback-Leibler divergence.

\item
  The estimated transition matrices for the background Markov model,
  \texttt{tmat}. 

\item
  A summary of the model selection process for selecting the motif
  width, model type, and constraint set, \texttt{cand}. This is a
  data.frame that gives for each candidate model considered the values
  of the relevant model selection criteria.
  
\item
  The selected constraint set, \texttt{cons}. This is an instance of
  the class \texttt{constraintSet}.

\item
  A description of the selected model, \texttt{sel}. This is a
  data.frame that summarized the selections made for the constraint set,
  the model type, the motif width, and the order of the background
  Markov model.


\item
  A summary of the predicted motif occurrences, \texttt{motifs}. This is
  an instance of the class \texttt{align} that gives, for each predicted
  motif occurrence, the sequence name, the position on the sequence, the
  orientation of the motif, the motif itself, and the posterior
  probability of this site being a motif occurrence.
  
\item
  A list \texttt{probs} with each entry giving the posterior
  probabilities of motif occurrences along a given sequence. If the
  motif is more likely to occur in the reverse complement orientation,
  this posterior probability appears with a negative sign. 

\end{enumerate}


\noindent\textbf{EXAMPLE.}\\


\noindent The \texttt{cosmo} package includes the example FASTA file 
\texttt{seq.fasta}. It contains 20 sequences that were simulated as
above according to the OOPS model to each contain one occurrence of
the motif whose sequence logo is given in figure \ref{fig.logo1}.
We can search these sequences for a shared motif, considering as
candidate constraint sets the two constraint sets constructed in
section \ref{sec.conset}, as candidate model types OOPS and TCM, and
as candidate motif widths 7 through 8:

<<example1, echo=TRUE, results=hide>>=
seqFile <- system.file("Exfiles/seq.fasta", package="cosmo")
res <- cosmo(seqs=seqFile, constraints=list(conSet1, conSet2) , minW=7, maxW=8, models=c("OOPS", "TCM")) 
@ 
The \texttt{print()} method outputs the estimated position weight matrix:
<<printPwm, echo=TRUE>>=
print(res)
@ 
A more detailed summary of the results is obtained through the
\texttt{summary()} method: 
<<summary, echo=TRUE>>=
summary(res)
@ 
The \texttt{cand} slot of the \texttt{cosmo} object consists 
of a data frame that summarizes the model selection process:
<<candModels, echo=TRUE>>=
res@cand
@ 
Note that the E-value criterion was evaluated for all candidate models
to select the optimal motif width for each given combination of model
type and constraint. The model type critertion then needs to be only
evaluated for that each combination of model type, constraint set and
selected motif width. The constraint set criterion, lastly, needs to
be evaluated only for the optimal motif widht and model type for each
candidate constraint set. In this case, \texttt{cosmo} selected a
motif width of 8, the one-occurrence-per-sequence model, and the first
constraint set, choices that agree well with the data-generating
distribution described above.

The alignment of predicted motif occurrences is stored in the
\texttt{motifs} slot of the \texttt{cosmo} output object:
<<motifs, echo=TRUE>>=
summary(res@motifs)
@ 
In this, case twenty motif occurrences were predicted, with an E-value
for the resulting alignment of \Sexpr{sprintf("%5.4e",res@motifs@eval)}. Invoking the  
\texttt{plot()} method on the \texttt{cosmo} object \texttt{res}
produces a sequence logo of the estimated motif (figure \ref{fig.logo2}).
Note that the sequence logo of the estimated position weight matrix
agrees well with the sequence logo of the true position weight matrix 
shown in figure \ref{fig.logo1}.  
\begin{figure}
\begin{center}
\caption{Sequence logo. \label{fig.logo2}}
<<seqLogo2, fig=TRUE, echo=FALSE, height=4,width=6>>=
plot(res)
@ 
\end{center}
\end{figure}
A plot of the posterior probabilities along each sequence is obtained by invoking the
\texttt{plot()} method on the \texttt{cosmo} object \texttt{res} with
the argument \texttt{type="prob"}(figure \ref{post.prob}). 
\begin{figure}
\begin{center}
\caption{Posterior probability plot. If the motif is more likely to occur in the
    forward strand orientation, the bar extends upward from the
    horizontal, otherwise it extends downward. \label{post.prob}}
<<postprob, fig=TRUE, echo=FALSE, height=8,width=6>>=
plot(res, type="prob")
@ 
\end{center}
\end{figure}

\subsection{External estimates of the background Markov model}
By default, the Markov model for the distribution of background
nucleotides is estimated from the input sequences supplied by the
user. It is also possible, however, estimate this model from a
separate, usualy larger set of sequences. The user may, for example,
want to use the set of intergenic sequences of the organism of
interest for this purpose. Estimating the background model from a
larger data set allows for more precise estimation and may thus
increase the performance of the algorithm in finding shared motifs in
the input sequences.

A separate set of sequences for the estimation of the background model
may be passed to \texttt{cosmo()} through the \texttt(backSeqs)
argument. As with the \texttt{seqs} argument, this may be either the
string \texttt{browse}, in which a GUI allows the user to browse the
file system for a FASTA file containing the sequences to be used,
another string pointing to this FASTA file, or a list with each
element representing a sequence in the form of a single string such as
"ACGTAGCTAG" ("seq" entry) and a description ("desc" entry).

If the background data set is large, one may wish to estimate the
background Markov model in a preliminary step and then pass the
obtained estimates to all following calls to \texttt{cosmo()}. The
function \texttt{bgModel()} can be used for this purpose:
<<bgModelArgs, echo=TRUE>>=
args(bgModel)
@ 
Its main argument consists of the sequences from which the background
model is to be estimated. If a Markov model of a specific order is
desired, this order may be specified through the \texttt{order}
argument. If \texttt{order==NULL}, the appropriate order is chosen
data-adaptively through likelihood-based cross-validation. This
approach will larger orders, with corresponding models that become
more difficult to estimate, as the amount of available data
increases. The \texttt{maxOrder} argument gives the largest candidate
order that is to be considered. To obtain an estimate of the
background Markov model from a set of example sequences contained in
the file \texttt{bgSeqs}, we might use the call
<<bgModelEx, echo=TRUE>>=
bgFile <- system.file("Exfiles", "bgSeqs", package="cosmo")
tm <- bgModel(bgFile)
@ 
The output produced by \texttt{bgModel()} is a list containing the
selected order, a summary of the Kullback-Leibler divergences for the
different candidate orders, as well as the estimated transition
matrices:   

<<bgModelOutput, echo=TRUE>>=
tm
@ 
 
The \texttt{transMat} element of this list contains one estimated
transition matrix for each order between zero and the selected
order. The entry in cell $(i,j)$ of a given transition matrix gives
the estimated probability of observing nucleotide $j$ in a given
position after having observed the tuple $i$ in the previous $k$
positions. The Kullback-Leibler divergences summarized in the
\texttt{klDivs} element of the output give the estimated risk
for each candidate order corresponding to the minus log loss
function; likelihood-based cross-validation selects the order with the
minimal Kullback-Leibler divergence. The estimated transition matrix
may be passed to \texttt{cosmo()} through the \texttt{transMat} argument:
<<tmatEx, echo=TRUE, results=hide>>=
res <- cosmo(seqs=seqFile, constraints="None" , minW=8, maxW=8, models="OOPS", transMat=tm$transMat)
@ 
<<tmatExOut, echo=TRUE>>=
res 
@ 
\texttt{MEME} allows the user to specify the background Markov model
in a slightly different format. The files it accepts for specifyint a
1st-order Markov model, for example, are of the form
<<bfileEx, echo=FALSE>>=
bfile <- system.file("Exfiles", "bfile", package="cosmo")
s1 <- scan(file = bfile, what = "", sep = "\n", quote = "", 
           allowEscapes = FALSE, quiet = TRUE)
cat(paste(s1,collapse="\n"))
@ 
Such files contain estimates of all relevant tuple frequencies. Note
that these frequencies are different from the conditional
probabilities given in a transition matrix: The tuple frequencies give
an estimate of the probability of observing a given tuple, while the
frequencies contained in a transition matrix give estimates of the
probality of observing a given nucleotide given the previous $k$
nucleotides. Thus, the entries in each row of a transition matrix must
sum to 1.0, not the entries in an entire matrix, as is the case with a
\texttt{MEME}-style tupe frequency matrix. A \texttt{MEME}-style
background file may be passed to \texttt{cosmo()} through the
\texttt{bfile} argument. Alternatively, a \texttt{MEME}-style
background file may be converted into a transition matrix by using the
function \texttt{bfile2tmat()}:
<<bfile2tmat, echo=TRUE>>=
tmat <- bfile2tmat(bfile)
tmat

@ 
\subsection{Software Design}
 
The following features of the programming approach employed in
\texttt{cosmo} may be of interest to users. \\


{\bf Class/method object-oriented programming}. Like many other
Bioconductor packages, \texttt{cosmo} has adopted the \textit{S4
class/method objected-oriented programming approach} presented in
\cite{Chambers}. In particular, a new class, \texttt{cosmo}, is
defined to represent the results obtained by the constrained motif
search algorithm. As discussed to some extent above, several methods
are provided to operate on this class. \\

{\bf Calls to C}. 
The R package was derived from an earlier stand-alone application that
was written entirely in C. This design was necessary to ensure that
the computationally intensive constrained optimization algorithm does
not take too much time. The constrained optimiziation itself is
carried out using the \texttt{donlp2()} function by Peter Spellucci,
available at \url{http://plato.asu.edu/ftp/donlp2/donlp2_intv_dyn.tar.gz}.


\subsection{License}

The \texttt{cosmo} package incorporates two sources of foreign code
whose free use has been restricted to research purposes. Commercial
purposes require permission and licensing by the owners of the
copyright to that code. This is true for the \texttt{donlp2()}
function that is used here to perform the constrained optimization of
the likelihood function as well as of code that is used by \texttt{cosmo}
to compute the E-value criterion. The copyright to \texttt{donlp2()}
is owned by its author, Peter Spellucci; the copyright to the second
piece of code, written by Timoty Bailey, is owned by the Regents of
the University of California.  For these reasons, the \texttt{cosmo}
package must likewise be restricted to research purposes, with
commercial uses requiring permission by Oliver Bembom, Peter
Spellucci, as well as the Regents of the University of California.

The license under with \texttt{donlp2()} is distributed furthermore
requires that its use must be acknowledged in any publication
containing results obtained with \texttt{donlp2()} or parts of it. The
same is hence true for publications containing results obtained with
\texttt{cosmo}. Citation of the author's name and netlib-source is
suitable for this purpose.

\section{Discussion}

The Bioconductor package \texttt{cosmo} implements a constrained motif
detection algorithm that includes as an important special case the
popular motif detection tool \texttt{MEME}, but also allows the user
to enhance the performance of the algorithm by specifying constraints
on the position weight matrix to the be estimated. 

We note that the same algorithm has also been implemented in the form
of a web application, accessible at
\url{http://cosmoweb.berkeley.edu}, that allows users to submit jobs
to designated web servers, with results posted in HTML as well as XML
format on a temporary web page. In this case, constraint definitions
are supplied in a text file according to a straightforward
standard. In particular, the R function \texttt{writeConFile()} in the
\texttt{cosmo} package can be used to convert a \texttt{constraintSet}
or \texttt{constraintGroup} object into such a text file.
Lastly, we have also posted the source code of the original
stand-alone C implementation of the algorithm on this web site.

\begin{thebibliography}{28}
\expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi
\expandafter\ifx\csname url\endcsname\relax
  \def\url#1{{\tt #1}}\fi

\bibitem[Akaike(1973)]{Akaike}
H.~Akaike.
\newblock {\em Information theory and an extension of the maximum likelihood
  principle}.
\newblock Academiai Kiado, 1973.

\bibitem[Bailey and Elkan(1995)]{Bailey95a}
T.L. Bailey and C.P. Elkan.
\newblock Unsupervised learning of multiple motifs in biolpolymers using
  expectation maximization.
\newblock {\em Machine Learning}, pages 51--80, 1995.

\bibitem[Bembom et~al.(2007)Bembom, Kele{\c s}, and van~der Laan]{Bembom}
O.~Bembom, S.~Kele{\c s}, and M.J. van~der Laan.
\newblock Supervised detection of conserved motifs in {DNA} sequences with
  \texttt{cosmo}.
\newblock Statistical Applications in Genetics and Molecular Biology: 
Vol. 6  : Iss. 1, Article 8.
\newblock URL \url{http://www.bepress.com/sagmb/vol6/iss1/art8}.

\bibitem[Bussemaker et~al.(2001)Bussemaker, Li, and Siggia]{Bussemaker}
H.J. Bussemaker, H.~Li, and E.D. Siggia.
\newblock Regulatory element detection using correlation with expression.
\newblock {\em Nature Genetics}, 27:\penalty0 167--171, 2001.

\bibitem[Chambers (1998)]{Chambers}
J.M. Chambers
\newblock {\em Programming with Data: A Guide to the S Language}.
\newblock Springer Verlag, New York, 1998.


\bibitem[Davidson(2001)]{Davidson}
E.~Davidson.
\newblock {\em Genomic Regulatory Systems. Development and Evolution}.
\newblock Academic Press, San Diego, 2001.

\bibitem[Eisen(2005)]{Eisen05}
M.B. Eisen.
\newblock All motifs are not created equal: structural properties of
  transcription factor - {DNA} interactions and the inference of sequences
  specificity.
\newblock {\em Genome Biology}, 6:\penalty0 P7, 2005.

\bibitem[Eisen et~al.(1998)Eisen, Spellman, Brown, and Botstein]{Eisen98}
M.B. Eisen, P.T. Spellman, P.O. Brown, and D.~Botstein.
\newblock Cluster analysis and display of genome-wide expression patterns.
\newblock {\em Proceedings of the National Academy of Science}, 95:\penalty0
  14863--14868, 1998.

\bibitem[Fickett and Wasserman(2000)]{Fickett}
J.W. Fickett and W.W. Wasserman.
\newblock Discovery and modeling of transcriptional regulatory regions.
\newblock {\em Current Opinions in Biotechnology}, 11:\penalty0 19--24, 2000.

\bibitem[Lawrence and Reilly(1990)]{Lawrence}
C.~Lawrence and A.~Reilly.
\newblock An expectation maximization ({EM}) algorithm for the identification
  and characterization of common sites in unaligned biopolymer sequences.
\newblock {\em Proteins: Structure, Function and Genetics}, 7:\penalty0 41--51,
  1990.

\bibitem[Lawrence et~al.(1993)Lawrence, Altschul, Boguski, Liu, Neuwald, and
  Wootton]{LawrenceEtAl}
C.E. Lawrence, S.F. Altschul, M.S. Boguski, J.S. Liu, A.F. Neuwald, and J.C.
  Wootton.
\newblock Detecting subtle sequence signals: a {Gibbs} sampling strategy for
  multiple alignment.
\newblock {\em Science}, 262:\penalty0 208--214, 1993.


\bibitem[Luscombe et~al.(2000)Luscombe, Austin, Berman, and Thornton]{Luscombe}
N.M. Luscombe, S.E. Austin, H.M. Berman, and J.M. Thornton.
\newblock {An overview of the structures of protein-DNA complexes}.
\newblock {\em Genome Biology}, 1:\penalty0 1--37, 2000.

\bibitem[Neuwald et~al.(1995)Neuwald, Liu, and Lawrence]{Neuwald}
A.F. Neuwald, J.S. Liu, and C.E. Lawrence.
\newblock Gibbs motif sampling: detection of bacterial outer membrane repeats.
\newblock {\em Protein Science}, 4:\penalty0 1618--1632, 1995.

\bibitem[Powell(2000)]{Powell}
J.~Powell.
\newblock {SAGE. The serial analysis of gene expression}.
\newblock {\em Methods of Molecular Biology}, 99:\penalty0 297--319, 2000.

\bibitem[Roth et~al.(1998)Roth, Hughes, Estep, and Church]{Roth}
F.P. Roth, J.D. Hughes, P.W. Estep, and G.M. Church.
\newblock {Finding DNA regulatory motifs within unaligned noncoding sequences
  clustered by whole-genome mRNA quantitation}.
\newblock {\em Nature Biotechnology}, 16:\penalty0 939--945, 1998.

\bibitem[Sandelin and Wassermann(2004)]{Sandelin}
A.~Sandelin and W.W. Wassermann.
\newblock Constrained binding site diversity within families of transcription
  factors enhances pattern discovery bioinformatics.
\newblock {\em Journal of Molecular Biology}, 338:\penalty0 207--215, 2004.

\bibitem[Schneider et~al.(1986)Schneider, Stormo, Gold, and
  Ehrenfeucht]{Schneider}
T.~D. Schneider, G.~D. Stormo, L.~Gold, and A.~Ehrenfeucht.
\newblock Information content of binding sites on nucleotide sequences.
\newblock {\em Journal of Molecular Biology}, 188:\penalty0 415--431, 1986.

\bibitem[Schneider and Stephens(1990)Schneider, Stormo, Gold, and
  Ehrenfeucht]{SeqLogos}
T.~D. Schneider, and R.~R. Stephens.
\newblock Sequence Logos: A New Way to Display Consensus Sequences
\newblock {\em Nucleic Acid Research}, 18:\penalty0 6097--6100, 1990.

\bibitem[Schwarz(1978)]{Schwarz}
G.~Schwarz.
\newblock Estimating the dimension of a model.
\newblock {\em Annals of Statistics}, 6:\penalty0 461--464, 1978.


\bibitem[Thompson et~al.(2003)Thompson, Rouchka, and Lawrence]{Thompson}
W.~Thompson, E.C. Rouchka, and C.E. Lawrence.
\newblock Gibbs recursive sampler: finding transcription factor binding sites.
\newblock {\em Nucleic Acid Research}, 31:\penalty0 3580--3585, 2003.

\bibitem[van~der Laan et~al.(2003)van~der Laan, Dudoit, and Kele{\c s}]{vdLCV}
M.J. van~der Laan, S.~Dudoit, and S.~Kele{\c s}.
\newblock Asymptotics optimality of likelihood based cross-validation.
\newblock Technical Report 125, Division of Biostatistics, University of
  California, Berkeley, 2003.
\newblock URL \url{www.bepress.com/ucbbiostat/paper125}.

\bibitem[Workman and Stormo(2000)]{Workman}
C.T. Workman and G.D. Stormo.
\newblock {ANN-Spec: a method for discovering transcription factor binding
  sites with improved specificity}.
\newblock In {\em Proceedings of the Pacific Symposium on Biocomputation},
  pages 467--478, 2000.

\end{thebibliography}


\end{document}
 
 
 
 
