From ea0882f4b176caa065b3ab4eee7ae62afb35d85a Mon Sep 17 00:00:00 2001
From: Sebastian Will <swill@csail.mit.edu>
Date: Mon, 13 Sep 2021 09:57:18 +0200
Subject: [PATCH] Add tutorial on negative design (still draft)

---
 biblio.bib               |   8 ++
 infrared-bookchapter.tex | 199 +++++++++++++++++++++++++++++++++------
 2 files changed, 178 insertions(+), 29 deletions(-)

diff --git a/biblio.bib b/biblio.bib
index 95e9b66..b675ca1 100644
--- a/biblio.bib
+++ b/biblio.bib
@@ -1,5 +1,13 @@
 % Encoding: UTF-8
 
+@InProceedings{Yao2021,
+	author = {Yao, Hua-Ting and Waldisp{\ifmmode\ddot{u}\else\"{u}\fi}hl, J{\ifmmode\acute{e}\else\'{e}\fi}r{\ifmmode\hat{o}\else\^{o}\fi}me and Ponty, Yann and Will, Sebastian},
+	title = {{Taming Disruptive Base Pairs to Reconcile Positive and Negative Structural Design of RNA}},
+	year = {2021},
+	month = {Apr},
+        booktitle = {Research in Computational Molecular Biology - 25nd Annual International Conference, {RECOMB} 2021},
+}
+
 @article{Andronescu2007,
 	author = {Andronescu, Mirela and Condon, Anne and Hoos, Holger H. and Mathews, David H. and Murphy, Kevin P.},
 	title = {{Efficient parameter estimation for RNA secondary structure prediction}},
diff --git a/infrared-bookchapter.tex b/infrared-bookchapter.tex
index f1c1ffa..b49735e 100644
--- a/infrared-bookchapter.tex
+++ b/infrared-bookchapter.tex
@@ -87,14 +87,12 @@ fontsize=\footnotesize
 % the running heads and the author index.
 %
 
-\newcommand{\etal}{ \emph{et al}~}
-\newcommand{\aka}{{a.k.a.}\xspace}
-\newcommand{\ie}{{i.e.}\xspace}
-\newcommand{\cf}{{c.f.}\xspace}
-\newcommand{\eg}{{e.g.}\xspace}
-\newcommand{\wrt}{w.r.t.\xspace}
-\newcommand{\ST}{\text{ such that }}
-\newcommand{\st}{\text{ s.t. }}
+%\newcommand{\etal}{ \emph{et al}~}
+%\newcommand{\aka}{{a.k.a.}\xspace}
+%\newcommand{\ie}{{i.e.}\xspace}
+%\newcommand{\cf}{{c.f.}\xspace}
+%\newcommand{\eg}{{e.g.}\xspace}
+%\newcommand{\wrt}{w.r.t.\xspace}
 
 \newcommand{\Base}[1]{\text{{\sffamily#1}}}
 \newcommand{\Ab}{\Base{A}\xspace}
@@ -158,7 +156,7 @@ Here we present the framework \Infrared, which addresses the multiple demands of
     \item Real-world RNA design applications typically demand for targeting thermodynamic criteria referring to multiple target structures (e.g. on and off-states of riboswitches, binding pockets of aptamers and competing structures in specific energetic relations), potentially under side constraints like moderate \GC-content or avoidance of specific sequence motifs. Thus, we design Infrared as a library that supports programmers to develop complex and potentially novel design strategies based on a declarative constraint framework. This allows application programmers to harness the power of fixed-parameter tractable sampling of designs in an easy to use system.
     
     In this chapter, we guide the reader through the use of this library and show by examples how to develop and run design programs from IPython notebooks. Further examples, covering as well non-design applications, are provided as part of the documentation of \Infrared.  
-    In addition it should be mentioned that we published complete design tools based on the library for the specific applications of the generation of multi-target designs (\RNARedPrint v2, as reimplementation of \RNARedPrint~\cite{hammer2019fixed}) and negative RNA design (\RNAPOND), which we hope are both directly useful to some readers. These tools can as well serve as further examples for the use of \Infrared in command line tools.
+    In addition it should be mentioned that we published complete design tools based on the library for the specific applications of the generation of multi-target designs (\RNARedPrint v2, as reimplementation of \RNARedPrint~\cite{hammer2019fixed}) and negative RNA design (\RNAPOND~\cite{Yao2021}), which we hope are both directly useful to some readers. These tools can as well serve as further examples for the use of \Infrared in command line tools.
 \end{itemize}
 
 \begin{figure}
@@ -510,20 +508,119 @@ show: constraint/function network, tree decomposition, explain why complexity de
     \label{fig:dependency-graph}
 \end{figure}
 
+\subsection{Negative design by direct sampling}
 
-\subsubsection{Using disruptive constraints for negative design}
+Good RNA designs typically must satisfy certain constraints and show high
+(or specific) affinity to the target structures, but as well must avoid
+high affinity to all non-target structures. Objectives of the latter type are called
+negative design criteria (whereas the former are called positive design
+criteria). Intuitively, design towards negative criteria seems harder since
+it requires to avoid affinity to exponentially many structures. (Indeed, as
+  we discussed before, negative design was shown to be NP hard in relevant
+settings.) 
 
-negative design due to disruptive constraints in RNAPOND, which makes use of the powerful FPT solving mechanism
+A classic (single-target) negative design objective, requires the target structure to have
+minimum free energy among all structures of the designed RNA; it can be tested using the Vienna RNA package:
+\begin{Pythoncode}
+def is_mfe_design(sequence, target):
+    fc = RNA.fold_compound(sequence)
+    return fc.eval_structure(target) == fc.mfe()[1]
+\end{Pythoncode}
 
-\TODO{try to code this idea in as simple as possible variant; then see whether we can include code or just discuss the general idea, referring to the RNAPOND paper}
+In many cases, sampling (targeting only positive design criteria) can be sufficient to satisfy negative design criteria. For example, we can easily find designs for any of our three target structures by straightforward ``generate-and-test'':
+\begin{Pythoncode}
+sampler = Sampler(single_target_design_model(target))
+sampler.set_target(0.7 * n, 0.1 * n, 'gc')
+for i in range(50):
+    seq = ass_to_seq(sampler.sample())
+    if is_mfe_design(seq,target):
+        print(f"{i} {seq}")
+\end{Pythoncode}
+where \texttt{design\_model(target)} returns a model for the given target structure with 'gc' feature and a bias to good base pair energy as we have developed it before (see Notes).
+Based on this code, we easily find 10 to 20 solutions from generating 50 (biased) samples for each of our example target structures (in less than 0.1 seconds on a current notebook).
 
-\subsection{Negative design by local search, resampling!?}
+A similar approach can still be sufficient to optimize other negative criteria for single-target design like the frequency of the target structure (see Notes for function \texttt{target\_frequency}).
+The following code regularly finds designs with >80\% target frequency for our example targets in comparable run-time (roughly, twice as long).
+\begin{Pythoncode}
+sampler = Sampler(single_target_design_model(target))
+sampler.set_target(0.7 * n, 0.1 * n, 'gc')
+best = 0
+for i in range(100):
+    seq = ass_to_seq(sampler.sample())
+    freq = target_frequency(seq, target)
+    if freq > best:
+        best = freq
+        print(f"{i} {seq} {freq:.6f}")
+\end{Pythoncode}
+Note that in both cases, we control the \GC content of our designs. This is just one example how the core idea of finding designs by sampling can be extended by further targets or constraints. (Adding prior knowledge as IUPAC string, would be another.) Generally, this approach will be most successful for problems with a medium-sized solution space. This would also apply to many design problems, where only parts of the RNA should be redesigned, while others are kept constant. 
+
+\subsection{Larger single-target designs by constraint-generation}
+For larger and harder design instances, we can apply a constraint-generation strategy following \RNAPOND~\cite{Yao2001}.
+\DOTS
 
-show local search for single target design / ensemble-defect, hint at added complexity in the presence of multiple targets\dots (note: e.g. RNADesign doesn't care).
-provide local resampling as an approach that makes reasonable use of the Infrared engine
-show: optimization of multi-defect
 
-\TODO{Can we design for multiple-targets based on local moves by simple point mutations? What is Steff's RNADesign.pl doing?}
+\begin{Pythoncode}
+def cg_design_iteration(dbps):
+    model = single_target_design_model(target)
+    model.add_constraints(NotBPComp(i, j) for (i, j) in dbps)
+    sampler = Sampler(model, lazy=True)
+    sampler.set_target(0.7 * n, 0.1 * n, 'gc' )
+    if sampler.treewidth() > 10 or not sampler.is_consistent():
+        return dbps[:-1], None
+    ctr = Counter()
+    for i in range(steps):
+        seq = ass_to_seq(sampler.targeted_sample())
+        fc = RNA.fold_compound(seq)
+        mfe, mfe_e = fc.mfe()
+        if fc.eval_structure(target) == mfe_e:
+            return dbps, seq
+        ctr.update(parse(mfe))
+    ndbps = [x[0] for x in ctr.most_common() if x[0] not in bps]
+    return dbps + ndbps[:3], None
+dbps, seq = [], None
+while seq is None: dbps, seq = cg_design_iteration(dbps)
+print(seq)
+\end{Pythoncode}
+
+\subsection{Negative design by stochastic optimization search with partial resampling}
+
+Based on \Infrared we can moreover easily implement stochastic optimization strategies to optimize negative objectives for single or multi-target design. We show how \Infrared supports such strategies based on resampling of connected components of the model's network.
+
+For this purpose, we define a method \texttt{multi\_design\_model} that returns a multi-target design network model for resampling. This model looks essentially like the one presented before, but fixes 
+all nucleotides outside of a specified \texttt{subset} to a provided \texttt{solution} (later, the last solution in stochastic search). This is accomplished by the code snippet
+
+\begin{Pythoncode}
+    for i in range(n):
+        if i not in subset:
+            value = solution.values()[i]
+            model.restrict_domains(i,(value,value))
+\end{Pythoncode}
+In the method (full code given in Notes), we moreover avoid to introduce constraints on or between variables outside of the subset.
+
+We can now implement a Metropolis-Hastings Monte Carlo optimization algorithm as follows
+
+\begin{Pythoncode}
+from random import random
+from math import exp
+def optimize(create_model, objective, steps, temp):
+    model = create_model(None, None)
+    cur = Sampler(model).sample()
+    curval = objective(ass_to_seq(cur))
+    ccs = model.connected_components()
+    weights = [1/len(cc) for cc in ccs]
+    bestval = math.inf
+    for i in range(steps):
+        cc = random.choices(ccs,weights)[0]
+        new = Sampler(create_model(cc, cur)).sample()
+        newval = objective(ass_to_seq(new))
+        if (newval <= curval
+            or random() <= exp(-(newval - curval ) / temp)):
+            cur, curval = new, newval
+            if curval < bestval:
+                best,bestval = cur, curval
+    return (ass_to_seq(best), bestval)
+\end{Pythoncode}
+
 
 \subsection{Further application-specific design goals}
 
@@ -532,30 +629,74 @@ discuss examples of further design goals, e.g. forbidding, enforcing motifs. Sho
 specific negative design objectives, e.g. energy differences between target structures. Example of riboswitch design (artificial, Moerl-like)
 
 \section{Notes}
-\TODO{What should go into Notes? Any technical details that we need to explain more formally?  How to refer to Notes \dots? Some code that we don't want to put into main text could go here as well.}
 
-\subsection{Ensemble defect}
-Ensemble defect is a common negative design objective in single target RNA design. We show code to compute a variant of ensemble defect using the Vienna RNA library.
+\subsection{A model for single target design}
 \begin{Pythoncode}
-def ensemble_defect(sequence, structure):
-    pass
+def single_target_design_model(target):
+    n, bps = len(target), parse(target)
+    model = Model(n, 4)
+    model.add_constraints(BPComp(i, j) for (i, j) in bps)
+    model.add_functions([GCCont(i) for i in range(n)], 'gc')
+    model.add_functions([BPEnergy(i, j, (i-1, j+1) not in bps)
+        for (i,j) in bps], 'energy')
+    model.set_feature_weight(-1.5, 'energy')
+    return model
+\end{Pythoncode}
+
+\subsection{Target frequency in ensemble}
+The frequency of the target structure in the ensemble of the designed sequence is a good example of a negative design criterion.
+\begin{Pythoncode}
+def target_frequency(sequence, target):
+    fc = RNA.fold_compound(sequence)
+    fc.pf()
+    return fc.pr_structure(target)
 \end{Pythoncode}
 
 \subsection{Multi-Defect}
 The multi-defect can be computed with the help of the Vienna RNA library.
 \begin{Pythoncode}
-def multi_defect(sequence, structures, xi=1):
-    n = len(structures)
+def multi_defect(sequence, targets, xi=1):
+    k = len(targets)
     fc = RNA.fold_compound(sequence)
     ee = fc.pf()[1]        
-    eos = [fc.eval_structure(structure)
-        for structure in structures]
-    diff_ee = sum(1/n * eos[i] - ee for i in range(n))
-    diff_targets = sum(2/(n*(n-1)) * abs(eos[i]-eos[j])
-        for i in range(n) for j in range(n) if i<j)
+    eos = [fc.eval_structure(target) for target in targets]
+    diff_ee = sum(1/k * eos[i] - ee for i in range(k))
+    diff_targets = sum(2/(k*(k-1)) * abs(eos[i]-eos[j])
+        for i in range(k) for j in range(k) if i<j)
     return diff_ee + xi * diff_targets
 \end{Pythoncode}
 
+\subsection{Multi-design model with support of partial resampling}
+
+\begin{Pythoncode}
+def multi_design_model(targets,subset=None,solution=None):
+    n = len(targets[0])
+    model = Model(n, 4)
+    if subset is None: subset = set(range(n))
+
+    # fix the entries outside of subset from solution
+    for i in range(n):
+        if i not in subset:
+            value = solution.values()[i]
+            model.restrict_domains(i,(value,value))
+
+    model.add_functions([GCCont(i) for i in range(n) if i in subset], 'gc')
+
+    for target in targets:
+        bps = parse(target)
+        bps = [(i,j) for (i,j) in bps if i in subset or j in subset]
+        model.add_constraints(BPComp(i, j) for (i, j) in bps)
+        model.add_functions([BPEnergy(i, j, (i-1, j+1) not in bps)
+            for (i,j) in bps], 'energy')
+
+    model.set_feature_weight(-1, 'energy')
+    model.set_feature_weight(-0.3, 'gc')
+
+    return model
+\end{Pythoncode}
+
+
+
 \bibliographystyle{plain}
 \bibliography{biblio}
 
-- 
GitLab