\documentclass[reqno]{amsart}
\usepackage{graphicx}
\usepackage{mathrsfs}
\usepackage{upgreek}
\usepackage{url}
\usepackage{hyperref}

\AtBeginDocument{{\noindent\small
\emph{Electronic Journal of Differential Equations},
Vol. 2008(2008), No. 42, pp. 1--31.\newline
ISSN: 1072-6691. URL: http://ejde.math.txstate.edu or http://ejde.math.unt.edu
\newline ftp ejde.math.txstate.edu  (login: ftp)}
\thanks{\copyright 2008 Texas State University - San Marcos.}
\vspace{9mm}}

\begin{document}
\title[\hfilneg EJDE-2008/42\hfil Sobolev gradients]
{Sobolev gradients for differential algebraic equations}

\author[R. Nittka, M. Sauter\hfil EJDE-2008/42\hfilneg]
{Robin Nittka, Manfred Sauter} 

\address{Robin Nittka\newline
University of Ulm, Institute of Applied Analysis,
D-89069 Ulm, Germany}
\email{robin.nittka@uni-ulm.de}

\address{Manfred Sauter \newline
University of Ulm, Institute of Applied Analysis,
D-89069 Ulm, Germany}
\email{manfred.sauter@uni-ulm.de}

\thanks{Submitted March 4, 2008. Published March 20, 2008.}
\subjclass[2000]{65L80, 41A60, 34A09}
\keywords{Differential algebraic equations; weighted Sobolev gradients; 
\hfill\break\indent
steepest descent; non-linear least squares; consistent initial conditions} 

\begin{abstract}
 Sobolev gradients and weighted Sobolev gradients have been used
 for the solution of a variety of ordinary as well as partial
 differential equations. In the article at hand
 we apply this method to linear and non-linear ordinary
 differential algebraic equations and construct suitable gradients 
 for such problems based on a new generic weighting scheme. 
 We explain how they can be put into practice.
 In the last part, we discuss the performance of our publicly available
 implementation on some differential algebraic equations and present 
 further applications.
\end{abstract}

\maketitle
\numberwithin{equation}{section}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{example}[theorem]{Example}
\newtheorem*{remark}{Remark}
\newtheorem{corollary}[theorem]{Corollary}

\newcommand{\apply}[2]{\langle#1,#2\rangle}
\newcommand{\scalar}[2]{\left(#1\mid #2\right)}

\section{Introduction}\label{sec:intro}

Differential algebraic equations (DAE) have a wide range of applications.
Inter alia they appear in electrical circuit simulation~\cite{CRRM96}, control
theory~\cite{KD99}, and in models of mechanical systems~\cite{Sch99}, to name
only a few prominent examples. Recently, a considerable amount of
research has been put into this field, see~\cite{KM06} and references therein.
The general formulation of a DAE is
\begin{equation}\label{generalDAE}
 f(t, u(t), u'(t)) = 0, \quad t \in (0, T)
\end{equation}
with some function $f\colon \mathbb{R} \times \mathbb{R}^n \times \mathbb{R}^n \to \mathbb{R}^m$
whose partial derivative with respect to the third argument may be singular.
A sufficiently smooth function $u\colon (0, T) \to \mathbb{R}^n$ satisfying~\eqref{generalDAE}
is called \emph{solution of the DAE}.
Though looking similar to ordinary differential equations,
differential algebraic equations show fundamental differences in many aspects.
Even for linear problems the solution space can be infinite dimensional, and
initial conditions do in general not impose uniqueness. Furthermore, initial
conditions might not admit a local solution, and
guessing feasible initial conditions is virtually impossible in many
cases~\cite[Subsection 5.3.4]{BCP89:ivpdae}.

Computing a numerical solution to a given DAE is a very delicate task. The
algebraically coupled equations arising in various engineering fields tend to
be numerically difficult~\cite{Cam95:hidae}. For example thermo-fluid systems are
naturally described by high index DAE~\cite{GA00:tfs}, as are DAE
resulting from batch distillation process modeling~\cite{CVSB03:bdp}. 
Most methods need antecedent transformations reducing the coupling. Those
\emph{index reductions} are complicated and can introduce considerable
numerical error by themselves~\cite[Chapter 6]{KM06}. Additionally, the special
structure of the problem is often lost.

Here we present an alternative way to deal with DAE that
has several significant advantages over the usual approaches. We use a
steepest descent method based on Sobolev gradients to minimize an
error functional in an appropriate function space. This very general method has been
successfully employed to treat Ginzburg-Landau equations for superconductivity,
conservation equations, minimal flow problems and minimal surface problems, among
others~\cite{Neu97}. The theoretical framework of Sobolev steepest descent was first presented
by John W.\ Neuberger~\cite{Neu76}. 
Our method treats the given DAE directly, without differentiation or
other prior transformations. Furthermore, it is not necessary to impose
the initial values, which is a great advantage over the step-by-step
methods that are usually employed. But in case one wants to impose
supplementary conditions, this is possible with little additional theoretical
effort. For example, it is possible to solve initial value or boundary value problems. The
only other software for solving differential algebraic boundary value problems we know of is
Ascher's and Spiteri's \texttt{COLDAE}~\cite{AS94}.

We propose steepest descent methods using weighted Sobolev
gradients for the numerical treatment of DAE.
In section~\ref{theory} we provide the underlying theory.
In section~\ref{closedgraph} we take an operator
theoretical point of view to introduce a space which seemingly has not been
considered before in this generality within the literature about
Sobolev steepest descent.
We prove that, in a sense motivated by section~\ref{sobolev_gradient},
this space which is related
the problem itself has advantages over the usual Sobolev spaces.
We continue this idea in section~\ref{fredholm} where we explain that
it is superior also in some other sense involving the Fredholm property.
In section~\ref{nonlinear} we show how various Sobolev gradients can be applied
to fully non-linear DAE, following the usual ideas as well as generalizing
the concept of section~\ref{closedgraph}.
In section~\ref{numerics} we discuss the
discretization techniques used for the numerics, also covering non-linear
problems and supplementary conditions.
Section~\ref{examples} contains details of our publicly available
implementation~\cite{loopback} and shows, via tables and plots,
how our program behaves on some intricate examples.
Finally, section~\ref{conclusion} summarizes our results.

%\input{parts/theory}

\section{Sobolev Steepest Descent}\label{theory}

In section~\ref{sobolev_gradient} we list some basic facts about the theory of Sobolev gradients.
For details we refer to John W.\ Neuberger's monograph~\cite{Neu97}.
In section~\ref{DAEtheory} we focus on the basic case of linear DAE with constant
coefficients. The general form of this equation is
\begin{equation}\label{linearDAE}
 M_1 u'(t) + M_2 u(t) = b(t), \quad t \in (0, T),
\end{equation}
where $M_1, M_2 \in\mathbb{R}^{m\times n}$ are constant matrices.
The function $b\in L^2(0,T;\mathbb{R}^m)$ is called the \emph{inhomogeneity} or \emph{right hand side}.
We look for weak solutions in $L^2(0,T;\mathbb{R}^n)$.

%\input{parts/theory.general}

\subsection{General Setting}\label{sobolev_gradient}

Let $V$ and $H$ be Hilbert spaces, $A \in \mathscr{L}(V,H)$, and $b \in H$.
Usually, $A$ is a differential operator and $V$ an appropriate Sobolev
space. We are looking for solutions $u \in V$ of the equation $Au = b$.

The (continuous) Sobolev gradient approach to this problem is the following.
Define the quadratic functional
\[
 \uppsi\colon V \to \mathbb{R}_+, \; u \mapsto {\textstyle \frac{1}{2}} 
 \left\| Au - b \right\|_H^2
\]
and try to find a zero (or at least a minimizer)
by steepest descent, i.\,e., by solving the
Hilbert space valued ordinary differential equation
\begin{equation}\label{eq:SG}
 \dot\varphi(t) = -\nabla \uppsi\left( \varphi(t) \right), \quad
 \varphi(0) = u_0
\end{equation}
for an arbitrary initial estimate $u_0 \in V$ and letting $t \to \infty$.
Here $\nabla\uppsi(u)$ denotes the unique representation of the Fr\'{e}chet
derivative $\uppsi'(u) \in V'$ as a vector in $V$ whose existence is guaranteed
by the Riesz-Fr\'{e}chet representation theorem.
The derivative of $\uppsi$ is
\begin{equation}\label{eq:gradient}
 \apply{\uppsi'(u)}{h}
  = \scalar{Au - b}{Ah}_H
  = \scalar{A^\ast Au - A^\ast b}{h}_V,
\end{equation}
hence
\[
 \nabla \uppsi(u) = A^\ast Au - A^\ast b.
\]

In~\cite[Theorems~4--6]{Neu97}, the following facts are proved.
\begin{theorem}\label{gradienttheory}
 If $b \in \mathop{\rm Rg} A$, then $\varphi(t)$ defined in~\eqref{eq:SG}\
 converges to some $\omega \in V$ in the norm of $V$, and $A\omega = b$.
 The vector $\omega$ is the zero of $\uppsi$ nearest to
 $u_0$ in the metric of $V$.
 Furthermore, for every $b \in H$ the images $A\varphi(t)$
 converge to $P_{\overline{\mathop{\rm Rg} A}} b$ as $t \to \infty$, i.\,e., to the
 orthogonal projection of $b$ onto the closure of the range of $A$.
\end{theorem}

Thus, we can characterize convergence of $\varphi(t)$ in terms of the range of $A$.

\begin{corollary}\label{convergence}
 There exists a global solution $\varphi$ to the differential equation~\eqref{eq:SG}.
 The trajectory $\left(\varphi(t)\right)_{t \in \mathbb{R}_+}$ converges in $V$ if and only if
 \begin{equation}\label{range}
  P_{\overline{\mathop{\rm Rg} A}}b \in \mathop{\rm Rg} A.
 \end{equation}
 Then the limit is the solution of the problem $Au = P_{\overline{\mathop{\rm Rg} A}}b$ with
 minimal distance to $u_0$.
\end{corollary}

\begin{proof}
 First note that the unique solution of equation~\eqref{eq:SG} is
 \[
  \varphi(t) = \mathrm{e}^{-tA^\ast A}u_0 + \int_0^t \mathrm{e}^{-(t-s)A^\ast A}A^\ast b \, \,\mathrm{d} s.
 \]
 Using the decomposition $b = P_{\overline{\mathop{\rm Rg} A}}b + P_{\ker A^\ast}b$, we
 see that $\varphi(t)$ depends only on $P_{\overline{\mathop{\rm Rg} A}}b$, not on $b$ itself.
 Replacing $b$ by its projection onto $\overline{\mathop{\rm Rg} A}$,
 theorem~\ref{gradienttheory} asserts that under condition~\eqref{range}
 the steepest descent converges and the limit has the claimed property.

 For the converse implication, assume that $\varphi(t)$ converges to some $\omega \in V$. Then
 \[
  A\omega \gets A\varphi(t) \to P_{\overline{\mathop{\rm Rg} A}}b
 \]
 by theorem~\ref{gradienttheory} and continuity of $A$.
 Hence $P_{\overline{\mathop{\rm Rg} A}}b \in \mathop{\rm Rg} A$, 
 and thus condition~\eqref{range} is fulfilled.
\end{proof}

The corollary shows in particular that the operator $A$ has closed range
if and only if $\varphi(t)$ converges for every $b \in H$.
But if $\mathop{\rm Rg} A$ is not closed, then arbitrarily small perturbations of $b$ in the
norm of $H$ alter the convergence behavior.
However, it can be proved that $\dot{\varphi}(t) \to 0$ for every $b \in H$ if
$\uppsi$ is non-negative and convex.

%\input{parts/theory.constantDAE}

\subsection{Application to differential algebraic equations}\label{DAEtheory}

Now we turn to linear, autonomous, first-order DAE, allowing
time-dependent inhomogeneities.
This means we fix matrices $M_1, M_2 \in \mathbb{R}^{m \times n}$
such that $\ker M_1 \neq \{0\}$ and a function $b \in L^2(0,T;\mathbb{R}^m)$ and
consider the DAE
\begin{equation}\label{eq:simpleDAE}
 M_1u' + M_2u = b, \quad
 u \in H^1(0,T;\mathbb{R}^n).
\end{equation}
For $V := H^1(0,T;\mathbb{R}^n)$, $H := L^2(0,T;\mathbb{R}^m)$ and $Au := M_1u' + M_2u$
this fits into the framework of section~\ref{sobolev_gradient}.
For convenience, we frequently identify a matrix $M \in \mathbb{R}^{m \times n}$
with a bounded linear operator from $L^2(0,T;\mathbb{R}^n)$ to $L^2(0,T;\mathbb{R}^m)$
acting as $(Mu)(x) := M(u(x))$. It is obvious that these
operators map $H^1(0,T;\mathbb{R}^n)$ into $H^1(0,T;\mathbb{R}^m)$.

We already have discovered that the steepest descent converges
whenever there is a solution to converge to---and then it picks the nearest solution.
But even if there is no solution the steepest descent might
converge. As we have seen in corollary~\ref{convergence} this happens if and only if
$P_{\overline{\mathop{\rm Rg} A}}b \in \mathop{\rm Rg} A$
for the given $b \in L^2(0,T;\mathbb{R}^m)$. Hence it is natural to ask whether
$\mathop{\rm Rg} A$ is closed because then the steepest descent converges for every $b$.
Unfortunately, in general this is not the case as the following necessary condition
shows.

\begin{proposition}\label{closedrange}
 If the operator $A$ defined above has closed range, then
 \begin{equation}\label{eq:contained}
  \mathop{\rm Rg} \left( M_2|_{\ker M_1} \right) \subset \mathop{\rm Rg} M_1.
 \end{equation}
 In other words, if $A$ has closed range, then
 $M_2$ maps $\ker M_1$ into $\mathop{\rm Rg} M_1$.
\end{proposition}

For the proof we use the following simple observation.
\begin{lemma}\label{diffSpace}
 Let $V$ be a subspace of $\mathbb{R}^n$ and assume that $u \in H^1(0,T;\mathbb{R}^n)$
 satisfies $u(x) \in V$ for almost every $x \in (0,T)$.
 Then $u'(x) \in V$ for almost every $x \in (0,T)$.
\end{lemma}
\begin{proof}
 Let $P_V$ denote a projection of $\mathbb{R}^n$ onto $V$.
 We consider $P_V$ also as an operator on $H^1(0,T;\mathbb{R}^n)$
 defined by pointwise application.
 Then linearity of differentiation yields
 \[
  u'(x) = \left(P_V u\right)'(x) = P_V u'(x) \in V
 \]
 for almost every $x \in (0,T)$. This proves the claim.
\end{proof}

\begin{proof}[Proof of proposition~\ref{closedrange}]
 Assume that $\mathop{\rm Rg} A$ is closed and
 that condition~\eqref{eq:contained} does not hold, i.\,e., that there exists
 a vector $e \in \ker M_1 \subset \mathbb{R}^N$ such that $M_2e \not\in \mathop{\rm Rg} M_1$.
 Fix any sequence $(v_k)$ in $H^1(0,T)$ converging
 to a function $v \in L^2(0,T) \setminus H^1(0,T)$ in the norm of $L^2(0,T)$
 and define $u_k := v_k e$.
 Then $v_k M_2e = Au_k \in \mathop{\rm Rg} A$ for all $k \in \mathbb{N}$ by lemma~\ref{diffSpace},
 hence $v M_2e = \lim v_k M_2e \in \overline{\mathop{\rm Rg} A}$.
 Since we assumed that $\mathop{\rm Rg} A = \overline{\mathop{\rm Rg} A}$ there exists $u \in H^1(0,T;\mathbb{R}^n)$
 such that $Au = v M_2e$. We decompose
 \[
  u = u_1 + u_2,
  \quad \text{where } u_1 := P_{\left(\ker M_1\right)^\perp}u \text{ and }
  u_2 := P_{\ker M_1}u,
 \]
 and note that $u_2' \in \ker M_1$ almost everywhere by the above lemma, whence
 \[
  v M_2e = Au = M_1 u_1' + M_2 u.
 \]
 Now fix a row vector $q \in \mathbb{R}^{1 \times m}$
 satisfying $q M_2 e = 1$ and $q M_1 = 0$.
 Such a vector exists because $e$ is chosen such that
 $\mathop{\rm span} \{ M_2e \} \cap \mathop{\rm Rg} M_1 = \{0\}$.
 Finally, observe that
 \[
  qM_2u = q\left( v M_2e - M_1u_1' \right) = v \not\in H^1(0,T),
 \]
 contradicting $u \in H^1(0,T;\mathbb{R}^n)$.
\end{proof}

The following simple examples of DAE show the different behavior that may
occur regarding the closedness of the range. We will revisit them
in section~\ref{closedgraph}.

\begin{example} \rm
 Let
 $M_1 := \bigl(\begin{smallmatrix} 0 & 0 \\ 1 & 0 \end{smallmatrix}\bigr)$ and
 $M_2 := \bigl(\begin{smallmatrix} 0 & 0 \\ 0 & 1 \end{smallmatrix}\bigr)$. Then
 $A\bigl(\begin{smallmatrix} u \\ v \end{smallmatrix}\bigr) = \bigl(\begin{smallmatrix} 0 \\ u' + v \end{smallmatrix}\bigr)$,
 whence
 $
  \mathop{\rm Rg} A = \bigl\{ \bigl(\begin{smallmatrix} 0 \\ f \end{smallmatrix}\bigr) : f \in
  L^2(0,T) \bigr\}
 $
 is closed.
\end{example}

\begin{example} \rm\label{semiclosedrange}
 Let
 $M_1 := \bigl(\begin{smallmatrix} 1 & 0 \\ 1 & 0 \end{smallmatrix}\bigr)$ and
 $M_2 := \bigl(\begin{smallmatrix} 0 & 0 \\ 0 & 1 \end{smallmatrix}\bigr)$. Then
 $A\bigl(\begin{smallmatrix} u \\ v \end{smallmatrix}\bigr) = \bigl(\begin{smallmatrix} u' \\ u' + v \end{smallmatrix}\bigr)$.
 Proposition~\ref{closedrange} shows that $\mathop{\rm Rg} A$ is not closed.
\end{example}

\begin{example} \rm \label{reallynotclosed}
 Let
 $M_1 := \bigl(\begin{smallmatrix} 0 & 1 \\ 0 & 0 \end{smallmatrix}\bigr)$ and
 $M_2 := \bigl(\begin{smallmatrix} 1 & 0 \\ 0 & 1 \end{smallmatrix}\bigr)$. Then
 $A\bigl(\begin{smallmatrix} u \\ v \end{smallmatrix}\bigr) = \bigl(\begin{smallmatrix} v' + u \\ v \end{smallmatrix}\bigr)$.
 We will prove later that $\mathop{\rm Rg} A$ is not closed, see example~\ref{reallynotclosed2}.
 We point out, however, that this does not follow from proposition~\ref{closedrange}.
\end{example}

As we have seen, we cannot expect the steepest descent to converge for any right hand
side $b$. But some regularity assumption on $b$ might ensure convergence.
More precisely, the authors suggest to investigate whether
$b \in H^1(0,T;\mathbb{R}^m)$ implies $P_{\overline{\mathop{\rm Rg} A}} b \in \mathop{\rm Rg} A$.

%\input{parts/closedness}

\section{Closedness}\label{closedgraph}

Considering $V = H^1(0,T;\mathbb{R}^n)$ as done in section~\ref{theory} is
natural since this space is the maximal subspace of
$L^2(0,T;\mathbb{R}^n)$ for which $u'$ can be defined.
However, noting that the equation
$M_1u' + M_2u = b$ can also be written as $(M_1u)' + M_2u = b$,
we see that it suffices to require $M_1u$ to be in $H^1(0,T;\mathbb{R}^m)$,
which may be the case even if $u \not\in H^1(0,T;\mathbb{R}^n)$. More precisely,
the part of $u$ in $\ker M_1$ is allowed to be only $L^2$ instead of $H^1$.
Indeed, the following lemma shows that this describes the maximal subspace of
$L^2(0,T;\mathbb{R}^n)$ to which $A$ can be extended.

\begin{proposition}\label{Abar}
 Define
 \begin{align*}
  D(\bar{A}) & := \left\{ u \in L^2(0,T;\mathbb{R}^n) : M_1u \in H^1(0,T;\mathbb{R}^m) \right\} \subset L^2(0,T;\mathbb{R}^n), \\
  \bar{A}u & := \left(M_1u\right)' + M_2u.
 \end{align*}
 Then the operator $\bar{A}: L^2(0,T;\mathbb{R}^n) \supset D(\bar{A}) \to L^2(0,T;\mathbb{R}^m)$ is closed.
 It is the closure of the operator
 $A: L^2(0,T;\mathbb{R}^n) \supset H^1(0,T;\mathbb{R}^n) \to L^2(0,T;\mathbb{R}^m)$
 defined in section~\ref{DAEtheory}.
\end{proposition}

\begin{proof}
 Denote $V := D(\bar{A})$. To show that $\bar{A}$ is closed,
 fix a sequence $(u_k)$ in $V$ converging in the norm of $L^2(0,T;\mathbb{R}^n)$
 to a function $u$ such that $\bar{A}u_k$ converges to a
 function $v$ in $L^2(0,T;\mathbb{R}^m)$.
 We have to prove that $u \in V$ and $\bar{A}u = v$.
 Define $w_k := M_1 u_k \in H^1(0,T;\mathbb{R}^m)$.
 Then $w_k \to M_1 u$ in $L^2(0,T;\mathbb{R}^n)$ and
 \[
  \bar{A}u_k = w_k' + M_2u_k \to v \mbox{ in } L^2(0,T;\mathbb{R}^m),
 \]
 hence
 \[
  w_k' \to v - M_2u \mbox{ in } L^2(0,T;\mathbb{R}^m).
 \]
 The differentiation operator on $L^2(0,T;\mathbb{R}^m)$ with domain
 $H^1(0,T;\mathbb{R}^m)$ is closed, hence
 $w_k \to M_1 u$ and $w_k' \to v - M_2 u$ implies that
 $M_1 u \in H^1(0,T;\mathbb{R}^n)$ and $(M_1 u)' = v - M_2 u$.
 This means precisely that $u \in V$ and $\bar{A}u = v$.
 We have shown that $\bar{A}$ is closed.

 Now let $P$ be a projection of $\mathbb{R}^n$ onto $\ker M_1$.
 We claim that
 \begin{equation}\label{eq:AbarDom}
  V = \left\{ u \in L^2(0,T;\mathbb{R}^n) : (I - P)u \in H^1(0,T;\mathbb{R}^n) \right\}.
 \end{equation}
 To see this, note that the restriction
 $\widetilde{M}_1\colon \mathop{\rm Rg}(I - P) \to \mathop{\rm Rg} M_1$ of $M_1$ to
 $\mathop{\rm Rg}(I - P)$ is invertible and satisfies
 $\widetilde{M}_1^{-1} M_1 u = (I - P) u$. This shows that
 $(I - P)u$ is in $H^1(0,T;\mathbb{R}^n)$ whenever $M_1 u$ is in $H^1(0,T;\mathbb{R}^m)$.
 The other inclusion similarly follows from $M_1 u = M_1 (I-P) u$.

 To show that $\bar{A}$ is the closure of $A$, for each $u \in V$
 we have to find a sequence $(u_k) \subset H^1(0,T;\mathbb{R}^n)$ such that $u_k \to u$ in $L^2(0,T;\mathbb{R}^n)$
 and $Au_k \to \bar{A}u$ in $L^2(0,T;\mathbb{R}^m)$. Fix $u \in V$ and define $w := (I - P)u$
 and $v := Pu$. The representation~\eqref{eq:AbarDom} shows that $w \in H^1(0,T;\mathbb{R}^n)$.
 Since $H^1(0,T;\mathbb{R}^n)$ is dense in $L^2(0,T;\mathbb{R}^n)$, there exists a sequence
 $(v_k)$ in $H^1(0,T;\mathbb{R}^n)$ which converges to $v$ in $L^2(0,T;\mathbb{R}^n)$. Define
 $u_k := w + P v_k \in H^1(0,T;\mathbb{R}^n)$.
 Then $u_k \to w + P v = w + v = u$ in $L^2(0,T;\mathbb{R}^n)$, thus
 \[
  Au_k = M_1w' + M_2u_k \to M_1w' + M_2u = \bar{A}u \mbox{ in } L^2(0,T;\mathbb{R}^m).
 \]
 This shows that $(u_k)$ is a sequence with the desired property.
\end{proof}

The following corollary restates the closedness of $\bar{A}$ in different words, using
a well-known characterization of closed operators.

\begin{corollary}\label{graphnorm}
 The space $V := D(\bar{A})$ equipped with the inner product
 \[
  \scalar{u}{v}_V := \scalar{u}{v}_{L^2(0,T;\mathbb{R}^n)} + \scalar{\bar{A}u}{\bar{A}v}_{L^2(0,T;\mathbb{R}^m)}
 \]
 is a Hilbert space. The operator $\bar{A}\colon V \to L^2(0,T;\mathbb{R}^m)$ is bounded.
\end{corollary}

This shows how to apply the method of steepest descent to the operator
$\bar{A}$. In general, this will lead to trajectories and limits which are
different from those obtained by the approach in section~\ref{theory}, since
$\nabla\uppsi$ is taken with respect to some other inner product. So the
question arises which space should be used (also compare to
section~\ref{ssec:gradients}). The next corollary shows that from a
theoretical point of view the situation improves if
$H^1(0,T;\mathbb{R}^n)$ is replaced with $V$.

\begin{lemma}\label{closedisbetter}
 Let $A\colon X \supset D(A) \to Y$ be a closable operator,
 and let $\bar{A}$ be its closure. Then
 $\mathop{\rm Rg} A \subset \mathop{\rm Rg} \bar{A} \subset \overline{\mathop{\rm Rg} A}$.
 In particular, if $\mathop{\rm Rg} A$ is closed, then $\mathop{\rm Rg} \bar{A}$ is closed.
\end{lemma}

\begin{proof}
 The first inclusion is obvious since $\bar{A}$ extends $A$.
 Now let $y \in \mathop{\rm Rg} \bar{A}$. Then there exists $x \in D(\bar{A})$ such
 that $\bar{A}x = y$. By definition of the closure
 there exists a sequence $\left(x_n\right) \subset D(A)$ such that
 $x_n \to x$ in $X$ and $Ax_n \to \bar{A}x = y$ in $Y$.
 But this proves that $y$ is a limit of a sequence in $\mathop{\rm Rg} A$, hence
 $y \in \overline{\mathop{\rm Rg} A}$.
\end{proof}

\begin{corollary}\label{betterconvergence}
 Let $b \in L^2(0,T;\mathbb{R}^m)$ and consider problem~\eqref{linearDAE}.
 If the steepest descent with respect to the inner product in $H^1(0,T;\mathbb{R}^n)$ converges
 for the right hand side $b$, then the steepest descent with respect to the inner product
 from corollary~\ref{graphnorm} converges for that right hand side as well.
\end{corollary}

\begin{proof}
 This follows from corollary~\ref{convergence} combined 
 with lemma~\ref{closedisbetter}.
\end{proof}

To illustrate that using of $\bar{A}$ instead of $A$ may improve the situation, but not
always does, we again consider the examples of section~\ref{DAEtheory}.
Here again, $A$ refers to the operator defined in section~\ref{DAEtheory},
whereas $\bar{A}$ and $V$ are as in corollary~\ref{graphnorm}.
The examples also show that relation~\eqref{eq:contained} is independent
of $\mathop{\rm Rg} \bar{A}$ being closed.

\begin{example} \rm
 Let $M_1$ and $M_2$ be as in example~\ref{semiclosedrange}. Then
 \begin{align*}
  V & = \Big\{ \begin{pmatrix} u \\ v \end{pmatrix} : u \in H^1(0,T), \; v 
  \in L^2(0,T) \Big\}, \\
  \mathop{\rm Rg} \bar{A}
   & = \Big\{ \begin{pmatrix} u' \\ u' + v \end{pmatrix} : u \in H^1(0,T), \; v 
   \in L^2(0,T) \Big\}
   = L^2(0,T;\mathbb{R}^2).
 \end{align*}
 We used that every function in $L^2(0,T)$ is the derivative of a function in $H^1(0,T)$.
 This shows that $\bar{A}$ is surjective. In particular $\mathop{\rm Rg} \bar{A}$ is closed,
 whereas $\mathop{\rm Rg} A$ is not as seen in example~\ref{semiclosedrange}.
\end{example}

\begin{example} \rm\label{reallynotclosed2}
 Consider again the matrices $M_1$ and $M_2$ from example~\ref{reallynotclosed}.
 Then
 \begin{align*}
  V & = \Big\{ \begin{pmatrix} u \\ v \end{pmatrix} : u \in L^2(0,T), \; 
  v \in H^1(0,T) \Big\}, \\
  \mathop{\rm Rg} \bar{A} & = \Big\{ \begin{pmatrix} v' + u \\ v \end{pmatrix} :
   u \in L^2(0,T), \; v \in H^1(0, T) \Big\} = L^2(0,T) \times H^1(0,T).
 \end{align*}
 Hence $\mathop{\rm Rg} \bar{A}$ is dense in $L^2(0,T;\mathbb{R}^2)$, but not closed. By lemma~\ref{closedisbetter} this implies
 that also $\mathop{\rm Rg} A$ is not closed. This proves the claim of example~\ref{reallynotclosed}.
\end{example}

%\input{parts/lojasiewicz}

\section{Fredholm Property}\label{fredholm}

Assuming that there exists a solution of~\eqref{eq:simpleDAE}
we are interested in the convergence behavior of the Sobolev steepest
descent. For example the so-called \L ojasiewicz-Simon inequality can be used to
investigate the rate of convergence~\cite{Har03}.
On the other hand, for the non-linear case treated in the next section a special
instance of this inequality has been used to prove convergence for arbitrary initial
estimates~\cite[Section~4.2]{Neu97}.

A particularly simple method to show that a \L ojasiewicz-Simon inequality
holds locally near a critical point $u_0 \in V$ is by checking
that $\uppsi''(u_0) = A^\ast A$ is a Fredholm operator~\cite[Corollary~3]{Chi06}.
Unfortunately, theorem~\ref{noFredholm} shows that we never are in
this situation when $A$ is the operator of section~\ref{theory}.
This fact is interesting in its own right.
Of course this does not mean that the \L ojasiewicz-Simon inequality
cannot be fulfilled for any steepest descent coming from a DAE;
we give an example at the end of the section.

\begin{lemma}\label{DastD}
 Let $D\colon H^1(0,T) \to L^2(0,T)$, $u \mapsto u'$.
 Then $D^\ast D = I - (I - \Delta_N)^{-1}$, where $\Delta_N$
 denotes the Neumann Laplacian $\Delta_N u = u''$ with domain
 \[
  D(\Delta_N) = \left\{ u \in H^2(0,T) : u'(0) = u'(T) = 0 \right\}.
 \]
\end{lemma}
\begin{proof}
 By definition, $\scalar{D^\ast Du}{v}_{H^1} = \scalar{Du}{Dv}_{L^2}$
 for all $u, v \in H^1(0,T)$. Thus it suffices to show that
 \begin{align*}
  \int_0^T u' v'
   & \stackrel{!}{=} \scalar{\left(I - (I - \Delta_N)^{-1}\right) u}{v}_{H^1} \\
   & = \int_0^T u v + \int_0^T u' v' - \int_0^T \left((I - \Delta_N)^{-1} u \right) v - \int_0^T \left( (I - \Delta_N)^{-1} u \right)' v'.
 \end{align*}
 This is an immediate consequence of the integration by parts formula,
 using that $(I - \Delta_N)^{-1} u \in D(\Delta_N)$. In fact,
 \begin{align*}
  \int_0^T \left( (I - \Delta_N)^{-1} u \right)' v'
   & = \left((I - \Delta_N)^{-1} u\right)' v \bigr|^T_0 - \int_0^T \left( (I - \Delta_N)^{-1} u \right)'' v \\
   & = \int_0^T \left( (I - \Delta_N) (I - \Delta_N)^{-1} u - (I - \Delta_N)^{-1} u \right) v.
 \end{align*}
 Collecting the terms, the claimed identity follows.
\end{proof}
As the embedding of $H^2(0,T)$ into $H^1(0,T)$ is compact, the above lemma shows
that $D^\ast D$ is a compact perturbation of the identity.
This result generalizes to $D\colon H^1(0,T;\mathbb{R}^n) \to L^2(0,T;\mathbb{R}^n)$, $u \mapsto u'$
by considering every component separately.

\begin{theorem}\label{noFredholm}
 Consider the operator $A\colon H^1(0,T;\mathbb{R}^n) \to L^2(0,T;\mathbb{R}^m)$ defined
 by $A := DM_1 + \iota M_2$ as introduced in section~\ref{theory}.
 Here the matrices $M_1$ and $M_2$ act as operators from $H^1(0,T;\mathbb{R}^n)$
 into $H^1(0,T;\mathbb{R}^m)$, and the differentiation $D$ and the embedding $\iota$
 map from $H^1(0,T;\mathbb{R}^n)$ into $L^2(0,T;\mathbb{R}^n)$.
 Then $A^\ast A = M_1^T M_1 + K$ for a compact operator
 $K$ acting on $H^1(0,T;\mathbb{R}^n)$ which shows that $A^\ast A$ is a Fredholm operator if
 and only if $\ker M_1 = \{0\}$.
\end{theorem}

\begin{proof}
 The embedding $\iota$ is compact, hence also $\iota^\ast$ is a compact operator.
 By lemma~\ref{DastD}, $D^\ast D = I + \tilde{K}$ for a compact operator $\tilde{K}$.
 Using the ideal property of compact operators, we obtain
 \[
  A^\ast A = M_1^\ast M_1 + K = M_1^T M_1 + K
 \]
 for a compact operator $K$ on $H^1(0,T;\mathbb{R}^n)$. Because compact
 perturbations of Fredholm operators remain Fredholm operators~\cite[Corollary~4.47]{AA02},
 $A^\ast A$ is a Fredholm operator if and only if $M_1^T M_1$ is.
 If $M_1$ has trivial kernel, then $M_1^T M_1$ is invertible and hence
 a Fredholm operator. If on the other hand $\ker M_1 \neq \{0\}$, then 
 $\dim \ker M_1^T M_1 = \infty$ as an operator on $H^1(0,T;\mathbb{R}^n)$,
 implying that $M_1^T M_1$ is not a Fredholm operator.
\end{proof}

However, the next example shows that $\bar{A}^\ast \bar{A}$ might be a
Fredholm operator even though $A^\ast A$ is not.
This shows that also in this sense we can improve the situation by replacing
$A$ by $\bar{A}$.

\begin{example} \rm
 For
 $M_1 := \bigl(\begin{smallmatrix} 1 & 0 \\ 0 & 0 \end{smallmatrix}\bigr)$
 and
 $M_2 := \bigl(\begin{smallmatrix} 0 & 0 \\ 0 & 1 \end{smallmatrix}\bigr)$
 let $\bar{A}$ be defined as in proposition~\ref{Abar}. It is easy to check that
 \[
  \ker \bar{A} = \Big\{ \begin{pmatrix} u \\ 0 \end{pmatrix} :
   u \equiv c \in \mathbb{R} \Big\}
  \quad\text{and}\quad
  \mathop{\rm Rg} \bar{A} = L^2(0,T) \times L^2(0,T),
 \]
 proving that $\bar{A}$ is a Fredholm operator of index $1$.
 This shows that also $\bar{A}^\ast \bar{A}$ is a Fredholm operator,
 see~\cite[Theorems~4.42 and 4.43]{AA02}.
\end{example}

On the other hand, $\bar{A}^\ast \bar{A}$ is not necessarily 
a Fredholm operator,
e.\,g.\ it is not for $M_1 := \begin{pmatrix} 1 & 0 \end{pmatrix}$ and
$M_2 := \begin{pmatrix} 0 & 1 \end{pmatrix}$.
It would be useful to have a characterization of $\bar{A}^\ast \bar{A}$ being a Fredholm
operator in terms of the matrices $M_1$ and $M_2$. This would provide a tool to investigate
the rate of convergence of the steepest descent.

%\input{parts/nonlinear}

\section{The Non-Linear Case}\label{nonlinear}

In this section we consider the general, fully non-linear first order DAE
\begin{equation}\label{eq:dae:nonlinear}
 f(t,u(t),u'(t)) = 0
\end{equation}
where $f\colon [0,T]\times\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^m$.
We treat this case in utmost generality, 
not caring about convergence. Instead, we focus on
the theoretical background needed to formulate various
steepest descent equations corresponding to the gradients
introduced in sections~\ref{theory} and~\ref{closedgraph}.

We need to formulate the DAE~\eqref{eq:dae:nonlinear} in a functional analytic way in order to apply
Sobolev gradient methods. We want to define the operator
\begin{equation}\label{eq:F:nonlinear}
 F\colon H^1(0,T;\mathbb{R}^n)\to L^2(0,T;\mathbb{R}^m), \; F(u) := t \mapsto f(t,u(t),u'(t))
\end{equation}
and to minimize the (non-linear) functional
\begin{equation}\label{eq:psi:nonlinear}
 \uppsi\colon H^1(0,T;\mathbb{R}^n) \to \mathbb{R}, \; \uppsi(u) := {\textstyle \frac{1}{2}} \| F(u) \|^2_2.
\end{equation}
Such an operator $F$ is frequently called
\emph{Nemytskii operator}~\cite[Chapter~1]{AP93:pna} or
\emph{differential operator}~\cite{AZ90}.
We require it to be well-defined and at least differentiable.
This is the case if $f$ fulfills certain
regularity and growth conditions.
For the sake of completeness, we prove a lemma of this kind.
Similar conditions involving higher order partial derivatives can be found
which guarantee $F$ to be of higher regularity, for example of class $\mathrm{C}^2$.

We say that a function $g\colon [0,T]\times\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^m$ satisfies the
growth assumption~\eqref{eq:growth} if for every compact set $K \subset \mathbb{R}^n$
there exist constants $C, M > 0$ only depending on $f$, $T$ and $K$ such that
\[\tag{G}\label{eq:growth}
 \forall t \in [0,T] \quad \forall x \in K \quad \forall y \in \mathbb{R}^n \quad
 \left| g(t, x, y) \right| \le C \left| y \right| + M
\]
where $|\cdot|$ denotes a norm in $\mathbb{R}^m$ or $\mathbb{R}^n$, respectively.
Similarly, we say that $g$ satisfies the boundedness assumption~\eqref{eq:bounded} if for every
compact set $K \subset \mathbb{R}^n$ there exists $L > 0$ only depending on $f$, $T$ and $K$ such that
\[\tag{B}\label{eq:bounded}
 \forall t \in [0,T] \quad \forall x \in K \quad \forall y \in \mathbb{R}^n \quad
 \left| g(t, x, y) \right| \le L.
\]

\begin{lemma}\label{lemma:FisC1}
 Let $f\colon [0,T]\times\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^m$ be measurable, and denote
 its arguments by $(t, x, y)$. Assume that $f$ is of class
 $\mathrm{C}^2$ with respect to $(x, y)$. We denote the matrix-valued partial derivative of
 $f$ with respect to $x$ by $f_x$, and similarly for $y$ and higher order partial derivatives.
 Assume that $f$, $f_x$, $f_{xx}$ and $f_{xy}$ satisfy~\eqref{eq:growth}
 and that $f_y$ and $f_{yy}$ satisfy~\eqref{eq:bounded}.
 Then $F$ as in~\eqref{eq:F:nonlinear} is a mapping of class $\mathrm{C}^1$
 from $H^1(0,T;\mathbb{R}^n)$ to $L^2(0,T;\mathbb{R}^m)$, and its derivative at $u \in H^1(0,T;\mathbb{R}^n)$
 applied to $h \in H^1(0,T;\mathbb{R}^n)$ is
 \begin{equation}\label{eq:F'}
  \left(F'(u)h\right)(t) = f_x(t, u(t), u'(t))h(t) + f_y(t, u(t), u'(t))h'(t)
 \end{equation}
 for almost every $t \in [0,T]$.
\end{lemma}

\begin{proof}
 Let $u \in H^1(0,T;\mathbb{R}^n)$ be arbitrary. As $H^1(0,T)$ continuously embeds
 into $\mathrm{C}[0,T]$, $u$ can be chosen to be a continuous function.
 Thus there exists $R$ such that $|u(t)| \le R$
 for all $t \in [0,T]$. Let $K$ be the closure of the ball $B(0,R+1)$.
 For this $K$, fix constants $C$, $M$ and $L$
 simultaneously satisfying~\eqref{eq:growth} and~\eqref{eq:bounded}
 for all the functions in the assumptions.
 The estimate $|F(u)(t)| \le C |u'(t)| + M$, $t \in [0,T]$, shows $F(u) \in L^2(0,T;\mathbb{R}^m)$.
 Similarly, for $F'(u)$ defined by~\eqref{eq:F'} we obtain
 \begin{align*}
  \left\|F'(u)h\right\|_2^2 & = \int \left| \left( F'(u) h \right)(t) \right|^2
   \le \int 2 \Big( \left( C |u'(t)| + M \right)^2 |h(t)|^2 + L^2 |h'(t)|^2 \Big) \\
   & \le 4 \left( C^2 \left\|u'\right\|_2^2 + T M^2 \right) \left\|h\right\|_\infty^2 + 2L^2 \left\|h'\right\|_2^2.
 \end{align*}
 Because $H^1(0,T)$ embeds into $L^\infty(0,T)$ continuously,
 this proves the boundedness of $F'(u)$ as an operator from
 $H^1(0,T;\mathbb{R}^n)$ to $L^2(0,T;\mathbb{R}^m)$.

 Next, we show that $F'(u)$ is indeed the derivative of $F$ at $u$.
 For every $t \in \mathbb{R}$ and $x, y \in \mathbb{R}^n$, denote by
 $o_{t,x,y}\colon \mathbb{R}^n \times \mathbb{R}^n \to \mathbb{R}^m$ the error
 in the expansion
 \[
  f(t,x+\varepsilon_1,y+\varepsilon_2) = f(t,x,y) + f_x(t,x,y) \varepsilon_1 
  + f_y(t,x,y) \varepsilon_2 + o_{t,x,y}(\varepsilon_1,\varepsilon_2)
   \Big| \begin{pmatrix} \varepsilon_1 \\ \varepsilon_2 \end{pmatrix} \Big|.
 \]
 We have to show that the error
 \[
  \Bigl( F(u+h)(t) - F(u)(t) - (F'(u)h)(t) \Bigr) 
  \Big| \begin{pmatrix} h(t) \\ h'(t) \end{pmatrix} \Big|^{-1}
   = o_{t,u(t),u'(t)}(h(t),h'(t))
 \]
 approaches zero as a function in $t$ with respect to the norm 
 of $L^2(0,T;\mathbb{R}^m)$ as $h$ tends to zero in $H^1(0,T;\mathbb{R}^n)$.
 For this we employ the estimate
 \[
  \left| g(x+h) - g(x) - g'(x)h \right| \le \sum_{i,j=1}^N \sup_{y \in [x,x+h]} \left| g_{x_i x_j}(y) \right| \left|h_i\right| \left|h_j\right|
 \]
 for functions $g\colon \mathbb{R}^N \to \mathbb{R}$ of class $\mathrm{C}^2$
 which can be verified by iterated applications of the mean value theorem.
 Thus by the assumptions on the second derivatives, for small enough
 $h \in H^1(0,T;\mathbb{R}^n)$ we obtain that
 \begin{align*}
  \bigl| o_{t,u(t), u'(t)}(h(t), h'(t)) \bigr|
   & \le \frac{ \sup \left|f_{xx}\right| 
     \left|h\right|^2 + 2 \sup \left|f_{xy}\right| 
     \left|h\right| \left|h'\right| + \sup 
     \left|f_{yy}\right| \left|h'\right|^2}{\left(|h|^2 + 
      |h'|^2\right)^{1/2}} \\
   & \le 3 \bigl( C (|u'| + |h'|) + M \bigr) |h| + L |h'|
 \end{align*}
 for every $t \in [0,T]$. By similar arguments as above, this estimate 
shows that $o_{\cdot, u(\cdot), u'(\cdot)}(h(\cdot), h'(\cdot))$ goes 
to zero in $L^2(0,T;\mathbb{R}^m)$  as $h$ tends to zero in 
$H^1(0,T;\mathbb{R}^n)$. This proves that $F'(u)$ is the
 derivative of $F$ at $u$.

 Finally, the continuity of the operator-valued function $F'$ on $H^1(0,T;\mathbb{R}^n)$
 can be proved in a similar manner. For this, we have to make use of the growth conditions
 on the second order derivatives.
\end{proof}
\begin{remark} \rm
 The lemma suffices for most applications. For example for quasi-linear problems,
 i.\,e., for $f(t,x,y) = g(t,x) y + h(t,x)$,
 and thus in particular for linear and semi-linear problems,
 the above assumptions are fulfilled whenever $g$ and $h$ are sufficiently smooth,
 independently of their growth behavior.
\end{remark}

The assumptions on $f$ can be weakened by imposing
more regularity on the solution $u$ as the following corollary shows.

\begin{corollary}
 Assume that $f\colon [0,T] \times \mathbb{R}^n \times \mathbb{R}^n$ is of class $\mathrm{C}^2$.
 Then $F$ defined as in~\eqref{eq:F:nonlinear} is a mapping of class $\mathrm{C}^1$
 from $H^2(0,T;\mathbb{R}^n)$ to $L^2(0,T;\mathbb{R}^m)$, and its derivative is as stated in
 equation~\eqref{eq:F'}.
\end{corollary}

\begin{proof}
 Since functions in $H^1(0,T)$ are bounded,
 the values of $f(t, u(t), u'(t))$ remain in a bounded set as $t$ ranges over $[0,T]$ and $u$ ranges over the unit
 ball in $H^2(0,T;\mathbb{R}^n)$, and the same statement holds for the partial derivatives.
 Using this fact, the arguments are similar to the proof of the lemma.
\end{proof}
However, it might happen that solutions of~\eqref{eq:dae:nonlinear} are of class $H^1$ but not of class $H^2$,
see for example equation~\eqref{eq:exbvp:simple} in section~\ref{sec:solspaceest}.
In such cases we impose too much regularity when choosing this Sobolev space.
For a general discussion about the technique
of using spaces of higher order than strictly necessary for Sobolev descent methods,
we refer to~\cite[Section~4.5]{Neu97}.

For the moment, we assume that $F\colon H^1(0,T;\mathbb{R}^n) \to L^2(0,T;\mathbb{R}^n)$
is of class $\mathrm{C}^1$. Later we will need higher regularity.
By the chain rule, the derivative of $\uppsi$ defined in~\eqref{eq:psi:nonlinear} is
\[
 \uppsi'(u)h = \scalar{F(u)}{F'(u)h}_{L^2} = \scalar{F'(u)^\ast F(u)}{h}_{H^1}.
\]
Analogously to the linear case, we define the \emph{$H^1$ Sobolev gradient} as
\[
 \nabla_{H^1}\uppsi(u) = F'(u)^\ast F(u)
\]
and consider trajectories of the corresponding steepest descent equation~\eqref{eq:SG}.
It is possible to find sufficient conditions under which those trajectories converge
to a solution of~\eqref{eq:dae:nonlinear}. In fact, this is one of the main topics
in the monograph~\cite{Neu97}.

However, it is known that for some examples using a weighted Lebesgue measure
for the computation of the Sobolev gradient---giving rise to
\emph{weighted Sobolev gradients}---improves the situation
significantly, cf.~\cite{Mah95:phd,Mah97:conv,Mah97:singular,Mah99:weighted}.
This complements our discussion in section~\ref{closedgraph} where
we showed that the convergence behavior can be improved by choosing an inner
product related to the problem itself.
We now generalize the inner product considered in that section to the non-linear case.
To this end, we equip $H^1(0,T;\mathbb{R}^n)$ with a variable inner product making it into a Riemannian
manifold.  A similar idea has been investigated by Kar\'atson and Neuberger in a recent article~\cite{NeuKar07}
where they identify Newton's method as a steepest descent with respect to a
certain variable inner product. The resulting method is quite similar to what
we present here. However, they make assumptions which are not fulfilled in our case.

For the rest of this section, we make use of the notations of~\cite{Lang95}.
\begin{lemma}\label{Riemannmetric}
 Let $F\colon H^1(0,T;\mathbb{R}^n) \to L^2(0,T;\mathbb{R}^m)$ be of class $\mathrm{C}^2$.
 Choose $\lambda > 0$. Then the mapping
 \[
  g_2\colon H^1(0,T;\mathbb{R}^n) \to \mathcal{L}^2_{\mathrm{sym}}\left(H^1(0,T;\mathbb{R}^n)\right)
 \]
 defined by
 \[
  g_2(u) := \left( (f, g) \mapsto \lambda \scalar{f}{g}_{H^1(0,T;\mathbb{R}^n)} + \scalar{F'(u)f}{F'(u)g}_{L^2(0,T;\mathbb{R}^m)} \right)
 \]
 makes $H^1(0,T;\mathbb{R}^n)$ into an infinite dimensional Riemannian manifold.
\end{lemma}

\begin{proof}
 We choose only one chart as the atlas of the manifold
 $X := H^1(0,T;\mathbb{R}^n)$, namely the identity mapping
 onto the Banach space $\mathbf{E} := H^1(0,T;\mathbb{R}^n)$.
 The tangent bundle is trivial, i.\,e., $TX \cong X \times \mathbf{E}$.
 In this case, a Riemannian metric on $X$ is a sufficiently smooth mapping
 $g = (g_1, g_2)$ from $X$ to $X \times \mathcal{L}^2_{\mathrm{sym}}(\mathbf{E})$
 such that $g_1 = \mathrm{id}$ and $g_2(u)$ is positive definite for every
 $u \in X$. Choose $g = (\mathrm{id}, g_2)$ with $g_2$ as above.
 Then $g$ is of class $\mathrm{C}^1$ by the chain rule, and $g_2(u)$
 is positive definite.
\end{proof}

Here $\lambda > 0$ can be chosen arbitrarily. Large values of $\lambda$
increase the distance of $g_2$ to a singular form, whereas for small values
of $\lambda$ the metric is closer to the original problem.
Both effects are desirable, so one has to find a balance between
these goals when choosing $\lambda$.

We want to apply the steepest descent method on Riemannian manifolds.
For finite dimensional manifolds, a discussion of this can be found for
example in~\cite[Section~7.4]{Udr94}.
We have to compute the gradient $\nabla_g \uppsi$ of the functional $\uppsi$ defined
in~\eqref{eq:psi:nonlinear}. By definition, the gradient at $u \in H^1(0,T;\mathbb{R}^n)$ satisfies
\begin{align*}
 \uppsi'(u) h & = \scalar{F(u)}{F'(u)h}_{L^2} = \scalar{F'(u)^\ast F(u)}{h}_{H^1} \\
  & = \scalar{\nabla_g \uppsi(u)}{h}_g
  = \lambda \scalar{\nabla_g \uppsi(u)}{h}_{H^1} + \scalar{ F'(u) \nabla_g \uppsi(u) }{ F'(u) h }_{L^2}
\end{align*}
for every $h \in H^1(0,T;\mathbb{R}^n)$. Thus, we obtain the representation
\[
 \nabla_g \uppsi(u) = \left( \lambda + F'(u)^\ast F'(u) \right)^{-1} F'(u)^\ast F(u)
\]
for $u \in H^1(0,T;\mathbb{R}^n)$. If $F$ is of class $\mathrm{C}^2$, there
exists a (local) solution to the steepest descent equation~\eqref{eq:SG} for any
initial value $u_0 \in H^1(0,T;\mathbb{R}^n)$.

Note that if the problem is linear, i.\,e., if there exist matrices
$M_1$ and $M_2$ and a function $b$
such that $F(u)(t) = M_1 u'(t) + M_2 u(t) - b(t)$, then the Riemannian
metric in lemma~\ref{Riemannmetric} equals the inner product corresponding
to the graph norm of the operator $Au = M_1 u' + M_2 u$.
Thus our approach indeed generalizes the discussion
of section~\ref{closedgraph} to the non-linear case.

We mention that these considerations lead to numerical computations similar to the
\emph{Levenberg-Marquardt} algorithm. This algorithm adapts
to local properties of the functional by varying $\lambda$.
Of course we could resemble this in our
setting by letting $\lambda$ smoothly depend on $u \in H^1(0,T;\mathbb{R}^n)$, thus introducing
a slightly more complicated Riemannian metric on the space.
If we let $\lambda$ tend to zero, we arrive at
the \emph{Gauss-Newton method} for solving non-linear least squares problems.
For a detailed treatment these methods see for example~\cite[Section 10.3]{NW06:no}.

In the literature about Sobolev gradient methods, one notices that a lot of properties
of linear problems carry over to the non-linear ones under some regularity
conditions. But it seems to be an open question whether there exists a non-linear analogue to
the fact that the Sobolev descent converges to the \emph{nearest} solution
of the equation, if one exists.
It is natural to assume that this question is closely related
to the theory of Riemannian metrics.
More precisely, it is quite possible that up to reparametrization the
trajectories of the steepest descent are geodesics of a suitable
Riemannian metric.
If this is the case, then this fact should be regarded as the appropriate generalization
of the linear result.
Those questions are beyond the scope of this article, but we
propose this investigation as a rewarding topic of research.


%\input{parts/numerics}

\section{Numerics}\label{numerics}

First we deal with general linear non-autonomous DAE. We
explain our discretization and how we calculate a Sobolev gradient. 
In the abstract setting different norms lead to different gradients. We show how this
can be transferred to the finite dimensional numerical setting taking the
\emph{graph norm} introduced in corollary~\ref{graphnorm} as an example.
We introduce several different gradients 
with varying numerical properties.
After that we discuss the overall steepest descent algorithm and the
step size calculation.
Then we move on to the fully non-linear case as in
section~\ref{nonlinear} and show how the numerics of the
linear case can be generalized. Finally, we show how supplementary conditions
can be integrated into Sobolev steepest descent.

%\input{parts/numerics.discretization}

\subsection{Discrete Formulation of Linear DAE}

First, we treat equation~\eqref{linearDAE} where the matrices $M_1$ and $M_2$
may depend on $t\in[0,T]$. For all discretizations we employ the finite differences scheme.
We fix an equidistant partition of $[0,T]$
into $N$ subintervals of length $\delta:= \frac{T}{N}$. We define a finite dimensional
version of a vector valued function $w$ as the vector $\tilde w$ containing
the values $w(0), w(\delta), \dots, w(T)$.
Hence a numerical solution is represented by $\tilde u\in\mathbb{R}^{(N+1)n}$ with structure
\[
 \tilde u = \begin{pmatrix}\tilde u_k\end{pmatrix}_{k=0}^N,\quad \tilde u_k \approx u(\delta k)\in\mathbb{R}^n \text{ for } k=0,\dots,N.
\]

Define the block diagonal matrices
$A,B \in \mathbb{R}^{(N+1)m\times (N+1)n}$ with blocks
$M_1(0)$, $M_1(\delta)$, \dots, $M_1(T)$ and $M_2(0)$, $M_2(\delta)$, \dots, $M_2(T)$, respectively.
An approximation of the functional $\uppsi$ is given by
\begin{equation}\label{eq:psi:discrete}
 \tilde\uppsi\colon \mathbb{R}^{(N+1)n} \to \mathbb{R}_+, \;
  \tilde u \mapsto {\textstyle \frac{T}{2(N+1)}} \bigl\| Q\tilde u - \tilde b \bigr\|^2_{\mathbb{R}^{(N+1)m}},
\end{equation}
where the matrix $Q$ is defined as
\begin{equation}\label{eq:def:Q}
 Q = A D_1 + B,\quad Q\in\mathbb{R}^{(N+1)m\times (N+1)n}
\end{equation}
for a matrix $D_1 \in \mathbb{R}^{(N+1)n\times (N+1)n}$ that numerically differentiates each
component of a discretized function.
The matrix $Q$ is a discrete version of the differential operator of the DAE.
Note that we replaced the $L^2$ function space norm by the corresponding finite dimensional
Euclidean norm.

There are many possible choices for the matrix $D_1$.
We use central differences involving both neighbor grid points in the
interior and forward and backward differences at the
boundary, all of them $\mathcal{O}(\delta^2)$ approximations. For
$n=1$ the differentiation matrix is
\begin{equation}\label{eq:diffD1}
 D^{(1)}_1 = \frac{1}{2\delta}
 \begin{pmatrix}
 1 & -4 & 3\\
 -1 & 0 & 1\\ 
 & \ddots & & \ddots\\ 
 & & -1 & 0 & 1\\
 & & -3 & 4 & -1\\ 
 \end{pmatrix}
 \in\mathbb{R}^{(N+1)\times (N+1)}.
\end{equation}
In general it is
\[
 D_1 = D^{(n)}_1 = D_1^{(1)}\otimes I_n,
\]
where $\otimes$ denotes the \emph{Kronecker matrix product} (see e.\,g.~\cite[p.~220]{KM06})
and $I_n$ the $n\times n$ identity matrix.

%\input{parts/numerics.gradients}

\subsection{Different Gradients in Finite Dimensional Spaces}\label{ssec:gradients}

We regard the derivative $\tilde\uppsi'(\tilde u)$ as a linear functional acting on $\mathbb{R}^{(N+1)n}$.
Then the ordinary Euclidean gradient of $\tilde\uppsi$ at $\tilde u$ can be calculated
in terms of the matrix $Q$ as follows.
\[
 \tilde\uppsi'(\tilde u)h = {\textstyle \frac{T}{N+1}} \scalar{Q\tilde u-\tilde b}{Qh}_{\mathbb{R}^{(N+1)m}} = \scalar{{\textstyle \frac{T}{N+1}} \bigl(Q^T Q\tilde u-Q^T\tilde b\bigr)}{h}_{\mathbb{R}^{(N+1)n}}
\]
This equality holds for all $h\in\mathbb{R}^{(N+1)n}$, thus 
\begin{equation}\label{eq:eucnablaphi}
 \nabla_\mathrm{e} \tilde\uppsi(\tilde u) := {\textstyle \frac{T}{N+1}} \bigl( Q^T Q\tilde u - Q^T\tilde b \bigr)
\end{equation}
is the \emph{Euclidean gradient}.

Now we explain how to compute different Sobolev gradients. 
To this end, note that the above Euclidean gradient does not correspond in any way to the
gradient of $\uppsi$ in the abstract setting.
In fact, $Q^T$ is the adjoint of $Q$ with respect to the Euclidean inner product
whereas in~\eqref{eq:gradient} the adjoint is taken with respect to the norm in $H^1$.
Thus, we have to discretize the $H^1$ inner product and use it to calculate the corresponding finite dimensional
adjoint.

Any inner product can be related to the ordinary Euclidean inner product via a positive definite
matrix. For $H^1(0,T;\mathbb{R}^n)$ we choose
\begin{equation}\label{eq:def:SH}
 S_{\mathrm{H}} := I_{(N+1)n} + D_1^T D_1.
\end{equation}    
By definition, the Sobolev gradient $\nabla_{\mathrm{H}} \tilde\uppsi(\tilde u)$
at the point $\tilde u$ has to satisfy
\[
 \tilde\uppsi'(\tilde u)h = \scalar{\nabla_{\mathrm{H}}\tilde\uppsi(\tilde u)}{h}_{H} = \scalar{S_{\mathrm{H}}\nabla_{\mathrm{H}}\tilde\uppsi(\tilde u)}{h}_{\mathbb{R}^{(N+1)n}} = \scalar{\nabla_\mathrm{e} \tilde\uppsi(\tilde u)}{h}_{\mathbb{R}^{(N+1)n}}
\]
for all $h\in\mathbb{R}^{(N+1)n}$. Therefore, to calculate the gradient $\nabla_{\mathrm{H}}$
numerically it suffices to solve the linear system
\begin{equation}\label{eq:gradientcalculation}
 S_{\mathrm{H}} x = \nabla_\mathrm{e}\tilde\uppsi(\tilde u)
\end{equation}
for the unknown $x\in\mathbb{R}^{(N+1)n}$.

Using the Sobolev gradient $\nabla_{\mathrm{H}}$ instead of $\nabla_\mathrm{e}$
already results in significantly better numerical performance.
Nevertheless, further improvements can be achieved using appropriately weighted Sobolev
gradients. For a detailed treatment of steepest descent in weighted Sobolev spaces
in the context of ODE and PDE with singularities, we refer to~\cite{Mah95:phd}.

Section~\ref{closedgraph} already
indicated the graph norm as a promising candidate for a norm
that is tailored to the structure of the DAE.
Hence we consider inner products in finite dimensions that are
related to the graph norm. Natural candidates are associated with the
positive definite matrices
\begin{equation}\label{eq:def:S}
\begin{split}
 S_{\mathrm{W}_1,\lambda} &:= \lambda I_{(N+1)n} + A^T D_1^T D_1 A,\\
 S_{\mathrm{W}_2,\lambda} &:= \lambda I_{(N+1)n} + A^T D_1^T D_1 A + B^T B,\\
 S_{\mathrm{G},\lambda} &:= \lambda I_{(N+1)n} + Q^T Q,
\end{split} 
\end{equation}
for $\lambda>0$. The identity matrix guarantees positive definiteness, while the
respective other part determines the relation to the DAE.
By choosing $\lambda$ smaller, the graph part gains more
weight.
Note that $S_{\mathrm{G},1}$ is a straightforward discretization of the graph norm.
We can calculate the corresponding Sobolev gradients $\nabla_{\mathrm{W}_1,\lambda}$,
$\nabla_{\mathrm{W}_2,\lambda}$ and $\nabla_{\mathrm{G},\lambda}$ as before by solving linear
systems similar to equation~\eqref{eq:gradientcalculation}.

We mention that the matrices in~\eqref{eq:def:S} are still sparse but structurally more complicated than
the matrix $S_{\mathrm{H}}$ defined in~\eqref{eq:def:SH} which corresponds to the $H^1$ inner product.
The matrix $S_{\mathrm{H}}$ is block-diagonal,
which allows us to solve the linear system individually within each block.
All the $n$ blocks equal $I_{N+1} + (D_1^{(1)})^T D_1^{(1)}$ which
is a band matrix depending only on the choice of numerical differentiation.
As it usually is tridiagonal or pentadiagonal, efficient solvers are
available for the corresponding linear systems.

%\input{parts/numerics.stepsize}

\subsection{Discrete Steepest Descent Algorithm and Step Size Calculation}\label{stepsize}

We want to discretize the continuous steepest descent~\eqref{eq:SG}. Once we
have decided which gradient $\nabla$ to use, we follow the
usual scheme of steepest descent algorithms and the more general \emph{line
search methods}~\cite[Chapter~3]{NW06:no}. First we fix an initial estimate
$\tilde u_0$. Then we know that $-\nabla\tilde{\uppsi}(\tilde{u}_0)$ is a descent direction of
$\tilde\uppsi$ at $\tilde u_0$, i.\,e., $\tilde\uppsi(\tilde u_0)$ locally
decreases along the direction of the negative gradient. More precisely, the
negative gradient specifies the direction in which the directional derivative
(G\^{a}teaux derivative, cf.~\cite{AP93:pna}) becomes minimal among all
directions of unit length which is where the choice of the norm comes in.

For a discretization of the continuous steepest descent~\eqref{eq:SG}, we have
to make steps which are small enough such that $\tilde\uppsi$ still decreases, and large
enough such that it decreases significantly. A straight-forward choice for the
\emph{step size} $s^\ast$ is the least non-negative real number that
minimizes $\tilde\uppsi(\tilde u - s^\ast \nabla)$, assuming that such a
number exists. Here we abbreviated $\nabla\tilde\uppsi(\tilde u)$ by $\nabla$.
Of course, if $\nabla \neq 0$ there exists a positive $s^\ast$ such that
$\tilde\uppsi(\tilde u - s^\ast \nabla) < \tilde\uppsi(\tilde u)$. Since
this is the only occurrence of the gradient in the algorithm, the scaling of the
gradient can be compensated by the choice of $s^\ast$. Thus the results
remain the same if we drop the factor $\frac{T}{N+1}$ in
formula~\eqref{eq:eucnablaphi} for our calculations.

In the linear case it is easy to calculate the optimal $s^\ast$ by interpolation, as
along a line the functional is a quadratic polynomial.
But in the non-linear case this is a more difficult problem. 
In practice, it usually is sufficient to calculate a local minimizer
instead of the global minimizer $s^\ast$. Nocedal and Wright
give a description of sophisticated step-length selection algorithms~\cite[Section~3.5]{NW06:no}.
Those algorithms try to use function values and gradient information
as efficiently as possible and produce step sizes satisfying certain descent conditions.
In our implementation we assume local convexity and search 
along an exponentially increasing sequence for the first increase of $\tilde\uppsi$ on the line. We then
perform a ternary search with this upper bound yielding a local minimizer of $\tilde\uppsi$.

We experienced that usually it is advantageous to damp the step size $s^\ast$,
i.\,e., to multiply $s^\ast$ by a factor $\mu \in (0,1)$,
when using the gradient itself for the direction.
Alternatively, our implementation provides the possibility to incorporate previous step
directions and step sizes into the calculation of the new ones. This
pattern is employed in \emph{non-linear conjugate gradient methods}, and it can be
used with Sobolev gradients as well; see for example the Polak-Ribi\`ere or
Fletcher-Reeves formulae~\cite[Section 5.2]{NW06:no}.

Algorithm~1 %\ref{alg:sd} 
is a summary of our final discrete steepest descent procedure.
This is a straight-forward application of the general discrete steepest descent method
for a given cost functional. Sufficient conditions for convergence to a minimizer
involving convexity and gradient inequalities can be found for example
in~\cite[Chapter~3]{NW06:no}.

%\begin{algorithm}[th]
%\caption{Discrete steepest descent}\label{alg:sd}
%\begin{algorithmic}
%\State Generate some initial guess $\tilde u_0$. \Comment{e.\,g.\ a constant function}
%\State $i \gets 0$
%\While{$\tilde u_i$ does not have target precision}
% \State $\nabla_\mathrm{e} \gets$ Euclidean gradient of $\tilde \uppsi$ at $\tilde u_i$ \Comment{see equation~\eqref{eq:eucnablaphi}}
%    \State Build linear system incorporating supp.\ cond.\ at $\tilde u_i$. \Comment{sections~\ref{ssec:gradients} and~\ref{suppcond}}
%    \State $\nabla_\mathrm{S} \gets$ solution of linear system for right hand side $\nabla_\mathrm{e}$ \Comment{see equation~\eqref{eq:def:SH}}
% \State $s^\ast \gets$ choose good step size for $\nabla_\mathrm{S}$ \Comment{section~\ref{stepsize}}
% \State $\tilde u_{i+1} \gets \tilde u_i - \mu s^\ast \nabla_\mathrm{S}$ \Comment{damped update, $0<\mu\le 1$}
% \State $i \gets i+1$
%\EndWhile
%\end{algorithmic}
%\end{algorithm}

\bigskip
\begin{tabular}{l}
\hline
 \multicolumn{1}{c}{\textbf{Algorithm 1.} Discrete steepest descent}  \\ %label{alg:sd}
\hline 
 Generate some initial guess $\tilde u_0$. 
  \hfill $|$ e.\,g.\ a constant function\\
 $i \gets 0$ \\
\textbf{while} $\tilde u_i$ does not have target precision \textbf{do}\\
\quad $\nabla_\mathrm{e} \gets$ Euclidean gradient of 
 $\tilde \uppsi$ at $\tilde u_i$  
  \hfill$|$ see equation~\eqref{eq:eucnablaphi}\\
\quad Build linear system incorporating supp.\ cond.\ at $\tilde u_i$. 
 \hfill $|$ sections~\ref{ssec:gradients} and~\ref{suppcond} \\
\quad $\nabla_\mathrm{S} \gets$ solution of linear system for right 
 hand side $\nabla_\mathrm{e}$  
 \hfill $|$ see equation~\eqref{eq:def:SH} \\
\quad $s^\ast \gets$ choose good step size for $\nabla_\mathrm{S}$ 
 \hfill $|$ section~\ref{stepsize} \\
\quad $\tilde u_{i+1} \gets \tilde u_i - \mu s^\ast \nabla_\mathrm{S}$ 
 \hfill $|$ damped update, $0<\mu\le 1$ \\
\quad $i \gets i+1$\\
\textbf{end while}\\
\hline
\end{tabular}

%\input{parts/numerics.leastsquares}

\subsection{Least Squares Method}

We now describe the close connection between the Sobolev
gradient $\nabla_{\mathrm{G},\lambda}$ coming from $S_{\mathrm{G},\lambda}$ as in~\eqref{eq:def:S}
and the well-known \emph{least squares method}.
In the limit $\lambda\to 0$ the resulting linear system might
be singular, but is still solvable for the given right hand side. In fact,
for $\lambda\to 0$ the linear system corresponding to equation~\eqref{eq:gradientcalculation} becomes
\[
 Q^T Q x = Q^T(Q\tilde u - \tilde b).
\]
Note that we have rescaled the Euclidean gradient by the factor $\frac{N+1}{T}$
as justified in section~\ref{stepsize}.
Starting the discrete steepest descent at an initial guess $\tilde u_0$
we compute $x$ and take a step of length $\delta \ge 0$ into the direction $-x$.
The parameter $\delta$ is chosen such that $\uppsi(\tilde u - \delta x)$ is minimal.
We claim that $\delta = 1$. In fact, for this $\delta$ we arrive at $\tilde u - x$
which satisfies the normal equations of the problem $Qy = \tilde b$, i.\,e.,
\[
 Q^T Q (\tilde u - x) = Q^T Q \tilde u - \left( Q^T Q \tilde u - Q^T \tilde b \right) = Q^T \tilde b.
\]
This shows that $\tilde u - x$ globally minimizes the functional, thus
proving $\delta = 1$. Moreover, this shows that in the limit descent with $\nabla_{\mathrm{G},\lambda}$ converges
to the solution of the least squares problem in the first step.
Note, however, that positive definiteness is a very desirable property for a linear
system and a direct solution of the normal equations may be numerically
considerably more difficult.

This relation indicates a possible reason why also for the non-linear case
the convergence of the steepest descent is observed to be fastest
for $\nabla_{\mathrm{G},\lambda}$ with small $\lambda$,
at least among the gradients we have used.

%\input{parts/numerics.nonlinear}

\subsection{The Non-Linear Case}\label{numerics:non-linear}

In the setting of equation~\eqref{eq:dae:nonlinear},
define $A(\tilde u)$ and $B(\tilde u)$ as block diagonal matrices with
blocks $f_y(k\delta,\tilde u_k, D_1 \tilde u_k)$ and
$f_x(k\delta,\tilde u_k, D_1\tilde u_k)$ for $k=0,\dots,N$, respectively.
We use the function
\[
 \widetilde F\colon \mathbb{R}^{(N+1)n} \to \mathbb{R}^{(N+1)m}, \;
 \tilde u \mapsto
 \begin{pmatrix}
  f(k\delta, \tilde u_k, D_1 \tilde u_k) \\
 \end{pmatrix}_k
\]
as discretization of $F$ defined by~\eqref{eq:F:nonlinear}.
Observe that $\widetilde F'(\tilde u)h = A(\tilde u) D_1 h + B(\tilde u) h$,
which resembles~\eqref{eq:F'}.
Then $\tilde \uppsi(\tilde u) := \frac{1}{2} \bigl\| \widetilde F(\tilde u) \bigr\|_2^2$
has derivative
\begin{equation}\label{eq:nonlinear:disc}
 \tilde\uppsi'(\tilde u)h = \scalar{\widetilde F(\tilde u)}{\left(A(\tilde u) D_1+B(\tilde u)\right)h} = \scalar{Q(\tilde u)^T \widetilde F(\tilde u)}{h},
\end{equation}
where we set $Q := AD_1 + B$ as in the notation of the linear case.

Now we can proceed as in the linear case. The only difference is that the
matrices $A$ and $B$ depend on the current position $\tilde u$, and hence the positive
definite matrices defined as in~\eqref{eq:def:S} change during the process as
well. This corresponds to steepest descent under a variable inner product
introduced in lemma~\ref{Riemannmetric}.
It is also connected to \emph{quasi-Newton methods} which update
an approximation of the Hessian at each step.
For details on quasi-Newton methods see \cite[Chapter~6]{NW06:no}.

Originally we came up with this method for non-linear DAE as a
direct generalization of the linear case.
Only for a formal justification we have equipped $H^1(0,T;\mathbb{R}^n)$
with a natural structure making it into a Riemannian manifold
leading to the gradient we use.
However, consequently following this second approach we would have been
led to a algorithm which slightly differs from algorithm~1. %\ref{alg:sd}.
As in general Riemannian manifolds do not carry a vector space structure,
there are no ``straight lines'' the steepest descent could follow.
One usually employs the exponential map of the manifold as a substitute,
traveling along geodesics.
Although there is no difference between these two variants for continuous
steepest descent, i.\,e., in the limit of infinitesimally small step size,
for the numerics one has to choose.
We decided in favor of the straight lines since computing
the exponential map means solving an ordinary differential
equation which is a much more complicated operation
unnecessarily complicating the implementation.


%\input{parts/numerics.suppcond}
\subsection{Supplementary Conditions}\label{suppcond}

To support \emph{linear supplementary conditions}, we want the steepest descent
steps to preserve specified features of the initial function. Therefore, we use
Sobolev gradients that do not change these features.
We remark that the methods of this chapter can be applied using any
gradient. We have chosen the space $H^1(0,T;\mathbb{R}^n)$ with its usual norm only for clarity of exposition.
More precisely, let $u_0\in H^1(0,T;\mathbb{R}^n)$ be an initial estimate satisfying the 
supplementary conditions. Denote by $H_\mathrm{a}$ the closed linear subspace
of $H^1(0,T;\mathbb{R}^n)$ such that $u_0 + H_\mathrm{a}$ is the space of all functions
in $H^1(0,T;\mathbb{R}^n)$ satisfying the supplementary conditions. We call $H_\mathrm{a}$
the \emph{space of admissible functions}.

Define the functional $\uppsi_\mathrm{a}$ as 
\[
 \uppsi_\mathrm{a}\colon H_\mathrm{a} \to \mathbb{R}_+, \quad \uppsi_\mathrm{a}(u) := \uppsi(u_0+u)={\textstyle \frac{1}{2}}\left\|F(u_0+u)\right\|^2_{L^2}.
\]
We have to calculate the gradient of $\uppsi_\mathrm{a}$ with
respect to the space $H_\mathrm{a}$ equipped with the inner product induced by $H^1(0,T;\mathbb{R}^n)$.
As this gradient naturally lies in the space of admissible functions, steepest descent
starting with $u_0$ will preserve the supplementary conditions
while minimizing $\uppsi_\mathrm{a}$.

Let $P_\mathrm{a}$ be the orthogonal projection of $H^1(0,T;\mathbb{R}^n)$ onto $H_\mathrm{a}$. 
Now $\uppsi_\mathrm{a}'(u)h=\uppsi'(u_0+u)h$ for $h\in H_\mathrm{a}$, and consequently
\begin{equation}\label{eq:supp}
\begin{split}
 \uppsi_\mathrm{a}'(u)P_\mathrm{a} h &= \uppsi'(u_0+u)P_\mathrm{a} h
 = \scalar{(\nabla\uppsi)(u_0+u)}{P_\mathrm{a} h}_{H^1}\\
 &= \scalar{P_\mathrm{a}(\nabla\uppsi)(u_0+u)}{h}_{H^1}\\
\end{split} 
\end{equation} 
for all $h\in H^1(0,T;\mathbb{R}^n)$. It follows that $(\nabla\uppsi_\mathrm{a})(u) = P_\mathrm{a}(\nabla\uppsi)(u_0+u)$.

Now we transfer this to the finite dimensional setting in a numerically
tractable way. Let $C\in\mathbb{R}^{k\times(N+1)n}$ be a matrix such that $\widetilde H_\mathrm{a} := \ker C$
is a finite dimensional version of $H_\mathrm{a}$.
The set of functions satisfying the supplementary conditions
introduced by the matrix $C$ is given by $\tilde u_0 + \widetilde H_\mathrm{a}$
for any valid function $\tilde u_0$.
We understand $\tilde\uppsi_\mathrm{a}(\tilde u)$ as a functional
on $\widetilde H_\mathrm{a}$ analogously to the above definition of $\uppsi_\mathrm{a}$.

Denote by $\widetilde{P}_\mathrm{a}$ the orthogonal projection in $\mathbb{R}^{(N+1)n}$ onto $\ker{C}$
with respect to the Euclidean inner product.
We search for $\nabla_S \in \widetilde H_\mathrm{a}$ satisfying $\tilde\uppsi_\mathrm{a}(\tilde u) = \scalar{\nabla_S}{h}$ for all
$h\in\widetilde H_\mathrm{a}$.
Similarly to~\eqref{eq:supp}, we calculate for any $h \in \mathbb{R}^{(N+1)n}$
\begin{align*}
 \tilde\uppsi_\mathrm{a}'(\tilde u)\widetilde{P}_\mathrm{a} h
 & = \tilde\uppsi'(\tilde u_0 + \tilde u)\widetilde{P}_\mathrm{a} h
 = \scalar{\bigl(\nabla_\mathrm{e}\tilde\uppsi\bigr)(\tilde u_0 + \tilde u)}{\widetilde{P}_\mathrm{a} h}_\mathrm{e}\\
 & = \scalar{\widetilde{P}_\mathrm{a}\bigl(\nabla_\mathrm{e}\tilde\uppsi\bigr)(\tilde u_0 + \tilde u)}{h}_\mathrm{e}
 \stackrel{!}{=} \scalar{\nabla_S}{\widetilde{P}_\mathrm{a} h}_S
 = \scalar{\widetilde{P}_\mathrm{a} S \widetilde{P}_\mathrm{a} \nabla_S}{h}_\mathrm{e}.
\end{align*}

Defining $S_\mathrm{a} := \widetilde{P}_\mathrm{a} S\widetilde{P}_\mathrm{a}$, it is obvious that $S_\mathrm{a}$ is
positive definite if restricted to $\widetilde H_\mathrm{a}$ since $S$ is positive definite.
To calculate the discrete Sobolev gradient
we have to solve the linear system 
\[
 S_\mathrm{a} x = \widetilde{P}_\mathrm{a} \; (\nabla_\mathrm{e}\uppsi) (\tilde u_0 + \tilde u)
\]
for $x$ in $\widetilde H_\mathrm{a}$. Note that one could use the conjugate gradient method for
solving this system, as the right hand side is in $\widetilde H_\mathrm{a}$,
cf.~\cite[Algorithm~13.2]{CRRM96} and~\cite[Algorithm~5.2]{NW06:no}.

This approach allows us to impose very general linear supplementary conditions, like
boundary conditions or periodic boundary conditions for the function as well as
for its derivative, or other not necessarily local linear conditions at arbitrary grid points.
Only fixing values is computationally unproblematic, as this corresponds to
deleting appropriate columns and rows in $S$ and the calculated gradient. But
more general supplementary conditions result in a non-sparse orthogonal
projection matrix $\widetilde{P}_\mathrm{a}$ and a dense inner product matrix $S_\mathrm{a}$.
This renders sparse solvers useless.
Then it might help to regard the calculation of the gradient under supplementary conditions
as an equality constrained linear least squares optimization problem.
For details, we refer to the book of Golub and van Loen~\cite[Section~12.1]{GL96:la}.


%\input{parts/examples}

\section{Implementation Details and Examples with Tables and Plots}\label{examples}

We present numerical results for some of the problems we used in
the development of our algorithm to illustrate its strengths and weaknesses. 
Altogether we utilized various sample problems from different sources
to test the correctness and to study the performance in representative cases.
The reader is welcome to check out the
source code containing many example problems, which is freely available
online~\cite{loopback}.

In subsection~\ref{ssec:singular} we
discuss the results for an interesting example investigated by Mahavier
who studied ODE problems with singularities
in the context of Sobolev gradients~\cite{Mah97:singular,Mah95:phd}.
Another interesting problem, posing
difficulties to several solvers, is discussed in subsection~\ref{ssec:exnumdiff}
which we found in~\cite{Sch03}.
The results for a few more intricate example problems from the
IVP~Testset~\cite{MIM06:testset} are discussed in
subsection~\ref{ssec:involved}.
One possible application utilizing the feature that no initial conditions
have to be specified is explained in section~\ref{sec:solspaceest}.
In this context, an example exposing an at first sight surprising
behavior is described in~\ref{sssec:nonunique}.
For testing purposes, several boundary value problems have been treated,
among them examples from Ascher and Spiteri~\cite{AS94}.
Other employed test problems stem from the books of Hairer and
Wanner~\cite{HW96}, Kunkel and Mehrmann~\cite{KM06} and Carey, Richardson, Reed
and Mulvaney~\cite{CRRM96} and from the IVP and BVP website of
Cash~\cite{JC:testset}.

When we speak of the \emph{index of a DAE} in an example we always refer to the
\emph{differentiation index} which agrees with the \emph{index of nilpotency} for linear
DAE with constant coefficients~\cite[Section~VII.1]{HW96}. There are several other
index concepts for DAE, each stressing different aspects of the equation~\cite[Section~2.4]{Sch03}.
By the \emph{maximal absolute error} and the \emph{average absolute error} of a numerical approximation
$\tilde u$ we mean
\begin{equation*}
E_\textrm{abs}(\tilde u) := \max_{i=0,\dots,N}\|u(t_i)-\tilde u(t_i) \|_\infty\text{ and }
E_\textrm{avg}(\tilde u) := {\textstyle \frac{T}{N+1}} \sum_{i=0}^N \|u(t_i)-\tilde u(t_i) \|^2_2,
\end{equation*}
respectively, where $u$ is a (highly) exact solution. The value
$\tilde\uppsi(\tilde u)$ is called the \emph{residual at $\tilde u$}.

Please keep in mind that our implementation aims at generality
and could be considerably optimized for more specialized cases.
Thus it would not make sense to give timings of program runs in the following.
We mention that the computer architecture as well as compiler flags have impact
on the numerical values. There are parameters affecting the rate of
convergence and the quality of the numerical results which we do not describe here in detail. 
Those parameters include precision bounds for termination checks of the linear solvers and control
details of the line search in the gradient direction.
However, all parameters which \emph{severely} affect the results are documented.

%\input{parts/examples.odesingular}

\subsection{Non-Linear ODE with Irregular Singularity}\label{ssec:singular}

ODE with meromorphic coefficients can be formulated as a singular ODE which
is a special case of a DAE.
More precisely, one can remove an $k$\textsuperscript{th} order pole at $t_0$ by multiplying
the corresponding equation with $(t-t_0)^k$, thereby getting a 
coefficient function in front of the highest derivative $y^{(n)}$ with a root at $t_0$.
However, these examples are very special and relatively simple examples of DAE
and hence are often not regarded as actual DAE.
The following example is of this type.

We consider the non-linear ordinary differential equation 
\begin{equation*}
\begin{cases}
t^2 y'(t) = 2t y(t) + y(t)^2\text{ for }t\in[0,1]\\
y(1) = 1
\end{cases}
\text{ with solution } y(t) = \frac{t^2}{2-t},
\end{equation*}
which is discussed in~\cite[Section~4]{Mah97:singular}.
Note that Mahavier introduces and employs \emph{weighted Sobolev descent} for such problems. 
He calculates the gradient with respect to a discretized weighted Sobolev space tailored to the problem. 
For semi-linear ODE problems our gradient $\nabla_{\mathrm{W}_1}$ of section~\ref{ssec:gradients} corresponds
directly to the weighted Sobolev gradients there. We solve the above problem on the
interval $[0,1]$ starting with the initial function $u_0(t) = t$. In
tables~\ref{tab:singular100} and~\ref{tab:singular100:err} we did not damp the steepest descent to allow
for comparison with Mahavier's results~\cite{Mah97:singular}. However,
there remain minor differences due to discretization, scaling and the employed line-search method.

Table~\ref{tab:singular100} lists the convergence behavior for several gradients.
The Euclidean gradient shows poor performance whereas the usual Sobolev gradient already improves the
situation significantly. Best performance is achieved by the weighted Sobolev gradient and the gradient
corresponding to the graph norm, the latter being slightly ahead.
Similar observations can be made about the average errors listed in table~\ref{tab:singular100:err}.
Damping with $0.85$ yields considerably better results. For example in the graph norm case with $N=1000$
and $\lambda=1$ this damping yields to convergence in less than $1000$ steps to a residual below $3\cdot 10^{-15}$,
an average error of about $2\cdot 10^{-10}$ and a maximal error of 
about $4\cdot 10^{-4}$. This is significantly
better than what is achieved in the same setting without damping 
after $10000$ steps. In that case the residual is
only reduced to about $1\cdot 10^{-14}$ despite of the higher number of steps. The reason for this improvement
lies in the usual convergence behavior of steepest descent methods. Often they exhibit the tendency to zig-zag~\cite[see Section~3.3]{NW06:no} which
is somewhat mitigated by this simple damping.
For this reason we always used a damping factor of $\mu=0.85$ in the numerics unless otherwise stated.

\begin{table}[th]
\small
\caption{Residuals for grid size $N=100$ without damping ($\mu=1$).}
\label{tab:singular100}
%\begin{tabular}[b]{c*{4}{d{3.5}}}
\begin{tabular}{ccccc}
\hline
& \multicolumn{4}{c}{\bfseries Residual (starting with $4.06\cdot 10^{-1}$
for $\tilde u_0$)} \\ 
\hline  %\cmidrule(l){2-5}
\multicolumn{1}{c}{\bfseries Steps} & 
\multicolumn{1}{c}{\bfseries Euclidean $\nabla_\mathrm{e}$} & \multicolumn{1}{c}{\bfseries Sobolev $\nabla_{\mathrm{H}}$} & \multicolumn{1}{c}{\bfseries Weighted $\nabla_{\mathrm{W}_1}$} & \multicolumn{1}{c}{\bfseries Graph $\nabla_\mathrm{G}$} \\ \hline
$5    $ & $3.7\cdot10^{-1}$ & $6.7\cdot10^{-4}$ & $2.2\cdot10^{-05}$ & $6.4\cdot10^{-06}$ \\
$10   $ & $3.6\cdot10^{-1}$ & $3.8\cdot10^{-4}$ & $3.5\cdot10^{-06}$ & $7.2\cdot10^{-07}$ \\
$20   $ & $3.3\cdot10^{-1}$ & $2.0\cdot10^{-4}$ & $5.9\cdot10^{-07}$ & $9.9\cdot10^{-08}$ \\
$50   $ & $2.8\cdot10^{-1}$ & $7.9\cdot10^{-5}$ & $5.6\cdot10^{-08}$ & $8.4\cdot10^{-09}$ \\
$100  $ & $2.2\cdot10^{-1}$ & $3.7\cdot10^{-5}$ & $9.7\cdot10^{-09}$ & $1.4\cdot10^{-09}$ \\ 
$200  $ & $1.5\cdot10^{-1}$ & $1.6\cdot10^{-5}$ & $1.7\cdot10^{-09}$ & $2.5\cdot10^{-10}$ \\
$500  $ & $7.1\cdot10^{-2}$ & $5.5\cdot10^{-6}$ & $1.8\cdot10^{-10}$ & $3.7\cdot10^{-11}$ \\
$1000 $ & $2.9\cdot10^{-2}$ & $2.3\cdot10^{-6}$ & $4.3\cdot10^{-11}$ & $1.8\cdot10^{-11}$ \\
$2000 $ & $9.8\cdot10^{-3}$ & $9.9\cdot10^{-7}$ & $2.0\cdot10^{-11}$ & $1.5\cdot10^{-11}$ \\
$5000 $ & $1.7\cdot10^{-3}$ & $3.1\cdot10^{-7}$ & $1.5\cdot10^{-11}$ & $1.4\cdot10^{-11}$ \\
$10000$ & $3.9\cdot10^{-4}$ & $1.2\cdot10^{-7}$ & $1.4\cdot10^{-11}$ & $1.4\cdot10^{-11}$ \\
\hline
\end{tabular}
\end{table}

\begin{table}[th]
\small
\caption{Average errors for grid size $N=100$ without damping ($\mu=1$).}
\label{tab:singular100:err}
\begin{tabular}[b]{ccccc}
\hline
& \multicolumn{4}{c}{\bfseries Average Error (starting with 
$6.17\cdot 10^{-2}$ for $\tilde u_0$)} \\
\hline
\multicolumn{1}{c}{\bfseries Steps} & 
\multicolumn{1}{c}{\bfseries Euclidean $\nabla_\mathrm{e}$} & \multicolumn{1}{c}{\bfseries Sobolev $\nabla_{\mathrm{H}}$} & 
\multicolumn{1}{c}{\bfseries Weighted $\nabla_{\mathrm{W}_1}$} & \multicolumn{1}{c}{\bfseries Graph $\nabla_\mathrm{G}$} \\ \hline
$5    $ & $6.0\cdot10^{-2}$ & $7.8\cdot10^{-3}$ & $2.3\cdot10^{-4}$ & $1.0\cdot10^{-4}$ \\
$10   $ & $5.9\cdot10^{-2}$ & $5.7\cdot10^{-3}$ & $8.1\cdot10^{-5}$ & $3.1\cdot10^{-5}$ \\
$20   $ & $5.8\cdot10^{-2}$ & $4.0\cdot10^{-3}$ & $2.8\cdot10^{-5}$ & $9.6\cdot10^{-6}$ \\
$50   $ & $5.4\cdot10^{-2}$ & $2.3\cdot10^{-3}$ & $7.0\cdot10^{-6}$ & $2.2\cdot10^{-6}$ \\
$100  $ & $4.8\cdot10^{-2}$ & $1.5\cdot10^{-3}$ & $2.4\cdot10^{-6}$ & $7.6\cdot10^{-7}$ \\ 
$200  $ & $4.0\cdot10^{-2}$ & $9.4\cdot10^{-4}$ & $8.5\cdot10^{-7}$ & $2.6\cdot10^{-7}$ \\
$500  $ & $2.6\cdot10^{-2}$ & $4.9\cdot10^{-4}$ & $2.1\cdot10^{-7}$ & $6.6\cdot10^{-8}$ \\
$1000 $ & $1.6\cdot10^{-2}$ & $3.0\cdot10^{-4}$ & $7.6\cdot10^{-8}$ & $2.5\cdot10^{-8}$ \\
$2000 $ & $8.6\cdot10^{-3}$ & $1.8\cdot10^{-4}$ & $2.9\cdot10^{-8}$ & $1.2\cdot10^{-8}$ \\
$5000 $ & $3.1\cdot10^{-3}$ & $9.0\cdot10^{-5}$ & $1.1\cdot10^{-8}$ & $9.0\cdot10^{-9}$ \\
$10000$ & $1.3\cdot10^{-3}$ & $5.3\cdot10^{-5}$ & $9.3\cdot10^{-9}$ & $8.0\cdot10^{-9}$ \\
\hline
\end{tabular}
\end{table}

To compare the behavior of the gradients for increasing grid size we
show results for a finer equidistant grid of size $N=10000$ in table~\ref{tab:singular10000}.
Note that the residual of the exact solution is approximately $6.79\cdot 10^{-17}$ for this grid.
The Euclidean gradient completely fails to converge and exhibits horrible numerical performance. The ordinary
Sobolev gradient copes fairly with the finer grid.
Among the gradients with $\lambda=1$, again $\nabla_{\mathrm{W}_1}$ and
$\nabla_\mathrm{G}$ achieve the best results.

However, for all gradients under consideration an appropriate choice
of $\lambda$ improves the numerical performance. 
For example using $\nabla_{\mathrm{W}_1,0.05}$ in the above setting we get 
a residual of $2\cdot 10^{-17}$ after $1000$ steps, with 
$E_\textrm{avg}\approx 10^{-11}$ and $E_\textrm{abs}\approx 2\cdot 10^{-4}$.
But in the case of steepest descent with $\nabla_\mathrm{G}$, the impact of a smaller value is
extraordinary. The rightmost two columns of table~\ref{tab:singular10000} 
show the results for 
$\lambda = 10^{-3}$ and $\lambda = 10^{-5}$. The norm of the gradient 
dropped below $10^{-12}$ in $110$ and $42$ steps, respectively.
With even smaller values of $\lambda$ even better results are achieved. 
For $\lambda=10^{-19}$ and $\mu=1$
we obtain a residual of about $10^{-23}$ in $6$ steps, with 
$E_\textrm{avg}\approx 10^{-15}$ and $E_\textrm{abs}\approx 4\cdot 10^{-6}$.
However, in general choosing $\lambda$ that small leads to failure of the linear solver and huge numerical errors.


\begin{table}[th]
\small
\caption{Residuals for grid size $N=10000$ with damping factor $\mu=0.85$.}
\label{tab:singular10000}
\begin{tabular}[b]{ccccccc}
\hline
& \multicolumn{6}{c}{\bfseries Residual (starting with $4.0\cdot 10^{-1}$ 
for $\tilde u_0$)} \\
\hline
\multicolumn{1}{c}{\bfseries Steps} & 
\multicolumn{1}{c}{\bfseries $\nabla_\mathrm{e}$} & 
\multicolumn{1}{c}{\bfseries $\nabla_{\mathrm{H}}$} &
\multicolumn{1}{c}{\bfseries $\nabla_{\mathrm{W}_1}$} & 
\multicolumn{1}{c}{\bfseries $\nabla_{\mathrm{G},1}$} &
\multicolumn{1}{c}{\bfseries $\nabla_{\mathrm{G},10^{-3}}$} & 
\multicolumn{1}{c}{\bfseries $\nabla_{\mathrm{G},10^{-5}}$} \\ \hline
$5   $ & $4.0\cdot10^{-1}$ & $8.3\cdot10^{-4}$ & $2.6\cdot10^{-05}$ & $5.2\cdot10^{-06}$ & $2.4\cdot10^{-09}$ & $2.4\cdot10^{-09}$ \\  
$10  $ & $4.0\cdot10^{-1}$ & $2.6\cdot10^{-4}$ & $3.9\cdot10^{-06}$ & $5.8\cdot10^{-07}$ & $7.7\cdot10^{-14}$ & $1.5\cdot10^{-17}$ \\
$20  $ & $4.0\cdot10^{-1}$ & $1.2\cdot10^{-4}$ & $4.2\cdot10^{-07}$ & $9.3\cdot10^{-09}$ & $9.8\cdot10^{-16}$ & $1.2\cdot10^{-20}$ \\
$30  $ & $4.0\cdot10^{-1}$ & $5.7\cdot10^{-5}$ & $1.4\cdot10^{-08}$ & $2.5\cdot10^{-09}$ & $2.5\cdot10^{-16}$ & $4.9\cdot10^{-21}$ \\
$40  $ & $4.0\cdot10^{-1}$ & $4.2\cdot10^{-5}$ & $4.6\cdot10^{-09}$ & $1.2\cdot10^{-09}$ & $8.8\cdot10^{-17}$ & $1.3\cdot10^{-21}$ \\
$50  $ & $4.0\cdot10^{-1}$ & $1.5\cdot10^{-5}$ & $3.0\cdot10^{-09}$ & $6.3\cdot10^{-10}$ & $3.5\cdot10^{-17}$ \\
$100 $ & $3.9\cdot10^{-1}$ & $9.1\cdot10^{-6}$ & $2.3\cdot10^{-10}$ & $3.1\cdot10^{-11}$ & $4.3\cdot10^{-19}$ \\
$150 $ & $3.9\cdot10^{-1}$ & $3.3\cdot10^{-6}$ & $3.4\cdot10^{-11}$ & $2.9\cdot10^{-12}$ &  \\
$200 $ & $3.9\cdot10^{-1}$ & $1.2\cdot10^{-6}$ & $1.0\cdot10^{-11}$ & $2.0\cdot10^{-12}$ \\
$400 $ & $3.9\cdot10^{-1}$ & $2.8\cdot10^{-7}$ & $3.3\cdot10^{-13}$ & $4.7\cdot10^{-14}$ \\
$1000$ & $3.5\cdot10^{-1}$ & $6.9\cdot10^{-8}$ & $3.2\cdot10^{-14}$ & $2.6\cdot10^{-15}$ \\ \hline
\multicolumn{1}{c}{$E_\textrm{avg}$} & $5.9\cdot10^{-2}$ & $3.3\cdot10^{-5}$ & $1.3\cdot10^{-09}$ & $3.0\cdot10^{-10}$ & $1.4\cdot10^{-12}$ & $4.5\cdot10^{-14}$ \\
\multicolumn{1}{c}{$E_\textrm{abs}$} & $3.3\cdot10^{-1}$ & $4.3\cdot10^{-2}$ & $8.9\cdot10^{-04}$ & $5.6\cdot10^{-04}$ & $8.5\cdot10^{-05}$ & $1.9\cdot10^{-05}$ \\
\hline
\end{tabular}
\end{table}

In the setting of this singular ODE, Sobolev descent with respect to the graph norm
gives results which are superior to the steepest descent method relying on weighted
Sobolev spaces as in~\cite{Mah97:singular}. Additionally, choosing 
a small $\lambda$ vastly improves the rate of convergence in many cases,
but is numerically more demanding.
However, steepest descent with respect to the graph norm is 
computationally more expensive than weighted Sobolev steepest descent.
This is because $S_\mathrm{G}$ has to be constructed in each iteration because it depends on
$\tilde u$, whereas $S_{\mathrm{W}_1}$ remains the same during the process
for this example.

%\input{parts/examples.difficult}

\subsection{A Small Non-Trivial Linear DAE}\label{ssec:exnumdiff}

Consider for $\eta\in\mathbb{R}$ the non-autonomous linear index $2$ DAE
\begin{equation}\label{eq:exnumdiff}
\begin{pmatrix}1 & 0 \\ 1 & \eta t \end{pmatrix}u'(t) + \begin{pmatrix}1 & \eta t \\ 0 & 1+\eta \end{pmatrix} = \begin{pmatrix}\exp(-t) \\ 0\end{pmatrix}.
\end{equation}
This equation has been introduced by Petzold, Gear and Hsu in~\cite{PGH81}. In the range
$\eta<-0.5$ it is known to pose difficulties to several numerical methods for
DAE, among them the implicit Euler method, BDF, and RadauIIA. More information about this
equation along with some numerical tests can be found in~\cite[Section~4.2]{Sch03}. It has a unique solution given by
\[
    u(t) = \begin{pmatrix}(1-\eta t)\exp(-t) \\ \exp(-t)\end{pmatrix}.
\]
Thus, the only consistent initial value is $u(0)$.
In our test we always set $\eta = -0.8$,
choose the initial function to equal $2$ in both components, and solve the equation on the 
interval $[0,3]$ for the grid size $N=1000$. Apart from the least squares case where we do not use
damping, we always set $\mu=0.85$.
The residual of the initial function is $2.99$, with $E_\text{avg} = 15.37$ and $E_\text{abs} = 1.95$.
In table~\ref{tab:exnumdiff} we show results of the steepest descent method applied to
problem~\eqref{eq:exnumdiff} using the gradient $\nabla_{\mathrm{G},\lambda}$ for different
values of the parameter $\lambda$. To facilitate comparison we also list results of ordinary Sobolev descent employing
$\nabla_{\mathrm{H}}$ and of the least squares method. The latter can be applied since the problem is linear. Steps
which reached optimal residuals with respect to solver and floating 
point precision are marked with ${}^*$.
We omit the results for the Euclidean gradient which decreases the residual
only to about $6.09\cdot 10^{-1}$ in the first $10000$ steps.  In
figures~\ref{fig:difficult:h1} and~\ref{fig:difficult:h2} we illustrate the
development of the steepest descent for these gradients. The individual plots
depict the results' first components after several steps, where darker color
corresponds to more iterations.

Again, descent with $\nabla_{\mathrm{G},1}$ reduces the residual faster than descent
with the ordinary Sobolev gradient. Decreasing $\lambda$ results in even better convergence.
However, note that ordinary Sobolev descent attains superior error values. 
This can be understood by looking at the plots in figure~\ref{fig:difficult:h1}.
The solutions found by ordinary Sobolev descent
approach the solution slowly, but in a very regular and smooth way,
whereas descent according to the graph norm approaches the solution rapidly in the interior of the
interval $[0,3]$ and only afterwards and more slowly at the interval boundaries. 
Regarding the error values, $\lambda$ has to be decreased to $10^{-3}$ for the graph norm gradient to deliver
results on par with descent according to $\nabla_{\mathrm{H}}$.
Interestingly enough, the least squares method needs $10$ steps to reach the optimal residual.
The oscillations depicted in the right plot of figure~\ref{fig:difficult:h2} are due to numerical errors of the linear solver.
If one further decreases $\lambda$, descent with respect to $\nabla_{\mathrm{G},\lambda}$ becomes more similar
to solving the linear least squares problem and starts to show oscillating behavior, too.

\begin{table}
\caption{Results for problem~\eqref{eq:exnumdiff}.}
\label{tab:exnumdiff}
\begin{tabular}{ccccc} 
\hline
\multicolumn{1}{c}{\bfseries Gradient} & 
\multicolumn{1}{c}{\bfseries Steps} &
 \multicolumn{1}{c}{\bfseries Residual} & \multicolumn{1}{c}{\bfseries Avg. Error} &
  \multicolumn{1}{c}{\bfseries Max. Error} \\ \hline
$\nabla_{\mathrm{H}}$  
& $   100$ & $2.6\cdot10^{-04}$ & $5.6\cdot10^{-02}$ & $3.2\cdot10^{-1}$ \\
& $  1000$ & $1.0\cdot10^{-06}$ & $1.1\cdot10^{-03}$ & $1.1\cdot10^{-1}$ \\
& $ 10000$ & $7.9\cdot10^{-09}$ & $5.1\cdot10^{-05}$ & $4.5\cdot10^{-2}$ \\ \hline $\nabla_{\mathrm{G},1}$
& $   100$ & $2.8\cdot10^{-05}$ & $2.1\cdot10^{-01}$ & $2.1\cdot10^{+0}$ \\
& $  1000$ & $7.5\cdot10^{-08}$ & $2.8\cdot10^{-02}$ & $1.8\cdot10^{+0}$ \\
& $ 10000$ & $3.8\cdot10^{-10}$ & $3.1\cdot10^{-03}$ & $8.7\cdot10^{-1}$ \\ \hline $\nabla_{\mathrm{G},10^{-3}}$
& $    10$ & $5.2\cdot10^{-07}$ & $5.4\cdot10^{-02}$ & $2.0\cdot10^{+0}$ \\
& $   100$ & $5.9\cdot10^{-10}$ & $3.8\cdot10^{-03}$ & $9.6\cdot10^{-1}$ \\
& $  1000$ & $6.9\cdot10^{-13}$ & $3.2\cdot10^{-04}$ & $1.3\cdot10^{-1}$ \\
& $ 10000$ & $5.0\cdot10^{-15}$ & $6.5\cdot10^{-05}$ & $2.7\cdot10^{-2}$ \\ \hline $\nabla_{\mathrm{G},10^{-5}}$
& $    10$ & $6.2\cdot10^{-10}$ & $3.8\cdot10^{-03}$ & $9.6\cdot10^{-1}$ \\
& $   100$ & $4.0\cdot10^{-13}$ & $2.7\cdot10^{-04}$ & $1.1\cdot10^{-1}$ \\
& $  1000$ & $6.3\cdot10^{-16}$ & $3.3\cdot10^{-05}$ & $1.4\cdot10^{-2}$ \\
& $ 10000$ & $1.4\cdot10^{-17}$ & $8.2\cdot10^{-06}$ & $4.0\cdot10^{-3}$ \\ \hline $\nabla_{\mathrm{G},10^{-10}}$
& $   5  $ & $1.7\cdot10^{-08}$ & $2.6\cdot10^{-05}$ & $1.0\cdot10^{-2}$ \\
& $  10  $ & $1.3\cdot10^{-16}$ & $1.1\cdot10^{-05}$ & $5.6\cdot10^{-3}$ \\
& $  20  $ & $5.2\cdot10^{-18}$ & $3.4\cdot10^{-06}$ & $2.5\cdot10^{-3}$ \\
& $  60  $ & $2.8\cdot10^{-23}$ & $1.3\cdot10^{-11}$ & $5.0\cdot10^{-6}$ \\
& $ 300^*$ & $2.9\cdot10^{-28}$ & $4.8\cdot10^{-11}$ & $7.9\cdot10^{-6}$ \\ \hline Least Squares
& $  1   $ & $1.7\cdot10^{-13}$ & $8.0\cdot10^{-02}$ & $4.6\cdot10^{-1}$ \\ 
& $  5   $ & $3.2\cdot10^{-23}$ & $1.1\cdot10^{-11}$ & $5.2\cdot10^{-6}$ \\ 
& $ 10^* $ & $2.3\cdot10^{-28}$ & $4.8\cdot10^{-11}$ & $7.9\cdot10^{-6}$ \\ 
\hline
\end{tabular}
\end{table}

\begin{figure}[ht] 
%\input{figures/difficult_h1}
\begin{center}
\begin{tabular}{cc}
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,2169.60)(0,150)%
\begin{picture}(3328.00,2469.60)
\put(0,0){\includegraphics{figures/difficult_h1}}
\put(560,320){\makebox(0,0)[r]{0}}
\put(560,712){\makebox(0,0)[r]{0.5}}
\put(560,1103){\makebox(0,0)[r]{1}}
\put(560,1495){\makebox(0,0)[r]{1.5}}
\put(560,1886){\makebox(0,0)[r]{2}}
\put(560,2278){\makebox(0,0)[r]{2.5}}
\put(656,160){\makebox(0,0){0}}
\put(1089,160){\makebox(0,0){0.5}}
\put(1523,160){\makebox(0,0){1}}
\put(1956,160){\makebox(0,0){1.5}}
\put(2389,160){\makebox(0,0){2}}
\put(2823,160){\makebox(0,0){2.5}}
\put(3256,160){\makebox(0,0){3}}
\put(2617,2100){\makebox(0,0)[r]{$(1+0.8t)\exp(-t)$}}
\end{picture}
\end{picture}
&
%\input{figures/difficult_gr}
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,2169.60)(0,150)%
\begin{picture}(3328.00,2469.60)
\put(560,320){\makebox(0,0)[r]{-0.5}}
\put(560,646){\makebox(0,0)[r]{ 0}}
\put(560,973){\makebox(0,0)[r]{ 0.5}}
\put(560,1299){\makebox(0,0)[r]{ 1}}
\put(560,1625){\makebox(0,0)[r]{ 1.5}}
\put(560,1952){\makebox(0,0)[r]{ 2}}
\put(560,2278){\makebox(0,0)[r]{ 2.5}}
\put(656,160){\makebox(0,0){ 0}}
\put(1089,160){\makebox(0,0){ 0.5}}
\put(1523,160){\makebox(0,0){ 1}}
\put(1956,160){\makebox(0,0){ 1.5}}
\put(2389,160){\makebox(0,0){ 2}}
\put(2823,160){\makebox(0,0){ 2.5}}
\put(3256,160){\makebox(0,0){ 3}}
\put(2617,2100){\makebox(0,0)[r]{$(1+0.8t)\exp(-t)$}}
\put(0,0){\includegraphics{figures/difficult_gr}}
\end{picture}
\end{picture}
\end{tabular}
\end{center}
\caption{Some descent steps with $\nabla_{\mathrm{H}}$ (left)
and with $\nabla_{\mathrm{G},1}$ (right).}
\label{fig:difficult:h1} %\label{fig:difficult:gr}
\end{figure}


\begin{figure}[ht]
%\input{figures/difficult_gr10}}
\begin{center}
\begin{tabular}{cc}
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,2169.60)(0,150)%
\begin{picture}(3328.00,2469.60)
\put(560,320){\makebox(0,0)[r]{0}}
\put(560,712){\makebox(0,0)[r]{0.5}}
\put(560,1103){\makebox(0,0)[r]{1}}
\put(560,1495){\makebox(0,0)[r]{1.5}}
\put(560,1886){\makebox(0,0)[r]{2}}
\put(560,2278){\makebox(0,0)[r]{2.5}}
\put(656,160){\makebox(0,0){0}}
\put(1089,160){\makebox(0,0){0.5}}
\put(1523,160){\makebox(0,0){1}}
\put(1956,160){\makebox(0,0){1.5}}
\put(2389,160){\makebox(0,0){2}}
\put(2823,160){\makebox(0,0){2.5}}
\put(3256,160){\makebox(0,0){3}}
\put(2617,2100){\makebox(0,0)[r]{$(1+0.8t)\exp(-t)$}}
\put(0,0){\includegraphics{figures/difficult_gr10}}
\end{picture}
\end{picture}
&
%\input{figures/difficult_ls}
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,2169.60)(0,150)%
\begin{picture}(3328.00,2469.60)
\put(560,320){\makebox(0,0)[r]{-0.5}}
\put(560,646){\makebox(0,0)[r]{0}}
\put(560,973){\makebox(0,0)[r]{0.5}}
\put(560,1299){\makebox(0,0)[r]{1}}
\put(560,1625){\makebox(0,0)[r]{1.5}}
\put(560,1952){\makebox(0,0)[r]{2}}
\put(560,2278){\makebox(0,0)[r]{2.5}}
\put(656,160){\makebox(0,0){0}}
\put(1089,160){\makebox(0,0){0.5}}
\put(1523,160){\makebox(0,0){1}}
\put(1956,160){\makebox(0,0){1.5}}
\put(2389,160){\makebox(0,0){2}}
\put(2823,160){\makebox(0,0){2.5}}
\put(3256,160){\makebox(0,0){3}}
\put(2617,2100){\makebox(0,0)[r]{$(1+0.8t)\exp(-t)$}}
\put(0,0){\includegraphics{figures/difficult_ls}}
\end{picture}
\end{picture}
\end{tabular}
\end{center}
\caption{left: some descent steps with $\nabla_{\mathrm{G},10^{-10}}$, \newline
right: development solving least squares problem}
\label{fig:difficult:h2}  %\label{fig:difficult:gr10}
\end{figure}

\subsection{More Involved Test Problems}\label{ssec:involved}

We ran our implementation on two problems of the
IVP~Testset~\cite{MIM06:testset} of the University of Bari (formerly released by
CWI Amsterdam).

\subsubsection{Chemical Akzo Nobel problem}\label{AkzoNobel}
This is a initial value problem consisting of a stiff semi-linear DAE of index $1$ with $n=m=6$. As the square-root is taken
in several components the domain of $\uppsi$ is not the whole space $H^1(0,180;\mathbb{R}^6)$. This 
poses difficulties for the line search algorithm, as we have to ensure that we do not leave the
domain of definition, decreasing the step width if necessary.

Another problem is that our implementation does not cope well with stiff problems. This is not surprising,
as we did not incorporate any mechanisms to refine the grid (also compare to section~\ref{implissues}).
But the algorithm tries to find the optimal numerical solution a fixed given grid. We can, however, apply
our method in a step-by-step manner by solving the equation on short intervals
if we ensure by appropriate initial conditions that we can glue the solutions together.
This works reasonably well and as a byproduct ensures that the solution stays in the domain of definition.
Still, it is computationally quite demanding to get highly exact numerical results
with this approach.

\subsubsection{Andrews' Squeezing Mechanism}

The equation of Andrews' squeezing mechanism is a non-stiff semi-linear index
$3$ DAE with $14$ differential and $13$ algebraic equations. It is described in
detail by Hairer and Wanner~\cite[Section~VII.7]{HW96}. However, our
implementation is not designed to cope with the extreme scaling issues between
the individual components and has problems with this equation.  To get a rough
idea, be aware that at a fixed position the solution vector has the structure
$y=\begin{pmatrix} q & \dot{q} & \ddot{q} & \lambda\end{pmatrix}^T$ , where
$q\in\mathbb{R}^7$ and $\lambda\in\mathbb{R}^6$.  For the correct solution, the
magnitude of $y_5$ is of order $10^{-2}$ on the interval $[0,0.003]$, whereas
$y_{16}$ is of order $10^6$. Without appropriate preconditioning, the linear
solver cannot handle this discrepancy.  A preconditioning strategy for index
$3$ DAE arising from multibody systems is proposed in~\cite{BDT08:scaling} in
the context of Newmark integration schemes.


%\input{parts/examples.dimestimation}

\subsection{Solution Space Estimation}\label{sec:solspaceest}

The structure of the set of solutions of general DAE can be quite
complicated. Even the calculation of consistent initial values is a non-trivial
task (see section~\ref{sec:intro}).
The presented steepest descent method allows to start from any initial function.
The choice of the initial function determines 
the calculated numerical solution. Thus, it is natural to ask whether valuable
information about the set of solutions can be acquired by running the steepest
descent method multiple times for a large number of sufficiently different
initial functions.

The question arises which initial functions to choose to get
sufficiently diverse solutions and whether this method really has the tendency to
exhaust the full set of solutions.
Here, we always took linear functions as initial estimates. We
generated them by prescribing random function values,
uniformly distributed in $[-2,2]^n$, at both interval boundary points of
$[0,T]$. Figure~\ref{fig:consistent} shows plots of initial values of
corresponding numerical solutions.
The $3$-dimensional plots belong to
specifically designed linear DAE with known solution space.

%\input{figures/linear_dim_plane}
\begin{figure}[ht]
\begin{center}
\begin{tabular}{cc}
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,1869.60)(0,300)%
\begin{picture}(3328.00,2469.60)
\put(2000,437){\makebox(0,0){-40}}
\put(1574,511){\makebox(0,0){-20}}
\put(1148,586){\makebox(0,0){ 0}}
\put(721,660){\makebox(0,0){ 20}}
\put(3024,934){\makebox(0,0){-10}}
\put(2593,708){\makebox(0,0){ 0}}
\put(2162,482){\makebox(0,0){ 10}}
\put(461,1068){\makebox(0,0)[r]{-40}}
\put(461,1219){\makebox(0,0)[r]{-20}}
\put(461,1369){\makebox(0,0)[r]{ 0}}
\put(461,1519){\makebox(0,0)[r]{ 20}}
\put(461,1670){\makebox(0,0)[r]{ 40}}
\put(0,0){\includegraphics{figures/linear_dim_plane}}
\end{picture}
\end{picture}
&
%plots/linear_dim_space
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,1869.60)(0,300)%
\begin{picture}(3328.00,2469.60)
\put(2000,437){\makebox(0,0){-4}}
\put(1441,534){\makebox(0,0){-1}}
\put(881,632){\makebox(0,0){ 2}}
\put(3024,934){\makebox(0,0){-2.5}}
\put(2737,783){\makebox(0,0){-1}}
\put(2449,632){\makebox(0,0){ 0.5}}
\put(2162,482){\makebox(0,0){ 2}}
\put(461,1068){\makebox(0,0)[r]{-1.5}}
\put(461,1269){\makebox(0,0)[r]{-0.5}}
\put(461,1469){\makebox(0,0)[r]{ 0.5}}
\put(461,1670){\makebox(0,0)[r]{ 1.5}}
\put(0,0){\includegraphics{figures/linear_dim_space}}
\end{picture}
\end{picture}
\end{tabular}
\end{center}
\caption{A $2$-dimensional space (left) and $3$-dimentional space (right)
 of consistent initial values.}
\label{fig:consistent} %\label{fig:consistent3dim}
\end{figure}

\begin{figure}[ht] 
%\input{figures/figure_eight_dim}
\begin{center}
\begin{tabular}{cc}
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,2169.60)(0,150)%
\begin{picture}(3328.00,2469.60)
\put(560,320){\makebox(0,0)[r]{-1}}
\put(560,712){\makebox(0,0)[r]{-0.5}}
\put(560,1103){\makebox(0,0)[r]{ 0}}
\put(560,1495){\makebox(0,0)[r]{ 0.5}}
\put(560,1886){\makebox(0,0)[r]{ 1}}
\put(560,2278){\makebox(0,0)[r]{ 1.5}}
\put(656,160){\makebox(0,0){-1}}
\put(1306,160){\makebox(0,0){-0.5}}
\put(1956,160){\makebox(0,0){ 0}}
\put(2606,160){\makebox(0,0){ 0.5}}
\put(3256,160){\makebox(0,0){ 1}}
\put(2757,2100){\makebox(0,0)[r]{numerical initial values}}
\put(0,0){\includegraphics{figures/figure_eight_dim}}
\end{picture}
\end{picture}
&
%\input{figures/figure_eight_dim_few} 
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,2169.60)(0,150)%
\begin{picture}(3328.00,2469.60)
\put(560,320){\makebox(0,0)[r]{-1}}
\put(560,712){\makebox(0,0)[r]{-0.5}}
\put(560,1103){\makebox(0,0)[r]{ 0}}
\put(560,1495){\makebox(0,0)[r]{ 0.5}}
\put(560,1886){\makebox(0,0)[r]{ 1}}
\put(560,2278){\makebox(0,0)[r]{ 1.5}}
\put(656,160){\makebox(0,0){-1}}
\put(1306,160){\makebox(0,0){-0.5}}
\put(1956,160){\makebox(0,0){ 0}}
\put(2606,160){\makebox(0,0){ 0.5}}
\put(3256,160){\makebox(0,0){ 1}}
\put(2757,2100){\makebox(0,0)[r]{ numerical initial values}}
\put(0,0){\includegraphics{figures/figure_eight_dim_few}}
\end{picture}
\end{picture}
\end{tabular}
\end{center}
\caption{Numerical initial values of the Figure Eight Problem; left: 10000 points, right: 100 points}
\label{fig:figureeight} \label{fig:figureeightfew}
%\caption{Plots of numerically consistent initial values.}\label{fig:consistent}
\end{figure}

\subsubsection{The Figure Eight Problem}

We ran our steepest descent implementation $10000$ times, starting with random
initial functions, on the problem 
\begin{equation*}
\begin{cases}
u_1(t)^2 + u_1'(t)^2 - 1 = 0, \\
2 u_1(t) u_1'(t) - u_2(t) = 0, \\
\end{cases}
\end{equation*}
where $t\in[0,1]$. We used a uniform grid with $N=300$ grid points,
a damping factor of $0.85$ and made $30$ descent steps using the gradient 
$\nabla_{\mathrm{G},10^{-5}}$. Usually the residual dropped below $10^{-16}$
within the first $12$ descent steps. We only cared about the function value
at $t=0$ of the numerical solution. The plot of these numerically
consistent initial conditions is shown in figure~\ref{fig:figureeight}. From
the picture it becomes evident why this problem is called the ``figure eight
problem''. In the right picture, only the first $100$ initial values are shown. Obviously the distribution
is not uniform. We found this equation in the sample problem collection of a DAE
solver written by Rheinboldt~\cite{Rhe00:samp}.

We remark that this is the only problem in the whole article to which
lemma~\ref{lemma:FisC1} does not apply.
In fact, the example actually does not fit into our framework because
$F$ as in~\eqref{eq:F:nonlinear} does not even map into $L^2(0,T;\mathbb{R}^m)$.
Nevertheless, the numerics work without any difficulties.


\subsubsection{Dimension Estimation}

For linear DAE the set of consistent initial values, in the following denoted
by $C\subset\mathbb{R}^n$, is a finite dimensional affine subspace of $\mathbb{R}^n$. To estimate
its dimension we produce a large amount of numerically consistent initial values.
When using these vectors to determine the dimension of $C$ one faces the problem that
they are numerically disturbed and have almost surely full
dimension. 

Assume we have a number of numerical initial values
$v_1,\dots v_N$, $N\gg n$. A first idea to determine the dimension is based on the
Gaussian elimination method with \emph{complete pivoting}, which is also called
\emph{total} or \emph{maximal pivoting}~\cite[Section~3.4]{GL96:la}. First we shift the
vectors $v_j$ such that they have sample mean $0$. If one takes the matrix $A$
containing all the $v_j$ as columns one expects the total pivots during the
Gaussian elimination process to decrease and to relate in some way to the
dimension of this point set. The index of the most significant drop would then
be the estimate of the dimension of $C$. 

A second approach utilizes \emph{principal component analysis}~(PCA), which is a well established method to reduce the dimension of data by introducing a
change to a lower dimensional new coordinate system.
An introduction to PCA is given by
Lay~\cite[Section~8.5]{Lay94:laa}. More precisely, for some target dimension
$d\in\mathbb{N}_0,\,d\le n$, PCA can be used to determine a shift $\hat c\in\mathbb{R}^n$ and
an orthogonal projection $\hat P\in\mathbb{R}^{n\times n}$ onto a $d$-dimensional
linear subspace of $\mathbb{R}^n$ which minimizes the quadratic error
\[
    E(d) := \min\biggl\{\sum_{i=1}^N \left\| (I-P)(v_i - c) \right\|^2_2 : \text{$c\in\mathbb{R}^n$, $P$ a $d$-dim.\ orth.\ proj.}\biggr\}.
\]

Since we are only interested in the error $E(d)$, the necessary calculations
are relatively simple. One
can show that $\hat c$ can be chosen independently of $d$ as the sample mean of the
vectors $v_i,\, i=1,\dots,N$. The scaled covariance matrix
\[
    S := \sum_{i=1}^N (r_i-\hat{c})(r_i-\hat{c})^T,
\]
is a positive semi-definite matrix whose eigenvalues we denote by
$\lambda_1\ge\dots\ge\lambda_n$. It can be shown that
\[
    E(d) = \sum_{i=d+1}^n \lambda_i.
\]
Using a numerical method to calculate eigenvalues, we are able to compute $E(d)$. 
We estimate the dimension as $d^*=0$ if the entire variance $E(0)$ is below some bound
and otherwise as the minimal $d^*\in\mathbb{N}_0$ such that $E(d^*) \le 0.999 \cdot E(0)$.
Hence we reduce the dimension as much as possible
while still preserving $99.9 \%$ of the entire variance.

We successfully applied this to several small linear DAE with index up to $3$.
Using the theory of regular matrix pencils \cite[Section~VII.1]{HW96} it is easy to
construct linear DAE with constant coefficients with certain index and known solution 
space. Table~\ref{tab:dimension} shows the results for such a constructed index $3$ system
with $n=m=13$ where the solution space is $4$-dimensional.
We point out that the method using pivots also works surprisingly well.
However, the automatic decision should be improved. In this example, it finds $99.9\%$ of 
the variance in a subspace of dimension $3$ although the most significant drop
occurs between the fourth and fifth eigenvalue.

Unfortunately, this method has some deficiencies. One problem is the dependency
on highly accurate numerical solutions, which are harder to obtain for the
higher index case. Additionally, it is heavily dependent on the scaling of the
problem and tends to underestimate the real dimension for problems with a
higher dimensional solution space because of insufficient diversity of the
numerical solutions. The latter problem could possibly be addressed by a more
sophisticated choice of initial functions.

\begin{table}[th]
\caption{left: pivots and eigenvalues for an index $3$ DAE with $4$-dimensional solution space,
right: logarithmic plot of the pivots' absolute values and the eigenvalues}
\label{tab:dimension}
\begin{center}
\begin{tabular}{cc}
\begin{tabular}[b]{crr}%
\hline%
\multicolumn{1}{c}{\bfseries Number} & 
\multicolumn{1}{c}{\bfseries Total pivots} & 
\multicolumn{1}{c}{\bfseries Eigenvalues} \\ \hline 
$1 $ & $-13.1$\hspace{0.5cm} & $13468.2$\hspace{0.5cm} \\ 
$2 $ & $ 12.8$\hspace{0.5cm} & $10269.7$\hspace{0.5cm} \\ 
$3 $ & $  8.5$\hspace{0.5cm} & $ 3517.9$\hspace{0.5cm} \\ 
$4 $ & $ -1.3$\hspace{0.5cm} & $   22.0$\hspace{0.5cm} \\ \hline
$5 $ & $  0.9\cdot 10^{-04}$  & $    1.6\cdot 10^{-08}$  \\ 
$6 $ & $  0.6\cdot 10^{-04}$  & $    6.2\cdot 10^{-09}$  \\ 
$7 $ & $ -1.5\cdot 10^{-07}$  & $    8.5\cdot 10^{-14}$  \\ 
$8 $ & $ -1.1\cdot 10^{-07}$  & $    2.3\cdot 10^{-14}$  \\ 
$9 $ & $  7.8\cdot 10^{-10}$  & $    6.5\cdot 10^{-19}$  \\ 
$10$ & $  5.2\cdot 10^{-10}$  & $    4.9\cdot 10^{-19}$  \\ 
$11$ & $ -3.9\cdot 10^{-10}$  & $    3.6\cdot 10^{-19}$  \\ 
$12$ & $  4.2\cdot 10^{-11}$  & $    2.1\cdot 10^{-21}$  \\ 
$13$ & $ -1.5\cdot 10^{-20}$  & $    4.1\cdot 10^{-35}$  \\ 
\hline
\end{tabular}
&
%\input{figures/table_dim13}
\setlength{\unitlength}{0.0500bp}
\begin{picture}(2900.00,3276.00)(550,0)%
\begin{picture}(2900.00,3276.00)
\put(1136,320){\makebox(0,0)[r]{$10^{-35}$}}
\put(1136,665){\makebox(0,0)[r]{$10^{-30}$}}
\put(1136,1011){\makebox(0,0)[r]{$10^{-25}$}}
\put(1136,1356){\makebox(0,0)[r]{$10^{-20}$}}
\put(1136,1702){\makebox(0,0)[r]{$10^{-15}$}}
\put(1136,2047){\makebox(0,0)[r]{$10^{-10}$}}
\put(1136,2393){\makebox(0,0)[r]{$10^{-5}$}}
\put(1136,2738){\makebox(0,0)[r]{$10^{0}$}}
\put(1136,3084){\makebox(0,0)[r]{$10^{5}$}}
\put(1407,160){\makebox(0,0){2}}
\put(1756,160){\makebox(0,0){4}}
\put(2105,160){\makebox(0,0){6}}
\put(2455,160){\makebox(0,0){8}}
\put(2804,160){\makebox(0,0){10}}
\put(3153,160){\makebox(0,0){12}}
\put(2593,861){\makebox(30,0)[r]{partial pivoting}}
\put(2593,670){\makebox(30,0)[r]{total pivoting}}
\put(2593,479){\makebox(30,0)[r]{eigenvalues}}
\put(0,0){\includegraphics{figures/table_dim13}}
\end{picture}
\end{picture}
\end{tabular}
\end{center}
\end{table}

\subsubsection{Detection of Non-Uniqueness}\label{sssec:nonunique}
A DAE might have an infinite dimensional solution space, and even for given
initial values the problem need not have a unique solution.
An example of a linear DAE which exhibits
non-uniqueness for some given initial value is
\begin{equation}\label{eq:nonunique}
\begin{pmatrix} -t & t^2 \\ -1 & t\end{pmatrix} u' + \begin{pmatrix} 1 & 0 \\ 0 & 1\end{pmatrix} u = 0,\quad u(0)=0\text{ for $t\in[0,2]$.}
\end{equation}
This problem is taken from Kunkel and Mehrmann~\cite[Chapter~8,
Exercise~3]{KM06}. The left plot in figure~\ref{fig:nonunique:ivp} shows the first component of
$100$ numerical solutions having started at random linear functions
satisfying the initial condition of equation~\eqref{eq:nonunique}. The
residuals of the numerical solutions are of magnitude $10^{-12}$.

Another interesting example problem was presented by Ascher and
Spiteri~\cite[Example~2]{AS94} as a test problem for their boundary value problem
solver \texttt{COLDAE}. This problem is a boundary value problem admitting
exactly $2$ different classical solutions. 
One can simplify it to an equivalent initial value problem as follows, without changing 
the behavior of our program significantly.
\begin{equation}\label{eq:exbvp:simple}
\begin{split}
u'(t) & = y(t) + \cos(t) \\
0 & = (u(t) - \sin(t))(y(t) - \exp(t))
\end{split} \quad \text{on }[0,1],\text{ where } u(0)=0
\end{equation}
The two solutions are
\begin{equation*}
\begin{pmatrix} u(t) \\ y(t) \end{pmatrix} = \begin{pmatrix} \sin(t)+\exp(t)-1\\ \exp(t)\end{pmatrix}
\quad\text{ and }\quad
\begin{pmatrix} u(t) \\ y(t) \end{pmatrix} = \begin{pmatrix} \sin(t)\\ 0\end{pmatrix}.
\end{equation*}

For the numerics we used a uniform grid with $N=1000$ and a damping factor of $0.85$.
Applying our method using the discretized $H^1$ norm and
starting from random linear initial functions which
satisfy the initial conditions, we experienced poor convergence speed and
arrived at an approximation for the second solution most of the time.
The right plot in figure~\ref{fig:nonunique:ivp} shows a numerical solution after
$10000$ steps using $H^1$ descent. Its residual is about $6.41\cdot 10^{-8}$ and $\tilde y$ 
obviously deviates from $y(t)=0$, especially at the interval boundaries. 
Using the discretized graph norm the steepest descent converged in around $100$ steps to a
numerical solution with a residual of about $10^{-20}$.

However, for both gradients certain initial functions resulted in unexpected numerical solutions,
e.\,g.\ the left plot in figure~\ref{fig:bvp:generalized}. The jump in the component
$\tilde y$ and the bend in the first component harm classical differentiability.
However, looking at equation~\eqref{eq:exbvp:simple} we see that $y(t)$ need not be differentiable
if we understand the problem as in section~\ref{closedgraph}.
In fact, our numerical solution resembles a weak solution in $D(\bar{A})$ (compare to proposition~\ref{Abar}).
This shows that even with finite
differences we are able to find weak solutions in accordance with the abstract theory.
This particular plot was generated using graph norm descent. Ordinary
$H^1$ descent has a tendency to smooth things out which interferes with jump solutions
and results in worse convergence.

However, our discretization with finite differences yields problems. Using
steepest descent with $\nabla_{G}$, we also get solutions as depicted in the
right picture in figure~\ref{fig:bvp:problem}. This originates in the central
differences of~\eqref{eq:diffD1} used for the numerical approximation of the
derivative.  In this case we experience a decoupling of the grid into even and
odd indices where $\tilde y(t)\approx \exp(t)$.  There, $\tilde u$ oscillates
between two possible local solutions.  Using the finite element method instead
would solve such deficiencies.

\begin{figure}[ht] 
%\input{figures/nonunique_dim}
\begin{center}
\begin{tabular}{cc}
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,2169.60)(0,150)%
\begin{picture}(3328.00,2469.60)
\put(368,334){\makebox(0,0)[r]{-2}}
\put(368,723){\makebox(0,0)[r]{-1}}
\put(368,1112){\makebox(0,0)[r]{ 0}}
\put(368,1500){\makebox(0,0)[r]{ 1}}
\put(368,1889){\makebox(0,0)[r]{ 2}}
\put(368,2278){\makebox(0,0)[r]{ 3}}
\put(464,160){\makebox(0,0){ 0}}
\put(1162,160){\makebox(0,0){ 0.5}}
\put(1860,160){\makebox(0,0){ 1}}
\put(2558,160){\makebox(0,0){ 1.5}}
\put(3256,160){\makebox(0,0){ 2}}
\put(2477,2085){\makebox(0,0)[r]{first comp.\ of solutions}}
\put(0,0){\includegraphics{figures/nonunique_dim}}
\end{picture}
\end{picture}
&
%\input{figures/bvp_h1_normal}
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,2169.60)(0,150)%
\begin{picture}(3328.00,2469.60)
\put(560,532){\makebox(0,0)[r]{\strut{} 0}}%
\put(560,951){\makebox(0,0)[r]{\strut{} 0.3}}%
\put(560,1370){\makebox(0,0)[r]{\strut{} 0.6}}%
\put(560,1789){\makebox(0,0)[r]{\strut{} 0.9}}%
\put(560,2208){\makebox(0,0)[r]{\strut{} 1.2}}%
\put(656,160){\makebox(0,0){\strut{} 0}}%
\put(1176,160){\makebox(0,0){\strut{} 0.2}}%
\put(1696,160){\makebox(0,0){\strut{} 0.4}}%
\put(2216,160){\makebox(0,0){\strut{} 0.6}}%
\put(2736,160){\makebox(0,0){\strut{} 0.8}}%
\put(3256,160){\makebox(0,0){\strut{} 1}}%
\put(2487,2085){\makebox(0,0)[r]{first component $\tilde u(t)$}}
\put(2487,1894){\makebox(0,0)[r]{last component $\tilde y(t)$}}
\put(2487,1703){\makebox(0,0)[r]{$\sin(t)$}}
\put(0,0){\includegraphics{figures/bvp_h1_normal}}
\end{picture}
\end{picture}
\end{tabular}
\end{center}
\caption{left: example of IVP without unique solution,\newline
right: results for \eqref{eq:exbvp:simple} after $H^1$ descent}
\label{fig:nonunique:ivp} %\label{fig:bvp:h1}
\end{figure}

\begin{figure}[ht] 
%\input{figures/bvp_gr_generalized}
\begin{center}
\begin{tabular}{cc}
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,2169.60)(0,150)%
\begin{picture}(3328.00,2469.60)
\put(368,538){\makebox(0,0)[r]{ 0}}
\put(368,973){\makebox(0,0)[r]{ 1}}
\put(368,1408){\makebox(0,0)[r]{ 2}}
\put(368,1843){\makebox(0,0)[r]{ 3}}
\put(368,2278){\makebox(0,0)[r]{ 4}}
\put(464,160){\makebox(0,0){ 0}}
\put(1022,160){\makebox(0,0){ 0.2}}
\put(1581,160){\makebox(0,0){ 0.4}}
\put(2139,160){\makebox(0,0){ 0.6}}
\put(2698,160){\makebox(0,0){ 0.8}}
\put(3256,160){\makebox(0,0){ 1}}
\put(1221,2085){\makebox(0,0)[r]{$\tilde u(t)$}}
\put(1221,1894){\makebox(0,0)[r]{$\sin(t)$}}
\put(1221,1703){\makebox(0,0)[r]{$\tilde y(t)$}}
\put(1221,1512){\makebox(0,0)[r]{$\exp(t)$}}
\put(0,0){\includegraphics{figures/bvp_gr_generalized}}
\end{picture}
\end{picture}
&
%\input{figures/bvp_gr_problems}
\setlength{\unitlength}{0.0500bp}
\begin{picture}(3328.00,2169.60)(0,150)%
\begin{picture}(3328.00,2469.60)
\put(368,320){\makebox(0,0)[r]{-1}}
\put(368,810){\makebox(0,0)[r]{ 0}}
\put(368,1299){\makebox(0,0)[r]{ 1}}
\put(368,1789){\makebox(0,0)[r]{ 2}}
\put(368,2278){\makebox(0,0)[r]{ 3}}
\put(464,160){\makebox(0,0){ 0}}
\put(1022,160){\makebox(0,0){ 0.2}}
\put(1581,160){\makebox(0,0){ 0.4}}
\put(2139,160){\makebox(0,0){ 0.6}}
\put(2698,160){\makebox(0,0){ 0.8}}
\put(3256,160){\makebox(0,0){ 1}}
\put(1221,2085){\makebox(0,0)[r]{$\tilde u(t)$}}
\put(1221,1894){\makebox(0,0)[r]{$\tilde y(t)$}}
\put(0,0){\includegraphics{figures/bvp_gr_problems}}
\end{picture}
\end{picture}
\end{tabular}
\end{center}
\caption{left: generalized numerical solution for~\eqref{eq:exbvp:simple},\newline
right: example illustrating problems of the discretization}
\label{fig:bvp:generalized} \label{fig:bvp:problem}
\end{figure}
%\caption{Plots for DAE exhibiting non-uniqueness.} \label{fig:nonunique}

%\input{parts/examples.implementation}

\subsection{Implementation\label{ssec:implementation}}

We implemented our software in the \texttt{C++} programming language using the
paradigm of compile-time polymorphism. It makes heavy use of the Standard
Template Library and the basic linear algebra routines provided by the
\texttt{ublas} library, which is part of the well known Boost
project~\cite{BOOST}. The software was developed using GNU/Linux on x86
architecture and compiled with the GCC \texttt{C++} compiler.  For solving the
involved linear systems we used the conjugate gradient
method~\cite[Chapter~5]{NW06:no}, Gauss-Seidel with successive
over-relaxation~\cite[Section~10.1]{GL96:la} and the impressive PARDISO
solver~\cite{SG06:pardiso}, being part of the Intel Math Kernel
Library~\cite{MKL}, which can solve sparse positive definite and indefinite
systems efficiently via parallel factorization.  We also used functions
provided by the MKL to calculate the singular value decomposition of a matrix.
We need this to construct projection matrices for general supplementary
conditions. For auxiliary calculations we used the computer algebra system
Maple.

\subsection{Possible Improvements}\label{implissues}

\subsubsection{Finite elements}\label{implissues:finiteelements}
Finite element methods could turn out to be a rewarding alternative to the finite
difference scheme admitting convergence even if the solutions fail to be
smooth. This should be not too difficult to implement. Numerically it is
more demanding because numerical integration methods have to be employed to get the
coefficients for the chosen basis.

\subsubsection{Non-uniform grids}
Analyzing the structure of the given DAE, it may be possible to calculate an
optimal grid for the discretization, or to refine the grid during the descent
process. Refinements of the grid are technically easy to integrate, since new
intermediate grid points can be inserted interpolating the current values in a
neighborhood. However, updating to the new matrices is an expensive operation.

\subsubsection{Functionals defined only on subsets of $H^1(0,T;\mathbb{R}^n)$}
In Section~\ref{nonlinear}, we assumed the function $f$ to be defined on
$[0,T]\times\mathbb{R}^n\times\mathbb{R}^n$. If $f$ can only be evaluated on a subset of this
space (e.\,g.\ because of a square root) the domain $D(\uppsi)$ of $\uppsi$ is not
the whole space $H^1(0,T;\mathbb{R}^n)$. Usual steepest descent does not
respect this and can leave the domain, even if there exists a
solution $u \in D(\uppsi)$. We have discussed this phenomenon in section~\ref{AkzoNobel}.
This issue could probably be addressed by assigning a penalty to the Sobolev
gradient calculation prohibiting strong tendencies towards the boundary of
the domain $D(\uppsi)$.

\subsubsection{Other projections}
The projection onto the feasible functions used in Section~\ref{suppcond}
does not have to be the orthogonal one.
One can choose among all projections trying to find one with beneficial properties,
i.\,e., a sparse projection that still is numerically stable.
Basic building blocks of such matrices have been constructed in~\cite{BHKL85}.

\subsubsection{Combination with conventional methods}
If desired, one can mix conventional techniques with our approach.
For example one could estimate consistent initial conditions using Sobolev
descent locally at the left boundary, then run a multi-step method to
calculate a rough approximate solution,
and then refine this initial guess again by Sobolev gradient methods
on the whole interval.

\subsubsection{Implementation Issues}
Our step size control should be replaced by more robust line search
algorithms enforcing Wolfe conditions, cf.~\cite[Chapter~3]{NW06:no}.
Error estimates, failure tolerance, and a decent user interface
have to be provided. The efficiency of the whole algorithm
has to be improved in order to meet the standards of current DAE solvers.

%\input{parts/conclusion}

\section{Conclusion}\label{conclusion}

As pointed out before, the method of Sobolev steepest descent differs greatly
from the usual step-by-step methods, thus introducing both new kinds of
problems and advantages.

Our approach has some drawbacks. The procedure itself tends to be expensive
in terms of runtime and memory usage compared to the conventional
multi-step methods. It is complicated to generate an appropriate mesh, since
we have to fix a mesh size a priori whereas step-by-step methods may
adjust their mesh according to local error estimates.
Such refinements can be put into practice in our setting, too, but changes of the
mesh are expensive. Moreover, convergence of the Sobolev descent
is guaranteed only under restrictive conditions. Hence currently the user has to
investigate for each kind of DAE separately whether the algorithm behaves as desired.
It certainly is an interesting problem for further research to find general
conditions under which Sobolev descent finds a solution for a DAE.

Yet, a new technique also introduces new possibilities. We regard it as one of
the main features that no initial conditions need to be supplied. Only some
initial estimate for the solution is needed, not necessarily a good one. In
general, it is difficult to find consistent initial conditions for DAE.
In~\cite[Subsection~5.3.4]{BCP89:ivpdae} they state, that ``Often the most
difficult part of solving a DAE system in applications is to determine a
consistent set of initial conditions with which to start the computation''. For
more information on this topic see~\cite{BHP98:cic} or~\cite[Chapter~2]{Yeo97:ii}.

Another advantage is the possibility to impose arbitrary linear supplementary
conditions, even non-local ones. The authors do not know of any other program
that can handle such general data.
In principle, the user can point the algorithm towards a solution with particular
characteristics by providing it with a suitable initial estimate, although
admittedly it is not clear in what sense the algorithm respects this hint.
Moreover, no previous transformations such
as differentiation of the equations have to be applied, and hence we do not
artificially increase the number of equations and the numerical errors.

As the next step for the theory of solving DAE via Sobolev descent
the authors suggest to generalize the results of section~\ref{closedgraph}
to the non-autonomous, the semi-linear and the fully non-linear case.
We mention that the concept of the graph norm for Sobolev gradient descent is rather
generic and easily generalizes to arbitrary differential problems, even
involving non-local and partial differential operators, to which the
theory could finally be extended.
Although it is easier and more common to use an ordinary Sobolev space for
all applications, we emphasize that using a metric more closely related
to the equation itself obviously improves the algorithm. Thus this modification
should at least be considered whenever Sobolev gradients are employed.
It may provide some insight to discuss the effects of the different metrics
and to compare the convergence theoretically.

As for the continuation of our practical efforts, one should consider to address the
deficiencies of our implementation discussed in~\ref{implissues}.
In particular, it seems to be important to put~\ref{implissues:finiteelements}
into practice since finite element methods are the more natural choice
when working with Sobolev spaces.

\subsection*{Acknowledgments}\mbox{}\\
This article has been inspired by John W.\ Neuberger who suggested applying 
Sobolev gradient methods to the field of DAE. 
The authors would like to thank him for his patience and aid.
This work has been started during the authors' stay at the
University of North Texas as a guests. 

Parts of this article were developed while one of the authors was sponsored
by the graduate school ``Mathematical Analysis of Evolution, 
Information and Complexity'' of the University of Ulm.

\begin{thebibliography}{00}

\bibitem{AA02}
Y.A.~Abramovich and C.D.~Aliprantis, \emph{{An Invitation to Operator Theory}},
  American Mathematical Society, 2002.

\bibitem{AP93:pna}
A.~Ambrosetti and G.~Prodi, \emph{{A Primer of Nonlinear Analysis}},
  Cambridge Univ.\ Press, 1993.

\bibitem{AS94}
U.M.~Ascher and R.J.~Spiteri, \emph{{Collocation Software for Boundary Value
  Differential-Algebraic Equations}}, SIAM Journal on Scientific Computing
  \textbf{15} (1994), no.~4, 938--952.

\bibitem{AZ90}
J.~Appell and P.P.~Zabrejko, \emph{{Nonlinear superposition operators}},
  Cambridge Univ.\ Press, 1990.

\bibitem{BCP89:ivpdae}
K.E.~Brenan, S.L.~Campbell, and L.R.~Petzold, \emph{{Numerical Solution of
  Initial-Value Problems in Differential-Algebraic Equations}}, Elsevier
  Science Publishing, New York, 1989.

\bibitem{BDT08:scaling}
C.L.~Bottasso, D.~Dopico, and L.~Trainelli, \emph{{On the optimal scaling of
  index three DAEs in multibody dynamics}}, Multibody System Dynamics
  \textbf{19} (2008), no.~1--2, 3--20.

\bibitem{BHKL85}
M.W.~Berry, M.T.~Heath, I.~Kaneko, M.~Lawo, R.J.~Plemmons, and R.C.~Ward,
  \emph{{An Algorithm to Compute a Sparse Basis of the Null Space}}, Num.\
  Mathematik \textbf{47} (1985), no.~4, 483--504.

\bibitem{BHP98:cic}
P.N.~Brown, A.C.~Hindmarsh, and L.R.~Petzold, \emph{{Consistent Initial
  Condition Calculation for Differential-Algebraic Systems}}, SIAM Journal on
  Scientific Computing \textbf{19} (1998), no.~5, 1495--1512.

\bibitem{BOOST}
\emph{Boost \texttt{C++} Library}, {\url{http://www.boost.org}}.

\bibitem{Cam95:hidae}
S.L.~Campbell, \emph{{High-Index Differential Algebraic Equations}}, Mechanics
  Based Design of Structures and Machines \textbf{23} (1995), no.~2, 199--222.

\bibitem{JC:testset}
J.~Cash, \emph{BVP and IVP software page},
  {\url{http://www.ma.ic.ac.uk/~jcash}}.

\bibitem{Chi06}
R.~Chill, \emph{{The {\L}ojasiewicz-Simon gradient inequality in Hilbert
  spaces}},\\ {\url{http://www.math.univ-metz.fr/~chill/procloja.pdf}}, 2006.

\bibitem{CRRM96}
G.F.~Carey, W.B.~Richardson, C.S.~Reed, and B.J.~Mulvaney, \emph{{Circuit,
  Device and Process Simulation: Mathematical and Numerical Aspects}}, Wiley,
  1996.

\bibitem{CVSB03:bdp}
E.F.~Costa, R.C.~Vieira, A.R.~Secchi, and E.C.~Biscaia, \emph{{Dynamic
  simulation of high-index models of batch distillation processes}}, Latin
  American Applied Research \textbf{33} (2003), 155--160.

\bibitem{GA00:tfs}
B.W.~Gordon and H.~Asada, \emph{{Modeling, Realization, and Simulation of
  Thermo-Fluid Systems Using Singularly Perturbed Sliding Manifolds}}, Journal
  of Dynamic Systems, Measurement, and Control \textbf{122} (2000), 699--707.

\bibitem{GL96:la}
G.H.~Golub and C.F.~van~Loan, \emph{{Matrix Computations}}, third ed., John
  Hopkins University Press, Baltimore and London, 1996.

\bibitem{Har03}
A.~Haraux, M.~Ali~Jendoubi, and O.~Kavian, \emph{{Rate of decay to equilibrium
  in some semilinear parabolic equations}}, Journal of Evolution Equations
  \textbf{3} (2003), no.~3, 463--484.

\bibitem{HW96}
E.~Hairer and G.~Wanner, \emph{{Solving ordinary differential equations, 2.
  Stiff and differential algebraic problems}}, second revised ed., Springer
  Series in Computational Mathematics, vol.~14, Springer, 1996.

\bibitem{KD99}
A.~Kumar and P.~Daoutidis, \emph{{Control of nonlinear differential algebraic
  equation systems}}, Chapman \& Hall/CRC, 1999.

\bibitem{KM06}
P.~Kunkel and V.~Mehrmann, \emph{{Differential-Algebraic Equations: Analysis
  and Numerical Solution}}, Textbooks in Mathematics, European Mathematical
  Society, 2006.

\bibitem{NeuKar07}
J.~Kar{\'a}tson and J.W.~Neuberger, \emph{{Newton's method in the context of
  gradients}}, Electronic Journal of Differential Equations \textbf{2007}
  (2007), no.~124, 1--13.

\bibitem{Lang95}
S.A.~Lang, \emph{{Differential and Riemannian Manifolds}}, Springer, 1995.

\bibitem{Lay94:laa}
D.C.~Lay, \emph{{Linear Algebra and Its Applications}}, Addison-Wesley,
  1994.

\bibitem{Mah95:phd}
W.T.~Mahavier, \emph{{A Numerical Method for Solving Singular Differential
  Equations Utilizing Steepest Descent in Weighted Sobolev Spaces}}, Ph.D.
  thesis, University of North Texas, 1995.

\bibitem{Mah97:conv}
W.T.~Mahavier, \emph{{A convergence result for discreet steepest decent in weighted
  sobolev spaces}}, Abstract and Applied Analysis \textbf{2} (1997), no.~1,
  67--72.

\bibitem{Mah97:singular}
W.T.~Mahavier, \emph{{A numerical method utilizing weighted Sobolev descent to solve
  singular differential equations}}, Nonlinear World \textbf{4} (1997), no.~4,
  435--456.

\bibitem{Mah99:weighted}
W.T.~Mahavier, \emph{{Weighted Sobolev Descent for singular first order partial
  Differential Equations}}, Southwest Journal of Pure and Applied Mathematics
  \textbf{1} (1999), 41--50.

\bibitem{MIM06:testset}
F.~Mazzia, F.~Iavernaro, and C.~Magherini, \emph{{Test Set for Initial Value
  Problem Solvers}},\\
  {\url{http://pitagora.dm.uniba.it/~testset/}}, 2006, Release 2.3 September 2006.

\bibitem{MKL}
\emph{Intel Math Kernel Library},\\
  {\url{http://www.intel.com/cd/software/products/asmo-na/eng/perflib/mkl/}}

\bibitem{Neu76}
J.W.~Neuberger, \emph{{Projection Methods for Linear and Nonlinear Systems of
  Partial Differential Equations}}, Dundee Conference on Differential
  Equations, vol. 564, {Springer Lecture Notes}, 1976, pp.~341--349.

\bibitem{Neu97}
J.W.~Neuberger, \emph{{Sobolev gradients and differential equations}}, Springer, 1997.

\bibitem{loopback}
R.~Nittka and M.~Sauter, \emph{{Implementation with source code and examples of Sobolev Gradients for
  Differential Algebraic Equations}},\\
  {\url{http://cantor.mathematik.uni-ulm.de/m5/nittka/research/2007/sobolev_dae/}}, 2007.

\bibitem{NW06:no}
J.~Nocedal and S.J.~Wright, \emph{{Numerical Optimization}}, second ed.,
  Springer Series in Operations Research, Springer, 2006.

\bibitem{PGH81}
L.~R.~Petzold, C.~W.~Gear, and H.~H.~Hsu, \emph{{Differential-Algebraic
  Equations Revisited}}, Proceedings Oberwolfach Workshop on Stiff Equations,
  Institut f{\"u}r Geometrie und Praktische Mathematik der TH Aachen, June
  1981, Bericht 9.

\bibitem{Rhe00:samp}
W.C.~Rheinboldt, \emph{Sample problems for \texttt{dae\_solve.tgz}},\\
  {\url{http://www.netlib.org/ode/daesolve/}}, 2000.

\bibitem{Sch03}
S.~Schulz, \emph{Four Lectures on Differential-Algebraic Equations}, \\
  {\url{http://www.math.auckland.ac.nz/Research/Reports/}} (497), 2003.

\bibitem{SG06:pardiso}
O.~Schenk and K.~G{\"a}rtner, \emph{{On fast factorization pivoting methods for
  sparse symmetric indefinite systems}}, Electronic Transactions on Numerical
  Analysis \textbf{23} (2006), 158--179.

\bibitem{Udr94}
C.~Udriste, \emph{{Convex Functions and Optimization Methods on Riemannian
  Manifolds}}, Springer, 1994.

\bibitem{Sch99}
R.~von~Schwerin, \emph{{Multibody System Simulation: Numerical Methods,
  Algorithms, and Software}}, Springer, 1999.

\bibitem{Yeo97:ii}
K.D.~Yeomans, \emph{{Initialization Issues in General Differential Algebraic
  Equation Integrators}}, Ph.D. thesis, North Carolina State University, 1997.

\end{thebibliography}

\end{document}