\documentclass[11pt]{article}
\setlength{\topmargin}{-1cm} \setlength{\oddsidemargin}{0in}
\setlength{\textheight}{9in} \setlength{\textwidth}{6.51in}
\usepackage{amsfonts}
\author{
Simon Lo \footnotemark[1] , Michael Monagan \footnotemark[1] , Allan
Wittkopf \thanks{Supported by NSERC of Canada and the MITACS NCE of
Canada.}\\
{\tt \{sclo,mmonagan,wittkopf\}@cecm.sfu.ca } \\
Centre for Experimental and Constructive Mathematics, \\
Department of Mathematics, Simon Fraser University, \\
Burnaby, B.C., V5A 1S6, Canada. }
\title{A Modular Algorithm for Computing the Characteristic Polynomial of an Integer Matrix in Maple}
\date{}
\begin{document}
\maketitle
\begin{abstract}
Let $\mathbf{A}$ be an $n \times n$ matrix of integers. In this
paper we present details of our Maple implementation of a modular
method for computing the characteristic polynomial of $\mathbf{A}$.
Our implementation considers several different representations for
the computation modulo primes, including the use of double precision
floats. The algorithm presently implemented in Maple releases 7--9
is the Berkowitz algorithm. We present some timings comparing the
two methods on a large sparse matrix arising from an application in
combinatorics, and also some randomly generated dense matrices.
\end{abstract}
\section{Introduction}
Let $\mathbf{A}$ be an $n \times n$ matrix of integers. One way to
compute the characteristic polynomial $c(x) = \det(x \mathbf{I} -
\mathbf{A}) \in \mathbb{Z}[x]$ is to evaluate the characteristic
matrix at $n$ points, compute $n$ determinants of integer matrices,
then interpolate to obtain the characteristic polynomial. The
determinants of integer matrices can be computed using a
fraction-free Gaussian elimination algorithm (see Chapter 9 of
Geddes et. al \cite{geddes}) in $O(n^3)$ integer multiplications and
divisions. This approach will lead to an algorithm that requires
$O(n^4)$ integer operations.
The algorithm presently implemented in Maple releases 7--9 is the
Berkowitz algorithm \cite{berkowitz}. It is a division free
algorithm and thus can be used to compute the characteristic
polynomial of a matrix over any commutative ring $R$. It does
$O(n^4)$ multiplications in $R$. In \cite{jounaidi}, Abdeljaoued
described a Maple implementation of a sequential version of the
Berkowitz algorithm and compared it with the interpolation method
and other methods. His implementation improves with sparsity to
$O(n^3)$ multiplications when the matrix has $O(n)$ non-zero
entries.
In section 2 we present details of our Maple implementation of a
modular method for computing $c(x)$, the characteristic polynomial
of $\mathbf{A}$. The algorithm computes the characteristic
polynomial modulo a sequence of primes and applies the Chinese
remainder theorem. Our implementation considers several different
representations for the computation modulo primes, including the use
of double precision floats. In section 3 we present some timings
comparing our modular algorithm implementations and the Berkowitz
algorithm on a $364 \times 364$ sparse matrix arising from an
application in combinatorics from Quaintance \cite{quaintance}, and
also some timings comparing randomly generated large dense matrices.
In section 4 we present two additional improvements that improve the
running time. In section 5 we give a short asymptotic comparison of
the algorithms.
\section{A Dense Deterministic Modular Algorithm}
Let $\mathbf{A} \in \mathbb{Z}^{n \times n}$ be the input matrix. We
want to compute the characteristic polynomial $c(x)= \det(x
\mathbf{I} - \mathbf{A}) \in \mathbb{Z}[x]$. We will utilize a
modular algorithm to compute $c(x)$ by computing the characteristic
polynomial of $\mathbf{A}$ modulo a sequence of primes $p_1, p_2,
p_3, ...$ using the Hessenberg matrix algorithm, then use the
Chinese remaindering algorithm to reconstruct $c(x).$ The cost of
the Hessenberg approach is $O(n^3)$ arithmetic operations in
$\mathbb{Z}_p$ for each prime $p$. An alternative to the Hessenberg
matrix approach would be a Krylov approach which has the same
asymptotic complexity. In the Krylov approach one starts with a
random vector $v \in \mathbb{Z}^n_p$ and computes the Krylov
sequence of vectors
\[ v, {\bf A} v, {\bf A}^2 v, {\bf A}^3 v, ..., {\bf A}^n v \]
stopping when a linear dependence between them is found. This linear
dependence provides a factor of the minimal polynomial of
$\mathbf{A}$. Here is our modular algorithm.
\bigskip
\noindent {\bf Input:} Matrix $\mathbf{A} \in \mathbb{Z}^{n \times n}$ \\
{\bf Output:} Characteristic polynomial $c(x) = \det(x \mathbf{I} - \mathbf{A}) \in \mathbb{Z}[x]$
\begin{enumerate}
\item Compute a bound $S$ (see below) larger than the largest coefficient of $c(x)$.
\item Choose $t$ machine primes $p_1, p_2,\ldots, p_t $ such that
$ m = \prod_{i=1}^t p_i > 2S . $
\item {\bf for} $i=1$ {\bf to} $t$ {\bf do}
\begin{enumerate}
\item $\mathbf{A}_i \gets \mathbf{A} \bmod p_i.$
\item Compute $c_i(x)$ --- the characteristic polynomial of $\mathbf{A}_i$ over
$\mathbb{Z}_{p_i}$ via the Hessenberg algorithm.
\end{enumerate}
\item Apply the Chinese remainder theorem: \\
\hspace*{5mm} Solve $c(x) \equiv c_i(x) \pmod {p_i}$ for $c(x) \in \mathbb{Z}_m[x].$
\item Output $c(x)$ in the symmetric range for $\mathbb{Z}_m.$
\end{enumerate}
\noindent We can compute a bound $S$ for the largest coefficient
of $c(x)$ by modifying a bound for the determinant of $\mathbf{A}$. We have
\[ {\rm det}(\mathbf{A}) \le n! \prod_{i=1}^n \max_{j=1}^n |a_{i,j}| \]
so a bound for $S$ is
\[ S \le {\rm det}( \mathbf{A'} ) ~~ {\rm where} ~~ a'_{i,j} =
\cases{1+ \left| a_{{i,i}} \right| &$i=j$\cr a_{{i,j}}&otherwise.}
\]
\subsection{Hessenberg Algorithm}
Recall that a square matrix $\mathbf{M} = (m_{i,j})$ is in upper
Hessenberg form if $m_{i,j}=0$ for all $i \ge j+2,$ in other words,
the entries below the first subdiagonal are zero.
\[ \left( \begin{array}{ccccccc}
m_{1,1} & m_{1,2} & m_{1,3} & \cdots & m_{1,n-2} & m_{1,n-1} & m_{1,n} \\
m_{2,1} & m_{2,2} & m_{2,3} & \cdots & m_{2,n-2} & m_{2,n-1} & m_{2,n} \\
0 & m_{3,2} & m_{3,3} & \cdots & m_{3,n-2} & m_{3,n-1} & m_{3,n} \\
0 & 0 & m_{4,3} & \cdots & m_{4,n-2} & m_{4,n-1} & m_{4,n} \\
\vdots & \vdots & \ddots & \ddots & \vdots & \vdots & \vdots \\
0 & 0 & 0 & \ddots & m_{n-1,n-2} & m_{n-1,n-1} & m_{n-1,n} \\
0 & 0 & 0 & \cdots & 0 & m_{n,n-1} & m_{n,n} \\
\end{array} \right) \]
\noindent The Hessenberg algorithm (see \cite{cohen}) consists of
the following two main steps:
\begin{description}
\item[Step 1:] Reduce the matrix $\mathbf{M} \in \mathbb{Z}_p^{n \times n}$ into the upper Hessenberg form using a
series of elementary row and column operations while preserving the characteristic
polynomial. In the algorithm below,
$\mathbf{R}_i$ denotes the $i$'th row of $\mathbf{M}$
and $\mathbf{C}_j$ the $j$'th column of $\mathbf{M}$.
In total, $O(n^3)$ operations in $\mathbb{Z}_p$ are performed.
\bigskip
\noindent {\bf Input:} Matrix $\mathbf{M} \in \mathbb{Z}_p^{n \times n}$ \\
{\bf Output:} Matrix $\mathbf{M}$ in upper Hessenberg form with
the same eigenvalues
\begin{tabbing}
000\=000\=000\=000\=\kill
{\bf for} $j=1$ {\bf to} $n-2$ {\bf do} \\
\>search for a nonzero entry $m_{i,j}$ where $j+2 \le i \le
n$ \\
\>{\bf if} found such entry {\bf then} \\
\>\>{\bf do}
$\mathbf{R}_i \leftrightarrow \mathbf{R}_{j+1}$ and
$\mathbf{C}_i \leftrightarrow \mathbf{C}_{j+1}$
{\bf if} $m_{j+1,j}=0$ \\
\>\>{\bf for} $k=j+2$ {\bf to} $n$ {\bf do} \\
\>\>\>{\bf comment} reduce using $m_{j+1,j}$ as pivot \\
\>\>\>$u \gets m_{k,j}\, {m_{j+1,j}}^{-1}$ \\
\>\>\>$\mathbf{R}_k \gets \mathbf{R}_k - u \mathbf{R}_{j+1}$ \\
\>\>\>$\mathbf{C}_{j+1} \gets \mathbf{C}_{j+1} + u \mathbf{C}_k$ \\
\>\>{\bf end for} \\
\>{\bf end if} \\
\>{\bf comment} now the first $j$ columns of $\mathbf{M}$ is in upper Hessenberg form \\
{\bf end for} \\
\end{tabbing}
It is clear from the algorithm that at each step of the outer
for loop, we are performing elementary row and column
operations, and that at the termination of the outer for loop, the entire matrix is reduced into upper
Hessenberg form. Let $\mathbf{H}$ be the matrix $\mathbf{M}$ reduced
into upper Hessenberg form, let the elementary matrix $\mathbf{E}_j$ represent the $j$'th
elementary row operation and let $\mathbf{E} = \mathbf{E}_1 \mathbf{E}_2
\cdots \mathbf{E}_{n-2}$. We can write $\mathbf{H} =
\mathbf{E} \mathbf{M} \mathbf{E}^{-1}$ since the elementary
column operations that we perform in the algorithm are inverses
of elementary row operations. Thus, this is a similarity transformation.
To see that $\mathbf{H}$ has the same characteristic polynomial as the matrix $\mathbf{M},$
note that $\mathbf{H} = \mathbf{E} \mathbf{M} \mathbf{E}^{-1}$
implies
\[ \begin{array}{r@{\;=\;}l}
\det(x \mathbf{I} - \mathbf{H}) &
\det(x \mathbf{I} - \mathbf{E} \mathbf{M} \mathbf{E}^{-1}) \\
& \det(\mathbf{E} (x \mathbf{I}) \mathbf{E}^{-1} - \mathbf{E} \mathbf{M}
\mathbf{E}^{-1}) \\
& \det(\mathbf{E} (x \mathbf{I} - \mathbf{M})
\mathbf{E}^{-1}) \\
& \det(\mathbf{E}) \det(x \mathbf{I} - \mathbf{M})
\det(\mathbf{E}^{-1}) \\
& \det(\mathbf{E}) \det(x \mathbf{I} - \mathbf{M})
\det (\mathbf{E})^{-1} \\
& \det(x \mathbf{I} - \mathbf{M}).
\end{array} \]
\bigskip
\item[Step 2:] The characteristic polynomial $c(x) = p_{n+1}(x) \in \mathbb{Z}_p[x]$ of the upper Hessenberg form can be
efficiently computed bottom up using $O(n^2)$ operations in $\mathbb{Z}_p[x]$ ($O(n^3)$ operations in
$\mathbb{Z}_p$)
from the following recurrence for $p_k(x).$
\[ p_{k+1}(x) =
\cases{ 1 & $k=0$ \cr
(x-m_{k,k})\,p_k(x) - \sum\limits_{i=1}^{k-1} (\prod\limits_{j=i}^{k-1} m_{j+1,j})\, m_{i,k}\, p_i(x) & $1 \le k \le n+1$ }
\]
\noindent The algorithm below computes the above recurrence bottom up,
and clearly shows that $O(n^2)$ operations in $\mathbb{Z}_p[x]$
are required.
\noindent {\bf Input:} Matrix $\mathbf{M} \in \mathbb{Z}_p^{n \times n}$ in upper Hessenberg form\\
{\bf Output:} Characteristic polynomial $c(x) \in \mathbb{Z}_p[x]$ of $\mathbf{M}$
\begin{tabbing}
000\=000\=000\=000\=\kill
$p_1(x) \gets 1$ \\
{\bf for} $k=1$ {\bf to} $n$ {\bf do} \\
\>$p_{k+1}(x) \gets (x-m_{k,k})\,p_k(x)$ \\
\>$t \gets 1$ \\
\>{\bf for} $i=1$ {\bf to} $k-1$ {\bf do} \\
\>\>$t \gets t\,m_{k-i+1,k-i}$ \\
\>\>$p_{k+1}(x) \gets p_{k+1} - t\,m_{k-i,k}\,p_{k-i}(x)$ \\
\>{\bf end for} \\
{\bf end for} \\
{\bf output} $p_{n+1}(x)$ \\
\end{tabbing}
\end{description}
\section{Timings and Implementations}
For a machine prime $p$, in order to improve the running time of our
algorithm, we've implemented the Hessenberg algorithm over
$\mathbb{Z}_p$ in the C programming language and the rest of the
algorithm in Maple. We used the Maple external function interface to
call the C code (see \cite{maple}). We've implemented both the
32-bit integer version and 64-bit integer versions, and also several
versions using 64-bit double precision floating point values for
comparison.
The following table consists of some timings (in seconds) of our
modular Hessenberg algorithm for a sparse $364\times364$ input
matrix arising from an application in combinatorics (see
\cite{quaintance}). Rows 1-9 below are for the modular algorithm
using different implementations of arithmetic for $\mathbb{Z}_p.$
The accelerated floating point version {\bf fprem} using 25-bit
primes generally give the best times.
\begin{center}
\begin{tabular}{|l||*{5}{r @{} l|}}
\hline
Versions &
\multicolumn{2}{l|}{{ Xeon}} &
\multicolumn{2}{l|}{{ Opteron}} &
\multicolumn{2}{l|}{{ AXP2800}} &
\multicolumn{2}{l|}{{ Pentium M}} &
\multicolumn{2}{l|}{{ Pentium 4}} \\
& \multicolumn{2}{l|}{{ 2.8 GHz}} &
\multicolumn{2}{l|}{{ 2.0 GHz}} &
\multicolumn{2}{l|}{{ 2.08 GHz}} &
\multicolumn{2}{l|}{{ 2.00 GHz}} &
\multicolumn{2}{l|}{{ 2.80 GHz}} \\
\hline
\hline
{\bf 64int} & 100&.7 & 107&.4 & & & & & & \\ \hline
{\bf 32int} & 66&.3 & 73&.0 & 76&.8 & 35&.6 & 57&.4 \\ \hline
{\bf new 32int} &49&.7 & 54&.7 & 56&.3 & 25&.5 & 39&.6 \\ \hline
{\bf fmod} & 29&.5 & 32&.1 & 33&.0 & 35&.8 & 81&.1 \\ \hline
{\bf trunc} & 67&.8 & 73&.7 & 69&.6 & 88&.5 & 110&.6 \\ \hline
{\bf modtr} & 56&.3 & 62&.5 & 59&.5 & 81&.0 & 82&.6 \\ \hline
{\bf new fmod} & 11&.0 & 11&.6 & 14&.5 & 15&.2 & 28&.8 \\ \hline
{\bf fprem} & 10&.4 & 10&.9 & 13&.7 & 13&.9 & 26&.8 \\ \hline
{\bf fLA} & 17&.6 & 19&.9 & 21&.9 & 26&.2 & 27&.3 \\ \hline
{\bf Berkowitz} & 2053&.6 & 2262&.6 & & & & & & \\
\hline
\end{tabular}
\end{center}
\bigskip
{\large \bf \noindent Implementations}
\begin{description}
\item[64int] The 64-bit integer version is implemented using the \emph{long long int} datatype in C,
or equivalently the \emph{integer[8]} datatype in Maple. We can use
32-bit primes. All modular arithmetic first executes the
corresponding 64-bit integer machine instruction, then reduces the
result mod $p$ because we work in $\mathbb{Z}_p$. We allow both
positive and negative integers of magnitude less than $p$.
\item[32int] The 32-bit integer version is similar, but implemented using the \emph{long int} datatype in C,
or equivalently the \emph{integer[4]} datatype in Maple. 16-bit
primes are used here.
\item[new 32int] This is an improved {\bf 32int}, with
various hand/compiler optimizations.
\item[fmod] This 64-bit float version is implemented using the \emph{double} datatype in C,
or equivalently the \emph{float[8]} datatype in Maple. 64-bit float
operations are used to simulate integer operations. Operations such
as additions, subtractions, multiplications are followed by a call
to the C library function \emph{fmod()} to reduce the results mod
$p$, since we are working in $\mathbb{Z}_{p}$. We allow both
positive and negative floating point representations of integers
with magnitude less than $p$.
\item[trunc] This 64-bit float version is similar to above, but uses
the C library function \emph{trunc()} instead of \emph{fmod()}. To
compute $b \gets a \bmod p$, we first compute $c \gets a-p \times
\emph{trunc(} a/p \emph{)}$, then $b \gets c$ if $c \neq \pm p$, $b
\gets 0$ otherwise. The \emph{trunc()} function rounds towards zero
to the nearest integer.
\item[modtr] A modified {\bf trunc} version,
where we do not do the extra check for equality to $\pm p$ at the
end. So to compute $b \gets a \bmod p$, we actually compute $b \gets
a-p \times \emph{trunc(} a/p \emph{)}$, which results in $-p \leq b
\leq p$.
\item[new fmod] An improved {\bf fmod} version, where
we have reduced the number of times \emph{fmod()} is called. In
other words, we reduce the results mod $p$ only when the number of
accumulated arithmetic operations on an entry exceeds a certain
threshold. In order to allow this, we are restricted to use 25-bit
primes. We call this delayed mod acceleration. See the next
subsection.
\item[fprem] Equivalent to {\bf new fmod} version, but via direct
assembly programming using {\it fprem} instruction, removing the
function call overhead and making some efficiency improvements.
\item[fLA] An improved {\bf trunc} version using
delayed mod acceleration. It is the default used in Maple's
\verb"LA:-Modular" routines.
\end{description}
% Allan's stuff
\subsection{Efficiency Considerations}
There are a few considerations for use of floating point for mod $p$
computations. Keeping these in mind, one can implement faster
external code for the algorithms than is possible with the integer
codes, and still have the advantage of using larger primes on 32-bit
machines.
\begin{enumerate}
\item Although floating point integer computations can represent
$53$-bit numbers accurately, we restrict the modulus to $p <
2^{25}$, which allows for more efficient mod operations, and
multiple mod operations (up to 8) to occur before having to reduce
modulo $p$. We call this the delayed mod acceleration.
\item Leveraging the smaller primes allows up to 8 computations
(using a maximal size prime) to occur before we must perform a
$\bmod$. This can be efficiently utilized in row-based algorithms,
as a counter associated with each row can count the number of
operations performed, and the algorithms can be made to only perform
the mod once the maximal number of computations is reached.
\item Floating point computations have a number of ways in which
a mod can be performed, including but not limited to subtracting the
floor of the inverse of the modulus times the number from the
number, the floating point mod operation {\it fmod} or {\it fprem},
using {\it trunc}, etc.
\end{enumerate}
\noindent In our experiments we found the following: Use of the
smaller primes, and delayed mod, mentioned in items $1$ and $2$
above increased performance by a factor of $2$-$3$.
With these modifications, use of floating point modular arithmetic
generally demonstrated better performance than integer modular
arithmetic.
The use of the C-library {\it fmod} function or direct assembly
programming using the {\it fprem} instruction (essentially
equivalent modulo function call overhead and some efficiency
improvements made available for our specific use of {\it fmod})
showed better performance than the other floating point schemes,
except on the Pentium 4, on which it was approximately equal. Note
also that on Pentium M the {\it fprem} performance was nearly a
factor of 2 times better.
% end Allan's stuff
\subsection{Timings for Dense Matrices}
The following table consists of some timings (in seconds) of our
modular Hessenberg algorithm using float ({\bf fprem}) and integer
({\bf new 32int}) implementations on dense $n \times n$ matrices,
with uniformly random integer entries between $-$999 and 999. We
also compare with Maple's Berkowitz algorithm. The timings were done
on a dual Opteron 2.2Ghz processor running Unix.
\begin{center}
\begin{tabular}{|l||*{4}{r @{} l|}}
\hline
n &
\multicolumn{2}{l|}{{ float}} &
\multicolumn{2}{l|}{{ integer}} &
\multicolumn{2}{l|}{{ Berkowitz}} \\
\hline \hline
50 & $<$0&.1 & 0&.13 & 7&.85 \\ \hline
100 & 0&.61 & 1&.85 & 128&.6 \\ \hline
200 & 8&.82 & 30&.8 & 2248&.1 \\ \hline
300 & 45&.4 & 153&.2 & & \\ \hline
400 & 173&.5 & 493&.4 & & \\ \hline
600 & 1195&.2 & 2973&.1 & & \\ \hline
800 & 4968&.8 & & & & \\ \hline
\end{tabular}
\end{center}
\noindent Note that the time for Berkowitz algorithm on a dense $200
\times 200$ integer matrix is even slower than a sparse $364 \times
364$ integer matrix, resulting from the cost of large integer
arithmetic. Maple's Berkowitz algorithm is much much slower than the
others, as the timings suggest.
From the above data, we can see that the float version is always
faster than the integer version (about 3 times faster). Therefore in
practice, we would always use the float version.
% A method to improve the running time still further would be to use
% the early termination technique (see \cite{charpoly}) to stop the
% Chinese remaindering. It would convert the deterministic algorithm
% into a Monte-Carlo type probabilistic algorithm, with a bounded
% probability of error. A good bound for the probability of error,
% such as $10^{-50}$ would be sufficient for all practical
% applications.
\section{Improvements to the Basic Algorithm}
A method to improve the running time further would be to use the
early termination technique described below. Consider the following
matrix
\[ \mathbf{A} = \left( \begin{array}{cccc}
1 & u & v & w \\
0 & 2 & x & y \\
0 & 0 & 3 & z \\
0 & 0 & 0 & 4 \\
\end{array} \right). \]
The characteristic polynomial of $\mathbf{A}$ is $c(x) =
(x-1)(x-2)(x-3)(x-4) = x^4-10x^3+35x^2-50x+24.$ Notice that the
largest coefficient of $c(x)$ does not depend on any of the entries
$u, v, w, x, y, z.$ So if $u, v, w, x, y, z$ are large, then the
bound $S$ for the largest coefficient of $c(x)$ would be arbitrarily
far off. Our modular algorithm would use too many primes and would
do too many unnecessary modular computations. This observation
suggests that we use a output sensitive version of the algorithm and
not use a bound at all. We will incrementally apply the Chinese
remainder theorem to reconstruct $c(x)$ and stop the Chinese
remaindering once $\zeta$ consecutive modular images ``remain the
same''.
Let $p_1, p_2,\ldots, p_s, p_{s+1}, \ldots, p_{s+\zeta}$ be machine
primes, $c_i(x) \equiv c(x) \pmod {p_i$}, $C_i(x) \equiv c(x) \pmod
{p_1 \cdots p_i}$. Application of the Chinese remainder theorem
allows us to construct $C_i(x)$ from $c_i(x)$ and $C_{i-1}(x)$. This
is an incremental version of Garner's algorithm. Now suppose that
$C_s(x) = C_{s+1}(x) = \cdots = C_{s+\zeta},$ then there is a high
probability that $c(x) = C_s(x).$ Choosing $\zeta$ carefully will
ensure that the probability of premature termination of the Chinese
remaindering is low.
This output sensitive probabilistic version of the modular algorithm
is much faster than the deterministic version when the largest
coefficient of the characteristic polynomial is much smaller than
the bound. On the sparse $364 \times 364$ example in the previous
section, the timing improves by about 30\%.
\subsection{Strongly Connected Components}
For the example above, the Hessenberg algorithm does not need to do
any work since $\mathbf{A}$ is already in upper Hessenberg form. Now
consider the matrix $\mathbf{A}^T$, i.e.,
\[ \mathbf{A}^T = \left( \begin{array}{cccc}
1 & 0 & 0 & 0 \\
u & 2 & 0 & 0 \\
v & x & 3 & 0 \\
w & y & z & 4 \\
\end{array} \right) \]
which has the same characteristic polynomial. The algorithm would be
required to do some actual reductions. This observation lead us to
ask the following question: For what matrix structures can we
compute the characteristic polynomial quickly? Consider a matrix of
the form
\[ \mathbf{B} = \left( \begin{array}{cccc}
a & b & w & x \\
c & d & y & z \\
0 & 0 & e & f \\
0 & 0 & g & h \\
\end{array} \right). \]
Let $c_\mathbf{B}(x)$ be the characteristic polynomial of
$\mathbf{B}$. Then $c_\mathbf{B}(x) = c_{\mathbf{B}_1}(x)
c_{\mathbf{B}_2}(x)$ where
\[ \mathbf{B}_1 = \left( \begin{array}{cc}
a & b \\
c & d \\
\end{array} \right)
~{\rm and}~
\mathbf{B}_2 = \left( \begin{array}{cc}
e & f \\
g & h \\
\end{array} \right).
\]
If the input matrix is block upper (lower) triangular, then the
computation of the characteristic polynomial is quick, since the
characteristic polynomial is just the product of the characteristic
polynomials of the diagonal blocks. But often matrices are not block
upper (lower) triangular, but are row and column permutations of
block upper (lower) triangular matrices. For example,
\[ \mathbf{P} = \left( \begin{array}{cccc}
h & 0 & g & 0 \\
z & d & y & c \\
f & 0 & e & 0 \\
x & b & w & a \\
\end{array} \right) \]
is the matrix $\mathbf{B}$ with rows 1,4 and columns 1,4
interchanged, and is no longer block upper triangular. The main
observation here is that simultaneous row and column interchanges do
not modify the characteristic polynomial (the arguments of section
2.1 also work here), so that if the rows and columns of a matrix are
permuted by the same permutation, then the characteristic polynomial
remains unchanged.
In order to apply the above observation, we need to efficiently
compute the permutation that would make the matrix block upper
triangular. It turns out that we can compute this permutation in
linear time by finding the strongly connected components of a
directed graph.
Let $\mathbf{A}$ be a $n \times n$ matrix. We denote by
$Graph(\mathbf{A})$ the weighted directed graph with $n$ vertices
such that $\mathbf{A}$ is the adjacency matrix of
$Graph(\mathbf{A})$. Recall that a directed graph $G$ is strongly
connected if for each pair of vertices $u, v \in G, u \ne v$, there
exists a path $u \leadsto v.$ Also, every directed graph can be
partitioned into maximal strongly connected components. Note that
permuting the vertices corresponds to permuting the rows and columns
of $\mathbf{A}.$
Denote by $\mathbf{A}_{(u_1,u_2,...,u_r),(v_1,v_2,...,v_s)}$ the $r
\times s$ submatrix of $\mathbf{A}$ such that
\[(\mathbf{A}_{(u_1,u_2,...,u_r),(v_1,v_2,...,v_s)})_{i,j} =
\mathbf{A}_{u_i,v_j}.\] The method works as follows:
\begin{enumerate}
\item Compute (see below) the $k$ strongly connected components of
$Graph(\mathbf{A}): V_1, V_2, ..., V_k$ where $V_i =
\{{v_i}_1,{v_i}_2,...,{v_i}_{n_i}\}.$
\item For $1 \le i \le k$, compute the characteristic
polynomial of the submatrix
$\mathbf{A}_{({v_i}_1,{v_i}_2,...,{v_i}_{n_i}),({v_i}_1,{v_i}_2,...,{v_i}_{n_i})}.
$ We denote these submatrices as $\mathbf{A}_{V_i,V_i}.$
\item Output (the characteristic polynomial $c(x)$ of $\mathbf{A}$) the
product of the characteristic polynomials computed in step 2.
\end{enumerate}
To see why the above method works let the $k$ strongly connected
components of $G = Graph(\mathbf{A})$ be $V_1, ..., V_k$. We can
topologically sort the $k$ strongly connected components such that
$V_i \prec V_j$ implies there does not exist $u \in V_i, v \in V_j$
such that $(v,u) \in G$. Without loss of generality assume $V_1
\prec V_2 \prec ... \prec V_k.$ Consider relabeling the vertices in
increasing topological order (which is not unique) and consider the
adjacency matrix $\mathbf{A}'$ that represents the relabeled graph.
First, it is clear that $\mathbf{A}'$ has the same characteristic
polynomial as $\mathbf{A}$ since $\mathbf{A}'$ is obtained by
permuting rows and columns of $\mathbf{A}$. Note that the $(i,j)$'th
blocks of $\mathbf{A}'$, denoted by $\mathbf{A}'_{i,j}$ are
precisely the submatrices $\mathbf{A}_{V_i,V_j}$. Therefore, the
diagonal $\mathbf{A}'_{i,i}$'s are precisely the
$\mathbf{A}_{V_i,V_i}$'s. Furthermore, the relabeling guarantees
$i0 and Number[Stack[t]] >= Number[v] do
OnStack[Stack[t]] := false;
t := t-1;
end do;
SCC[k] := [seq(Stack[j],j=t+1..top)];
top := t;
k := k+1
end if
end proc:
n,m := op(1,M);
A:=[seq([seq(`if`(M[i,j]=0,NULL,j),j=1..n)],i=1..n)];
i := 0;
k := 1;
Stack := Array(1..n);
top := 0;
Number := Array(1..n);
Lowlink := Array(1..n);
OnStack := Array(1..n,fill=false);
for u to n do
if Number[u] = 0 then StrongConnect(u) end if
end do;
[seq(`if`(nops(SCC[i])=1 and M[op(SCC[i]),op(SCC[i])]=0,
NULL, M[SCC[i],SCC[i]]),i=1..k-1)]
end proc:
\end{verbatim}
The output is a list of non-zero square matrices
$\mathbf{A}_1,\mathbf{A}_2,...,\mathbf{A}_r$. Let $m=\sum_{i=1}^r
{\rm dim}(\mathbf{A}_i).$ If the input $\mathbf{M}$ is an $n \times
n$ matrix, the output satisfies
\[c_{\mathbf{M}}(x) = x^{n-m} \prod_{i=1}^r c_{\mathbf{A}_i}(x)\]
where $c_{\mathbf{A}}(x)$ is the characteristic polynomial of
$\mathbf{A}$.
Running this on the sparse $364 \times 364$ matrix yields 12 blocks
of sizes $\{5,9,10,22,48,54,76,93\}.$ It takes 0.25 seconds. Note
that the line
\[\verb"A:=[seq([seq(`if`(M[i,j]=0,NULL,j),j=1..n)],i=1..n)]"\] took
0.20 out of a total of 0.25 seconds. Therefore if the matrix is
sparse, one needs to implement more efficient procedures for
extracting the column or row indices of the nonzero entries of a
matrix.
In total, the two improvements mentioned above have reduced the time
for computing the characteristic polynomial for the sparse $364
\times 364$ example to under 0.5 seconds, compared to 10.4 seconds
without the improvements!
\section{Asymptotic Comparison of the Methods}
Let $\mathbf{A}$ be an $n \times n$ matrix of integers. To compare
the running times of the Berkowitz algorithm and the modular
algorithm, we suppose that the entries of $\mathbf{A}$ are bounded
by $B^m$ in magnitude, that is, they are $m$ base $B$ digits in
length. For both algorithms, we need a bound $S$ on the size of the
coefficients of the characteristic polynomial $c(x)$. A generic
bound on the size of the determinant of $\mathbf{A}$ is sufficient
since this is the largest coefficient of $c(x)$. The magnitude of
the determinant of $\mathbf{A}$ is bounded by $S = n! B^{mn}$ and
its length is bounded by $n \log_B n + m n$ base $B$ digits. If $B >
2^{15}$ then we may assume $\log_B n < 2$ in practice and hence the
length of the determinant is $O(m n)$ base $B$ digits.
In Berkowitz's algorithm, the $O(n^4)$ integer multiplications are
on integers of average size $O(m n)$ digits in length, hence the
complexity (assuming classical integer arithmetic is used) is $O(n^4
(mn)^2)$. Since Maple 9 uses the FFT for integer multiplication and
division, the complexity is reduced to $\tilde{O}(n^5 m)$.
In the modular algorithm, we will need $O(mn)$ machine primes. The
cost of reducing the $n^2$ integers in $\mathbf{A}$ modulo one prime
is $O(m n^2)$. The cost of computing the characteristic polynomial
modulo each prime $p$ is $O(n^3)$. The cost of the Chinese
remaindering assuming a classical method for the Chinese remainder
algorithm (which is what Maple uses) is $O(n (mn)^2)$. Thus the
total complexity is $O(mn m n^2 + mn n^3 + n (mn)^2)$ = $O(m^2 n^3 +
m n^4)$.
If we assume $m=O(n),$ that is, the size of the integer grows
proportionally with the size of the matrix, the complexity of the
Berkowitz algorithm and the modular algorithm is $\tilde{O}(n^6)$
and $O(n^5)$ respectively.
If we have $|A_{i,j}|