%-------------------------------------------------------------------------------
% Computer Methods in Applied Mechanics and Engineering
@string{CMAME={Computer Methods Appl. Mech. Eng.}
}
% Applicable Algebra in Engineering, Communication, and Computing
@string{AAECC={Applic. Algebra in Eng. Comm. and Comput.}
}
% BIT Numerical Mathematics
@string{BIT={BIT Numer. Math.}
}
% ACM Transactions on Mathematical Software:
@string{TOMS={{ACM} Trans. Math. Softw.}
}
% Communications of the ACM
@string{CACM={Commun. ACM}
}
% Journal of the ACM
@string{JACM={J. ACM}
}
% ACM SIGNUM Newsletter
@string{SIGNUM={ACM SIGNUM Newsletter}
}
% SIAM Journal on Matrix Analysis and Applications:
@string{SIMAX={{SIAM} J. Matrix Anal. Appl.}
}
% SIAM Journal on Numerical Analysis:
@string{SINUM={{SIAM} J. Numer. Anal.}
}
% SIAM Review
@string{SIREV={{SIAM} Review}
}
% SIAM Journal on Scientific Computing
% SIAM Journal on Scientific and Statistical Computing
@string{SISC={{SIAM} J. Sci. Comput.}
}
% IMA Journal of Applied Mathematics
@string{IMAJAM={{IMA} J. Appl. Math.}
}
% IMA Journal of Numerical Analysis
@string{IMAJNA={{IMA} J. Numer. Anal.}
}
% International Journal of Supercomputer Applications:
@string{IJSA={Intl. J. Supercomp. Appl.} }
@string{IJSAHPC={Intl. J. Supercomp. Appl. High Perf. Comput.}
}
% International Journal of Numerical Methods in Engineering:
@string{IJNME={Intl. J. Numer. Methods Eng.}
}
% Communications in Numerical Methods in Engineering
@string{CNME={Comm. Numer. Methods Eng.}
}
% SIAM Journal on Algebraic and Discrete Methods:
@string{SIAMJADM={{SIAM} J. Alg. Disc. Meth.}
}
% SIAM Journal on Discrete Mathematics:
@string{SIAMJDM={{SIAM} J. Disc. Math.}
}
% SIAM Journal on Applied Mathematics:
@string{SIAMJAM={{SIAM} J. Appl. Math.}
}
% SIAM Journal on Computing
@string{SICOMP={{SIAM} J. Comput.}
}
% Parallel Computing
@string{PC={Parallel Computing}
}
% International Journal of High Speed Computing
@string{IJHSC={Intl. J. High Speed Computing}
}
% Linear Algebra and its Applications
@string{LAA={Linear Algebra Appl.}
}
% Journal of Parallel and Distributed Computing:
@string{JPDC={J. Parallel Distrib. Comput.}
}
% Numerical Linear Algebra with Applications:
@string{NLAA={Numer. Linear Algebra Appl.}
}
% Mathematics of Computation
@string{MATHCOMP={Math. Comp.}
}
% Mathematical Programming
@string{MATHPROG={Math. Program.}
}
% Numerische Mathematik:
@string{NUMERMATH={Numer. Math.}
}
% IEEE Transactions on Computers:
@string{IEEETC={IEEE Trans. Comput.}
}
% IEEE Trans. on Computer-Aided Design of Integrated Circuits and Systems
@string{IEEETCAD={IEEE Trans. Computer-Aided Design Integ. Circ. Sys.}
}
% IEEE Transactions on Circuit Theory
@string{IEEETCT={IEEE Trans. Circuit Theory}
}
% IEEE Transactions on Parallel and Distributed Systems:
@string{IEEETPDS={IEEE Trans. Parallel Distributed Systems}
}
% IEEE Trans. Circuits and Systems
@string{IEEETCS={IEEE Trans. Circuits and Systems}
}
% J. of Computational and Applied Mathematics
@string{JCAM={J. Comput. Appl. Math.}
}
% J. SIAM
@string{JSIAM={J. {SIAM}}
}
% SIAM Journal on Computing
@string{SIAMCOMP={{SIAM} J. Comput.}
}
% International Journal of Parallel Programming
% index at http://ftp.math.utah.edu/pub/tex/bib/toc/intjparallelprogram.html
@string{IJPP={Intl. J. Parallel Programming}
}
% Congressus Numerantium
@string{CONGNUM={Cong. Numer.}
}
% Future Generation Computer Systems
@string{FGCS={Future Generation Comp. Sys.}
}
% Concurrency: Practice and Experience}
@string{CPE={Concurrency: Pract. Exp.}
}
% Management Science
@string{MSCI={Management Sci.}
}
% Electronic Transactions on Numerical Analysis
@string{ETNA={Electronic Trans. on Numerical Analysis}
}
% Proceedings of the IEEE
@string{PROCIEEE={Proc. IEEE}
}
% The Computer Journal
@string{CJ={The Computer Journal}
}
% International Journal Computer Mathematics
@string{IJCM={Intl. J. Computer Mathematics}
}
% Computer Physics Communications
@string{CPC={Computer Physics Comm.}
}
% Theoretical Computer Science
@string{TCS={Theoretical Comp. Sci.}
}
% Annals of Operations Research
@string{AOR={Annals of Oper. Res.}
}
% Computers and Structures
@string{CAS={Computers and Structures}
}
% Computing
@string{COMP={Computing}
}
% Advances in Computer Methods for Partial Differential Equations
@string{ACMPDE={Advances in Computer Methods for Partial Differential Equations}
}
% Algorithms
@string{ALGO={Algorithms}
}
% Bell System Tech. J.
@string{BELL={Bell System Tech. J.}
}
% Can. J. Math.
@string{CANMATH={Can. J. Math.}
}
% Computer Physics Reports
@string{CPR={Computer Physics Reports}
}
% Computers and Mathematics with Applications
@string{CMA={Computers and Mathematics with Applications}
}
% Computers in Chem. Eng.
@string{CCE={Computers in Chem. Eng.}
}
% Czechoslovak Math J.
@string{CZECH={Czechoslovak Math J.}
}
% Electr. Eng.
@string{EE={Electr. Eng.}
}
% IEEE Trans. Magnetics
@string{IEEETM={IEEE Trans. Magnetics}
}
% IEEE Trans. Power Apparatus and Systems
@string{IEEETPAS={IEEE Trans. Power Apparatus and Systems}
}
% IEEE Trans. Power Systems
@string{IEEETPS={IEEE Trans. Power Systems}
}
% IMA Preprint Series
@string{IMAPREPRINT={IMA Preprint Series}
}
% J. Physics: Conference Series
@string{JPHYS={J. Physics: Conference Series}
}
% J. of Scientific Computing
@string{JSC={J. of Scientific Computing}
}
% Journal of Systems Architecture
@string{JSA={J. Systems Architecture}
}
% ORSA J. on Computing
@string{ORSA={ORSA J. on Computing}
}
% Parallel Processing Letters
@string{PARALETTERS={Parallel Processing Letters}
}
% Proceedings in Applied Mathematics and Mechanics
@string{PAMM={Proc. Applied Math. Mech.}
}
% The Journal of Supercomputing
@string{JSUPER={J. Supercomputing}
}
% VLSI Design
@string{VLSI={VLSI Design}
}
% IEEE Transactions on Very Large Scale Integration (VLSI) Systems
@string{IEEETVLSI={IEEE Trans. VLSI Sys.}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%A ---------------------------------------------------------------------------
%got
@techreport{Adlers98,
author={Adlers, M.},
title={Computing sparse orthogonal factors in {{MATLAB}}},
institution={Dept. of Mathematics, Linko\"ping University},
year={1998},
number={LiTH-MAT-R-1998-19},
address={Linko\"ping, Sweden},
month={April}
}
%got
@inproceedings{AgrawalKleinRavi93,
author={Agrawal, A. and Klein, P. and Ravi, R.},
title={Cutting down on fill using nested dissection: provably good elimination orderings},
pages={31--55},
booktitle={Graph Theory and Sparse Matrix Computation},
series={IMA Volumes in Applied Mathematics},
year={1993},
publisher={Springer-Verlag},
address={New York},
editor={George, A. and Gilbert, J. R. and Liu, J. W. H.},
volume={56},
}
%got
@inproceedings{Agullo13,
author={Agullo, E. and Amestoy, P. R. and Buttari, A. and Guermouche, A. and Joslin, G. and L'Excellent, J.-Y. and Li, X. S. and Napov, A. and Rouet, F.-H. and Sid-Lakhdar, W. M. and Wang, S. and Weisbecker, C. and Yamazaki, I.},
title={Recent Advances in Sparse Direct Solvers},
booktitle={Proc. 22nd Conference on Structural Mechanics in Reactor Technology},
address={San Francisco},
month={August},
year={2013},
}
%got
@techreport{AgulloButtariGuermoucheLopez14,
author={Agullo, E. and Buttari, A. and Guermouche, A. and Lopez, F.},
title={Implementing multifrontal sparse solvers for multicore architectures
with Sequential Task Flow runtime systems},
institution={Institut de Recherche en Informatique de Toulouse (IRIT)},
year={2014},
number={IRI/RT–2014-03–FR},
note={to appear in ACM Transactions on Mathematical Software}
}
%got
@article{AgulloGuermoucheLExcellent08,
author={Agullo, E. and Guermouche, A. and L'Excellent, J.-Y.},
title={A parallel out-of-core multifrontal method: storage of factors on disk and analysis of models for an out-of-core active memory},
journal=PC,
volume={34},
number={6-8},
pages={296--317},
year={2008},
}
%got
@article{AgulloGuermoucheLExcellent10,
author={Agullo, E. and Guermouche, A. and L'Excellent, J.-Y.},
title={Reducing the {I/O} Volume in Sparse Out-of-core Multifrontal Methods},
journal=SISC,
volume={31},
number={6},
pages={4774-4794},
year={2010},
url={ http://dx.doi.org/10.1137/080720061 },
}
%got
@article{Alaghband89,
author={Alaghband, G.},
title={Parallel pivoting combined with parallel reduction and fill-in control},
journal=PC,
year={1989},
volume={11},
pages={201--221},
}
%got
@article{Alaghband95,
author={Alaghband, G.},
title={Parallel sparse matrix solution and performance},
journal=PC,
volume={21},
number={9},
pages={1407--1430},
year={1995},
url={ http://dx.doi.org/10.1016/0167-8191(95)00029-N },
keywords={Sparse linear system},
keywords={LU decomposition},
keywords={Shared memory multiprocessor},
keywords={Parallel pivoting strategy},
keywords={Back substitution},
abstract={A parallel solution to the large sparse systems of linear
equations is presented. The solution method is based on a parallel
pivoting technique for LU decomposition on a shared memory MIMD
multiprocessor. At each application of the algorithm to the matrix
several pivots for reducing the matrix in parallel are generated.
During parallel pivoting steps only symmetric permutations are
possible. Unsymmetric permutation for numerical stability however is
possible during single pivoting steps. We will report on switching
between parallel and single pivoting steps to assure numerical
stability. Once the matrix is decomposed, the parallel pivoting
information is used to solve structurally identical matrices
repeatedly. The algorithms, their implementation, and the performance
of the solution methods on actual multiprocessors are presented. Based
on the resulting triangular matrix structure, two algorithms for back
substitution are presented and their performance is compared.},
}
%got
@article{AlaghbandJordan89,
author={Alaghband, G. and Jordan, H. F.},
year={1989},
title={Sparse {Gaussian} Elimination with Controlled Fill-in on a Shared Memory Multiprocessor},
journal=IEEETC,
volume={38},
pages={1539-1557},
}
%got
@incollection{AlvaradoPothenSchreiber93,
author={Alvarado, F. L. and Pothen, A. and Schreiber, R.},
title={Highly Parallel Sparse Triangular Solution},
pages={141--158},
annote={was techreport AlvaradoPothenSchreiber92, Univ Waterloo, CS-92-51},
booktitle={Graph Theory and Sparse Matrix Computation},
series={IMA Volumes in Applied Mathematics},
year={1993},
publisher={Springer-Verlag},
address={New York},
editor={George, A. and Gilbert, J. R. and Liu, J. W. H.},
volume={56},
}
%got
@article{AlvaradoSchreiber93,
author={Alvarado, F. L. and Schreiber, R.},
title={Optimal Parallel Solution of Sparse Triangular Systems},
journal=SISC,
volume={14},
number={2},
pages={446-460},
year={1993},
url={ http://dx.doi.org/10.1137/0914027 },
}
%got
@article{AlvaradoYuBetancourt90,
author={Alvarado, F. L. and Yu, D. C. and Betancourt, R.},
journal=IEEETPS,
title={Partitioned sparse $A^{-1}$ methods},
year={1990},
volume={5},
number={2},
pages={452-459},
abstract={The classic Ax=b problem in the partitioning of sparse vector
matrices is solved by constructing factored components of the inverses
of L and U, the triangular factors of matrix A. The number of
additional fill-ins in the partitioned inverses of L and U can be made
zero. The number of partitions is related to the path length of sparse
vector methods. Allowing some fill-in in the partitioned inverses of L
and U results in fewer partitions. Ordering algorithms most suitable
for sparsity preservation in the inverses of L and U require addition
fill-in in L and U themselves. Tests on practical power system matrices
with from 118 to 1993 nodes indicate that the proposed approach is
competitive in serial environments, and appears more suitable for
parallel environments. Because sparse vectors are not required, the
approach works not only for short-circuit calculations but also for
power flow and stability computations},
keywords={load flow;matrix algebra;power systems;stability;power
flow;power system;short-circuit calculations;sparse vector matrices
partitioning;stability computations;Equations;Parallel
processing;Partitioning algorithms;Power systems;Sparse
matrices;Topology},
url={ http://dx.doi.org/10.1109/59.54552 },
}
%got
@article{AmestoyAshcraftBoiteauButtariLExcellentWeisbecker15,
author={Amestoy, P. R. and Ashcraft, C. C. and Boiteau, O. and Buttari, A. and L'Excellent, J.-Y. and Weisbecker, C.},
title={Improving multifrontal methods by means of block low-rank representations},
year={2015},
journal=SISC,
volume={37},
number={3},
pages={A1451-A1474},
}
%got
@article{Amestoy13,
author={Amestoy, P. R. and Buttari, A. and Joslin, G. and L'Excellent, J.-Y. and Sid-Lakhdar, W. M. and Weisbecker, C. and Forzan, M. and Pozza, C. and Perrin, R. and Pellissier, V.},
title={Shared memory parallelism and low-rank approximation techniques applied to direct solvers in {FEM} simulation},
journal=IEEETM,
year={2013},
}
%got
@article{AmestoyDavisDuff96,
author={Amestoy, P. R. and Davis, T. A. and Duff, I. S.},
title={An approximate minimum degree ordering algorithm},
journal=SIMAX,
year={1996},
volume={17},
number={4},
pages={886--905},
}
%got
@article{AmestoyDavisDuff04,
author={Amestoy, P. R. and Davis, T. A. and Duff, I. S.},
title={Algorithm 837: {AMD}, an approximate minimum degree ordering algorithm},
journal=TOMS,
volume={30},
number={3},
month=sep,
year={2004},
pages={381--388},
url={ http://dx.doi.org/10.1145/1024074.1024081 },
abstract={AMD is a set of routines that implements the approximate
minimum degree ordering algorithm to permute sparse matrices prior to
numerical factorization. There are versions written in both C and
Fortran 77. A MATLAB interface is included.},
}
%got
@incollection{AmestoyDaydeDuff89,
author={Amestoy, P. R. and Dayd\'{e}, M. J. and Duff, I. S.},
year={1989},
title={Use of Level-3 BLAS Kernels in the Solution of Full and Sparse Linear Equations},
editor={Delhaye, J.-L. and Gelenbe, E.},
booktitle={High Performance Computing},
address={Amsterdam},
publisher={North-Holland},
pages={19-31},
annote={f},
keywords={multifrontal}
}
%got
@article{AmestoyDuff89,
author={Amestoy, P. R. and Duff, I. S.},
year={1989},
title={Vectorization of a Multiprocessor Multifrontal Code},
journal=IJSA,
volume={3},
number={3},
pages={41--59},
}
%got
@article{AmestoyDuff93,
author={Amestoy, P. R. and Duff, I. S.},
title={Memory management issues in sparse multifrontal methods on multiprocessors},
journal=IJSA,
year={1993},
volume={7},
number={1},
pages={64--82},
}
%got
@article{AmestoyDuffGuermoucheSlavova10,
author={Amestoy, P. R. and Duff, I. S. and Guermouche, A. and Slavova, Tz.},
title={Analysis of the solution phase of a parallel multifrontal solver},
journal=PC,
volume={36},
pages={3-15},
year={2010},
}
%got
@article{AmestoyDuffLExcellent00,
author={Amestoy, P. R. and Duff, I. S. and L'Excellent, J.-Y.},
title={Multifrontal parallel distributed symmetric and unsymmetric solvers},
journal=CMAME,
volume={184},
pages={501-520},
year={2000},
}
%got
@article{AmestoyDuffLExcellentKoster01,
author={Amestoy, P. R. and Duff, I. S. and L'Excellent, J.-Y. and Koster, J.},
title={A fully asynchronous multifrontal solver using distributed dynamic scheduling},
journal=SIMAX,
volume=23,
number=1,
year={2001},
pages={15-41},
}
%got
@article{AmestoyDuffLExcellentLi01,
author={Amestoy, P. R. and Duff, I. S. and L'Excellent, J.-Y. and Li, X. S.},
title={Analysis and Comparison of Two General Sparse Solvers for Distributed Memory Computers},
journal=TOMS,
volume={27},
number={4},
pages={388--421},
month=dec,
year={2001},
url={ http://dx.doi.org/10.1145/504210.504212 },
abstract={This paper provides a comprehensive study and comparison of
two state-of-the-art direct solvers for large sparse sets of linear
equations on large-scale distributed-memory computers. One is a
multifrontal solver called MUMPS, the other is a supernodal solver
called superLU. We describe the main algorithmic features of the two
solvers and compare their performance characteristics with respect to
uniprocessor speed, interprocessor communication, and memory
requirements. For both solvers, preorderings for numerical stability
and sparsity play an important role in achieving high parallel
efficiency. We analyse the results with various ordering algorithms.
Our performance analysis is based on data obtained from runs on a
512-processor Cray T3E using a set of matrices from real applications.
We also use regular 3D grid problems to study the scalability of the
two solvers.},
}
%got
@article{AmestoyDuffLExcellentLi03,
author={Amestoy, P. R. and Duff, I. S. and L'Excellent, J.-Y. and Li, X. S.},
title={Impact of the implementation of {MPI} point-to-point communications on the performance of two general sparse solvers},
journal=PC,
volume={29},
number={7},
pages={833--947},
year={2003},
}
%got
@article{AmestoyDuffLExcellentRobertRouetUcar12,
author={Amestoy, P. R. and Duff, I. S. and L'Excellent, J.-Y. and Robert, Y. and Rouet, F. H. and U\c{c}ar, B.},
title={On computing inverse entries of a sparse matrix in an out-of-core environment},
journal=SISC,
volume={34},
number={4},
pages={1975--1999},
year={2012},
annote={INPT-IRIT technical report RT-APO-10-06},
}
%got
@article{AmestoyDuffLExcellentRouet15,
author={Amestoy, P. R. and Duff, I. S. and L'Excellent, J.-Y. and Rouet, F. H.},
title={Parallel computation of entries of {$A^{-1}$}},
year={2015},
journal=SISC,
volume={37},
number={2},
pages={C268--C284},
}
%got
@article{AmestoyDuffPraletVoemel03,
author={Amestoy, P. R. and Duff, I. S. and Pralet, S. and V\"omel, C.},
title={Adapting a parallel sparse direct solver to architectures with clusters of {SMP}s},
journal=PC,
volume={29},
number={11--12},
pages={1645 - 1668},
year={2003},
annote={Parallel and distributed scientific and engineering computing},
url={ http://dx.doi.org/10.1016/j.parco.2003.05.010 },
keywords={Sparse linear systems},
keywords={MUMPS},
keywords={Distributed memory algorithms},
keywords={Task scheduling},
keywords={Dynamic scheduling}
}
%got
@article{AmestoyDuffPuglisi96,
author={Amestoy, P. R. and Duff, I. S. and Puglisi, C.},
title={Multifrontal {QR} factorization in a multiprocessor environment},
journal=NLAA,
year={1996},
volume={3},
number={4},
pages={275-300},
url={ http://dx.doi.org/10.1002/(SICI)1099-1506(199607/08)3:4<275::AID-NLA83>3.0.CO;2-7 },
abstract={We describe the design and implementation of a parallel QR
decomposition algorithm for a large sparse matrix A. The algorithm is
based on the multifrontal approach and makes use of Householder
transformations. The tasks are distributed among processors according
to an assembly tree which is built from the symbolic factorization of
the matrix $A^TA$. We first address uniprocessor issues and then discuss
the multiprocessor implementation of the method. We consider the
parallelization of both the factorization phase and the solve phase. We
use relaxation of the sparsity structure of both the original matrix
and the frontal matrices to improve the performance. We show that, in
this case, the use of Level 3 BLAS can lead to very significant gains
in performance. We use the eight processor Alliant FX/80 at CERFACS to
illustrate our discussion.}
}
%got
@article{AmestoyDuffVoemel04,
author={Amestoy, P. R. and Duff, I. S. and V\"omel, C.},
title={Task Scheduling in an Asynchronous Distributed Memory Multifrontal Solver},
journal=SIMAX,
volume={26},
number={2},
pages={544-565},
year={2004},
url={ http://dx.doi.org/10.1137/S0895479802419877 },
}
%got
@article{AmestoyGuermoucheLExcellentPralet06,
author={Amestoy, P. R. and Guermouche, A. and L'Excellent, J.-Y. and Pralet, S.},
title={Hybrid scheduling for the parallel solution of linear systems},
journal=PC,
volume={32},
number={2},
pages={136 - 156},
year={2006},
annote={Parallel Matrix Algorithms and Applications (PMAA'04)},
url={ http://dx.doi.org/10.1016/j.parco.2005.07.004 },
keywords={Sparse matrices},
keywords={Parallel multifrontal method},
keywords={Dynamic scheduling},
keywords={Memory}
}
%got
@inproceedings{AmestoyLExcellentRouetSidLakhdar14,
author={Amestoy, P. R. and L'Excellent, J.-Y. and Rouet, F.-H. and Sid-Lakhdar, W. M.},
title={Modeling {1D} distributed-memory dense kernels for an asynchronous multifrontal sparse solver},
booktitle={Proc. High-Performance Computing for Computational Science, {VECPAR} 2014},
address={Eugene, Oregon, USA},
year={2014},
}
%got
@inproceedings{AmestoyLExcellentSidLakhdar14,
author={Amestoy, P. R. and L'Excellent, J.-Y. and Sid-Lakhdar, W. M.},
title={Characterizing asynchronous broadcast trees for multifrontal factorizations},
booktitle={Proc. SIAM Workshop on Combinatorial Scientific Computing (CSC14)},
address={Lyon, France},
month={July},
pages={51--53},
year={2014},
}
%got
@article{AmestoyLiNg07,
author={Amestoy, P. R. and Li, X. S. and Ng, E.},
title={Diagonal {Markowitz} Scheme with Local Symmetrization},
journal=SIMAX,
volume={29},
number={1},
pages={228-244},
year={2007},
url={ http://dx.doi.org/10.1137/050637315 },
}
%got
@article{AmestoyLiPralet07,
author={Amestoy, P. R. and Li, X. S. and Pralet, S.},
title={Unsymmetric Ordering Using A Constrained {Markowitz} Scheme},
journal=SIMAX,
volume={29},
number={1},
pages={302-327},
year={2007},
url={ http://dx.doi.org/10.1137/050622547 },
}
%got
@article{AmestoyPuglisi02,
author={Amestoy, P. R. and Puglisi, C.},
title={An unsymmetrized multifrontal {LU} factorization},
journal=SIMAX,
year={2002},
volume={24},
pages={553--569},
url={ http://dx.doi.org/10.1137/S0895479800375370 },
}
%got
@article{AmitHall81,
author={Amit, R. and Hall, C.},
month={Feb.},
year={1981},
title={Storage Requirements for Profile and Frontal Elimination},
journal=SINUM,
volume={19},
number={1},
pages={205-218},
}
%got (Tim owns hardcopy of book)
@book{LAPACK,
author={Anderson, E. and Bai, Z. and Bischof, C. H. and Blackford, S.
and Demmel, J. W. and Dongarra, J. J. and {Du Croz}, J. and
Greenbaum, A. and Hammarling, S. and McKenney, A. and Sorensen, D. C.},
title={{LAPACK} Users' Guide},
publisher={SIAM},
year={1999},
address={Philadelphia, PA},
edition={3rd},
note={ http://www.netlib.org/lapack/lug/ }
}
%got
@article{AndersonSaad89,
author={Anderson, E. and Saad, Y.},
title={Solving sparse triangular linear systems on parallel computers},
journal=IJHSC,
volume={01},
number={01},
pages={73-95},
year={1989},
url={ http://dx.doi.org/10.1142/S0129053389000056 },
}
%got
@article{ArioliDemmelDuff89,
author={Arioli, M. and Demmel, J. W. and Duff, I. S.},
title={Solving sparse linear systems with sparse backward error},
journal=SIMAX,
year={1989},
volume={10},
number={2},
pages={165--190},
}
%got
@article{ArioliDuffDeRijk89,
author={Arioli, M. and Duff, I. S. and {de Rijk}, P. P. M},
year={1989},
title={On the augmented systems approach to sparse least-squares problems},
journal=NUMERMATH,
volume={55},
pages={667--684},
}
%got
@article{ArioliDuffGouldReid90,
author={Arioli, M. and Duff, I. S. and Gould, N. I. M. and Reid, J. K.},
title={Use of the {P4} and {P5} algorithms for in-core factorization of sparse matrices},
journal=SISC,
year={1990},
volume={11},
pages={913-927},
}
%got
@article{ArnoldParrDewe83,
author={Arnold, C. P. and Parr, M. I. and Dewe, M. B.},
year={1983},
title={An Efficient Parallel Algorithm for the Solution of Large Sparse Linear Matrix Equations},
journal=IEEETC,
volume={C-32},
number={3},
pages={265-272},
}
%%ASHCRAFT --------------------------------------------------------------------
%got (Tim has hardcopy)
@techreport{Ashcraft87b,
author={Ashcraft, C. C.},
title={A vector implementation of the multifrontal method for large sparse, symmetric positive definite systems},
institution={Boeing Computer Services},
year={1987},
address={Seattle, WA},
number={ETA-TR-51},
}
%got
@incollection{Ashcraft93,
author={Ashcraft, C. C.},
title={The fan-both family of column-based distributed {Cholesky} factorization algorithms},
pages={159--190},
booktitle={Graph Theory and Sparse Matrix Computation},
series={IMA Volumes in Applied Mathematics},
year={1993},
publisher={Springer-Verlag},
address={New York},
editor={George, A. and Gilbert, J. R. and Liu, J. W. H.},
volume={56},
}
%got
@article{Ashcraft95,
author={Ashcraft, C. C.},
title={Compressed graphs and the minimum degree algorithm},
journal=SISC,
pages={1404-1411},
volume=16,
year={1995},
}
%got
@article{AshcraftEisenstatLiu90,
author={Ashcraft, C. C. and Eisenstat, S. C. and Liu, J. W. H.},
title={A fan-in algorithm for distributed sparse numerical factorization},
journal=SISC,
year={1990},
volume={11},
number={3},
pages={593--599},
url={ http://dx.doi.org/10.1137/0911033 },
}
%got
@techreport{AshcraftEisenstatLiuSherman90,
author={Ashcraft, C. C. and Eisenstat, S. C. and Liu, J. W. H. and Sherman, A. H.},
title={A Comparison of Three Column-based Distributed Sparse Factorization Schemes},
institution={Yale University},
number={YALEU/DCS/RR-810},
year={1990},
address={New Haven, CT},
}
%got
@article{AshcraftGrimes89,
author={Ashcraft, C. C. and Grimes, R. G.},
title={The influence of relaxed supernode partitions on the multifrontal method},
journal=TOMS,
year={1989},
volume={15},
number={4},
pages={291--309},
}
%got
@inproceedings{AshcraftGrimes99,
author={Ashcraft, C. C. and Grimes, R. G.},
title={{SPOOLES:} An object-oriented sparse matrix library},
booktitle={Proc. 1999 SIAM Conf. Parallel Processing for Scientific Computing},
year={1999},
month={Mar.},
note={ http://www.netlib.org/linalg/spooles }
}
%got
@article{AshcraftGrimesLewis98,
author={Ashcraft, C. C. and Grimes, R. G. and Lewis, J. G.},
title={Accurate symmetric indefinite linear equation solvers},
journal=SIMAX,
year={1998},
volume={20},
number={2},
pages={513--561},
}
%got
@article{AshcraftGrimesLewisPeytonSimon87,
author={Ashcraft, C. C. and Grimes, R. G. and Lewis, J. G. and Peyton, B. W. and Simon, H. D.},
year={1987},
title={Progress in Sparse Matrix Methods for Large Linear Systems on Vector Supercomputers},
journal=IJSA,
volume={1},
number={4},
pages={10-30},
}
%got
@article{AshcraftLiu97,
author={Ashcraft, C. C. and Liu, J. W. H.},
title={Using domain decomposition to find graph bisectors},
journal=BIT,
year={1997},
volume={37},
pages={506-534},
keywords={block Kernighan-Lin}
}
%got
@article{AshcraftLiu98b,
author={Ashcraft, C. C. and Liu, J. W. H.},
title={Applications of the {Dulmage}-{Mendelsohn} decomposition and network flow to graph bisection improvement},
journal=SIMAX,
year={1998},
volume={19},
number={2},
pages={325-354},
}
%got
@article{AshcraftLiu98,
author={Ashcraft, C. C. and Liu, J. W. H.},
title={Robust Ordering of Sparse Matrices using Multisection},
journal=SIMAX,
year={1998},
volume={19},
number={3},
pages={816-832},
url={ http://dx.doi.org/10.1137/S0895479896299081 },
}
%%A continued -----------------------------------------------------------------
%got
@article{AvronShklarskiToledo08,
author={Avron, H. and Shklarski, G. and Toledo, S.},
title={Parallel unsymmetric-pattern multifrontal sparse {LU} with column preordering},
journal=TOMS,
volume={34},
number={2},
year={2008},
pages={1--31},
url={ http://dx.doi.org/10.1145/1326548.1326550 },
publisher={ACM},
address={New York, NY, USA},
}
%got
@article{AykanatCambazogluUcar08,
author={Aykanat, C. and Cambazoglu, B. B. and U\c{c}ar, B.},
title={Multi-level direct K-way hypergraph partitioning with multiple constraints and fixed vertices},
journal=JPDC,
volume={68},
number={5},
pages={609--625},
year={2008},
url={ http://dx.doi.org/10.1016/j.jpdc.2007.09.006 },
abstract={K-way hypergraph partitioning has an ever-growing use in
parallelization of scientific computing applications. We claim that
hypergraph partitioning with multiple constraints and fixed vertices
should be implemented using direct K-way refinement, instead of the
widely adopted recursive bisection paradigm. Our arguments are based on
the fact that recursive-bisection-based partitioning algorithms perform
considerably worse when used in the multiple constraint and fixed
vertex formulations. We discuss possible reasons for this performance
degradation. We describe a careful implementation of a multi-level
direct K-way hypergraph partitioning algorithm, which performs better
than a well-known recursive-bisection-based partitioning algorithm in
hypergraph partitioning with multiple constraints and fixed vertices.
We also experimentally show that the proposed algorithm is effective in
standard hypergraph partitioning.}
}
%got
@article{AykanatPinarCatalyurek04,
author={Aykanat, C. and Pinar, A. and \c{C}ataly\"{u}rek, \"U. V.},
title={Permuting Sparse Rectangular Matrices into Block-Diagonal Form},
journal=SISC,
volume={25},
number={6},
pages={1860--1879},
year={2004},
url={ http://dx.doi.org/10.1137/S1064827502401953 },
abstract={We investigate the problem of permuting a sparse rectangular
matrix into block-diagonal form. Block-diagonal form of a matrix grants
an inherent parallelism for solving the deriving problem, as recently
investigated in the context of mathematical programming, LU
factorization, and QR factorization. To represent the nonzero structure
of a matrix, we propose bipartite graph and hypergraph models that
reduce the permutation problem to those of graph partitioning by vertex
separator and hypergraph partitioning, respectively. Our experiments on
a wide range of matrices, using the state-of-the-art graph and
hypergraph partitioning tools MeTiS and PaToH, revealed that the
proposed methods yield very effective solutions both in terms of
solution quality and runtime. }
}
%got
@inproceedings{AzadHalappanavarRajamanickamBomanKhanPothen12,
author={Azad, A. and Halappanavar, M. and Rajamanickam, S. and Boman, E. and Khan, A. and Pothen, A.},
title={Multithreaded algorithms for maximum matching in bipartite graphs},
booktitle={Proc. of 26th IPDPS},
pages={860--872},
year={2012},
}
%%B ---------------------------------------------------------------------------
%got
@article{BankRose90,
author={Bank, R. E. and Rose, D. J.},
title={On the complexity of sparse {Gaussian} elimination via bordering},
year={1990},
journal=SISC,
volume={11},
number={1},
pages={145--160},
}
%got
@article{BankSmith87,
author={Bank, R. E. and Smith, R. K.},
year={1987},
title={General Sparse Elimination Requires No Permanent Integer Storage},
journal=SISC,
volume={8},
number={4},
pages={574-584},
url={ http:/www.netlib.org/linalg/bsmp.f }
}
%got
@article{BarnardPothenSimon95,
author={Barnard, S. T. and Pothen, A. and Simon, H. D.},
title={A spectral algorithm for envelope reduction of sparse matrices},
journal=NLAA,
year={1995},
volume={2},
pages={317--334},
url={ http://dx.doi.org/10.1002/nla.1680020402 },
abstract={The problem of reordering a sparse symmetric matrix to
reduce its envelope size is considered. A new spectral algorithm for
computing an envelope-reducing reordering is obtained by associating a
Laplacian matrix with the given matrix and then sorting the components
of a specified eigenvector of the Laplacian. This Laplacian eigenvector
solves a continuous relaxation of a discrete problem related to
envelope minimization called the minimum 2-sum problem. The permutation
vector computed by the spectral algorithm is a closest permutation
vector to the specified Laplacian eigenvector. Numerical results show
that the new reording algorithm usually computes smaller envelope sizes
than those obtained from the current standards such as the
Gibbs-Poole-Stockmeyer (GPS) algorithm or the reverse Cuthill-McKee
(RCM) algorithm in SPARSPAK, in some cases reducing the envelope by
more than a factor of two. }
}
%got
@article{BennerMontryWeigand87,
author={Benner, R. E. and Montry, G. R. and Weigand, G. G.},
title={Concurrent multifrontal methods: shared memory, cache, and frontwidth issues},
journal=IJSA,
year={1987},
volume={1},
number={3},
pages={26--44},
}
%GET
@article{Berge57,
author={Berge, C.},
title={Two theorems in graph theory},
journal={Proceedings of the National Academy of Sciences of the United States of America},
volume={43},
number={9},
pages={842},
year={1957},
publisher={National Academy of Sciences}
}
%got
@article{BermanSchnitger90,
author={Berman, P. and Schnitger, G.},
title={On the Performance of the Minimum Degree Ordering for {Gaussian} Elimination},
journal=SIMAX,
year={1990},
volume={11},
number={1},
pages={83-88},
}
%got
@article{BerryDahlhausHeggernesSimonet08,
author={Berry, A. and Dahlhaus, E. and Heggernes, P. and Simonet, G.},
title={Sequential and parallel triangulating algorithms for Elimination Game and new insights on Minimum Degree },
journal=TCS,
volume={409},
number={3},
pages={601--616},
year={2008},
url={ http://dx.doi.org/10.1016/j.tcs.2008.09.059 },
keywords={Minimum degree},
keywords={Minimal triangulation},
keywords={Chordal graphs},
keywords={Parallel and sequential algorithms },
abstract={Elimination Game is a well-known algorithm that simulates
Gaussian elimination of matrices on graphs, and it computes a
triangulation of the input graph. The number of fill edges in the
computed triangulation is highly dependent on the order in which
Elimination Game processes the vertices, and in general the produced
triangulations are neither minimum nor minimal. In order to obtain a
triangulation which is close to minimum, the Minimum Degree heuristic
is widely used in practice, but until now little was known on the
theoretical mechanisms involved. In this paper we show some interesting
properties of Elimination Game; in particular that it is able to
compute a partial minimal triangulation of the input graph regardless
of the order in which the vertices are processed. This results in a new
algorithm to compute minimal triangulations that are sandwiched between
the input graph and the triangulation resulting from Elimination Game.
One of the strengths of the new approach is that it is easily
parallelizable, and thus we are able to present the first parallel
algorithm to compute such sandwiched minimal triangulations. In
addition, the insight that we gain through Elimination Game is used to
partly explain the good behavior of the Minimum Degree algorithm. We
also give a new algorithm for producing minimal triangulations that is
able to use the minimum degree idea to a wider extent. }
}
%got
@article{Berry71,
author={Berry, R. D.},
month={Jan.},
year={1971},
title={An Optimal Ordering of Electronic Circuit Equations for a Sparse Matrix Solution},
journal=IEEETCT,
volume={CT-19},
number={1},
pages={40-50},
}
%got
@article{BhatHabaskiLiuNguyenPeeters93,
author={Bhat, M. V. and Habashi, W. G. and Liu, J. W. H. and Nguyen, V. N. and Peeters, M. F.},
title={A note on nested dissection for rectangular grids},
journal=SIMAX,
year={1993},
volume={14},
number={1},
pages={253-258},
}
%got (Tim has book from TAMU library)
@incollection{BirkhoffGeorge73,
author={Birkhoff, G. and George, A.},
year={1973},
title={Elimination by Nested Dissection},
editor={Traub, J. F.},
booktitle={Complexity of Sequential and Parallel Numerical Algorithms},
publisher={New York: Academic Press},
pages={221-269},
}
%got
@article{BischofHansen91,
author={Bischof, C. H. and Hansen, P. C.},
title={Structure-Preserving and Rank-Revealing {QR}-Factorizations},
publisher={SIAM},
year={1991},
journal=SISC,
volume={12},
number={6},
pages={1332-1350},
keywords={rank-revealing QR-factorization; rank-deficient problems; numerical rank; incremental condition estimation; sparse matrices},
url={ http://dx.doi.org/10.1137/0912073 },
abstract={The rank-revealing QR-factorization (RRQR factorization) is a
special QR-factorization that is guaranteed to reveal the numerical
rank of the matrix under consideration. This makes the
RRQR-factorization a useful tool in the numerical treatment of many
rank-deficient problems in numerical linear algebra. In this paper, a
framework is presented for the efficient implementation of RRQR
algorithms, in particular, for sparse matrices. A sparse RRQR-algorithm
should seek to preserve the structure and sparsity of the matrix as
much as possible while retaining the ability to capture safely the
numerical rank. To this end, the paper proposes to compute an initial
QR-factorization using a restricted pivoting strategy guarded by
incremental condition estimation (ICE), and then applies the algorithm
suggested by Chan and Foster to this QR-factorization. The column
exchange strategy used in the initial QR factorization will exploit the
fact that certain column exchanges do not change the sparsity
structure, and compute a sparse QR-factorization that is a good
approximation of the sought-after RRQR-factorization. Due to quantities
produced by ICE, the Chan/Foster RRQR algorithm can be implemented very
cheaply, thus verifying that the sought-after RRQR-factorization has
indeed been computed. Experimental results on a model problem show that
the initial QR-factorization is indeed very likely to produce
RRQR-factorization. Fill-in is comparable with other methods, and
little additional effort is required to implement the Chan/Foster
postprocessing step. Compared to alternative strategies, the new
algorithm allows to a greater extent the use of Householder
transformations (instead of Givens rotations), and requires fewer
touches of the data, while requiring not more (and sometimes
substantially fewer) floating-point operations. These characteristics
make the algorithm attractive for sparse problems, and a good candidate
for parallel computers as well.},
}
%got
@article{BischofLewisPierce90,
author={Bischof, C. H. and Lewis, J. G. and Pierce, D. J.},
title={Incremental condition estimation for sparse matrices},
publisher={SIAM},
year={1990},
journal=SIMAX,
volume={11},
pages={644--659},
url={ http://dx.doi.org/10.1137/0611047 },
abstract={Incremental condition estimation provides an estimate for the
smallest singular value of a triangular matrix. In particular, it gives
a running estimate of the smallest singular value of a triangular
factor matrix as the factor is generated one column or row at a time.
An incremental condition estimator for dense matrices was originally
suggested by Bischof. In this paper this scheme is generalized to
handle sparse triangular matrices, especially those that are factors of
sparse matrices. Numerical experiments on a variety of matrices
demonstrate the reliability of this scheme in estimating the smallest
singular value. A partial description of its implementation in a sparse
matrix factorization code further illustrates its practicality.},
}
%got
@article{Bjork84,
author={Bj\"{o}rck, A.},
title={A General Updating Algorithm for Constrained Linear Least Squares Problems},
journal=SISC,
volume={5},
number={2},
pages={394-402},
year={1984},
URL={ http://dx.doi.org/10.1137/0905029 },
abstract={ Linear least squares problems which are sparse except for a
small subset of dense equations can be efficiently solved by an
updating method. Often the least squares solution is also required to
satisfy a set of linear constraints, which again can be divided into
sparse and dense subsets. This paper develops an updating algorithm for
the solution of such problems. The algorithm is completely general in
that no restrictive assumption on the rank of any subset of equations
or constraints is made },
}
%got
@article{Bjork88,
author={Bj\"{o}rck, A.},
title={A direct method for sparse least squares problems with lower and upper bounds},
journal=NUMERMATH,
year={1988},
volume={54},
pages={19-32},
}
%got (Tim owns hardcopy of book)
@book{Bjork96,
author={Bj\"{o}rck, A.},
title={Numerical methods for least squares problems},
publisher={SIAM},
year={1996},
address={Philadelphia, PA}
}
%got
@article{BjorkDuff80,
author={Bj\"{o}rck, A. and Duff, I. S.},
title={A direct method for sparse linear least squares problems},
journal=LAA,
year={1988},
volume={34},
pages={43--67},
}
%got
@article{Bjorstad87,
author={Bjorstad, P. E.},
month={July},
year={1987},
title={A Large Scale, Sparse, Secondary Storage, Direct Linear Equation Solver for Structural Analysis and its Implementation on Vector and Parallel Architectures},
journal=PC,
volume={5},
pages={3-12},
keywords={parallel algorithm Cholesky factorization symmetric matrices positive definite matrices}
}
%GET
@book{ScaLAPACK97,
author={Blackford, L. and Choi, J. and Cleary, A. and D'Azevedo, E. and Demmel, J. and Dhillon, I. and Dongarra, J. and Hammarling, S. and Henry, G. and Petitet, A. and Stanley, K. and Walker, D. and Whaley, R.},
title={{ScaLAPACK} Users' Guide},
publisher={Society for Industrial and Applied Mathematics},
year={1997},
url={ http://dx.doi.org/10.1137/1.9780898719642 },
}
%GET
@article{BomanCatalyurekChevalierDevine12,
author={Boman, E. G and {\c{C}}ataly{\"u}rek, {\"U}. V. and Chevalier, C. and Devine, K. D.},
title={The Zoltan and Isorropia parallel toolkits for combinatorial scientific computing: Partitioning, ordering and coloring},
journal={Scientific Programming},
volume={20},
number={2},
pages={129--150},
year={2012},
publisher={Hindawi Publishing Corporation}
}
%got
@techreport{BomanHendrickson96,
author={Boman, E. G. and Hendrickson, B.},
title={A multilevel algorithm for reducing the envelope of sparse matrices},
institution={Stanford University},
year={1996},
number={SCCM-96-14},
address={Stanford, CA},
}
%got
@article{BrainmanToledo01,
author={Brainman, I. and Toledo, S.},
title={Nested-dissection orderings for sparse {LU} with partial pivoting},
journal=SIMAX,
year={2002},
volume={23},
pages={998-1012},
}
%got
@article{BraytonGustavsonWilloughby70,
author={Brayton, R. K. and Gustavson, F. G. and Willoughby, R. A.},
year={1970},
title={Some Results on Sparse Matrices},
journal=MATHCOMP,
volume={24},
number={112},
pages={937-954},
}
%got (Tim has book from TAMU library)
@incollection{BrownWait81,
author={Brown, N. G. and Wait, R.},
year={1981},
title={A Branching Envelope Reducing Algorithm for Finite Element Meshes},
editor={Duff, I. S.},
booktitle={Sparse Matrices and Their Uses},
publisher={New York: Academic Press},
pages={315-324},
keywords={ordering profile reduction finite element method}
}
%got
@inproceedings{BuiJones93,
author={Bui, T. and Jones, C.},
title={A heuristic for reducing fill in sparse matrix factorization},
booktitle={Proc. 6th SIAM Conf. Parallel Processing for Scientific Computation},
year={1993},
pages={445-452},
organization={SIAM},
url={ http://www.osti.gov/scitech/biblio/54439 },
abstract={We present a heuristic that helps to improve the quality of
the bisection returned by the Kernighan-Lin and greedy graph bisection
algorithms. This in turns helps to reduce the amount of fill-in
produced by separator-based algorithms that reorder a matrix before
factorization. We also describe the performance of our heuristic on
graphs from the Harwell-Boeing collection of sparse matrix test
problems, and compare them with known results by other methods on the
same graphs.},
}
%got (Tim has book from TAMU library)
@incollection{Bunch73,
author={Bunch, J. R.},
year={1973},
title={Complexity of Sparse Elimination},
editor={Traub, J. F.},
booktitle={Complexity of Sequential and Parallel Numerical Algorithms},
publisher={New York: Academic Press},
pages={197--220},
keywords={survey},
}
%GET
@article{Bunch74,
author={Bunch, J. R.},
title={Partial pivoting strategies for symmetric matrices},
journal=SINUM,
volume={11},
pages={521--528},
year={1974},
}
%GET
@article{BunchKaufman77,
author={Bunch, J. R. and Kaufman, L.},
title={Some stable methods for calculating inertia and solving symmetric linear systems},
year={1977},
journal=MATHCOMP,
volume={31},
pages={163--179},
}
%got
@article{Buttari13,
author={Buttari, A.},
title={Fine-grained multithreading for the multifrontal {QR} factorization of sparse matrices},
journal=SISC,
volume={35},
number={4},
pages={C323--C345},
year={2013},
}
%got
@article{Bykat77,
author={Bykat, A.},
title={A note on an element ordering scheme},
journal=IJNME,
year={1977},
volume={11},
number={1},
publisher={John Wiley \& Sons, Ltd},
url={ http://dx.doi.org/10.1002/nme.1620110120 },
pages={194--198},
abstract={This paper contains an algorithm for the ordering of
elements in the finite element method, which can be utilized in the
Non-assembly Method of the author and in the Frontal Solution Method of
B. M. Irons.},
}
%%C ---------------------------------------------------------------------------
%got (Tim has hardcopy)
@inproceedings{Calahan73,
author={Calahan, D. A.},
year={1973},
title={Parallel Solution of Sparse Simultaneous Linear Equations},
booktitle={Proceedings of the 11th Annual Allerton Conference on Circuits and System Theory},
pages={729-735},
}
%got
@article{CardenalDuffJimenez98,
author={Cardenal, J. and Duff, I. S. and Jim\'enez, J.M.},
title={Solution of sparse quasi-square rectangular systems by Gaussian elimination},
journal=IMAJNA,
volume={18},
number={2},
pages={165-177},
year={1998},
url={ http://dx.doi.org/10.1093/imanum/18.2.165 },
abstract={We present a general method for the linear least-squares
solution of overdetermined and underdetermined systems. The method is
particularly efficient when the coefficient matrix is quasi-square,
that is when the number of rows and number of columns is almost the
same. The numerical methods for linear least-squares problems and
minimum-norm solutions do not generally take account of this special
characteristic. The proposed method is based on {LU} factorization of
the original quasi-square matrix A, assuming that A has full rank. In
the overdetermined case, the {LU} factors are used to compute a basis
for the null space of $A^T$. The right-hand side vector b is then
projected onto this subspace and the least-squares solution is obtained
from the solution of this reduced problem. In the case of
underdetermined systems, the desired solution is again obtained through
the solution of a reduced system. The use of this method may lead to
important savings in computational time for both dense and sparse
matrices. It is also shown in the paper that, even in cases where the
matrices are quite small, sparse solvers perform better than dense
solvers. Some practical examples that illustrate the use of the method
are included.}
}
%GET
@article{CatalyurekAykanat99,
author={\c{C}ataly\"{u}rek, \"U. V. and Aykanat, C.},
title={Hypergraph-partitioning-based decomposition for parallel sparse-matrix vector multiplication},
journal=IEEETPDS,
volume={10},
number={7},
pages={673--693},
year={1999},
publisher={IEEE}
}
%GET
@inproceedings{CatalyurekAykanat01,
author={\c{C}ataly\"{u}rek, \"U. V. and Aykanat, C.},
title={A fine-grain hypergraph model for {2D} decomposition of sparse matrices},
booktitle={Proc. 15th IEEE Intl. Parallel and Distrib. Proc. Symp: IPDPS '01},
pages={1199--1204},
year={2001},
organization={IEEE}
}
%got
@misc{CatalyurekAykanat11,
author={\c{C}ataly\"{u}rek, \"U. V. and Aykanat, C.},
title={{PaToH}: Partitioning Tool for Hypergraphs},
howpublished={ http://bmi.osu.edu/umit/software.html },
year={2011},
}
%got
@inproceedings{CatalyurekDobrianGebremedhinHalappanavarPothen11,
author={\c{C}ataly\"{u}rek, \"U. V. and Dobrian, F. and Gebremedhin, A. and Halappanavar, M. and Pothen, A.},
title={Distributed memory algorithms for matching and coloring},
booktitle={Proc. of IPDPS (Workshop on Parallel Computing and Optmization)},
year={2011},
}
%got
@article{CatalyurekAykanatKayaaslan11,
author={\c{C}ataly\"urek, \"U. V. and Aykanat, C. and Kayaaslan, E.},
title={Hypergraph Partitioning-Based Fill-Reducing Ordering for Symmetric Matrices},
journal=SISC,
volume={33},
number={4},
pages={1996--2023},
year={2011},
url={ http://dx.doi.org/10.1137/090757575 },
abstract={A typical first step of a direct solver for the linear
system $Mx=b$ is reordering of the symmetric matrix M to improve
execution time and space requirements of the solution process. In this
work, we propose a novel nested-dissection-based ordering approach that
utilizes hypergraph partitioning. Our approach is based on the
formulation of graph partitioning by vertex separator (GPVS) problem as
a hypergraph partitioning problem. This new formulation is immune to
deficiency of GPVS in a multilevel framework and hence enables better
orderings. In matrix terms, our method relies on the existence of a
structural factorization of the input M matrix in the form of $M=AA^T$
(or $M=AD^2A^T$). We show that the partitioning of the row-net
hypergraph representation of the rectangular matrix A induces a GPVS of
the standard graph representation of matrix M. In the absence of such
factorization, we also propose simple, yet effective structural
factorization techniques that are based on finding an edge clique cover
of the standard graph representation of matrix M, and hence applicable
to any arbitrary symmetric matrix M. Our experimental evaluation has
shown that the proposed method achieves better ordering in comparison
to state-of-the-art graph-based ordering tools even for symmetric
matrices where structural $M=AA^T$ factorization is not provided as an
input. For matrices coming from linear programming problems, our method
enables even faster and better orderings. }
}
%got
@article{ChanGeorge80,
author={Chan, W. M. and George, A.},
year={1980},
title={A Linear Time Implementation of the Reverse {C}uthill-{M}ckee Algorithm},
journal=BIT,
volume={20},
pages={8-14},
}
%got
@inproceedings{ChenMalkowskiKandemirRaghavan05,
author={Chen, G. and Malkowski, K. and Kandemir, M. and Raghavan, P.},
booktitle={Proc. 19th IEEE Parallel and Distributed Processing Symposium},
title={Reducing power with performance constraints for parallel sparse applications},
year={2005},
month={April},
abstract={Sparse and irregular computations constitute a large
fraction of applications in the data-intensive scientific domain. While
every effort is made to balance the computational workload in such
computations across parallel processors, achieving sustained near
machine-peak performance with close-to-ideal load balanced
computation-to-processor mapping is inherently difficult. As a result,
most of the time, the loads assigned to parallel processors can exhibit
significant variations. While there have been numerous past efforts
that study this imbalance from the performance viewpoint, to our
knowledge, no prior study has considered exploiting the imbalance for
reducing power consumption during execution. Power consumption in
large-scale clusters of workstations is becoming a critical issue as
noted by several recent research papers from both industry and
academia. Focusing on sparse matrix computations in which underlying
parallel computations and data dependencies can be represented by
trees, this paper proposes schemes that save power through
voltage/frequency scaling. Our goal is to reduce overall energy
consumption by scaling the voltages/frequencies of those processors
that are not in the critical path; i.e., our approach is oriented
towards saving power without incurring performance penalties.},
keywords={computational workload balancing; data dependency;
data-intensive scientific domain; frequency scaling; large-scale
workstation clusters; parallel computation; parallel processing;
parallel sparse application; performance constraint; power consumption
reduction; voltage scaling; energy conservation; parallel processing;
performance evaluation; power consumption; resource allocation;
workstation clusters;},
url={ http://dx.doi.org/10.1109/IPDPS.2005.378 },
}
%got
@article{ChenDavisHagerRajamanickam09,
author={Chen, Y. and Davis, T. A. and Hager, W. W. and Rajamanickam, S.},
title={Algorithm 887: {CHOLMOD}, Supernodal Sparse {Cholesky} Factorization and Update/Downdate},
journal=TOMS,
volume={35},
number={3},
year={2008},
pages={1--14},
url={ http://dx.doi.org/10.1145/1391989.1391995 },
publisher={ACM},
address={New York, NY, USA},
abstract={CHOLMOD is a set of routines for factorizing sparse
symmetric positive definite matrices of the form $A$ or $AA^T$,
updating/downdating a sparse Cholesky factorization, solving linear
systems, updating/downdating the solution to the triangular system
$Lx=b$, and many other sparse matrix functions for both symmetric and
unsymmetric matrices. Its supernodal Cholesky factorization relies on
LAPACK and the Level-3 BLAS, and obtains a substantial fraction of the
peak performance of the BLAS. Both real and complex matrices are
supported. CHOLMOD is written in ANSI/ISO C, with both C and
MATLAB\texttrademark interfaces. It appears in MATLAB 7.2 as
\verb'x=A\b' when \verb'A' is sparse symmetric positive definite, as
well as in several other sparse matrix functions. }
}
%got
@article{ChenRenWangYang15,
author={Chen, X. and Ren, L. and Wang, Y. and Yang, H.},
title={{GPU}-accelerated sparse {LU} factorization for circuit simulation with performance modeling},
year={2015},
journal=IEEETPDS,
volume={26},
number={3},
pages={786--795},
}
%got
@article{ChenWangYang13,
author={Chen, X. and Wang, Y. and Yang, H.},
title={{NICSLU}: an adaptive sparse matrix solver for parallel circuit simulation},
year={2013},
journal=IEEETCAD,
volume={32},
number={2},
pages={261--274},
}
%got
@article{ChenTewarson72,
author={Chen, Y. T. and Tewarson, R. P.},
title={On the fill-in when sparse vectors are orthonormalized},
year={1972},
journal=COMP,
volume={9},
number={1},
url={ http://dx.doi.org/10.1007/BF02236376 },
publisher={Springer-Verlag},
pages={53--56},
}
%got
@article{ChenTewarson72b,
author={Chen, Y. T. and Tewarson, R. P.},
title={On the optimal choice of pivots for the Gaussian elimination},
year={1972},
journal=COMP,
volume={9},
number={3},
url={ http://dx.doi.org/10.1007/BF02246733 },
publisher={Springer-Verlag},
pages={245--250},
}
%got
@article{Cheng73,
author={Cheng, K. Y.},
title={Minimizing the bandwidth of sparse symmetric matrices},
year={1973},
journal=COMP,
volume={11},
number={2},
url={ http://dx.doi.org/10.1007/BF02252900 },
publisher={Springer-Verlag},
pages={103--110},
}
%got
@article{Cheng73b,
author={Cheng, K. Y.},
title={Note on minimizing the bandwidth of sparse, symmetric matrices},
year={1973},
journal=COMP,
volume={11},
number={1},
url={ http://dx.doi.org/10.1007/BF02239468 },
publisher={Springer-Verlag},
pages={27--30},
}
%got
@article{ChevalierPellegrini08,
author={Chevalier, C. and Pellegrini, F.},
title={{PT-SCOTCH}: a tool for efficient parallel graph ordering},
journal=PC,
year={2008},
volume={34},
number={6-8},
pages={318--331},
url={ http://dx.doi.org/10.1016/j.parco.2007.12.001 },
}
%got
@article{ChuGeorge90,
author={Chu, E. and George, A.},
title={Sparse Orthogonal Decomposition on a Hypercube Multiprocessor},
publisher={SIAM},
year={1990},
journal=SIMAX,
volume={11},
number={3},
pages={453-465},
keywords={sparse matrix; orthogonal decomposition; parallel computation; Givens rotation; row/submatrix merging; hypercube multiprocessors},
url={ http://dx.doi.org/10.1137/0611031 },
}
%got
@techreport{ChuGeorgeLiuNg84,
author={Chu, E. and George, A. and Liu, J. W. H. and Ng, E. G.},
month={Nov.},
year={1984},
title={{SPARSPAK}: Waterloo Sparse Matrix Package, User's Guide for {SPARSPAK-A}},
institution={Univ. of Waterloo Dept.~of Computer Science},
number={CS-84-36},
keywords={SPARSPAK users guide symmetric},
note={ https://cs.uwaterloo.ca/research/tr/1984/CS-84-36.pdf },
address={Waterloo, Ontario},
}
%got
@article{CliffeDuffScott98,
author={Cliffe, K. A. and Duff, I. S. and Scott, J. A.},
title={Performance issues for frontal schemes on a cache-based high-performance computer},
journal=IJNME,
year={1998},
volume={42},
number={1},
publisher={John Wiley \& Sons, Ltd},
url={ http://dx.doi.org/10.1002/(SICI)1097-0207(19980515)42:1<127::AID-NME357>3.0.CO;2-K },
pages={127--143},
keywords={unsymmetric sparse matrices, frontal solver, direct
methods, finite elements, BLAS, computational kernels},
abstract={We consider the implementation of a frontal code for the
solution of large sparse unsymmetric linear systems on a
high-performance computer where data must be in the cache before
arithmetic operations can be performed on it. In particular, we show
how we can modify the frontal solution algorithm to enhance the
proportion of arithmetic operations performed using Level 3 BLAS thus
enabling better reuse of data in the cache. We illustrate the effects
of this on Silicon Graphics Power Challenge machines using problems
which arise in real engineering and industrial applications.}
}
%got
@article{ColemanEdenbrandtGilbert86,
author={Coleman, T. F. and Edenbrandt, A. and Gilbert, J. R.},
title={Predicting fill for sparse orthogonal factorization},
journal=JACM,
year={1986},
volume={33},
pages={517-532},
}
%got
@article{Collins73,
author={Collins, R. J.},
year={1973},
title={Bandwidth Reduction by Automatic Renumbering},
journal=IJNME,
publisher={John Wiley \& Sons, Ltd},
volume={6},
number={3},
pages={345-356},
url={ http://dx.doi.org/10.1002/nme.1620060306 },
}
%got
@article{Conroy90,
author={Conroy, J. M.},
title={Parallel nested dissection},
journal=PC,
year={1990},
volume={16},
pages={139-156},
}
%got
@article{ConroyKratzerLucasNaiman98,
author={Conroy, J. M. and Kratzer, S. G. and Lucas, R. F. and Naiman, A. E.},
title={Data-parallel sparse {LU} factorization},
journal=SISC,
volume={19},
number={2},
pages={584--604},
year={1998},
annote={was tech report ConroyKratzerLucasNaiman94 RC-TR-94-124},
url={ http://dx.doi.org/10.1137/S1064827594276412 },
abstract={Sparse matrix factorization is a computational bottleneck in
many scientific and engineering problems. This paper examines the
problem of factoring large sparse matrices on data-parallel computers.
A multifrontal approach is presented in which only the fine-grain
concurrency found within the elimination of each supernode is
exploited. Throughput approaching that of large dense matrix
factorizations is demonstrated on two data-parallel systems, the MasPar
MP-2 and the Thinking Machines CM-5.}
}
%got (Tim owns hardcopy of book)
@book{CormenLeisersonRivest90,
author={Cormen, T. H. and Leiserson, C. E. and Rivest, R. L.},
title={Introduction to Algorithms},
publisher={MIT Press},
address={Cambridge, MA},
year={1990},
}
%got
@inproceedings{CozetteGuermoucheUtard04,
title={Adaptive paging for a multifrontal solver},
author={Cozette, O. and Guermouche, A. and Utard, G.},
year={2004},
booktitle={Proc. 18th Intl. Conf. on Supercomputing},
publisher={ACM Press},
pages={267-276},
}
%got
@article{CraneGibbsPooleStockmeyer76,
author={Crane, H. L. and Gibbs, N. E. and Poole, W. G. and Stockmeyer, P. K.},
month={Dec.},
year={1976},
title={Algorithm 508: Matrix Bandwidth and Profile Reduction},
journal=TOMS,
volume={2},
number={4},
pages={375--377},
}
%got
@article{CurtisReid71,
author={Curtis, A. R. and Reid, J. K.},
year={1971},
title={The Solution of Large Sparse Unsymmetric Systems of Linear Equations},
journal=IMAJAM,
volume={8},
number={3},
pages={344-353},
url={ http://dx.doi.org/10.1093/imamat/8.3.344 },
keywords={unsymmetric matrices}
}
%got
@incollection{Cuthill72,
author={Cuthill, E.},
title={Several Strategies for Reducing the Bandwidth of Matrices},
pages={157--166},
editor={Rose, D. J. and Willoughby, R. A.},
booktitle={Sparse Matrices and Their Applications},
address={New York},
publisher={New York: Plenum Press},
year={1972},
url={ http://link.springer.com/book/10.1007%2F978-1-4615-8675-3 },
}
%got
@inproceedings{CuthillMcKee69,
author={Cuthill, E. and McKee, J.},
title={Reducing the Bandwidth of Sparse Symmetric Matrices},
booktitle={Proc. 24th Conf. of the {ACM}},
publisher={Brandon Press},
address={New Jersey},
year={1969},
pages={157--172},
}
%%D ---------------------------------------------------------------------------
%got
@techreport{DamhaugReid96,
author={Damhaug, A. C. and Reid, J. R.},
title={{MA46}: a {Fortran} code for direct solution of sparse unsymmetric linear systems of equations from finite-element applications},
number={RAL-TR-96-010},
institution={Rutherford Appleton Lab},
year={1996},
address={Oxon, England},
}
%got
@article{DaveDuff87,
author={Dave, A. K. and Duff, I. S.},
year={1987},
title={Sparse Matrix Calculations on the {CRAY-2}},
journal=PC,
volume={5},
pages={55-64},
keywords={CRAY frontal methods parallel algorithms},
annote={Report CSS 197, AERE Harwell, and in Proc. Int. Conf. on Vector
and Parallel Computing, Loen, Norway, June 2-6, 1986},
}
%%DAVIS -----------------------------------------------------------------------
%got
@article{Davis04_algo,
author={Davis, T. A.},
title={Algorithm 832: {UMFPACK V4.3}, an unsymmetric-pattern multifrontal method},
journal=TOMS,
volume={30},
number={2},
month=jun,
year={2004},
pages={196--199},
url={ http://dx.doi.org/10.1145/992200.992206 },
abstract={An ANSI C code for sparse {LU} factorization is
presented that combines a column pre-ordering strategy with a
right-looking unsymmetric-pattern multifrontal numerical factorization.
The pre-ordering and symbolic analysis phase computes an upper bound on
fill-in, work, and memory usage during the subsequent numerical
factorization. User-callable routines are provided for ordering and
analyzing a sparse matrix, computing the numerical factorization,
solving a system with the {LU} factors, transposing and permuting a
sparse matrix, and converting between sparse matrix representations.
The simple user interface shields the user from the details of the
complex sparse factorization data structures by returning simple
handles to opaque objects. Additional user-callable routines are
provided for printing and extracting the contents of these opaque
objects. An even simpler way to use the package is through its MATLAB
interface. UMFPACK is incorporated as a built-in operator in MATLAB 6.5
as {\tt x=A}$\backslash${\tt b} when {\tt A} is sparse and
unsymmetric.},
}
%got
@article{Davis04,
author={Davis, T. A.},
title={A column pre-ordering strategy for the unsymmetric-pattern multifrontal method},
journal=TOMS,
volume={30},
number={2},
month=jun,
year={2004},
pages={165--195},
url={ http://dx.doi.org/10.1145/992200.992205 },
abstract={A new method for sparse {LU} factorization is presented
that combines a column pre-ordering strategy with a right-looking
unsymmetric-pattern multifrontal numerical factorization. The column
ordering is selected to give a good a priori upper bound on fill-in and
then refined during numerical factorization (while preserving the
bound). Pivot rows are selected to maintain numerical stability and to
preserve sparsity. The method analyzes the matrix and automatically
selects one of three pre-ordering and pivoting strategies. The number
of nonzeros in the {LU} factors computed by the method is typically less
than or equal to those found by a wide range of unsymmetric sparse LU
factorization methods, including left-looking methods and prior
multifrontal methods.},
}
%got
@article{Davis05,
author={Davis, T. A.},
title={Algorithm 849: A Concise Sparse {Cholesky} Factorization Package},
journal=TOMS,
volume={31},
number={4},
year={2005},
pages={587--591},
abstract={The LDL software package is a set of short, concise
routines for factorizing symmetric positive-definite sparse matrices,
with some applicability to symmetric indefinite matrices. Its primary
purpose is to illustrate much of the basic theory of sparse matrix
algorithms in as concise a code as possible, including an elegant
method of sparse symmetric factorization that computes the
factorization row-by-row but stores it column-by-column. The entire
symbolic and numeric factorization consists of less than 50 lines of
code. The package is written in C, and includes a MATLAB interface.}
}
%got
@book{Davis06book,
author={Davis, T. A.},
title={Direct Methods for Sparse Linear Systems},
publisher={SIAM},
year={2006},
address={Philadelphia, PA}
}
%got
@article{Davis11a,
author={Davis, T. A.},
title={Algorithm 915: {SuiteSparseQR}, multifrontal multithreaded rank-revealinng sparse {QR} factorization},
journal=TOMS,
year={2011},
volume={38},
number={1},
pages={8:1--8:22},
}
%got
@book{Davis11b,
author={Davis, T. A.},
title={{MATLAB} Primer},
publisher={Chapman \& Hall/CRC Press},
year={2011},
address={Boca Raton},
edition={8th}
}
%got
@article{Davis13algo,
author={Davis, T. A.},
title={Algorithm 930: {FACTORIZE}, an object-oriented linear system solver for {MATLAB}},
journal=TOMS,
year={2013},
volume={39},
number={4},
pages={28:1--28:18},
}
%got
@article{DavisDavidson88,
author={Davis, T. A. and Davidson, E. S.},
year={1988},
title={Pairwise Reduction for the Direct, Parallel Solution of Sparse Unsymmetric Sets of Linear Equations},
journal=IEEETC,
volume={37},
number={12},
pages={1648-1654},
keywords={PSOLVE pairwise-pivoting parallel unsymmetric sparse vector hardware}
}
%got
@article{DavisDuff97,
author={Davis, T. A. and Duff, I. S.},
title={An unsymmetric-pattern multifrontal method for sparse {LU} factorization},
journal=SIMAX,
year={1997},
volume={18},
number={1},
pages={140-158},
}
%got
@article{DavisDuff99,
author={Davis, T. A. and Duff, I. S.},
title={A combined unifrontal/multifrontal method for unsymmetric sparse matrices},
journal=TOMS,
volume={25},
number={1},
pages={1--20},
month=mar,
year={1999},
url={ http://dx.doi.org/10.1145/305658.287640 },
abstract={We discuss the organization of frontal matrices in
multifrontal methods for the solution of large sparse sets of
unsymmetric linear equations. In the multifrontal method, work on a
frontal matrix can be suspended, the frontal matrix can be stored for
later reuse, and a new frontal matrix can be generated. There are thus
several frontal matrices stored during the factorization, and one or
more of these are assembled (summed) when creating a new frontal
matrix. Although this means that arbitrary sparsity patterns can be
handled efficiently, extra work is required to sum the frontal matrices
together and can be costly because indirect addressing is required. The
(uni)frontal method avoids this extra work by factorizing the matrix
with a single frontal matrix. Rows and columns are added to the frontal
matrix, and pivot rows and columns are removed. Data movement is
simpler, but higher fill-in can result if the matrix cannot be permuted
into a variable-band form with small profile. We consider a combined
unifrontal/multifrontal algorithm to enable general fill-in reduction
orderings to be applied without the data movement of previous
multifrontal approaches. We discuss this technique in the context of a
code designed for the solution of sparse systems with unsymmetric
pattern.},
}
%got
@article{DavisGilbertLarimoreNg04_algo,
author={Davis, T. A. and Gilbert, J. R. and Larimore, S. I. and Ng, E. G.},
title={Algorithm 836: {COLAMD}, a column approximate minimum degree ordering algorithm},
journal=TOMS,
volume={30},
number={3},
month=sep,
year={2004},
pages={377--380},
url={ http://dx.doi.org/10.1145/1024074.1024080 },
abstract={Two codes are discussed, COLAMD and SYMAMD, that
compute approximate minimum degree orderings for sparse matrices in two
contexts: (1) sparse partial pivoting, which requires a sparsity
preserving column pre-ordering prior to numerical factorization, and
(2) sparse Cholesky factorization, which requires a symmetric
permutation of both the rows and columns of the matrix being
factorized. These orderings are computed by COLAMD and SYMAMD,
respectively. The ordering from COLAMD is also suitable for sparse QR
factorization, and the factorization of matrices of the form
$\m{A}\tr\m{A}$ and $\m{AA}\tr$, such as those that arise in
least-squares problems and interior point methods for linear
programming problems. The two routines are available both in MATLAB
and C-callable forms. They appear as built-in routines in MATLAB
Version 6.0.},
}
%got
@article{DavisGilbertLarimoreNg04,
author={Davis, T. A. and Gilbert, J. R. and Larimore, S. I. and Ng, E. G.},
journal=TOMS,
title={A Column Approximate Minimum Degree Ordering Algorithm},
volume={30},
number={3},
month=sep,
year={2004},
pages={353--376},
url={ http://dx.doi.org/10.1145/1024074.1024079 },
abstract={Sparse Gaussian elimination with partial pivoting
computes the factorization $\m{PAQ}=\m{LU}$ of a sparse matrix $\m{A}$,
where the row ordering $\m{P}$ is selected during factorization using
standard partial pivoting with row interchanges. The goal is to select
a column preordering, $\m{Q}$, based solely on the nonzero pattern of
$\m{A}$, that limits the worst-case number of nonzeros in the
factorization. The fill-in also depends on $\m{P}$, but $\m{Q}$ is
selected to reduce an upper bound on the fill-in for any subsequent
choice of $\m{P}$. The choice of $\m{Q}$ can have a dramatic impact on
the number of nonzeros in $\m{L}$ and $\m{U}$. One scheme for
determining a good column ordering for $\m{A}$ is to compute a
symmetric ordering that reduces fill-in in the Cholesky factorization
of $\m{A}\tr\m{A}$. A conventional minimum degree ordering algorithm
would require the sparsity structure of $\m{A}\tr\m{A}$ to be computed,
which can be expensive both in terms of space and time since
$\m{A}\tr\m{A}$ may be much denser than $\m{A}$. An alternative is to
compute $\m{Q}$ directly from the sparsity structure of $\m{A}$; this
strategy is used by MATLAB's COLMMD preordering algorithm. A new
ordering algorithm, COLAMD, is presented. It is based on the same
strategy but uses a better ordering heuristic. COLAMD is faster and
computes better orderings, with fewer nonzeros in the factors of the
matrix.},
annote={refers to Algo 836},
}
%got
@article{DavisHager99,
author={Davis, T. A. and Hager, W. W.},
title={Modifying a sparse {Cholesky} factorization},
journal=SIMAX,
year={1999},
volume={20},
number={3},
pages={606-627},
}
%got
@article{DavisHager01,
author={Davis, T. A. and Hager, W. W.},
title={Multiple-Rank Modifications of a Sparse {Cholesky} Factorization},
journal=SIMAX,
year={2001},
volume={22},
pages={997-1013},
}
%got
@article{DavisHager05,
author={Davis, T. A. and Hager, W. W.},
title={Row modifications of a sparse {Cholesky} factorization},
journal=SIMAX,
year={2005},
volume={26},
number={3},
pages={621-639},
}
%got
@article{DavisHager09,
author={Davis, T. A. and Hager, W. W.},
title={Dynamic Supernodes in Sparse {Cholesky} Update/Downdate and Triangular Solves},
journal=TOMS,
volume={35},
number={4},
year={2009},
pages={1--23},
url={ http://dx.doi.org/10.1145/1462173.1462176 },
publisher={ACM},
address={New York, NY, USA},
abstract={The supernodal method for sparse Cholesky factorization
represents the factor $L$ as a set of supernodes, each consisting of a
contiguous set of columns of $L$ with identical nonzero pattern. A
conventional supernode is stored as a dense submatrix. While this is
suitable for sparse Cholesky factorization where the nonzero pattern of
$L$ does not change, it is not suitable for methods that modify a
sparse Cholesky factorization after a low-rank change to $A$ (an
update/downdate, $\overline{A}=A \pm WW^T$). Supernodes merge and
split apart during an update/downdate. Dynamic supernodes are
introduced, which allow a sparse Cholesky update/downdate to obtain
performance competitive with conventional supernodal methods. A
dynamic supernodal solver is shown to exceed the performance of the
conventional (BLAS-based) supernodal method for solving triangular
systems. These methods are incorporated into CHOLMOD, a sparse
Cholesky factorization and update/downdate package, which forms the
basis of \verb'x=A\b' in MATLAB when \verb'A' is sparse and symmetric
positive definite. }
}
%got
@article{DavisHu11,
author={Davis, T. A. and Hu, Y.},
title={The {University} of {Florida} sparse matrix collection},
journal=TOMS,
volume={38},
number={1},
month=dec,
year={2011},
pages={1:1--1:25},
url={ http://dx.doi.org/10.1145/2049662.2049663 },
publisher={ACM},
address={New York, NY, USA},
keywords={Graph drawing, multilevel algorithms, performance
evaluation, sparse matrices},
}
%got
@article{DavisNatarajan10,
author={Davis, T. A. and {Palamadai Natarajan}, E.},
title={Algorithm 907: {KLU}, A Direct Sparse Solver for Circuit Simulation Problems},
journal=TOMS,
volume={37},
number={3},
year={2010},
month=sep,
pages={36:1--36:17},
url={ http://dx.doi.org/10.1145/1824801.1824814 },
abstract={KLU is a software package for solving sparse
unsymmetric linear systems of equations that arise in circuit
simulation applications. It relies on a permutation to block triangular
form (BTF), several methods for finding a fill-reducing ordering
(variants of approximate minimum degree and nested dissection), and
Gilbert/Peierls' sparse left-looking {LU} factorization algorithm to
factorize each block. The package is written in C and includes a MATLAB
interface. Performance results comparing KLU with SuperLU, Sparse 1.3,
and UMFPACK on circuit simulation matrices are presented. KLU is the
default sparse direct solver in the $\textrm{Xyce}^{TM}$ circuit
simulation package developed by Sandia National Laboratories.},
}
%got
@article{DavisRajamanickamSidLakhdar16,
author={Davis, T. A. and Rajamanickam, S. and Sid-Lakhdar, W. M.},
title={A survey of direct methods for sparse linear systems},
journal={Acta Numerica},
volume={25},
month={5},
year={2016},
pages={383--566},
numpages = {184},
url={ http://dx.doi.org/10.1017/S0962492916000076 },
abstract = {Wilkinson defined a sparse matrix as one with enough zeros
that it pays to take advantage of them. This informal yet practical
definition captures the essence of the goal of direct methods for
solving sparse matrix problems. They exploit the sparsity of a matrix
to solve problems economically: much faster and using far less memory
than if all the entries of a matrix were stored and took part in
explicit computations. These methods form the backbone of a wide range
of problems in computational science. A glimpse of the breadth of
applications relying on sparse solvers can be seen in the origins of
matrices in published matrix benchmark collections (Duff and Reid
1979a, Duff, Grimes and Lewis 1989a, Davis and Hu 2011). The goal of
this survey article is to impart a working knowledge of the underlying
theory and practice of sparse direct methods for solving linear systems
and least-squares problems, and to provide an overview of the
algorithms, data structures, and software available to solve these
problems, so that the reader can both understand the methods and know
how best to use them. }
}
%got
@article{DavisYew90,
author={Davis, T. A. and Yew, P. C.},
title={A nondeterministic parallel algorithm for general unsymmetric sparse {LU} factorization},
journal=SIMAX,
year={1990},
volume={11},
number={3},
pages={383-402},
keywords={D2 algorithm}
}
%%D continued -----------------------------------------------------------------
%got
@incollection{DaydeDuff97,
author={Dayd\'e, M. J. and Duff, I. S.},
title={The use of computational kernels in full and sparse linear solvers, efficient code design on high-performance {RISC} processors},
year={1997},
booktitle={Vector and Parallel Processing - VECPAR'96},
volume={1215},
series={Lecture Notes in Computer Science},
editor={Palma, J. M. L. M. and Dongarra, J.},
url={http://dx.doi.org/10.1007/3-540-62828-2_116},
publisher={Springer Berlin Heidelberg},
pages={108-139},
abstract={We believe that the availability of portable and efficient
serial and parallel numerical libraries that can be used as building
blocks is extremely important for both simplifying application software
development and improving reliability. This is illustrated by
considering the solution of full and sparse linear systems. We describe
successive layers of computational kernels such as the BLAS, the sparse
BLAS, blocked algorithms for factorizing full systems, direct and
iterative methods for sparse linear systems. We also show how the
architecture of the today's powerful RISC processors may influence
efficient code design.}
}
%got
@article{DeSouzaKeuningsWolseyZone94,
author={{De Souza}, C. and Keunings, R. and Wolsey, L. A. and Zone, O.},
title={A New Approach to Minimising the Frontwidth in Finite Element Calculations},
journal=CMAME,
year={1994},
volume={111},
number={3-4},
pages={323-334},
url={ http://dx.doi.org/10.1016/0045-7825(94)90137-6 },
abstract={We propose a new approach to determine the element ordering
that minimises the frontwidth in finite element computations. The
optimisation problem is formulated using graph theoretic concepts. We
develop a divide-and-conquer strategy which defines a series of graph
partitioning subproblems. The latter are tackled by means of three
different heuristics, namely the Kernighan-Lin deterministic technique,
and the non-deterministic Simulated Annealing and Stochastic Evolution
algorithms. Results obtained for various 2D and 3D finite element
meshes, whether structured or non-structured, reveal the superiority of
the proposed approach relative to the standard Cuthill-McKee greedy
algorithms. Relative improvements in frontwidth are in the range
25-50\% in most cases. These figures translate into a significant 2-4
speedup of the finite element solver phase relative to the standard
Cuthill-McKee ordering. The best results are obtained with the
divide-and-conquer variant that uses the Stochastic Evolution
partitioning heuristic. Numerical experiments indicate that the two
non-deterministic variants of our divide-and-conquer approach are
robust with respect to mesh refinement and vary little in solution
quality from one run to another. }
}
%got
@article{DelCorso99,
author={{Del Corso}, G. M. and Manzini, G.},
title={Finding Exact Solutions to the Bandwidth Minimization Problem},
year={1999},
journal=COMP,
volume={62},
number={3},
url={ http://dx.doi.org/10.1007/s006070050002 },
publisher={Springer Verlag},
keywords={AMS Subject Classifications:65F50, 05C78, 65F30.; Key words. Sparse matrices, bandwidth minimization.},
pages={189--203},
}
%got
@incollection{DembartNeves77,
author={Dembart, B. and Neves, K. W.},
year={1977},
title={Sparse Triangular Factorization on Vector Computers},
editor={Anderson, P. M.},
booktitle={Exploring Applications of Parallel Processing to Power Systems Applications},
address={California},
publisher={Electric Power Research Institute},
pages={57-101},
}
%got (Tim owns hardcopy of book)
@book{Demmel97,
author={Demmel, J. W.},
title={Applied Numerical Linear Algebra},
publisher={SIAM},
address={Philadelphia},
year={1997},
}
%got
@article{DemmelEisenstatGilbertLiLiu99,
author={Demmel, J. W. and Eisenstat, S. C. and Gilbert, J. R. and Li, X. S. and Liu, J. W. H.},
title={A supernodal approach to sparse partial pivoting},
journal=SIMAX,
year={1999},
volume={20},
number={3},
pages={720-755},
url={ http://dx.doi.org/10.1137/S0895479895291765 },
}
%got
@article{DemmelGilbertLi99,
author={Demmel, J. W. and Gilbert, J. R. and Li, X. S.},
title={An asynchronous parallel supernodal algorithm for sparse {Gaussian} elimination},
journal=SIMAX,
year={1999},
volume={20},
number={4},
pages={915--952},
url={ http://dx.doi.org/10.1137/S0895479897317685 },
}
%got
@inproceedings{DevineBomanHeaphyBisselingCatalyurek06,
author={Devine, K. D. and Boman, E. G. and Heaphy, R. T. and Bisseling, R. H. and \c{C}ataly\"{u}rek, \"U. V.},
title={Parallel Hypergraph Partitioning for Scientific Computing},
booktitle={Proc. of 20th International Parallel and Distributed Processing Symposium (IPDPS'06)},
publisher={IEEE},
year={2006},
}
%got
@incollection{DobrianKumfertPothen00,
author={Dobrian, F. and Kumfert, G. K. and Pothen, A.},
title={The design of sparse direct solvers using object oriented techniques},
booktitle={Adv. in Software Tools in Sci. Computing},
editor={Bruaset, A. M. and Langtangen, H. P. and Quak, E.},
publisher={Springer-Verlag},
year={2000},
pages={89--131},
url={ http://www.netlib.org/linalg/oblio },
}
%got
@incollection{DobrianPothen05,
author={Dobrian, F. and Pothen, A.},
title={Oblio: design and performance},
booktitle={State of the Art in Scientific Computing, Lecture Notes in Computer Science},
editor={Dongarra, J. and Madsen, K. and Wasniewski, J.},
volume={3732},
year={2005},
publisher={Springer-Verlag},
pages={758--767},
}
%got
@article{DongarraDuCrozDuffHammarling90,
author={Dongarra, J. J. and {Du Croz}, J. and Duff, I. S. and Hammarling, S.},
title={A set of level-3 basic linear algebra subprograms},
journal=TOMS,
year={1990},
volume={16},
number={1},
pages={1--17},
url={ http://dx.doi.org/10.1145/77626.79170 },
}
%got (Tim owns hardcopy of book)
@book{DongarraDuffSorensenVanDerVorst98,
author={Dongarra, J. J. and Duff, I. S. and Sorensen, D. C. and {Van der Vorst}, H. A.},
title={Numerical Linear Algebra for High-Performance Computers},
publisher={SIAM},
address={Philadelphia},
year={1998},
url={ http://dx.doi.org/10.1137/1.9780898719611 },
annote={see chap 6, direct solution of sparse linear systems},
}
%%DUFF ------------------------------------------------------------------------
%got
@article{Duff74b,
author={Duff, I. S.},
year={1974},
title={On the Number of Nonzeros Added When {Gaussian} Elimination is Performed on Sparse Random Matrices},
journal=MATHCOMP,
volume={28},
pages={219-230},
annote={Disproves Hsieh and Ghausi, IEEE Trans. Circuit Theory, Vol CT-19, 1972 p329},
}
%got
@article{Duff74,
author={Duff, I. S.},
year={1974},
title={Pivot Selection and Row Ordering in {Givens} Reductions on Sparse Matrices},
journal=COMP,
volume={13},
pages={239--248},
}
%got
@article{Duff77,
author={Duff, I. S.},
title={On Permutations to Block Triangular Form},
journal=IMAJAM,
volume={19},
number={3},
pages={339--342},
year={1977},
url={ http://dx.doi.org/10.1093/imamat/19.3.339 },
abstract={This note clarifies some points relating to the permutation
of a matrix to block triangular form. In particular, the block
triangular form of any matrix which may be permuted to have a zero-free
diagonal is shown to be unique apart from certain well-defined trivial
permutations.},
}
%got
@article{Duff77c,
author={Duff, I. S.},
month={Apr.},
year={1977},
title={A Survey of Sparse Matrix Research},
journal=PROCIEEE,
volume={65},
number={4},
pages={500-535},
}
%got (Tim has book from TAMU library)
@incollection{Duff79,
author={Duff, I. S.},
year={1979},
title={Practical Comparisons of Codes for the Solution of Sparse Linear Systems},
booktitle={Sparse Matrix Proceedings},
publisher={SIAM},
address={Philadelphia},
editor={Duff, I. S. and Stewart, G. W.},
pages={107--134},
}
%got
@article{Duff81b,
author={Duff, I. S.},
year={1981},
title={Algorithm 575: Permutations for a Zero-Free Diagonal},
journal=TOMS,
volume={7},
number={1},
pages={387--390},
url={ http://dx.doi.org/10.1145/355958.355968 },
keywords={ordering, zero-free diagonal}
}
%got
@article{Duff81z,
author={Duff, I. S.},
title={{ME28}: {A} Sparse Unsymmetric Linear Equation Solver for Complex Equations},
journal=TOMS,
volume={7},
number={4},
pages={505--511},
month=dec,
year={1981},
url={ http://dx.doi.org/10.1145/355972.355978 },
keywords={complex sparse linear equations; drop tolerances; ME28;
numerical software; real and complex arithmetic; sparse matrix},
}
%got
@article{Duff81,
author={Duff, I. S.},
year={1981},
title={On Algorithms for Obtaining a Maximum Transversal},
journal=TOMS,
volume={7},
number={1},
pages={315--330},
url={ http://dx.doi.org/10.1145/355958.355963 },
keywords={zero-free diagonal}
}
%got
@incollection{Duff81e,
author={Duff, I. S.},
year={1981},
title={A Sparse Future},
editor={Duff, I. S.},
booktitle={Sparse Matrices and Their Uses},
publisher={New York: Academic Press},
pages={1-29},
}
%got (Tim has book from TAMU library)
@book{Duff81g,
author={Duff, I. S.},
title={Sparse Matrices and Their Uses},
publisher={Academic Press},
address={New York and London},
year={1981},
}
%got
@article{Duff84,
author={Duff, I. S.},
year={1984},
title={Design Features of a Frontal Code for Solving Sparse Unsymmetric Linear Systems Out-of-Core},
journal=SISC,
volume={5},
pages={270-280},
keywords={MA32 frontal methods unsymmetric matrices out-of-core}
}
%got
@article{Duff84e,
author={Duff, I. S.},
month={sep},
year={1984},
title={Direct Methods for Solving Sparse Systems of Linear Equations},
journal=SISC,
volume={5},
number={3},
pages={605-619},
keywords={survey 56 references performance analysis},
url={ http://dx.doi.org/10.1137/0905043 },
abstract={We survey algorithms and software for solving sparse systems
of linear equations by matrix factorization, paying particular
attention to recent developments. We classify the various algorithms
according to the type of system they solve (i.e. unsymmetric, symmetric
definite, symmetric indefinite, unsymmetric but with symmetric pattern)
and whether they perform pivoting for numerical stability. We consider
both algorithms which work in main memory and those which use auxiliary
storage. We illustrate the performance of the major approaches which
we discuss by runs on test problems. }
}
%got (Tim has book from TAMU library)
@incollection{Duff84f,
author={Duff, I. S.},
title={The Solution of Nearly Symmetric Sparse Linear Systems},
booktitle={Computing Methods in Applied Sciences and Engineering, VI: Proc. 6th Intl. Symposium},
pages={57--74},
publisher={North-Holland},
address={Amsterdam, New York, and London},
editor={Glowinski, R. and Lions, J.-L.},
year={1984},
}
%got (Tim has book from TAMU library)
@incollection{Duff84b,
author={Duff, I. S.},
year={1984},
title={The Solution of Sparse Linear Systems on the {CRAY-1}},
editor={Kowalik, J. S.},
booktitle={High-Speed Computation},
publisher={Berlin: Springer-Verlag},
pages={293-309},
keywords={CRAY frontal methods multifrontal},
annote={Report CSS-125, AERE Harwell, May 1982},
}
%got (Tim has book from TAMU library)
@incollection{Duff84c,
author={Duff, I. S.},
year={1984},
title={A Survey of Sparse Matrix Software},
editor={Cowell, W. R.},
booktitle={Sources and Development of Mathematical Software},
publisher={Englewood Cliffs, NJ: Prentice-Hall},
pages={165-199},
}
%got (Tim has book from TAMU library)
@incollection{Duff85,
author={Duff, I. S.},
year={1985},
title={Data Structures, Algorithms and Software for Sparse Matrices},
editor={Evans, D. J.},
booktitle={Sparsity and Its Applications},
publisher={Cambridge, United Kingdom: Cambridge University Press},
pages={1--29},
}
%got
@article{Duff86,
author={Duff, I. S.},
year={1986},
title={Parallel Implementation of Multifrontal Schemes},
journal=PC,
volume={3},
pages={193-204},
keywords={parallel multifrontal elimination trees garbage collection},
annote={AERE Harwell Report CSS 174},
}
%got
@inproceedings{Duff86b,
author={Duff, I. S.},
year={1986},
title={The Parallel Solution of Sparse Linear Equations},
editor={Handler, W. and Haupt, D. and Jeltsch, R. and Juling, W. and Lange, O.},
booktitle={CONPAR 86, Proc. Conf. on Algorithms and Hardware for Parallel Processing, Lecture Notes in Computer Science 237},
organization={Berlin: Springer-Verlag},
pages={18--24},
url={ http://dx.doi.org/10.1007/3-540-16811-7 },
}
%got
@inproceedings{Duff88,
author={Duff, I. S.},
title={Parallelism in sparse matrices},
booktitle={Parallel Systems and Computation},
editor={Paul, G. and Almasi, G. S.},
publisher={North-Holland},
address={Amsterdam, New York, and London},
pages={99--106},
year={1988},
}
%got
@article{Duff89d,
author={Duff, I. S.},
year={1989},
title={Direct solvers},
journal=CPR,
volume={11},
pages={1--20},
}
%got
@article{Duff89b,
author={Duff, I. S.},
year={1989},
title={Multiprocessing a Sparse Matrix Code on the {Alliant} {FX/8}},
journal=JCAM,
volume={27},
pages={229--239},
}
%got (Tim has book from TAMU library)
@inproceedings{Duff89c,
author={Duff, I. S.},
year={1989},
title={Parallel algorithms for sparse matrix solution},
booktitle={Parallel computing. Methods, algorithms, and applications},
editor={Evans, D. J. and Sutti, C.},
publisher={Adam Hilger Ltd., Bristol},
pages={73--82},
url={ https://books.google.com/books?id=2z5ipEZQOZEC },
}
%got
@article{Duff90,
author={Duff, I. S.},
title={The solution of large-scale least-squares problems on supercomputers},
year={1990},
journal=AOR,
volume={22},
number={1},
url={ http://dx.doi.org/10.1007/BF02023055 },
pages={241-252},
}
%got
@incollection{Duff91,
author={Duff, I. S.},
title={Parallel Algorithms for General Sparse Systems},
year={1991},
booktitle={Computer Algorithms for Solving Linear Algebraic Equations},
volume={77},
series={NATO ASI Series},
editor={Spedicato, E.},
url={ http://dx.doi.org/10.1007/978-3-642-76717-3_13 },
publisher={Springer Berlin Heidelberg},
keywords={sparse matrix; sparse equations; vectorization; parallelism; multifrontal; block iterative; Cimmino; augmented system; MA28; symmetric indefinite; constrained optimization; block pivots},
pages={277--297},
}
%got
@article{Duff96,
author={Duff, I. S.},
title={A review of frontal methods for solving linear systems},
journal=CPC,
year={1996},
volume={97},
pages={45--52},
}
%got
@article{Duff00,
author={Duff, I. S.},
title={The impact of high-performance computing in the solution of linear systems: trends and problems},
journal=JCAM,
volume={123},
number={1-2},
pages={515--530},
annote={Numerical Analysis 2000. Vol. III: Linear Algebra },
year={2000},
url={ http://dx.doi.org/10.1016/S0377-0427(00)00401-5 },
keywords={Sparse matrices},
keywords={Direct methods},
keywords={Parallelism},
keywords={Matrix factorization},
keywords={Multifrontal methods },
abstract={We review the influence of the advent of high-performance
computing on the solution of linear equations. We will concentrate on
direct methods of solution and consider both the case when the
coefficient matrix is dense and when it is sparse. We will examine the
current performance of software in this area and speculate on what
advances we might expect in the early years of the next century. }
}
%got
@article{Duff04,
author={Duff, I. S.},
title={{MA57}---a code for the solution of sparse symmetric definite and indefinite systems},
journal=TOMS,
volume={30},
number={2},
month=jun,
year={2004},
pages={118--144},
url={ http://dx.doi.org/10.1145/992200.992202 },
abstract={We introduce a new code for the direct solution of
sparse symmetric linear equations that solves indefinite systems with
$2 \times 2$ pivoting for stability. This code, called {\tt MA57}, is
in HSL 2002 and supersedes the well used HSL code {\tt MA27}. We
describe some of the implementation details and emphasize the novel
features of {\tt MA57}. These include restart facilities, matrix
modification, partial solution for matrix factors, solution of multiple
right-hand sides, and iterative refinement and error analysis. The code
is written in Fortran 77, but there are additional facilities within a
Fortran 90 implementation that include the ability to identify and
change pivots. Several of these facilities have been developed
particularly to support optimization applications, and we illustrate
the performance of the code on problems arising therefrom.},
}
%got
@article{Duff07,
author={Duff, I. S.},
year={2007},
title={Developments in matching and scaling algorithms},
journal=PAMM,
volume={7},
number={1},
pages={1010801--1010802},
}
%got
@article{Duff09,
author={Duff, I. S.},
title={The design and use of a sparse direct solver for skew symmetric matrices},
journal=JCAM,
year={2009},
volume={226},
pages={50--54},
}
%got (Tim has hardcopy)
@article{DuffErismanGearReid88,
author={Duff, I. S. and Erisman, A. M. and Gear, C. W. and Reid, J. K.},
title={Sparsity structure and {Gaussian} elimination},
journal=SIGNUM,
year={1988},
volume={23},
pages={2-8},
}
%got
@article{DuffErismanReid76,
author={Duff, I. S. and Erisman, A. M. and Reid, J. K.},
year={1976},
title={On {George}'s Nested Dissection Method},
journal=SINUM,
volume={13},
number={5},
pages={686--695},
}
%got (Tim owns hardcopy of book)
@book{DuffErismanReid86,
author={Duff, I. S. and Erisman, A. M. and Reid, J. K.},
year={1986},
title={Direct Methods for Sparse Matrices},
publisher={London: Oxford Univ. Press},
annote={reprinted 1989 and 2003},
}
%got (Tim has book from TAMU library)
@incollection{DuffGouldLescrenierReid90,
author={Duff, I. S. and Gould, N. I. M. and Lescrenier, M. and Reid, J. K.},
title={The multifrontal method in a parallel environment},
year={1990},
booktitle={Reliable Numerical Computation},
editor={Cox, M. G. and Hammarling, S.},
publisher={Oxford University Press},
address={London},
pages={93--111},
url={ http://ukcatalogue.oup.com/product/9780198535645.do# }
}
%got
@article{DuffGouldReidScottTurner91,
author={Duff, I. S. and Gould, N. I. M. and Reid, J. K. and Scott, J. A. and Turner, K.},
title={The Factorization of Sparse Symmetric Indefinite Matrices},
journal=IMAJNA,
volume={11},
number={2},
pages={181-204},
year={1991},
url={ http://dx.doi.org/10.1093/imanum/11.2.181 },
abstract={The Harwell multifrontal code MA27 is able to solve
symmetric indefinite systems of linear equations such as those that
arise from least-squares and constrained optimization algorithms, but
may sometimes lead to many more arithmetic operations being needed to
factorize the matrix than is required by other strategies. In this
paper, we report on the results of our investigation of this problem.
We have concentrated on seeking new strategies that preserve the
multifrontal principle but follow the sparsity structure more closely
in the case when some of the diagonal entries are zero.}
}
%got
@article{DuffGrimesLewis89,
author={Duff, I. S. and Grimes, R. G. and Lewis, J. G.},
title={Sparse Matrix Test Problems},
journal=TOMS,
year={1989},
volume={15},
number={1},
pages={1--14},
}
%got
@incollection{DuffJohnsson89,
author={Duff, I. S. and Johnsson, L. S.},
title={Node Orderings and Concurrency in Structurally-Symmetric Sparse Problems},
booktitle={Parallel Supercomputing: Methods, Algorithms, and Applications},
editor={Carey, G. F.},
address={New York, NY},
chapter={12},
pages={177--189},
publisher={John Wiley and Sons Ltd.},
year={1989},
}
%got
@article{DuffKayaUcar11,
author={Duff, I. S. and Kaya, K. and U\c{c}ar, B.},
title={Design, Implementation, and Analysis of Maximum Transversal Algorithms},
journal=TOMS,
volume={38},
number={2},
pages={13:1--13:31},
month=dec,
url={ http://dx.doi.org/10.1145/2049673.2049677 },
year={2011},
abstract={We report on careful implementations of seven
algorithms for solving the problem of finding a maximum transversal of
a sparse matrix. We analyse the algorithms and discuss the design
choices. To the best of our knowledge, this is the most comprehensive
comparison of maximum transversal algorithms based on augmenting paths.
Previous papers with the same objective either do not have all the
algorithms discussed in this paper, or they used non-uniform
implementations from different researchers. We use a common base to
implement all of the algorithms and compare their relative performance
on a wide range of graphs and matrices. We systematize, develop, and
use several ideas for enhancing performance. One of these ideas
improves the performance of one of the existing algorithms in most
cases, sometimes significantly. So much so that we use this as the
eighth algorithm in comparisons.},
}
%got
@article{DuffKoster99,
author={Duff, I. S. and Koster, J.},
title={The design and use of algorithms for permuting large entries to the diagonal of sparse matrices},
journal=SIMAX,
year={1999},
volume={20},
number={4},
pages={889-901},
url={ http://dx.doi.org/10.1137/S0895479897317661 },
}
%got
@article{DuffKoster01,
author={Duff, I. S. and Koster, J.},
title={On algorithms for permuting large entries to the diagonal of a sparse matrix},
journal=SIMAX,
year={2001},
volume={22},
number={4},
pages={973-996},
url={ http://dx.doi.org/10.1137/S0895479899358443 },
}
%got
@article{DuffPralet05,
author={Duff, I. S. and Pralet, S.},
title={Strategies for scaling and pivoting for sparse symmetric indefinite problems},
journal=SIMAX,
volume={27},
year={2005},
number={2},
pages={313-340},
url={ http://dx.doi.org/10.1137/04061043X },
}
%got
@article{DuffPralet07,
author={Duff, I. S. and Pralet, S.},
year={2007},
title={Towards stable mixed pivoting strategies for the sequential and parallel solution of sparse symmetric indefinite systems},
journal=SIMAX,
volume={29},
number={3},
pages={1007--1024},
}
%got
@article{DuffReid74,
author={Duff, I. S. and Reid, J. K.},
title={A Comparison of Sparsity Orderings for Obtaining a Pivotal Sequence in {Gaussian} Elimination},
journal=IMAJAM,
volume={14},
number={3},
pages={281--291},
year={1974},
url={ http://dx.doi.org/10.1093/imamat/14.3.281 },
annote={previously AERE Harwell Technical Report 73, March 1973},
abstract={A large number of pivotal strategies for use in conjunction
with Gaussian elimination for solving sparse systems of linear
equations are compared on the grounds of how well they preserve
sparsity, how many arithmetic operations they involve and their
computational cost. Conclusions are based mainly on the results
obtained on test problems.},
}
%got
@article{DuffReid76,
author={Duff, I. S. and Reid, J. K.},
title={A Comparison of Some Methods for the Solution of Sparse Overdetermined Systems of Linear Equations},
journal=IMAJAM,
volume={17},
number={3},
pages={267--280},
year={1976},
url={ http://dx.doi.org/10.1093/imamat/17.3.267 },
keywords={linear least squares survey},
abstract={Four methods for the least squares solution of
overdetermined systems of linear equations are compared from the point
of view of preservation of sparsity. Conclusions are drawn principally
from the results of numerical experiments, and favour the methods of
Peters and Wilkinson (1970) and an augmented matrix approach suggested
by Hachtel (private communication).},
}
%got
@article{DuffReid78b,
author={Duff, I. S. and Reid, J. K.},
year={1978},
title={Algorithm 529: Permutations to Block Triangular Form},
journal=TOMS,
volume={4},
number={2},
pages={189-192},
url={ http://dx.doi.org/10.1145/355780.355790 },
keywords={ordering block triangular form}
}
%got
@article{DuffReid78a,
author={Duff, I. S. and Reid, J. K.},
year={1978},
title={An Implementation of {Tarjan}'s Algorithm for the Block Triangularization of a Matrix},
journal=TOMS,
volume={4},
number={2},
pages={137-147},
url={ http://dx.doi.org/10.1145/355780.355785 },
keywords={ordering, block triangular form, partitioning, reducibility, depth first search}
}
%got (Tim has book from TAMU library)
@incollection{DuffReid79,
author={Duff, I. S. and Reid, J. K.},
year={1979},
title={Performance Evaluation of Codes for Sparse Matrix Problems},
editor={Fosdick, L. D.},
booktitle={Performance Evaluation of Numerical Software; Proc. IFIP TC 2.5 Working Conf.},
address={New York},
publisher={New York: North-Holland},
pages={121-135},
keywords={performance evaluation, matrix collection},
abstract={There is a strict limit to the extent that
analytical methods can be used to assess the effectiveness of
techniques employed in sparse matrix codes. Any full evaluation
therefore demands the running of a realistic sert of test problems.
The collections that we have assembled at Harwell are described,
although they are still in their infancy, and we show a number
of examples of their use both during algorithm development
and for assessment of codes written elsewhere.},
}
%got
@article{DuffReid79b,
author={Duff, I. S. and Reid, J. K.},
year={1979},
title={Some Design Features of a Sparse Matrix Code},
journal=TOMS,
volume={5},
number={1},
pages={18--35},
url={ http://dx.doi.org/10.1145/355815.355817 },
keywords={software MA28},
annote={previously Report CSS 48, AERE Harwell, 1977},
}
%got
@article{DuffReid82b,
author={Duff, I. S. and Reid, J. K.},
year={1982},
title={Experience of Sparse Matrix Codes on the {CRAY-1}},
journal=CPC,
volume={26},
pages={293--302},
}
%got
@article{DuffReid83b,
author={Duff, I. S. and Reid, J. K.},
year={1983},
title={The Multifrontal Solution of Indefinite Sparse Symmetric Linear Equations},
journal=TOMS,
volume={9},
number={3},
pages={302--325},
url={ http://dx.doi.org/10.1145/214392.214398 },
keywords={algorithms, experiment, multifrontal, indefinite matrices,
symmetric matrices, minimum degree algorithm, 2x2 pivots, MA27},
annote={Report CSS 122, AERE Harwell, 1982},
}
%got
@article{DuffReid83,
author={Duff, I. S. and Reid, J. K.},
title={A Note on the Work Involved in No-fill Sparse Matrix Factorization},
journal=IMAJNA,
volume={3},
number={1},
pages={37-40},
year={1983},
url={ http://dx.doi.org/10.1093/imanum/3.1.37 },
abstract={We provide an alternative proof of the result of Rose (1972)
that all perfect (no-fill) orderings of a symmetric sparse matrix
require the same number of arithmetic operations during triangular
factorization and show that such a result does not hold for unsymmetric
sparse matrices.},
}
%got
@article{DuffReid84,
author={Duff, I. S. and Reid, J. K.},
year={1984},
title={The Multifrontal Solution of Unsymmetric Sets of Linear Equations},
journal=SISC,
volume={5},
number={3},
pages={633-641},
keywords={multifrontal unsymmetric matrices, MA37},
annote={AERE Harwell Report CSS 133, 1983},
}
%got
@techreport{DuffReid95,
author={Duff, I. S. and Reid, J. K.},
title={{MA47}, a {Fortran} code for the direct solution of indefinite sparse symmetric linear systems},
institution={Rutherford Appleton Laboratory},
year={1995},
number={RAL-95-001},
address={Oxon, England},
month={Jan.}
}
%got
@article{DuffReid96b,
author={Duff, I. S. and Reid, J. K.},
title={The design of {MA48}: a code for the direct solution of sparse unsymmetric linear systems of equations},
journal=TOMS,
volume={22},
number={2},
pages={187--226},
month=jun,
year={1996},
url={ http://dx.doi.org/10.1145/229473.229476 },
abstract={We describe the design of a new code for the direct
solution of sparse unsymmetric linear systems of equations. The new
code utilizes a novel restructuring of the symbolic and numerical
phases, which increases speed and saves storage without sacrifice of
numerical stability. Other features include switching to full-matrix
processing in all phases of the computation enabling the use of all
three levels of BLAS, treatment of rectangular or rank-deficient
matrices, partial factorization, and integrated facilities for
iterative refinement and error estimation.},
}
%got
@article{DuffReid96,
author={Duff, I. S. and Reid, J. K.},
title={Exploiting zeros on the diagonal in the direct solution of indefinite sparse symmetric linear systems},
journal=TOMS,
volume={22},
number={2},
pages={227--257},
month=jun,
year={1996},
url={ http://dx.doi.org/10.1145/229473.229480 },
abstract={We describe the design of a new code for the solution of
sparse indefinite symmetric linear systems of equations. The principal
difference between this new code and earlier work lies in the
exploitation of the additional sparsity available when the matrix has a
significant number of zero diagonal entries. Other new features have
been included to enhance the execution speed, particularly on vector
and parallel machines.},
}
%got
@article{DuffReidMunksgaardNielsen79,
author={Duff, I. S. and Reid, J. K. and Munksgaard, J. K. and Nielsen, H. B.},
title={Direct Solution of Sets of Linear Equations whose Matrix is Sparse, Symmetric and Indefinite},
journal=IMAJAM,
volume={23},
number={2},
pages={235-250},
year={1979},
url={ http://dx.doi.org/10.1093/imamat/23.2.235 },
keywords={symmetric matrices indefinite matrices 2x2 pivots},
abstract={We consider the use of 1x1 and 2x2 pivots for direct
solution of sets of linear equations whose matrix is sparse and
symmetric. Inclusion of 2x2 pivots permits a stable decomposition to be
obtained in the indefinite case and we demonstrate that in practice
there is little loss of speed even in positive definite cases. A
pivotal strategy suitable for the sparse case is proposed and compared
experimentally with alternatives. We present an analysis of error,
explain how the stability may be monitored cheaply, discuss automatic
scaling and consider implementation details.},
}
%got
@article{DuffReidScott89,
author={Duff, I. S. and Reid, J. K. and Scott, J. A.},
title={The use of profile reduction algorithms with a frontal code},
journal=IJNME,
year={1989},
volume={28},
number={11},
publisher={John Wiley \& Sons, Ltd},
url={ http://dx.doi.org/10.1002/nme.1620281106 },
pages={2555--2568},
abstract={We study profile reduction algorithms when used to order
the elements for the frontal solution of a system of linear equations
with a symmetric sparsity pattern. We consider two distinct procedures
for producing an efficient element ordering; one based on assembling
the pattern of the finite-element matrix, reordering the variables and
using the new variable order to resequence the elements, and the other
based on generating adjacency lists for the elements themselves and
reordering the elements directly.We compare the results of using
several variants of these algorithms in conjunction with the Harwell
frontal code, MA32, on the CRAY-2 for a range of practical problems. We
find that, given suitable enhancements, both approaches are practical
and neither is consistently superior to the other.},
}
%got
@article{DuffScott96,
author={Duff, I. S. and Scott, J. A.},
title={The design of a new frontal code for solving sparse, unsymmetric systems},
journal=TOMS,
volume={22},
number={1},
pages={30--45},
month=mar,
year={1996},
url={ http://dx.doi.org/10.1145/225545.225550 },
abstract={We describe the design, implementation, and performance
of a frontal code for the solution of large, sparse, unsymmetric
systems of linear equations. The resulting software package, MA42, is
included in Release 11 of the Harwell Subroutine Library and is
intended to supersede the earlier MA32 package. We discuss in detail
the extensive use of higher-level BLAS kernels within MA42 and
illustrate the performance on a range of practical problems on a CRAY
Y-MP, an IBM 3090, and an IBM RISC System/6000. We examine extending
the frontal solution scheme to use multiple fronts to allow MA42 to be
run in parallel. We indicate some directions for future development.},
}
%got
@article{DuffScott99,
author={Duff, I. S. and Scott, J. A.},
title={A frontal code for the solution of sparse positive-definite symmetric systems arising from finite-element applications},
journal=TOMS,
volume={25},
number={4},
pages={404--424},
month=dec,
year={1999},
url={ http://dx.doi.org/10.1145/332242.332243 },
abstract={We describe the design, implementation, and performance
of a frontal code for the solution of large sparse symmetric systems of
linear finite-element equations. The code is intended primarily for
positive-definite systems, since numerical pivoting is not performed.
The resulting software package, MA62, will be included in the Harwell
Subroutine Library. We illustrate the performance of our new code on a
range of problems arising from real engineering and industrial
applications. The performance of the code is compared with that of the
Harwell Subroutine Library general frontal solver MA42 and with other
positive-definite codes from the Harwell Subroutine Library.},
keywords={Algorithms; finite-element equations; Gaussian elimination;
Level 3 BLAS; performance; sparse symmetric linear equations; symmetric
frontal method},
}
%got
@article{DuffScott04,
author={Duff, I. S. and Scott, J. A.},
title={A Parallel Direct Solver for Large Sparse Highly Unsymmetric Linear Systems},
journal=TOMS,
volume={30},
number={2},
month=jun,
year={2004},
pages={95--117},
url={ http://dx.doi.org/10.1145/992200.992201 },
abstract={The need to solve large sparse linear systems of
equations efficiently lies at the heart of many applications in
computational science and engineering. For very large systems when
using direct factorization methods of solution, it can be beneficial
and sometimes necessary to use multiple processors, because of
increased memory availability as well as reduced factorization time. We
report on the development of a new parallel code that is designed to
solve linear systems with a highly unsymmetric sparsity structure using
a modest number of processors (typically up to about 16). The problem
is first subdivided into a number of loosely connected subproblems and
a variant of sparse Gaussian elimination is then applied to each of the
subproblems in parallel. An interface problem in the variables on the
boundaries of the subproblems must also be factorized. We discuss how
our software is designed to achieve the goals of portability, ease of
use, fficiency, and flexibility, and illustrate its performance on an
SGI Origin 2000, a Cray T3E, and a 2-processor Compaq DS20, using
problems arising from real applications.},
annote={MP48},
}
%got
@article{DuffScott05,
author={Duff, I. S. and Scott, J. A.},
year={2005},
title={Stabilized bordered block diagonal forms for parallel sparse solvers},
journal=PC,
volume={31},
pages={275--289},
url={ http://dx.doi.org/10.1016/j.parco.2004.12.008 },
abstract={One possible approach to the solution of large sparse linear
systems is to reorder the system matrix to bordered block diagonal form
and then to solve the block system in parallel. We consider the duality
between singly bordered and doubly bordered block diagonal forms. The
idea of a stabilized doubly bordered block diagonal form is introduced.
We show how a stable factorization of a singly bordered block diagonal
matrix results in a stabilized doubly bordered block diagonal matrix.
We propose using matrix stretching to generate a singly bordered form
from a doubly bordered form. Matrix stretching is compared with two
alternative methods for obtaining a singly bordered form and is shown
to be efficient both in computation time and the quality of the
resulting block structure.},
}
%got
@article{DuffUcar10,
author={Duff, I. S. and U\c{c}ar, B.},
title={On the Block Triangular Form of Symmetric Matrices},
journal=SIREV,
volume={52},
number={3},
pages={455-470},
year={2010},
url={ http://dx.doi.org/10.1137/080720036 },
}
%got
@incollection{DuffUcar12,
author={Duff, I. S. and U\c{c}ar, B.},
title={Combinatorial Problems in Solving Linear Systems},
year={2012},
booktitle={Combinatorial Scientific Computing},
editor={Schenk, O.},
pages={21--68},
chapter={2},
url={ http://dx.doi.org/10.1201/b11644-3 },
publisher={Chapman and Hall/CRC Computational Science},
}
%got
@article{DuffVanderVorst99,
author={Duff, I. S. and {Van der Vorst}, H. A.},
title={Developments and trends in the parallel solution of linear systems},
year={1999},
journal=PC,
volume={25},
pages={1931--1970},
url={ http://dx.doi.org/10.1016/S0167-8191(99)00077-0 },
abstract={In this review paper, we consider some important developments
and trends in algorithm design for the solution of linear systems
concentrating on aspects that involve the exploitation of parallelism.
We briefly discuss the solution of dense linear systems, before
studying the solution of sparse equations by direct and iterative
methods. We consider preconditioning techniques for iterative solvers
and discuss some of the present research issues in this field.}
}
%got
@article{DuffWiberg88,
author={Duff, I. S. and Wiberg, T.},
title={Implementations of {O}$(\sqrt{n}t)$ assignment algorithms},
journal=TOMS,
year={1988},
volume={14},
number={3},
pages={267--287},
month=sep,
url={ http://dx.doi.org/10.1145/44128.44131 },
keywords={algorithms; theory},
}
%%D continued------------------------------------------------------------------
%got
@article{DulmageMendelsohn63,
author={Dulmage, A. L. and Mendelsohn, N. S.},
title={Two algorithms for bipartite graphs},
journal=JSIAM,
volume={11},
year={1963},
pages={183--194},
url={ http://dx.doi.org/10.1137/0111014 },
}
%%E ---------------------------------------------------------------------------
%got
@article{Edlund02,
author={Edlund, O.},
title={A software package for sparse orthogonal factorization and updating},
journal=TOMS,
volume={28},
number={4},
pages={448--482},
month=dec,
year={2002},
url={ http://dx.doi.org/10.1145/592843.592848 },
abstract={Though there is good software for sparse QR
factorization, there is little support for updating and
downdating---something that is absolutely essential in some linear
programming algorithms, for example. This paper describes an
implementation of sparse LQ factorization, including block
triangularization, approximate minimum degree ordering, symbolic
factorization, multifrontal factorization, {\em and\/} updating and
downdating. The factor $Q$ is not retained. The updating algorithm
expands the nonzero pattern of the factor $L$, which is reflected in
the dynamic representation of $L$. The block triangularization is used
as an ``ordering for sparsity'' rather than as a prerequisite for block
backward substitution. In the symbolic factorization, something called
``element counters'' is introduced to reduce the overestimation of the
number of nonzeros that the commonly used methods do. Both the
approximate minimum degree ordering and the symbolic factorization are
done without explicitly forming the nonzero pattern of the symmetric
matrix in the corresponding normal equations. Tests show that the
average time used for a single update or downdate is essentially the
same as the time used for a single forward or backward substitution.
Other parts of the implementation show the same range of performance as
existing code, but cannot be replaced because of the special character
of the systems that are solved.},
}
%got
@techreport{EisenstatGurskySchultzSherman77,
author={Eisenstat, S. C. and Gursky, M. C. and Schultz, M. H. and Sherman, A. H.},
year={1977},
title={The {Yale} Sparse Matrix Package, {II}: The Non-symmetric Codes},
number={114},
institution={Yale Univ. Dept. of Computer Science},
keywords={YSMP unsymmetric matrices},
address={New Haven, CT},
}
%got
@article{EisenstatGurskySchultzSherman82,
author={Eisenstat, S. C. and Gursky, M. C. and Schultz, M. H. and Sherman, A. H.},
year={1982},
title={{Yale} Sparse Matrix Package, {I}: The Symmetric Codes},
journal=IJNME,
volume={18},
number={8},
publisher={John Wiley \& Sons, Ltd},
url={ http://dx.doi.org/10.1002/nme.1620180804 },
pages={1145--1151},
keywords={YSMP symmetric matrices},
}
%got
@article{EisenstatLiu92,
author={Eisenstat, S. C. and Liu, J. W. H.},
title={Exploiting Structural Symmetry in Unsymmetric Sparse Symbolic Factorization},
journal=SIMAX,
year={1992},
volume={13},
number={1},
pages={202--211},
url={ http://dx.doi.org/10.1137/0613017 },
}
%got
@article{EisenstatLiu93a,
author={Eisenstat, S. C. and Liu, J. W. H.},
title={Exploiting Structural Symmetry in a Sparse Partial Pivoting Code},
journal=SISC,
volume={14},
number={1},
pages={253-257},
year={1993},
url={ http://dx.doi.org/10.1137/0914016 },
}
%got
@inproceedings{EisenstatLiu93b,
author={Eisenstat, S. C. and Liu, J. W. H.},
title={Structural Representations of {Schur} Complements in Sparse Matrices},
pages={85--100},
booktitle={Graph Theory and Sparse Matrix Computation},
series={IMA Volumes in Applied Mathematics},
year={1993},
publisher={Springer-Verlag},
address={New York},
editor={George, A. and Gilbert, J. R. and Liu, J. W. H.},
volume={56},
}
%got
@article{EisenstatLiu05,
author={Eisenstat, S. C. and Liu, J. W. H.},
title={The theory of elimination trees for sparse unsymmetric matrices},
journal=SIMAX,
year={2005},
volume={26},
number={3},
pages={686--705},
url={ http://dx.doi.org/10.1137/S089547980240563X },
}
%got
@article{EisenstatLiu05b,
author={Eisenstat, S. C. and Liu, J. W. H.},
title={A tree based dataflow model for the unsymmetric multifrontal method},
journal=ETNA,
year={2005},
volume={21},
pages={1-19},
url={ http://etna.mcs.kent.edu/volumes/2001-2010/vol21/abstract.php?vol=21&pages=1-19 },
abstract={This paper introduces a new model to describe the flow of
data from update to frontal matrix in the unsymmetric multifrontal
method for solving sparse linear systems. The model is based on the
elimination tree of an unsymmetric matrix and consists of the edges in
this tree together with some cross edges. By symmetrically renumbering
the rows and columns of the coefficient matrix using a tree-based
postordering, we can permute the matrix into a bordered block
triangular form while preserving the elimination tree. The model
simplifies when the matrix has this form, which suggests that a
multifrontal implementation based on it should be simpler as well. We
also extend the model to handle pivoting for stability; compare it with
others used in the unsymmetric multifrontal method; and point out the
implications for parallelism.}
}
%got
@article{EisenstatLiu08,
author={Eisenstat, S. C. and Liu, J. W. H.},
title={Algorithmic aspects of elimination trees for sparse unsymmetric matrices},
journal=SIMAX,
year={2008},
volume={29},
number={4},
pages={1363--1381},
url={ http://dx.doi.org/10.1137/050643581 },
}
%got
@article{EisenstatSchultzSherman75,
author={Eisenstat, S. C. and Schultz, M. H. and Sherman, A. H.},
title={Efficient implementation of sparse symmetric {Gaussian} elimination},
journal=ACMPDE,
year={1975},
pages={33--39},
}
%got (Tim has book from TAMU library)
@incollection{EisenstatSchultzSherman76b,
author={Eisenstat, S. C. and Schultz, M. H. and Sherman, A. H.},
year={1976},
title={Applications of an Element Model for {Gaussian} Elimination},
editor={Bunch, J. R. and Rose, D. J.},
booktitle={Sparse Matrix Computations},
publisher={New York: Academic Press},
pages={85--96},
}
%got (Tim has book from TAMU library)
@incollection{EisenstatSchultzSherman76,
author={Eisenstat, S. C. and Schultz, M. H. and Sherman, A. H.},
year={1976},
title={Considerations in the Design of Software for Sparse {Gaussian} Elimination},
editor={Bunch, J. R. and Rose, D. J.},
booktitle={Sparse Matrix Computations},
publisher={New York: Academic Press},
pages={263--273},
keywords={software design YSMP, also discusses unsymmetric method}
}
%got (Tim has book from TAMU library)
@incollection{EisenstatSchultzSherman79,
author={Eisenstat, S. C. and Schultz, M. H. and Sherman, A. H.},
year={1979},
title={Software for Sparse {Gaussian} Elimination with Limited Core Storage},
booktitle={Sparse Matrix Proceedings},
publisher={SIAM},
address={Philadelphia},
editor={Duff, I. S. and Stewart, G. W.},
pages={135--153},
}
%got
@article{EisenstatSchultzSherman81,
author={Eisenstat, S. C. and Schultz, M. H. and Sherman, A. H.},
year={1981},
title={Algorithms and Data Structures for Sparse Symmetric {Gaussian} Elimination},
journal=SISC,
volume={2},
number={2},
pages={225--237},
url={ http://dx.doi.org/10.1137/0902019 },
}
%got
@article{ErismanGrimesLewisPoole85,
author={Erisman, A. M. and Grimes, R. G. and Lewis, J. G. and Poole, W. G.},
month={Apr.},
year={1985},
title={A Structurally Stable Modification of {Hellerman-Rarick's} {P4} Algorithm for Reordering Unsymmetric Sparse Matrices},
journal=SINUM,
volume={22},
number={2},
pages={369-385},
keywords={ordering, P4 algorithm, unsymmetric matrices, reordering algorithms},
url={ http://dx.doi.org/10.1137/0722022 },
}
%got
@article{ErismanGrimesLewisPoole87,
author={Erisman, A. M. and Grimes, R. G. and Lewis, J. G. and Poole, W. G. and Simon, H. D.},
month={July},
year={1987},
title={Evaluation of Orderings for Unsymmetric Sparse Matrices},
journal=SISC,
volume={8},
number={4},
pages={600-624},
keywords={ordering, Markowitz ordering, P4 algorithm},
annote={also Report MM-5, ETA Division, Boeing Computer Services, Seattle, WA.}
}
%got
@inproceedings{EswarHuangSadayappan94,
author={Eswar, K. and Huang, C.-H. and Sadayappan, P.},
booktitle={Scalable High-Performance Computing Conference, 1994., Proceedings of the},
title={Memory-adaptive parallel sparse {Cholesky} factorization},
year={1994},
month={May},
pages={317-323},
abstract={The problem of Cholesky factorization of sparse
positive-definite matrices on distributed-memory multiprocessors is
considered. A column-based algorithm with the ability to adapt to the
amount of memory available on each processor is presented. Exploiting
the available memory allows the local computation on each processor to
be ordered so that good local efficiencies and dynamic load balance are
achieved. A proof that this distributed algorithm is deadlock-free is
given. Experimental results of an implementation of this algorithm on
an Intel iPSC/860 multiprocessor system are reported},
keywords={concurrency control;distributed memory systems;mathematics
computing;matrix algebra;parallel algorithms;resource
allocation;Cholesky factorization;Intel iPSC/860 multiprocessor
system;column-based algorithm;deadlock-free algorithm;distributed
algorithm;distributed-memory multiprocessors;dynamic load balance;local
efficiencies;memory-adaptive parallel algorithm;ordered local
computation;sparse positive-definite matrices;Aggregates;Data
structures;Distributed algorithms;Distributed computing;Information
science;Multiprocessing systems;Partitioning algorithms;Sparse
matrices;System recovery},
url={ http://dx.doi.org/10.1109/SHPCC.1994.296660 },
}
%got
@inproceedings{EswarHuangSadayappan95,
author={Eswar, K. and Huang, C.-H. and Sadayappan, P.},
booktitle={Proc. 5th Symp. Frontiers of Massively Parallel Computation},
title={On mapping data and computation for parallel sparse {Cholesky} factorization},
year={1995},
month={Feb},
pages={171-178},
abstract={When performing the Cholesky factorization of a sparse matrix
on a distributed-memory multiprocessor, the methods used for mapping
the elements of the matrix and the operations constituting the
factorization to the processors can have a significant impact on the
communication overhead incurred. This paper explores how two
techniques, one used when mapping dense Cholesky factorization and the
other used when mapping sparse Cholesky factorization, can be
integrated to achieve a communication-efficient parallel sparse
Cholesky factorization. Two localizing techniques to further reduce the
communication overhead are also described. The mapping strategies
proposed here, as well as other previously proposed strategies fit into
the unifying framework developed in this paper. Communication
statistics for sample sparse matrices are included},
keywords={mathematics computing;parallel algorithms;sparse
matrices;communication overhead;communication
statistics;distributed-memory multiprocessor;localizing
techniques;mapping data;parallel sparse Cholesky factorization
computation;sparse matrices;sparse matrix;Concurrent
computing;Distributed computing;Information science;Sparse
matrices;Statistics;Symmetric matrices},
url={ http://dx.doi.org/10.1109/FMPC.1995.380450 },
}
%got
@inproceedings{EswarSadayappanHuangVisvanathan93,
author={Eswar, K. and Sadayappan, P. and Huang, C.-H. and Visvanathan, V.},
title={Supernodal Sparse {Cholesky} Factorization on Distributed-Memory Multiprocessors},
booktitle={Proc. Intl. Conf. Parallel Processing (ICPP93)},
year={1993},
month={Aug},
volume={3},
pages={18-22},
abstract={The concept of supernodes has been widely used in the design
of algorithms for the solution of sparse linear systems of equations.
This paper discusses the use of supernodes in the design of algorithms
for sparse Cholesky factorization on distributed-memory
multiprocessors. A new algorithm that is communication efficient, has
good load balance, and benefits significantly from supernodes is
presented.},
keywords={Algorithm design and analysis;Equations;Information
science;Linear systems;Parallel processing;Performance analysis;Sparse
matrices;Supercomputers;Taxonomy;Workstations},
url={ http://dx.doi.org/10.1109/ICPP.1993.170 },
}
%got
@inproceedings{EswarSadayappanVisvanathan91,
author={Eswar, K. and Sadayappan, P. and Visvanathan, V.},
title={Multifrontal Factorization of Sparse Matrices on Shared-Memory Multiprocessors},
booktitle={Proc. Intl. Conf. on Parallel Processing},
address={Austin, TX},
volume={{III:} Algorithms and Applications},
pages={159--166},
year={1991},
}
%got
@incollection{EswarSadayappanVisvanathan93,
author={Eswar, K. and Sadayappan, P. and Visvanathan, V.},
year={1993},
booktitle={Parallel Computing on Distributed Memory Multiprocessors},
volume={103},
series={NATO ASI Series},
editor={\"Ozg\"uner, F. and Er\c{c}al, F.},
url={ http://dx.doi.org/10.1007/978-3-642-58066-6_6 },
title={Parallel Direct Solution of Sparse Linear Systems},
publisher={Springer Berlin Heidelberg},
keywords={sparse matrices; Cholesky factorization; parallel algorithms},
pages={119-142},
}
%got (Tim has book from TAMU library)
@book{Evans85b,
editor={Evans, D. J.},
year={1985},
title={Sparsity and Its Applications},
publisher={Cambridge, United Kingdom: Cambridge University Press}
}
%got
@article{Everstine79,
author={Everstine, G. C.},
title={A comparison of three resequencing algorithms for the reduction of matrix profile and wavefront},
journal=IJNME,
volume={14},
number={6},
publisher={John Wiley \& Sons, Ltd},
url={ http://dx.doi.org/10.1002/nme.1620140606 },
pages={837--853},
year={1979},
keywords={ordering profile reduction},
abstract={Three widely-used nodal resequencing algorithms were tested
and compared for their ability to reduce matrix profile and
root-mean-square (rms) wavefront, the latter being the most critical
parameter in determining matrix decomposition time in the NASTRAN
finite element computer program. The three algorithms are Cuthill-McKee
(CM), Gibbs-Poole-Stockmeyer (GPS), and Levy. Results are presented for
a diversified collection of 30 test problems ranging in size from 59 to
2680 nodes. It is concluded that GPS is exceptionally fast, and, for
the conditions under which the test was made, the algorithm best able
to reduce profile and rms wavefront consistently well. An extensive
bibliography of resequencing algorithms is included.},
}
%%F ---------------------------------------------------------------------------
%got
@article{Felippa75,
author={Felippa, C. A.},
year={1975},
title={Solution of Linear Equations with Skyline-Stored Symmetric Matrix},
journal=CAS,
volume={5},
pages={13-29},
keywords={skyline-storage method, symmetric matrices}
}
%got
@article{FenvesLaw83,
author={Fenves, S. J. and Law, K. H.},
title={A two-step approach to finite element ordering},
journal=IJNME,
year={1983},
volume={19},
number={6},
pages={891--911},
publisher={John Wiley \& Sons, Ltd},
url={ http://dx.doi.org/10.1002/nme.1620190610 },
abstract={A two-step approach to finite element ordering is
introduced. The scheme involves ordering of the finite elements first,
based on their adjacency, followed by a local numbering of the nodal
variables. The ordering of the elements is performed by the
Cuthill-McKee algorithm. This approach takes into consideration the
underlying structure of the finite element mesh, and may be regarded as
a 'natural' finite element ordering scheme. The experimental results
show that this two-step scheme is more efficient than the reverse
Cuthill-McKee algorithm applied directly to the nodes, in terms of both
execution time and the number of fill-in entries, particularly when
higher order finite elements are used. In addition to its efficiency,
the two-step approach increases modularity and flexibility in finite
element programs, and possesses potential application to a number of
finite element solution methods.}
}
%got
@inproceedings{FiducciaMattheyses82,
author={Fiduccia, C. M. and Mattheyses, R. M.},
title={A linear-time heuristic for improving network partition},
booktitle={Proc. 19th Design Automation Conf.},
year={1982},
pages={175-181},
address={Las Vegas, NV}
}
%got
@article{Fiedler73,
author={Fiedler, M.},
title={Algebraic connectivity of graphs},
journal=CZECH,
year={1973},
volume={23},
pages={298-305},
}
%got
@article{ForrestTomlin72,
author={Forrest, J. J. H. and Tomlin, J. A.},
title={Updated triangular factors of the basis to maintain sparsity in the product form simplex method},
journal=MATHPROG,
year={1972},
volume={2},
number={1},
pages={263-278},
url={ http://dx.doi.org/10.1007/BF01584548 },
}
%got
@article{FosterDavis13,
author={Foster, L. V. and Davis, T. A.},
title={Algorithm 933: Reliable Calculation of Numerical Rank, Null Space Bases, Pseudoinverse Solutions and Basic Solutions using {SuiteSparseQR}},
journal=TOMS,
volume=40,
number=1,
pages={{7:1}--{7:23}},
year={2013},
}
%got
@article{FuJiaoYang98,
author={Fu, C. and Jiao, X. and Yang, T.},
title={Efficient sparse {LU} factorization with partial pivoting on distributed memory architectures},
journal=IEEETPDS,
year={1998},
volume={9},
number={2},
pages={109--125},
url={ http://doi.ieeecomputersociety.org/10.1109/71.663864 },
abstract={A sparse LU factorization based on Gaussian elimination with
partial pivoting (GEPP) is important to many scientific applications,
but it is still an open problem to develop a high performance GEPP code
on distributed memory machines. The main difficulty is that partial
pivoting operations dynamically change computation and nonzero fill-in
structures during the elimination process. This paper presents an
approach called S* for parallelizing this problem on distributed memory
machines. The S* approach adopts static symbolic factorization to avoid
run-time control overhead, incorporates 2D L/U supernode partitioning
and amalgamation strategies to improve caching performance, and
exploits irregular task parallelism embedded in sparse LU using
asynchronous computation scheduling. The paper discusses and compares
the algorithms using 1D and 2D data mapping schemes, and presents
experimental studies on Cray-T3D and T3E. The performance results for a
set of nonsymmetric benchmark matrices are very encouraging, and S* has
achieved up to 6.878 GFLOPS on 128 T3E nodes. To the best of our
knowledge, this is the highest performance ever achieved for this
challenging problem and the previous record was 2.583 GFLOPS on shared
memory machines}
}
%%G ---------------------------------------------------------------------------
%got
@article{GallivanHansenOstromskyZlatev95,
author={Gallivan, K. A. and Hansen, P. C. and Ostromsky, Tz. and Zlatev, Z.},
title={A locally optimized reordering algorithm and its application to a parallel sparse linear system solver},
year={1995},
journal=COMP,
volume={54},
number={1},
url={ http://dx.doi.org/10.1007/BF02238079 },
publisher={Springer-Verlag},
keywords={65F05; 65Y05; Sparse matrix; general sparsity; Gaussian elimination; drop tolerance; re-ordering; binary tree; block algorithm; coarse-grain parallelism; speed-up},
pages={39--67},
}
%got
@article{GallivanMarsolfWijshoff96,
author={Gallivan, K. A. and Marsolf, B. A. and Wijshoff, H. A. G.},
title={Solving large nonsymmetric sparse linear systems using {MCSPARSE}},
journal=PC,
volume={22},
number={10},
pages={1291--1333},
year={1996},
url={ http://dx.doi.org/10.1016/S0167-8191(96)00047-6 },
keywords={Linear algebra},
keywords={Sparse linear systems},
keywords={Cedar system},
keywords={Implementation},
keywords={Parallel processing systems},
abstract={In this paper, the methods and implementation techniques
used for the nonsymmetric sparse linear system solver, MCSPARSE on the
Cedar system are described. A novel reordering scheme (H*) upon which
the solver is based is presented. The tradeoffs discussed include
stability and fill-in control, hierarchical parallelism, and load
balancing. Experimental results demonstrating the effectiveness of the
solver with respect to each of these issues are presented. We also
address the implications of this work for other parallel processing
systems.},
}
%got
@article{GaoParlett90,
author={Gao, F. and Parlett, B. N.},
title={A note on communication analysis of parallel sparse Cholesky factorization on a hypercube},
journal=PC,
volume={16},
number={1},
pages={59--60},
year={1990},
url={ http://dx.doi.org/10.1016/0167-8191(90)90158-6 },
keywords={Linear algebra},
keywords={Matrix computation},
keywords={Cholesky factorization},
keywords={Hypercube},
keywords={Communication costs},
abstract={We give a simpler communication analysis, than as appeared
in George et al. [1], for the nested-dissection method of
column-oriented Cholesky factorization using the subgrid-to-subcube
column assignment for a k × k grid on a hypercube of p processors. We
prove that the amount of communication any one processor does is O(k2),
which implies balanced communication as well as the result in George et
al. [1] that the total volume of communication traffic is O(pk2).},
annote={see GeorgeLiuNg89},
}
%got
@article{Gay91,
author={Gay, D. M.},
title={Massive Memory Buys Little Speed for Complete, In-Core Sparse {Cholesky} Factorizations on Some Scalar Computers},
journal=LAA,
year={1991},
volume={152},
pages={291--314},
keywords={loop-free code generation vs left-looking Cholesky}
}
%got
@incollection{Geist87,
author={Geist, G. A.},
title={Solving Finite Element Problems with Parallel Multifrontal Schemes},
booktitle={Proc. 2nd Conf. on Hypercube Multiprocessors (1986), Knoxville, TN},
year={1987},
publisher={SIAM},
address={Philadelphia},
editor={Heath, M. T.},
pages={656--661},
}
%got
@article{GeistNg89,
author={Geist, G. A. and Ng, E. G.},
title={Task Scheduling for Parallel Sparse {Cholesky} Factorization},
journal=IJPP,
year={1989},
volume={18},
number={4},
pages={291-314},
}
%got
@article{GengOdenVanDeGeijn97,
author={Geng, P. and Oden, J. T. and {van de Geijn}, R. A.},
title={A parallel multifrontal algorithm and its implementation },
journal=CMAME,
volume={149},
number={1-4},
pages={289 - 301},
year={1997},
annote={Containing papers presented at the Symposium on Advances in Computational Mechanics },
url={ http://dx.doi.org/10.1016/S0045-7825(97)00052-2 },
abstract={In this paper, we describe a multifrontal method for
solving sparse systems of linear equations arising in finite element
and finite difference methods. The method proposed in this study is a
combination of the nested dissection ordering and the frontal method.
It can significantly reduce the storage and computational time required
by the conventional direct methods and is also a natural parallel
algorithm. In addition, the method inherits major advantages of the
frontal method, which include a simple interface with finite element
codes and an effective data structure so that the entire computation is
performed element by element on a series of small linear systems with
dense stiffness matrices. The numerical implementation targets both
distributed-memory machines as well as conventional sequential
machines. Its performance is tested through a series of examples. }
}
%GET
@article{Gentleman75,
author={Gentleman, W. M.},
title={Row elimination for solving sparse linear systems and least squares problems},
journal={Lecture Notes in Mathematics},
volume={506},
publisher={Springer-Verlag},
year={1975},
pages={122--133}
}
%%GEORGE -----------------------------------------------------------------------
%GET
@techreport{George71,
author={George, A.},
title={Computer implementation of the finite element method},
year={1971},
number={STAN-CS-71-208},
institution={Stanford University, Department of Computer Science}
}
%got
@incollection{George72,
author={George, A.},
title={Block elimination on finite element systems of equations},
pages={101-114},
editor={Rose, D. J. and Willoughby, R. A.},
booktitle={Sparse Matrices and Their Applications},
address={New York},
publisher={New York: Plenum Press},
year={1972},
url={ http://link.springer.com/book/10.1007%2F978-1-4615-8675-3 },
}
%got
@article{George73,
author={George, A.},
year={1973},
title={Nested Dissection of a Regular Finite Element Mesh},
journal=SINUM,
volume={10},
number={2},
pages={345-363},
keywords={ordering nested dissection},
url={ http://dx.doi.org/10.1137/0710032 },
}
%got
@article{George74,
author={George, A.},
year={1974},
title={On Block Elimination for Sparse Linear Systems},
journal=SINUM,
volume={11},
number={3},
pages={585-603},
keywords={partitioning},
url={ http://dx.doi.org/10.1137/0711050 },
}
%got
@article{George77,
author={George, A.},
year={1977},
title={Numerical experiments using dissection methods to solve n-by-n grid problems},
journal=SINUM,
volume={14},
number={2},
pages={161-179},
url={ http://dx.doi.org/10.1137/0714011 },
}
%got
@incollection{George77b,
author={George, A.},
year={1977},
title={Solution of Linear Systems of Equations: Direct Methods for Finite-Element Problems},
booktitle={Sparse Matrix Techniques, Lecture Notes in Mathematics 572},
publisher={Berlin: Springer-Verlag},
editor={Barker, V. A.},
pages={52-101},
}
%got
@article{George80,
author={George, A.},
year={1980},
title={An Automatic One-Way Dissection Algorithm for Irregular Finite-Element Problems},
journal=SINUM,
volume={17},
number={6},
pages={740--751},
keywords={one-way dissection finite-element method},
url={ http://dx.doi.org/10.1137/0717062 },
}
%got (Tim has book from TAMU library)
@incollection{George81,
author={George, A.},
year={1981},
title={Direct Solution of Sparse Positive Definite Systems: Some Basic Ideas and Open Problems},
editor={Duff, I. S.},
booktitle={Sparse Matrices and Their Uses},
publisher={New York: Academic Press},
pages={283-306},
}
%got
@article{GeorgeHeath80,
author={George, A. and Heath, M. T.},
year={1980},
title={Solution of Sparse Linear Least Squares Problems Using {Givens} Rotations},
journal=LAA,
volume={34},
pages={69-83},
keywords={linear least squares problems Givens rotations},
annote={also in Large Scale Matrix Problems, Bjork, A, Plemmons, R. J., Schneider, H. (eds), North Holland, New York, 1981 }
}
%got
@article{GeorgeHeathLiuNg86,
author={George, A. and Heath, M. T. and Liu, J. W. H. and Ng, E. G.},
year={1986},
title={Solution of Sparse Positive Definite Systems on a Shared-Memory Multiprocessor},
journal=IJPP,
volume={15},
number={4},
pages={309-325},
keywords={Cholesky factorization sparse triangular solutions parallel},
abstract={Algorithms and software for solving sparse symmetric positive
definite systems on serial computers have reached a high state of
development. In this paper, we present algorithms for performing sparse
Cholesky factorization and sparse triangular solutions on a
shared-memory multiprocessor computer, along with some numerical
experiments demonstrating their performance on a Sequent Balance 8000
system.}
}
%got
@article{GeorgeHeathLiuNg88,
author={George, A. and Heath, M. T. and Liu, J. W. H. and Ng, E. G.},
title={Sparse {Cholesky} Factorization on a Local-Memory Multiprocessor},
year={1988},
journal=SISC,
volume={9},
number={2},
pages={327--340},
url={ http://dx.doi.org/10.1137/0909021 },
keywords={parallel {Cholesky} local-memory multiprocessor},
annote={Report CS-86-01, Dept. of Computer Science, York Univ., Ontario
1986, also Report ORNL/TM-9962, Oak Ridge National Laboratory, Oak
Ridge, Tennessee}
}
%got
@article{GeorgeHeathLiuNg89,
author={George, A. and Heath, M. T. and Liu, J. W. H. and Ng, E. G.},
title={Solution of sparse positive definite systems on a hypercube},
journal=JCAM,
year={1989},
volume={27},
pages={129--156},
}
%got
@article{GeorgeHeathNg83,
author={George, A. and Heath, M. T. and Ng, E. G.},
title={A Comparison of Some Methods for Solving Sparse Linear Least-Squares Problems},
publisher={SIAM},
year={1983},
journal=SISC,
volume={4},
number={2},
pages={177-187},
keywords={sparse linear least-squares problems; Peters-Wilkinson method; Givens rotations; normal equations},
url={ http://dx.doi.org/10.1137/0904013 }
}
%got
@article{GeorgeHeathNg84,
author={George, A. and Heath, M. T. and Ng, E. G.},
title={Solution of sparse underdetermined systems of linear equations},
journal=SISC,
year={1984},
volume={5},
number={4},
pages={988--997},
url={ http://dx.doi.org/10.1137/0905068 },
}
%got
@article{GeorgeHeathNgLiu87,
author={George, A. and Heath, M. T. and Ng, E. G. and Liu, J. W. H.},
title={Symbolic {Cholesky} factorization on a local-memory multiprocessor },
journal=PC,
volume={5},
number={1-2},
pages={85 - 95},
year={1987},
url={ http://dx.doi.org/10.1016/0167-8191(87)90009-3 },
keywords={Parallel algorithm},
keywords={Cholesky factorization of sparse symmetric matrices},
keywords={message-passing distributed-memory multiprocessor},
keywords={Intel iPSC hypercube },
abstract={We present a parallel algorithm for symbolic Cholesky
factorization of sparse symmetric matrices. The symbolic factorization
algorithm complements a parallel numeric factorization algorithm
published earlier. The implementation is designed for a
message-passing, distributed-memory multiprocessor. In addition to
discussing the basic algorithm and data structures required, we also
describe two enhancements that improve performance. Empirical test
results obtained on an Intel iPSC hypercube are given. },
}
%got
@article{GeorgeHeathPlemmons81,
author={George, A. and Heath, M. T. and Plemmons, R. J.},
year={1981},
title={Solution of Large-Scale Sparse Least Squares Problems Using Auxiliary Storage},
journal=SISC,
volume={2},
number={4},
pages={416--429},
url={ http://dx.doi.org/10.1137/0902034 },
abstract={Very large sparse linear least squares problems arise in a
variety of applications, such as geodetic network adjustments,
photogrammetry, earthquake studies, and certain types of finite element
analysis. Many of these problems are so large that it is impossible to
solve them without using auxiliary storage devices. Some problems are
so massive that the storage needed for their solution exceeds the
virtual address space of the largest machines. In this paper we
describe a method for solving such problems on a typical (large)
computer and provide the results of some experiments illustrating the
effectiveness of our approach. The method includes an automatic
partitioning scheme which is essential to the efficient management of
the data on auxiliary files.},
}
%got
@article{GeorgeLiu75,
author={George, A. and Liu, J. W. H.},
month={Jun.},
year={1975},
title={A Note on Fill for Sparse Matrices},
journal=SINUM,
volume={12},
number={3},
pages={452-454},
url={ http://dx.doi.org/10.1137/0712035 },
}
%got
@article{GeorgeLiu78b,
author={George, A. and Liu, J. W. H.},
year={1978},
title={Algorithms for Matrix Partitioning and the Numerical Solution of Finite Element Systems},
journal=SINUM,
volume={15},
number={2},
pages={297-327},
keywords={partitioning},
url={ http://dx.doi.org/10.1137/0715021 },
}
%got
@article{GeorgeLiu78,
author={George, A. and Liu, J. W. H.},
year={1978},
title={An Automatic Nested Dissection Algorithm for Irregular Finite Element Problems},
journal=SINUM,
volume={15},
number={5},
pages={1053--1069},
keywords={ordering nested dissection},
url={ http://dx.doi.org/10.1137/0715069 },
}
%got
@article{GeorgeLiu79,
author={George, A. and Liu, J. W. H.},
year={1979},
title={The Design of a User Interface for a Sparse Matrix Package},
journal=TOMS,
volume={5},
number={2},
pages={139--162},
url={ http://dx.doi.org/10.1145/355826.355829 },
keywords={software user interface SPARSPAK}
}
%got
@article{GeorgeLiu79c,
author={George, A. and Liu, J. W. H.},
year={1979},
title={An Implementation of a Pseudo-Peripheral Node Finder},
journal=TOMS,
volume={5},
pages={284-295},
keywords={ordering}
}
%got
@article{GeorgeLiu80b,
author={George, A. and Liu, J. W. H.},
year={1980},
title={A Fast Implementation of the Minimum Degree Algorithm Using Quotient Graphs},
journal=TOMS,
volume={6},
number={3},
url={ http://dx.doi.org/10.1145/355900.355906 },
pages={337--358},
annote={see GeorgeLiu79b, 'A Quotient Graph Model for Symmetric
Factorization,' in Sparse Matrix Proceedings 1978, pages 154-175},
}
%got
@article{GeorgeLiu80,
author={George, A. and Liu, J. W. H.},
year={1980},
title={A Minimal Storage Implementation of the Minimum Degree Algorithm},
journal=SINUM,
volume={17},
number={2},
pages={282-299},
keywords={ordering minimum degree},
url={ http://dx.doi.org/10.1137/0717024 },
}
%got
@article{GeorgeLiu80c,
author={George, A. and Liu, J. W. H.},
title={An Optimal Algorithm for Symbolic Factorization of Symmetric Matrices},
year={1980},
journal=SIAMCOMP,
volume={9},
number={3},
pages={583--593},
url={ http://dx.doi.org/10.1137/0209044 },
}
%got (Tim owns hardcopy of book)
@book{GeorgeLiu81,
author={George, A. and Liu, J. W. H.},
title={Computer Solution of Large Sparse Positive Definite Systems},
year={1981},
address={Englewood Cliffs, NJ},
publisher={Prentice-Hall},
keywords={positive definite systems},
}
%got
@article{GeorgeLiu87,
author={George, A. and Liu, J. W. H.},
title={{Householder} reflections versus {Givens} rotations in sparse orthogonal decomposition},
journal=LAA,
year={1987},
volume={88},
pages={223--238},
}
%got
@article{GeorgeLiu89,
author={George, A. and Liu, J. W. H.},
year={1989},
title={The Evolution of the Minimum Degree Ordering Algorithm},
journal=SIREV,
volume={31},
number={1},
pages={1--19},
keywords={minimum degree},
url={ http://dx.doi.org/10.1137/1031001 }
}
%got
@article{GeorgeLiu99,
author={George, A. and Liu, J. W. H.},
title={An object-oriented approach to the design of a user interface for a sparse matrix package},
journal=SIMAX,
year={1999},
volume={20},
number={4},
pages={953-969},
url={ http://dx.doi.org/10.1137/S0895479897317739 },
}
%got
@article{GeorgeLiuNg84,
author={George, A. and Liu, J. W. H. and Ng, E. G.},
year={1984},
title={Row ordering schemes for sparse {Givens} transformations: {I}. {Bipartite} graph model},
journal=LAA,
volume={61},
pages={55--81},
}
%got
@article{GeorgeLiuNg86,
author={George, A. and Liu, J. W. H. and Ng, E. G.},
year={1986},
title={Row ordering schemes for sparse {Givens} transformations: {II}. {Implicit} graph model},
journal=LAA,
volume={75},
pages={203--223},
}
%got
@article{GeorgeLiuNg86b,
author={George, A. and Liu, J. W. H. and Ng, E. G.},
year={1986},
title={Row ordering schemes for sparse {Givens} transformations: {III}. {Analysis} for a model problem},
journal=LAA,
volume={75},
pages={225--240},
}
%got
@article{GeorgeLiuNg88,
author={George, A. and Liu, J. W. H. and Ng, E. G.},
year={1988},
title={A Data Structure for Sparse {QR} and {LU} Factorizations},
journal=SISC,
volume={9},
number={1},
pages={100--121},
url={ http://dx.doi.org/10.1137/0909008 },
}
%got
@article{GeorgeLiuNg89,
author={George, A. and Liu, J. W. H. and Ng, E. G.},
title={Communication results for parallel sparse {Cholesky} factorization on a hypercube},
journal=PC,
volume={10},
number={3},
pages={287 - 298},
year={1989},
url={ http://dx.doi.org/10.1016/0167-8191(89)90101-4 },
keywords={Parallel computation},
keywords={linear algebra},
keywords={sparse linear systems},
keywords={Cholesky factorization},
keywords={communication costs},
keywords={task assignment strategy },
abstract={We consider the problem of reducing data traffic among
processor nodes during the parallel factorization of a sparse matrix on
a hypercube multiprocessor. A task assignment strategy based on the
structure of an elimination tree is presented. This assignment is aimed
at achieving load balancing among the processors and also reducing the
amount of processor-to-processor data communication. An analysis of
regular grid problems is presented, providing a bound on communication
volume generated by the new strategy, and showing that the allocation
scheme is optimal in the asymptotic sense. Some experimental results on
the performance of this scheme are presented. }
}
%got
@article{GeorgeMcIntyre78,
author={George, A. and McIntyre, D. R.},
title={On the application of the minimum degree algorithm to finite element systems},
journal=SINUM,
year={1978},
volume={15},
number={1},
pages={90--112},
url={ http://dx.doi.org/10.1137/0715006 }
}
%got
@article{GeorgeNg83erratum,
author={George, A. and Ng, E. G.},
title={Erratum: On Row and Column Orderings for Sparse Least Squares Problems},
journal=SINUM,
volume={20},
number={4},
pages={872},
year={1983},
url={ http://dx.doi.org/10.1137/0720059 },
}
%got
@article{GeorgeNg83,
author={George, A. and Ng, E. G.},
title={On row and column orderings for sparse least square problems},
journal=SINUM,
year={1983},
volume={20},
number={2},
pages={326--344},
url={ http://dx.doi.org/10.1137/0720022 }
}
%got (Tim has hardcopy)
@article{GeorgeNg84b,
author={George, A. and Ng, E. G.},
title={A new release of {SPARSPAK} - the {Waterloo} sparse matrix package},
journal=SIGNUM,
year={1984},
volume={19},
number={4},
pages={9-13},
}
%got
@techreport{GeorgeNg84,
author={George, A. and Ng, E. G.},
month={Nov.},
year={1984},
title={{SPARSPAK}: Waterloo Sparse Matrix Package, User's Guide for {SPARSPAK-B}},
institution={Univ. of Waterloo Dept.~of Computer Science},
number={CS-84-37},
keywords={SPARSPAK users guide linear least squares problems},
note={ https://cs.uwaterloo.ca/research/tr/1984/CS-84-37.pdf },
address={Waterloo, Ontario}
}
%got (Tim has hardcopy)
@article{GeorgeNg85b,
author={George, A. and Ng, E. G.},
title={A brief description of {SPARSPAK} - {Waterloo} sparse linear equations package},
journal=SIGNUM,
year={1985},
volume={16},
number={2},
pages={17-19},
}
%got
@article{GeorgeNg85,
author={George, A. and Ng, E. G.},
year={1985},
title={An Implementation of {Gaussian} Elimination with Partial Pivoting for Sparse Systems},
journal=SISC,
volume={6},
number={2},
pages={390-409},
url={ http://dx.doi.org/10.1137/0906028 },
}
%got
@article{GeorgeNg86,
author={George, A. and Ng, E. G.},
title={Orthogonal reduction of sparse matrices to upper triangular form using {Householder} transformations},
journal=SISC,
year={1986},
volume={7},
number={2},
pages={460--472},
url={ http://dx.doi.org/10.1137/0907031 },
}
%got
@article{GeorgeNg87,
author={George, A. and Ng, E. G.},
year={1987},
title={Symbolic Factorization for Sparse {Gaussian} Elimination with Partial Pivoting},
journal=SISC,
volume={8},
number={6},
pages={877-898},
url={ http://dx.doi.org/10.1137/0908072 },
annote={Report CS-84-43, Dept. of Computer Science, Univ. of Waterloo,
Ontario, 1984.}
}
%got
@article{GeorgeNg88b,
author={George, A. and Ng, E. G.},
year={1988},
title={On the Complexity of Sparse {QR} and {LU} Factorization of Finite-Element Matrices},
journal=SISC,
volume={9},
pages={849--861},
url={ http://dx.doi.org/10.1137/0909057 },
}
%got
@article{GeorgeNg90,
author={George, A. and Ng, E. G.},
title={Parallel sparse {Gaussian} elimination with partial pivoting},
journal=AOR,
year={1990},
volume={22},
number={1},
pages={219-240},
url={ http://dx.doi.org/10.1007/BF02023054 },
}
%got
@article{GeorgePooleVoigt78,
author={George, A. and Poole, W. G. and Voigt, R. G.},
title={Incomplete nested dissection for solving n-by-n grid problems},
journal=SINUM,
year={1978},
volume={15},
number={4},
pages={662-673},
url={ http://dx.doi.org/10.1137/0715044 }
}
%got
@article{GeorgePothen97,
author={George, A. and Pothen, A.},
title={An analysis of spectral envelope-reduction via quadratic assignment problems},
journal=SIMAX,
year={1997},
volume={18},
number={3},
pages={706--732},
url={ http://dx.doi.org/10.1137/S089547989427470X }
}
%got
@article{GeorgeRashwan80,
author={George, A. and Rashwan, H.},
year={1980},
title={On Symbolic Factorization of Partitioned Sparse Symmetric Matrices},
journal=LAA,
volume={34},
pages={145-157},
annote={also in Large Scale Matrix Problems, Bjork, A, Plemmons, R. J., Schneider, H. (eds), North Holland, New York, 1981},
}
%got
@article{GeorgeRashwan85,
author={George, A. and Rashwan, H.},
title={Auxiliary Storage Methods for Solving Finite Element Systems},
journal=SISC,
volume={6},
number={4},
pages={882--910},
year={1985},
url={ http://dx.doi.org/10.1137/0906060 },
}
%%G continued ------------------------------------------------------------------
%got
@inproceedings{GeorgeSaxenaGuptaSinghChoudhury11,
author={George, T. and Saxena, V. and Gupta, A. and Singh, A. and Choudhury, A. R.},
booktitle={Parallel Distributed Processing Symposium (IPDPS), 2011 IEEE International},
title={Multifrontal Factorization of Sparse {SPD} Matrices on {GPUs}},
year={2011},
month={May},
pages={372--383},
abstract={Solving large sparse linear systems is often the most
computationally intensive component of many scientific computing
applications. In the past, sparse multifrontal direct factorization has
been shown to scale to thousands of processors on dedicated
supercomputers resulting in a substantial reduction in computational
time. In recent years, an alternative computing paradigm based on GPUs
has gained prominence, primarily due to its affordability,
power-efficiency, and the potential to achieve significant speedup
relative to desktop performance on regular and structured parallel
applications. However, sparse matrix factorization on GPUs has not been
explored sufficiently due to the complexity involved in an efficient
implementation and concerns of low GPU utilization. In this paper, we
present an adaptive hybrid approach for accelerating sparse
multifrontal factorization based on a judicious exploitation of the
processing power of the host CPU and GPU. We present four different
policies for distributing and scheduling the workload between the host
CPU and the GPU, and propose a mechanism for a runtime selection of the
appropriate policy for each step of sparse Cholesky factorization. This
mechanism relies on auto-tuning based on modeling the best policy
predictor as a parametric classifier. We estimate the classifier
parameters from the available empirical computation time data such that
the expected computation time is minimized. This approach is readily
adaptable for using the current or an extended set of policies for
different CPU-GPU combinations as well as for different combinations of
dense kernels for both the CPU and the GPU.},
url={ http://dx.doi.org/10.1109/IPDPS.2011.44 },
}
%got
@article{GeschiereWijshoff95,
author={Geschiere, J. P. and Wijshoff, H. A. G.},
title={Exploiting large grain parallelism in a sparse direct linear system solver},
journal=PC,
volume={21},
number={8},
pages={1339--1364},
year={1995},
url={ http://dx.doi.org/10.1016/0167-8191(95)00024-I },
keywords={Linear algebra},
keywords={Sparse linear system},
keywords={Direct linear system solver},
keywords={Granularity},
keywords={Cray Y-MP, Shared-memory multiprocessor},
keywords={Tasking facility},
abstract={MCSPARSE is a parallel solver based on large grain
parallelism, combined with medium and fine grain parallelism. For the
multiple CPU Cray-systems, the large grain parallelism can be exploited
using Cray's macro-tasking while Cray's micro-tasking facilities can be
used to implement the medium grain parallelism of MCSPARSE. The fine
grain parallelism can be mapped on the parallel fine grain Cray
processor. In this paper, this design together with its performance
results on the Cray Y-MP 4/464 are presented. More specifically, we
study the impact of an elaborate reordering scheme H* on the resulting
efficiency of MCSPARSE.},
}
%got
@article{Gibbs76,
author={Gibbs, N. E.},
month={Dec.},
year={1976},
title={Algorithm 509: A Hybrid Profile Reduction Algorithm},
journal=TOMS,
volume={2},
number={4},
pages={378--387},
}
%got
@article{GibbsPooleStockmeyer76a,
author={Gibbs, N. E. and Poole, W. G. and Stockmeyer, P. K.},
month={Apr.},
year={1976},
title={An Algorithm for Reducing the Bandwidth and Profile of a Sparse Matrix},
journal=SINUM,
volume={13},
number={2},
pages={236--250},
}
%got
@article{GibbsPooleStockmeyer76b,
author={Gibbs, N. E. and Poole, W. G. and Stockmeyer, P. K.},
year={1976},
title={A Comparison of Several Bandwidth and Reduction Algorithms},
journal=TOMS,
volume={2},
number={4},
pages={322--330},
}
%%GILBERT ----------------------------------------------------------------------
%got
@article{Gilbert80,
author={Gilbert, J. R.},
title={A Note on the {NP}-Completeness of Vertex Elimination on Directed Graphs},
journal=SIAMJADM,
volume={1},
number={3},
pages={292-294},
year={1980},
url={ http://dx.doi.org/10.1137/0601033 },
}
%got
@article{Gilbert94,
author={Gilbert, J. R.},
title={Predicting structure in sparse matrix computations},
journal=SIMAX,
year={1994},
volume={15},
number={1},
pages={62-79},
url={ http://dx.doi.org/10.1137/S0895479887139455 },
}
%got
@article{GilbertGrigori03,
author={Gilbert, J. R. and Grigori, L.},
title={A Note on the Column Elimination Tree},
journal=SIMAX,
volume={25},
number={1},
pages={143-151},
year={2003},
url={ http://dx.doi.org/10.1137/S0895479801393770 },
}
%got
@article{GilbertHafsteinsson90,
author={Gilbert, J. R. and Hafsteinsson, H.},
title={Parallel symbolic factorization of sparse linear systems },
journal=PC,
volume={14},
number={2},
pages={151 - 162},
year={1990},
url={ http://dx.doi.org/10.1016/0167-8191(90)90104-H },
keywords={Linear algebra},
keywords={sparse matrix computation},
keywords={parallel algorithms},
keywords={elimination trees },
abstract={The Cholesky factorization of a sparse matrix A can
introduce new nonzeros into the factor matrix. We present an efficient
{CRCW} parallel algorithm to find this fill. Our algorithm takes
O($\log^2n$) time using m* processors, where m* is the number of
nonzeros in the Cholesky factor of A. The algorithm has two stages.
First it finds A's elimination tree, and then uses it to compute the
fill. The part of the algorithm that finds the elimination tree runs in
O($\log^2n$) time using m processors, where m is the number of nonzeros
in A. }
}
%got
@article{GilbertLiNgPeyton01,
author={Gilbert, J. R. and Li, X. S. and Ng, E. G. and Peyton, B. W.},
title={Computing row and column counts for sparse {QR} and {LU} factorization},
journal=BIT,
year={2001},
volume={41},
number={4},
pages={693-710},
url={ http://dx.doi.org/10.1023/A%3A1021943902025 },
}
%got
@article{GilbertLiu93,
author={Gilbert, J. R. and Liu, J. W. H.},
title={Elimination structures for unsymmetric sparse {LU} factors},
journal=SIMAX,
year={1993},
volume={14},
number={2},
pages={334-354},
url={ http://dx.doi.org/10.1137/0614024 },
}
%got
@article{GilbertMillerTeng98,
author={Gilbert, J. R. and Miller, G. L. and Teng, S. H.},
title={Geometric mesh partitioning: {Implementation} and experiments},
journal=SISC,
year={1998},
volume={19},
number={6},
pages={2091-2110},
url={ http://dx.doi.org/10.1137/S1064827594275339 },
}
%got
@article{GilbertMolerSchreiber92,
author={Gilbert, J. R. and Moler, C. and Schreiber, R.},
title={Sparse matrices in {MATLAB}: design and implementation},
journal=SIMAX,
year={1992},
volume={13},
number={1},
pages={333-356},
url={ http://dx.doi.org/10.1137/0613024 },
}
%got
@incollection{GilbertNg93,
author={Gilbert, J. R. and Ng, E. G.},
title={Predicting structure in nonsymmetric sparse matrix factorizations},
pages={107--139},
booktitle={Graph Theory and Sparse Matrix Computation},
series={IMA Volumes in Applied Mathematics},
year={1993},
publisher={Springer-Verlag},
address={New York},
editor={George, A. and Gilbert, J. R. and Liu, J. W. H.},
volume={56},
}
%got
@article{GilbertNgPeyton94,
author={Gilbert, J. R. and Ng, E. G. and Peyton, B. W.},
title={An efficient algorithm to compute row and column counts for sparse {Cholesky} factorization},
journal=SIMAX,
year={1994},
volume={15},
number={4},
pages={1075-1091},
url={ http://dx.doi.org/10.1137/S0895479892236921 },
}
%got
@article{GilbertNgPeyton97,
author={Gilbert, J. R. and Ng, E. G. and Peyton, B. W.},
title={Separators and structure prediction in sparse orthogonal factorization},
journal=LAA,
year={1997},
volume={262},
pages={83--97},
}
%got
@article{GilbertPeierls88,
author={Gilbert, J. R. and Peierls, T.},
year={1988},
title={Sparse Partial Pivoting in Time Proportional to Arithmetic Operations},
journal=SISC,
volume={9},
number={5},
pages={862-874},
keywords={partial pivoting, graph algorithms},
url={ http://dx.doi.org/10.1137/0909058 },
}
%got
@article{GilbertSchreiber92,
author={Gilbert, J. R. and Schreiber, R.},
title={Highly parallel sparse {Cholesky} factorization},
journal=SISC,
year={1992},
volume={13},
number={5},
pages={1151-1172},
url={ http://dx.doi.org/10.1137/0913067 },
}
%got
@article{GilbertTarjan87,
author={Gilbert, J. R. and Tarjan, R. E.},
title={The Analysis of a Nested Dissection Algorithm},
journal=NUMERMATH,
year={1987},
volume={50},
number={4},
pages={377--404},
url={ http://dx.doi.org/10.1007/BF01396660 },
}
%got
@article{GilbertZmijewski87,
author={Gilbert, J. R. and Zmijewski, E.},
title={A Parallel Graph Partitioning Algorithm for a Message-passing Multiprocessor},
journal=IJPP,
year={1987},
volume={16},
number={6},
pages={427-449},
annote={edge cut, but discusses how to convert that to node cut and
a nested-dissection fill-reducing permutation }
}
%%G continued ------------------------------------------------------------------
%got
@article{GillespieOlesky95,
author={Gillespie, M. I. and Olesky, D. D.},
title={Ordering {Givens} rotations for sparse {QR} factorization},
journal=SIMAX,
year={1995},
volume={16},
number={3},
pages={1024-1041},
url={ http://dx.doi.org/10.1137/S1064827593253343 },
}
%got (Tim owns hardcopy of book)
@book{GolubVanLoan2012,
author={Golub, G. H. and {Van Loan}, C. F.},
title={Matrix Computations},
edition={4th},
series={Johns Hopkins Studies in the Mathematical Sciences},
publisher={The Johns Hopkins University Press},
address={Baltimore, London},
year={2012},
}
%got
@article{GonzalezCabaleiroPena00,
author={Gonz\'alez, P. and Cabaleiro, J. C. and Pena, T. F.},
title={On parallel solvers for sparse triangular systems },
journal=JSA,
volume={46},
number={8},
pages={675 - 685},
year={2000},
url={ http://dx.doi.org/10.1016/S1383-7621(99)00036-3 },
keywords={Triangular systems},
keywords={Level scheduling},
keywords={Data driven},
keywords={Distributed-memory multiprocessors},
keywords={Message-Passing Interface (MPI) },
abstract={In this paper we describe and compare two different methods
for solving general sparse triangular systems in distributed memory
multiprocessor architectures. The two methods involve some
preprocessing overheads so they are primarily of interest in solving
many systems with the same coefficient matrix. Both algorithms start
off from the idea of the classical substitution method. The first
algorithm we present introduces a concept of data driven flow and makes
use of non-blocking communications in order to dynamically extract the
inherent parallelism of sparse systems. The second algorithm uses a
reordering technique for the unknowns, so the final system can be
grouped in variable blocksizes where the rows are independent and can
be solved in parallel. This latter technique is called level scheduling
because of the way it is represented in the adjacency graph. These
methods have been tested in the Fujitsu {AP1000} and the Cray {T3D}
and {T3E} multicomputers. The performance has been analysed using
matrices from the Harwell-Boeing collection. }
}
%got
@article{GotoVanDeGeijn08,
author={Goto, K. and {van de Geijn}, R.},
title={High Performance Implementation of the Level-3 {BLAS}},
journal=TOMS,
volume={35},
number={1},
month=jul,
year={2008},
pages={14:1-14:14},
annote={Article 4, 14 pages},
url={ http://dx.doi.org/10.1145/1377603.1377607 },
abstract={A simple but highly effective approach for transforming
high-performance implementations on cache-based architectures of
matrix-matrix multiplication into implementations of other commonly
used matrix-matrix computations (the level-3 BLAS) is presented.
Exceptional performance is demonstrated on various architectures.}
}
%got
@article{GouldScott04,
author={Gould, N. I. M. and Scott, J. A.},
title={A Numerical Evaluation of {HSL} Packages for the Direct Solution of Large Sparse, Symmetric Linear Systems of Equations},
journal=TOMS,
volume={30},
number={3},
month=sep,
year={2004},
pages={300--325},
url={ http://dx.doi.org/10.1145/1024074.1024077 },
abstract={In recent years a number of new direct solvers for the
solution of large sparse, symmetric linear systems of equations have
been added to the mathematical software library HSL. These include
solvers that are designed for the solution of positive-definite systems
as well as solvers that are principally intended for solving indefinite
problems. The available choice can make it difficult for users to know
which solver is the most appropriate for their use. In this study, we
use performance profiles as a tool for evaluating and comparing the
performance of the HSL solvers on an extensive set of test problems
taken from a range of practical applications.},
}
%got
@article{GouldScottHu07,
author={Gould, N. I. M. and Scott, J. A. and Hu, Y.},
title={A Numerical Evaluation of Sparse Solvers for Symmetric Systems},
journal=TOMS,
volume={33},
number={2},
month=jun,
year={2007},
pages={10:1--10:32},
url={ http://dx.doi.org/10.1145/1236463.1236465 },
abstract={In recent years a number of solvers for the direct
solution of large sparse symmetric linear systems of equations have
been developed. These include solvers that are designed for the
solution of positive definite systems as well as those that are
principally intended for solving indefinite problems. In this study, we
use performance profiles as a tool for evaluating and comparing the
performance of serial sparse direct solvers on an extensive set of
symmetric test problems taken from a range of practical applications.},
}
%got
@article{GrigoriBomanDonfackDavis08,
author={Grigori, L. and Boman, E. and Donfack, S. and Davis, T. A.},
title={Hypergraph-based unsymmetric nested dissection ordering for sparse {LU} factorization},
journal=SISC,
year={2010},
volume=32,
number=6,
pages={3426--3446},
url={ http://dx.doi.org/10.1137/080720395 }
}
%got
@article{GrigoriCosnardNg07,
author={Grigori, L. and Cosnard, M. and Ng, E. G.},
title={On the row merge tree for sparse {LU} factorization with partial pivoting},
journal=BIT,
volume=47,
number=1,
year={2007},
url={ http://dx.doi.org/10.1007/s10543-007-0116-1 },
pages={45--76},
}
%got
@article{GrigoriDemmelLi07,
author={Grigori, L. and Demmel, J. W. and Li, X. S.},
title={Parallel symbolic factorization for sparse {LU} with static pivoting},
journal=SISC,
volume=29,
number=3,
pages={1289--1314},
year={2007},
url={ http://dx.doi.org/10.1137/050638102 },
}
%got
@article{GrigoriGilbertCosnard09,
author={Grigori, L. and Gilbert, J. R. and Cosnard, M.},
title={Symbolic and Exact Structure Prediction for Sparse {Gaussian} Elimination with Partial Pivoting},
year={2009},
journal=SIMAX,
volume={30},
number={4},
pages={1520-1545},
keywords={sparse {LU} factorization; partial pivoting; structure prediction; characterization of fill},
url={ http://dx.doi.org/10.1137/050629343 }
}
%got
@article{GrigoriLi07,
author={Grigori, L. and Li, X. S.},
title={Towards an accurate performance modelling of parallel sparse factorization},
journal=AAECC,
year={2007},
volume=18,
number=3,
pages={241--261},
}
%got
@article{GrimesPierceSimon90,
author={Grimes, R. G. and Pierce, D. J. and Simon, H. D.},
title={A new algorithm for finding a pseudoperipheral node in a graph},
journal=SIMAX,
year={1990},
volume={11},
number={2},
pages={323--334},
}
%got
@article{GuermoucheLExcellent06,
author={Guermouche, A. and L'Excellent, J.-Y.},
title={Constructing memory-minimizing schedules for multifrontal methods},
journal=TOMS,
volume={32},
number={1},
pages={17--32},
month=mar,
year={2006},
url={ http://dx.doi.org/10.1145/1132973.1132975 },
abstract={We are interested in the memory usage of multifrontal
methods. Starting from the algorithms introduced by Liu, we propose new
schedules to allocate and process tasks that improve memory usage. This
generalizes two existing factorization and memory-allocation schedules
by allowing a more flexible task allocation together with a specific
tree traversal. We present optimal algorithms for this new class of
schedules, and demonstrate experimentally their benefit for some
real-world matrices from sparse matrix collections where either the
active memory or the total memory is minimized.},
}
%got
@article{GuermoucheLExcellentUtard03,
title={Impact of reordering on the memory of a multifrontal solver},
journal=PC,
volume={29},
number={9},
pages={1191 - 1218},
year={2003},
annote={Parallel Matrix Algorithms and Applications},
url={ http://dx.doi.org/10.1016/S0167-8191(03)00099-1 },
author={Guermouche, A. and L'Excellent, J.-Y. and Utard, G.},
keywords={Sparse direct solvers},
keywords={Parallel multifrontal method},
keywords={Reordering},
keywords={Assembly tree},
keywords={Memory}
}
%got
@article{GunnelsGustavsonHenryVanDeGeijn01,
author={Gunnels, J. A. and Gustavson, F. G. and Henry, G. M. and {van de Geijn}, R. A.},
title={FLAME: Formal Linear Algebra Methods Environment},
journal=TOMS,
volume={27},
number={4},
month=dec,
year={2001},
pages={422--455},
url={ http://dx.doi.org/10.1145/504210.504213 },
keywords={Formal derivation, libraries, linear algebra, performance},
abstract={Since the advent of high-performance distributed-memory
parallel computing, the need for intelligible code has become ever
greater. The development and maintenance of libraries for these
architectures is simply too complex to be amenable to conventional
approaches to implementation. Attempts to employ traditional
methodology have led, in our opinion, to the production of an abundance
of anfractuous code that is difficult to maintain and almost impossible
to upgrade. Having struggled with these issues for more than a decade,
we have concluded that a solution is to apply a technique from
theoretical computer science, formal derivation, to the development of
high-performance linear algebra libraries. We think the resulting
approach results in aesthetically pleasing, coherent code that greatly
facilitates intelligent modularity and high performance while enhancing
confidence in its correctness. Since the technique is
language-independent, it lends itself equally well to a wide spectrum
of programming languages (and paradigms) ranging from C and Fortran to
C++ and Java. In this paper, we illustrate our observations by looking
at the Formal Linear Algebra Methods Environment (FLAME), a framework
that facilitates the derivation and implementation of linear algebra
algorithms on sequential architectures. This environment demonstrates
that lessons learned in the distributed-memory world can guide us
toward better approaches even in the sequential world. We present
performance experiments on the Intel (R) Pentium (R) III processor that
demonstrate that high performance can be attained by coding at a high
level of abstraction.},
}
%%GUPTA ------------------------------------------------------------------------
%got (Tim has hardcopy)
@techreport{Gupta96,
author={Gupta, A.},
title={Fast and effective algorithms for graph partitioning and sparse matrix ordering},
institution={IBM Research Division},
year={1996},
number={RC 20496 (90799)},
address={Yorktown Heights, NY}
}
%got (Tim has hardcopy)
@techreport{Gupta96b,
author={Gupta, A.},
title={{WGPP:} {Watson} graph partitioning},
institution={IBM Research Division},
year={1996},
number={RC 20453 (90427)},
address={Yorktown Heights, NY},
month={May}
}
%got
@article{Gupta02,
author={Gupta, A.},
title={Improved Symbolic and Numerical Factorization Algorithms for Unsymmetric Sparse Matrices},
journal=SIMAX,
year={2002},
volume={24},
pages={529--552},
annote={WSMP}
}
%got
@article{Gupta02b,
author={Gupta, A.},
title={Recent Advances in Direct Methods for Solving Unsymmetric Sparse Systems of Linear Equations},
journal=TOMS,
volume={28},
number={3},
pages={301--324},
month=sep,
year={2002},
url={ http://dx.doi.org/10.1145/569147.569149 },
abstract={During the past few years, algorithmic improvements
alone have reduced the time required for the direct solution of
unsymmetric sparse systems of linear equations by almost an order of
magnitude. This paper compares the performance of some well-known
software packages for solving general sparse systems. In particular, it
demonstrates the consistently high level of performance achieved by
WSMP---the most recent of such solvers. It compares the various
algorithmic components of these solvers and discusses their impact on
solver performance. Our experiments show that the algorithmic choices
made in WSMP enable it to run more than twice as fast as the best among
similar solvers and that WSMP can factor some of the largest sparse
matrices available from real applications in only a few seconds on a
4-CPU workstation. Thus, the combination of advances in hardware and
algorithms makes it possible to solve those general sparse linear
systems quickly and easily that might have been considered too large
until recently.},
annote={was tech report Gupta01a}
}
%got
@article{Gupta07,
author={Gupta, A.},
title={A shared- and distributed-memory parallel general sparse direct solver},
journal=AAECC,
year={2007},
volume={18},
number={3},
pages={263--277},
annote={about WSMP}
}
%got
@article{GuptaKarypisKumar97,
author={Gupta, A. and Karypis, G. and Kumar, V.},
title={Highly Scalable Parallel Algorithms for Sparse Matrix Factorization},
journal=IEEETPDS,
year={1997},
volume={8},
number={5},
pages={502--520},
url={ http://dx.doi.org/10.1109/71.598277 },
annote={PSPASES http://www.cs.umn.edu/$\sim$mjoshi/pspases},
abstract={In this paper, we describe scalable parallel algorithms for
symmetric sparse matrix factorization, analyze their performance and
scalability, and present experimental results for up to 1,024
processors on a Cray T3D parallel computer. Through our analysis and
experimental results, we demonstrate that our algorithms substantially
improve the state of the art in parallel direct solution of sparse
linear systems-both in terms of scalability and overall performance. It
is a well known fact that dense matrix factorization scales well and
can be implemented efficiently on parallel computers. In this paper, we
present the first algorithms to factor a wide class of sparse matrices
(including those arising from two- and three-dimensional finite element
problems) that are asymptotically as scalable as dense matrix
factorization algorithms on a variety of parallel architectures. Our
algorithms incur less communication overhead and are more scalable than
any previously known parallel formulation of sparse matrix
factorization. Although, in this paper, we discuss Cholesky
factorization of symmetric positive definite matrices, the algorithms
can be adapted for solving sparse linear least squares problems and for
Gaussian elimination of diagonally dominant matrices that are almost
symmetric in structure. An implementation of one of our sparse Cholesky
factorization algorithms delivers up to 20 GFlops on a Cray T3D for
medium-size structural engineering and linear programming problems. To
the best of our knowledge, this is the highest performance ever
obtained for sparse Cholesky factorization on any supercomputer},
}
%%GUSTAVSON --------------------------------------------------------------------
%got
@incollection{Gustavson72,
author={Gustavson, F. G.},
pages={41--52},
title={Some Basic Techniques for Solving Sparse Systems of Linear Equations},
editor={Rose, D. J. and Willoughby, R. A.},
booktitle={Sparse Matrices and Their Applications},
address={New York},
publisher={New York: Plenum Press},
year={1972},
url={ http://link.springer.com/book/10.1007%2F978-1-4615-8675-3 },
}
%got (Tim has book from TAMU library)
@incollection{Gustavson76,
author={Gustavson, F. G.},
year={1976},
title={Finding the Block Lower Triangular Form of a Sparse Matrix},
editor={Bunch, J. R. and Rose, D. J.},
booktitle={Sparse Matrix Computations},
address={New York},
publisher={Academic Press},
pages={275-290},
keywords={ordering block triangular form}
}
%got
@article{Gustavson78,
author={Gustavson, F. G.},
year={1978},
title={Two Fast Algorithms for Sparse Matrices: Multiplication and Permuted Transposition},
journal=TOMS,
volume={4},
number={3},
pages={250--269},
url={ http://dx.doi.org/10.1145/355791.355796 },
}
%got
@article{GustavsonLinigerWilloughby70,
author={Gustavson, F. G. and Liniger, W. M. and Willoughby, R. A.},
year={1970},
title={Symbolic Generation of an Optimal {Crout} Algorithm for Sparse Systems of Linear Equations},
journal=JACM,
volume={17},
pages={87-109},
url={ http://dx.doi.org/10.1145/321556.321565 },
keywords={Crout algorithm GNSO program code generation},
keywords={loop-free code generation}
}
%%H ---------------------------------------------------------------------------
%got
@article{HachtelBraytonGustavson71,
author={Hachtel, G.D. and Brayton, R.K. and Gustavson, F.G.},
journal=IEEETCT,
title={The Sparse Tableau Approach to Network Analysis and Design},
year={1971},
volume={18},
number={1},
pages={101-113},
abstract={The tableau approach to automated network design optimization
via implicit, variable order, variable time-step integration, and
adjoint sensitivity computation is described. In this approach, the
only matrix operation required is that of repeatedly solving linear
algebraic equations of fixed sparsity structure. Required partial
derivatives and numerical integration is done at the branch level
leading to a simple input language, complete generality and maximum
sparsity of the characteristic coefficient matrix. The bulk of
computation and program complexity is thus located in the sparse matrix
routines; described herein are the routines OPTORD and 1-2-3 GNSO.
These routines account for variability type of the matrix elements in
producing a machine code for solution of Ax=b in nested iterations for
which a weighted sum of total operations count and round-off error
incurred in the optimization is minimized.},
keywords={Computer-aided circuit design;Nonlinear networks;Optimization
techniques;Sparse-matrix methods;Capacitance;Computational
modeling;Computer networks;Design automation;Design
optimization;Differential equations;Nonlinear equations;Roundoff
errors;Sparse matrices;Voltage},
url={ http://dx.doi.org/10.1109/TCT.1971.1083223 },
month={Jan},
keywords={loop-free code generation}
}
%got
@inproceedings{HadfieldDavis94,
author={Hadfield, S. M. and Davis, T. A.},
year={1994},
title={Potential and Achievable Parallelism in the Unsymmetric-Pattern Multifrontal {LU} Factorization Method for Sparse Matrices},
booktitle={Proceedings of the Fifth SIAM Conf. on Applied Linear Algebra},
address={Snowbird, Utah},
organization={SIAM},
pages={387-391},
}
%got
@article{HadfieldDavis95,
author={Hadfield, S. M. and Davis, T. A.},
title={The use of graph theory in a parallel multifrontal method for sequences of unsymmetric pattern sparse matrices},
journal=CONGNUM,
year={1995},
volume={108},
pages={43--52},
}
%got
@article{Hager02,
author={Hager, W. W.},
title={Minimizing the profile of a symmetric matrix},
journal=SISC,
year={2002},
volume={23},
number={5},
pages={1799-1816},
}
%got
@article{HareJohnsonOleskyVanDenDriessche93,
author={Hare, D. R. and Johnson, C. R. and Olesky, D. D. and {Van Den Driessche}, P.},
title={Sparsity analysis of the {QR} factorization},
journal=SIMAX,
year={1993},
volume={14},
number={3},
pages={665-669},
url={ http://dx.doi.org/10.1137/0614046 },
}
%got
@article{HeTanWangShi15,
author={K. He and S. X.-D. Tan and H. Wang and G. Shi},
journal=IEEETVLSI,
title={{GPU}-Accelerated Parallel Sparse {LU} Factorization Method for Fast Circuit Analysis},
year={2015},
volume={PP},
number={99},
abstract={Lower upper (LU) factorization for sparse matrices is the
most important computing step for circuit simulation problems. However,
parallelizing LU factorization on the graphic processing units (GPUs)
turns out to be a difficult problem due to intrinsic data dependence
and irregular memory access, which diminish GPU computing power. In
this paper, we propose a new sparse LU solver on GPUs for circuit
simulation and more general scientific computing. The new method, which
is called GPU accelerated LU factorization (GLU) solver (for GPU LU),
is based on a hybrid right-looking LU factorization algorithm for
sparse matrices. We show that more concurrency can be exploited in the
right-looking method than the left-looking method, which is more
popular for circuit analysis, on GPU platforms. At the same time, the
GLU also preserves the benefit of column-based left-looking LU method,
such as symbolic analysis and column-level concurrency. We show that
the resulting new parallel GPU LU solver allows the parallelization of
all three loops in the LU factorization on GPUs. While in contrast, the
existing GPU-based left-looking LU factorization approach can only
allow parallelization of two loops. Experimental results show that the
proposed GLU solver can deliver 5.71x and 1.46x speedup over the
single-threaded and the 16-threaded PARDISO solvers, respectively,
19.56x speedup over the KLU solver, 47.13x over the UMFPACK solver, and
1.47x speedup over a recently proposed GPU-based left-looking LU solver
on the set of typical circuit matrices from the University of Florida
(UFL) sparse matrix collection. Furthermore, we also compare the
proposed GLU solver on a set of general matrices from the UFL, GLU
achieves 6.38x and 1.12x speedup over the single-threaded and the
16-threaded PARDISO solvers, respectively, 39.39x speedup over the KLU
solver, 24.04x over the UMFPACK solver, and 2.35x speedup over the same
GPU-based left-looking LU solver. In addition, comparison on
self-generate- $RLC$ mesh networks shows a similar trend, which further
validates the advantage of the proposed method over the existing sparse
LU solvers.},
keywords={Algorithm design and analysis;Concurrent computing;Graphics processing units;Instruction sets;Parallel processing;Programming;Sparse matrices;Circuit simulation and analysis;graphic processing unit (GPU) parallelization;sparse LU factorization.},
url={ http://dx.doi.org/10.1109/TVLSI.2015.2421287 },
}
%%HEATH ------------------------------------------------------------------------
%got
@article{Heath82,
author={Heath, M. T.},
title={Some Extensions of an Algorithm for Sparse Linear Least Squares Problems},
publisher={SIAM},
year={1982},
journal=SISC,
volume={3},
number={2},
pages={223-237},
url={ http://dx.doi.org/10.1137/0903014 },
}
%got
@article{Heath84,
author={Heath, M. T.},
title={Numerical methods for large sparse linear least squares problems},
journal=SISC,
year={1984},
volume={5},
number={3},
pages={497--513},
url={ http://dx.doi.org/10.1137/0905037 },
}
%got
@article{HeathNgPeyton91,
author={Heath, M. T. and Ng, E. G. and Peyton, B. W.},
title={Parallel algorithms for sparse linear systems},
journal=SIREV,
year={1991},
volume={33},
number={3},
pages={420-460},
annote={also ch 2 in Parallel Algo. for Matrix Computations, SIAM 1990},
url={ http://dx.doi.org/10.1137/1033099 },
}
%got
@article{HeathRaghavan95b,
author={Heath, M. T. and Raghavan, P.},
title={A {Cartesian} parallel nested dissection algorithm},
journal=SIMAX,
year={1995},
volume={16},
number={1},
pages={235-253},
url={ http://dx.doi.org/10.1137/S0895479892238270 },
}
%got
@article{HeathRaghavan97,
author={Heath, M. T. and Raghavan, P.},
title={Performance of a fully parallel sparse solver},
journal=IJSAHPC,
year={1997},
volume={11},
number={1},
pages={49--64},
annote={CAPSS},
url={ http://hpc.sagepub.com/content/11/1/49.abstract },
abstract={The performance of a fully parallel direct solver for large
sparse-symmetric positive definite systems of linear equations is
demonstrated. The solver is designed for distributed-memory,
message-passing parallel computer systems. All phases of the
computation, including symbolic processing as well as numeric
factorization and triangular solution, are performed in parallel. A
parallel Cartesian-nested dissection algorithm is used to compute a
fill- reducing ordering for the matrix and an appropriate partitioning
of the problem across the processors. The separator tree resulting from
nested dissection is used to identify and exploit large-grain
parallelism in the remaining steps of the computation. The parallel
performance of the solver is reported for a series of test problems on
the Thinking Machines CM-5 and the Intel Touchstone Delta. The parallel
efficiency, scalability, and absolute performance of the solver, as
well as the relative importance of the various phases of the
computation, are investigated empirically. }
}
%got
@article{HeathSorensen86,
author={Heath, M. T. and Sorensen, D. C.},
year={1986},
title={A Pipelined {Givens} Method for Computing the {QR} Factorization of a Sparse Matrix},
journal=LAA,
volume={77},
pages={189-203},
keywords={Givens transformations {QR} factorization},
annote={Report ANL/MCS-TM-47, Argonne National Laboratory (which I
have), Mathematics and Computer Science Division, Argonne, Illinois,
Feb. 1985},
url={ http://dx.doi.org/10.1016/0024-3795(86)90168-0 },
abstract={This paper discusses an extension of the pipelined Givens
method for computing the QR factorization of a real mxn matrix to the
case in which the matrix is sparse. When restricted to one process, the
algorithm performs the same computation as the serial sparse Givens
algorithm of George and Heath. Our implementation is compatible with
the data structures used in SPARSPAK. The pipelined algorithm is well
suited to parallel computers having globally shared memory and
low-overhead synchronization primitives, such as the Denelcor HEP, for
which computational results are presented. We point out certain
synchronization problems that arise in the adaptation to the sparse
setting and discuss the effect on parallel speedup of accessing a
serial data file. }
}
%%H continued ------------------------------------------------------------------
%got
@article{HeggernesPeyton08,
author={Heggernes, P. and Peyton, B. W.},
title={Fast Computation of Minimal Fill Inside A Given Elimination Ordering},
publisher={SIAM},
year={2008},
journal=SIMAX,
volume={30},
number={4},
pages={1424-1444},
keywords={sparse matrix computations; minimal fill; elimination
trees; composite tree rotations; maximum cardinality search (MCS);
minimal triangulation},
url={ http://dx.doi.org/10.1137/070680680 },
}
%got
@article{HellermanRarick71,
author={Hellerman, E. and Rarick, D. C.},
year={1971},
title={Reinversion with the Preassigned Pivot Procedure},
journal=MATHPROG,
volume={1},
number={1},
pages={195-216},
keywords={ordering P3 algorithm},
url={ http://dx.doi.org/10.1007/BF01584086 },
}
%got
@incollection{HellermanRarick72,
author={Hellerman, E. and Rarick, D. C.},
title={The Partitioned Preassigned Pivot Procedure ({P4})},
pages={67--76},
editor={Rose, D. J. and Willoughby, R. A.},
booktitle={Sparse Matrices and Their Applications},
address={New York},
publisher={New York: Plenum Press},
year={1972},
url={ http://link.springer.com/book/10.1007%2F978-1-4615-8675-3 },
}
%%HENDRICKSON -----------------------------------------------------------------
%got
@article{HendricksonLeland95a,
author={Hendrickson, B. and Leland, R.},
title={An improved spectral graph partitioning algorithm for mapping parallel computations},
journal=SISC,
year={1995},
volume={16},
number={2},
pages={452-469},
annote={edge cut. Chaco. does not discuss node separators, nor sparse matrix orderings},
url={ http://dx.doi.org/10.1137/0916028 },
}
%GET
@article{HendricksonLeland95b,
author={Hendrickson, B. and Leland, R.},
title={A multi-level algorithm for partitioning graphs},
journal={Supercomputing '95: Proc. 1995 ACM/IEEE Conf. on Supercomputing},
year={1995},
pages={28},
url={ http://doi.ieeecomputersociety.org/10.1109/SUPERC.1995.3 },
publisher={IEEE Computer Society},
address={Los Alamitos, CA, USA},
}
%GET
@techreport{HendricksonLeland95c,
author={Hendrickson, B. and Leland, R.},
title={The {Chaco} user’s guide: Version 2.0},
year={1995},
institution={Technical Report SAND95-2344, Sandia National Laboratories}
}
%got
@article{HendricksonRothberg98,
author={Hendrickson, B. and Rothberg, E.},
title={Improving the runtime and quality of nested dissection ordering},
journal=SISC,
year={1998},
volume={20},
number={2},
pages={468--489},
annote={Chaco},
url={ http://dx.doi.org/10.1137/S1064827596300656 },
}
%%H continued ------------------------------------------------------------------
%got
@article{HenonRametRoman02,
author={H\'enon, P. and Ramet, P. and Roman, J.},
title={{PaStiX}: A High-Performance Parallel Direct Solver for Sparse Symmetric Definite Systems},
journal=PC,
year={2002},
volume={28},
number={2},
pages={301--321},
url={ http://dx.doi.org/10.1016/S0167-8191(01)00141-7 },
abstract={Solving large sparse symmetric positive definite systems of
linear equations is a crucial and time-consuming step, arising in many
scientific and engineering applications. This paper considers the block
partitioning and scheduling problem for sparse parallel factorization
without pivoting. There are two major aims to this study: the
scalability of the parallel solver, and the compromise between memory
overhead and efficiency. Parallel experiments on a large collection of
irregular industrial problems validate our approach.}
}
%got
@article{HoLee90,
author={Ho, C.-W. and Lee, R. C. T.},
title={A Parallel Algorithm for Solving Sparse Triangular Systems},
journal=IEEETC,
year={1990},
volume={39},
number={6},
pages={848-852},
abstract={A fast parallel algorithm, which is generalized from the
parallel algorithms for solving banded linear systems, is proposed to
solve sparse triangular systems. The original problem is transformed
into a directed graph. The solving procedure then consists of
eliminating edges in this graph. The worst-case time-complexity of this
parallel algorithm is $O(\log^2n)$ where n is the size of the coefficient
matrix. When the coefficient matrix is a triangular banded matrix with
bandwidth m, then the time-complexity of the algorithm is
O(log(m)log(n)).}
}
%%HOGG -------------------------------------------------------------------------
%got
@article{HoggOvtchinnikovScott16,
author={Hogg, J. D. and Ovtchinnikov, E. and Scott, J. A.},
title={A sparse symmetric indefinite direct solver for {GPU} architectures },
journal=TOMS,
volume={42},
pages={1:1-1:25},
year={2016},
url={ http://doi.acm.org/10.1145/2756548 },
abstract={In recent years, there has been considerable interest in
the potential for graphics processing units (GPUs) to speed up the
performance of sparse direct linear solvers. Efforts have focused on
symmetric positive definite systems for which no pivoting is required
while little progress has been reported for the much harder indefinite
case. We address this challenge by designing and developing a sparse
symmetric indefinite solver SSIDS. This new library-quality solver is
designed for use on GPU architectures and incorporates threshold
partial pivoting within a multifrontal approach. Both the factorize and
the solve phases are performed using the GPU. Another important feature
is that the solver produces bit-compatible results. Numerical results
for indefinite problems arising from a range of practical applications
demonstrate that, for large problems, SSIDS achieves performance
improvements of up to a factor of 7 compared with a state-of-the-art
multifrontal solver on a multicore CPU.},
keywords={direct solver , bit compatibility , indefinite symmetric
systems , sparse linear systems , multifrontal , GPU},
}
%got
@article{HoggReidScott10,
author={Hogg, J. D. and Reid, J. K. and Scott, J. A.},
title={Design of a multicore sparse {Cholesky} factorization using {DAGs}},
journal=SISC,
volume={32},
number={6},
pages={3627--3649},
annote={was HoggReidScott09 tech report},
year={2010},
}
%got
@article{HoggScott13d,
author={Hogg, J. D. and Scott, J. A.},
title={An efficient analyse phase for element problems},
journal=NLAA,
volume={20},
number={3},
pages={397--412},
year={2013},
}
%got
@article{HoggScott13,
author={Hogg, J. D. and Scott, J. A.},
title={New Parallel Sparse Direct Solvers for Multicore Architectures},
journal=ALGO,
volume={6},
number={4},
pages={702--725},
year={2013},
abstract={ At the heart of many computations in science and engineering
lies the need to efficiently and accurately solve large sparse linear
systems of equations. Direct methods are frequently the method of
choice because of their robustness, accuracy and potential for use as
black-box solvers. In the last few years, there have been many new
developments, and a number of new modern parallel general-purpose
sparse solvers have been written for inclusion within the HSL
mathematical software library. In this paper, we introduce and briefly
review these solvers for symmetric sparse systems. We describe the
algorithms used, highlight key features (including bit-compatibility
and out-of-core working) and then, using problems arising from a range
of practical applications, we illustrate and compare their
performances. We demonstrate that modern direct solvers are able to
accurately solve systems of order $10^6$ in less than 3 minutes on a
16-core machine. },
}
%got
@article{HoggScott13c,
author={Hogg, J. D. and Scott, J. A.},
title={Optimal weighted matchings for rank-deficient sparse matrices},
journal=SIMAX,
volume={34},
number={4},
pages={1431--1447},
year={2013},
}
%got
@article{HoggScott13b,
author={Hogg, J. D. and Scott, J. A.},
title={Pivoting strategies for tough sparse indefinite systems},
journal=TOMS,
volume={40},
number={1},
year={2013},
pages={4:1--4:19},
url={ http://dx.doi.org/10.1145/2513109.2513113 },
abstract={The performance of a sparse direct solver is dependent
upon the pivot sequence that is chosen before the factorization begins.
In the case of symmetric indefinite systems, it may be necessary to
modify this sequence during the factorization to ensure numerical
stability. These modifications can have serious consequences in terms
of time as well as the memory and flops required for the factorization
and subsequent solves. This study focuses on hard-to-solve sparse
symmetric indefinite problems for which standard threshold partial
pivoting leads to significant modifications. We perform a detailed
review of pivoting strategies that are aimed at reducing the
modifications without compromising numerical stability. Extensive
numerical experiments are performed on a set of tough problems arising
from practical applications. Based on our findings, we make
recommendations on which strategy to use and, in particular, a
matching-based approach is recommended for numerically challenging
problems.},
}
%got
@article{HoggScott14,
author={Hogg, J. D. and Scott, J. A.},
title={Compressed threshold pivoting for sparse symmetric indefinite systems},
journal=SIMAX,
volume={35},
number={2},
pages={783--817},
year={2014},
}
%%H continued ------------------------------------------------------------------
%got
@article{HoitWilson83,
author={Hoit, M. and Wilson, E. L.},
year={1983},
title={An Equation Numbering Algorithm Based on a Minimum Front Criteria},
journal=CAS,
volume={16},
pages={225-239},
number={1-4},
url={ http://dx.doi.org/10.1016/0045-7949(83)90163-3 },
abstract={An algorithm is presented which defines the optimum
equation numbering sequence for the Gauss Elimination solution of
finite element systems. The equation numbering sequence generated is
identical to the equation solving sequence used in the well-known
'Frontal Method', in which the next equation to be eliminated is
determined by the minimum front criteria. After the numbering sequence
is determined, the equations are stored in profile form. Therefore, the
new algorithm is named the 'Profile-Front Minimization' or 'PFM'
algorithm. In this paper, existing methods for equation renumbering are
discussed. The theoretical development and the computer implementation
of the {PFM} method are then presented. Solution times and storage
requirements for several examples are summarized and the {PFM}
algorithm is compared to other techniques. In all the examples studied,
the {PFM} algorithm produced equation numbering sequences which
resulted in the minimum solution times. In addition, the {PFM}
method, for higher order finite element systems, requires significantly
less computer storage than other methods. The Fortran listing of the
subroutines for the {PFM} algorithm are given. These can be used
directly by Profile or Frontal solvers since both nodal and element
numbering sequences are produced. These subroutines have proven to be
extremely effective when used on microcomputers.}
}
%got
@article{Hood76,
author={Hood, P.},
year={1976},
title={Frontal Solution Program for Unsymmetric Matrices},
journal=IJNME,
volume={10},
number={2},
pages={379-400},
url={ http://dx.doi.org/10.1002/nme.1620100209 },
keywords={frontal methods unsymmetric matrices},
abstract={A frontal solution program is presented which may be used for
the solution of unsymmetric matrix equations arising in certain
applications of the finite element method to boundary value problems.
Based on the Gaussian elimination algorithm, it has advantages over
band matrix methods in that core requirements and computation times may
be considerably reduced; furthermore numbering of the finite element
mesh may be completed in an arbitrary manner. The program is written in
FORTRAN and a glossary of terms is provided. }
}
%got
@article{HopcroftKarp73,
author={Hopcroft, J. E. and Karp, R. M.},
title={An $n^{5/2}$ algorithm for maximum matchings in bipartite graphs},
journal=SICOMP,
year={1973},
pages={225--231},
volume={2},
url={ http://dx.doi.org/10.1137/0202019 },
}
%got
@article{HuangWing79,
author={Huang, J. W. and Wing, O.},
year={1979},
title={Optimal Parallel Triangulation of a Sparse Matrix},
journal=IEEETCS,
volume={CAS-26},
number={9},
pages={726-732},
keywords={parallel}
}
%got
@article{HulbertZmijewski91,
author={Hulbert, L. and Zmijewski, E.},
title={Limiting Communication in Parallel Sparse {Cholesky} Factorization},
journal=SISC,
year={1991},
volume={12},
number={5},
pages={1184-1197},
url={ http://dx.doi.org/10.1137/0912063 },
abstract={A new parallel algorithm for computing the Cholesky
factorization of a large sparse positive-definite matrix on a
message-passing multiprocessor is developed. The algorithm attempts to
reduce the communication overhead by redistributing the computational
load and by repeatedly combining the effect of many messages into a
single message. It is demonstrated experimentally that, for problems
ordered and partitioned among the processors using nested dissection,
the new algorithm communicates significantly fewer messages than a more
straightforward approach. Because of this reduction in communication,
for the test problems on an Intel iPSC/2 hypercube, the new algorithm
is typically at least 20 percent faster. Theoretically, it is shown
that in factoring a $k \times k$ grid on p processors, the new
algorithm sends $\Theta (pk\log_2 p)$ messages, compared to
$k \times k$ grid on p processors, the new algorithm sends
$\Theta (pk\log _2 k)$ messages for the straightforward algorithm. }
}
%%I ---------------------------------------------------------------------------
%got
@article{Igual12,
title={The {FLAME} approach: From dense linear algebra algorithms to high-performance multi-accelerator implementations},
journal=JPDC,
volume={72},
number={9},
pages={1134 - 1143},
year={2012},
annote={Accelerators for High-Performance Computing },
url={ http://dx.doi.org/10.1016/j.jpdc.2011.10.014 },
author={Igual, F. D. and Chan, E. and Quintana-Ortí, E. S. and Quintana-Ortí, G. and {van de Geijn}, R. A. and {Van Zee}, F. G.},
keywords={Dense linear algebra libraries},
keywords={Graphics processors},
keywords={Runtime systems},
keywords={High performance computing },
abstract={Parallel accelerators are playing an increasingly important
role in scientific computing. However, it is perceived that their
weakness nowadays is their reduced 'programmability' in comparison with
traditional general-purpose CPUs. For the domain of dense linear
algebra, we demonstrate that this is not necessarily the case. We show
how the libflame library carefully layers routines and abstracts
details related to storage and computation, so that extending it to
take advantage of multiple accelerators is achievable without
introducing platform specific complexity into the library code base. We
focus on the experience of the library developer as he develops a
library routine for a new operation, reduction of a generalized
Hermitian positive definite eigenvalue problem to a standard Hermitian
form, and configures the library to target a multi-GPU platform. It
becomes obvious that the library developer does not need to know about
the parallelization or the details of the multi-accelerator platform.
Excellent performance on a system with four NVIDIA Tesla C2050
GPUs is reported. This makes libflame the first library to be
released that incorporates multi-GPU functionality for dense matrix
computations, setting a new standard for performance. }
}
%got
@article{Irons70,
author={Irons, B. M.},
year={1970},
title={A Frontal Solution Program for Finite Element Analysis},
journal=IJNME,
volume={2},
pages={5-32},
keywords={frontal methods},
url={ http://dx.doi.org/10.1002/nme.1620020104 },
abstract={The program given here assembles and solves symmetric
positive-definite equations as met in finite element applications. The
technique is more involved than the standard band-matrix algorithms,
but it is more efficient in the important case when two-dimensional or
three-dimensional elements have other than corner nodes. Artifices are
included to improve efficiency when there are many right hand sides, as
in automated design. The organization of the program is described with
reference to diagrams, full notation, specimen input data and
supplementary comments on the ASA FORTRAN print-out.}
}
%got
@article{IronyShklarskiToledo04,
author={Irony, D. and Shklarski, G. and Toledo, S.},
title={Parallel and fully recursive multifrontal sparse {Cholesky}},
journal=FGCS,
volume={20},
number={3},
pages={425--440},
year={2004},
url={ http://dx.doi.org/10.1016/j.future.2003.07.007 }
}
%%J ---------------------------------------------------------------------------
%got
@article{Jennings66,
author={Jennings, A.},
title={A Compact Storage Scheme for the Solution of Symmetric Linear Simultaneous Equations},
journal=CJ,
volume={9},
number={3},
pages={281-285},
year={1966},
url={ http://dx.doi.org/10.1093/comjnl/9.3.281 },
abstract={A method is presented for the computer solution of symmetric
linear simultaneous equations which takes advantage of the presence of
zero elements away from the leading diagonal but which is more flexible
than diagonal band storage. The equations are solved by a form of
compact elimination. A routine based on the method has been programmed
in Atlas Autocode and used in some analyses of building frameworks. It
is found to be economical both in use of store and computing time.},
}
%got
@article{JessKees82,
author={Jess, J. A. G. and Kees, H. G. M.},
year={1982},
title={A Data Structure for Parallel {LU} Decomposition},
journal=IEEETC,
volume={C-31},
number={3},
pages={231-239},
keywords={parallel},
abstract={Some new results are presented concerning the
pivoting of large systems of linear equations with respect to parallel
processing techniques. It will be assumed that the processing of a
pivot takes one time slot. The pivoting problem is studied by means of
an associated graph model. Given a triangulated graph a set of label
classes is established. Class k contains all pivots which may be
processed in parallel during the kth time slot. The label classes are
used to establish the elimination-tree (e-tree). The e-tree is a
spanning tree for the given graph. The critical path in the e-tree
indicates the minimum number of time slots necessary to complete the
L/U-decomposition. Furthermore, the earliest and latest admissible time
slot for the processing of every pivot may be derived, such that the
critical path is not affected. The e-tree can be seen as a data
structure to guide parallel processing based on sparsity.}
}
%got
@article{JohnsonDulmageMendelsohn62,
author={Johnson, D. M. and Dulmage, A. L. and Mendelsohn, N. S.},
title={Connectivity and reducibility of graphs},
journal=CANMATH,
volume={14},
year={1962},
pages={529--539},
url={ http://dx.doi.org/10.4153/CJM-1962-044-0 }
}
%got
@incollection{GuptaGustavsonJoshiKarypisKumar99b,
author={Joshi, M. and Karypis, G. and Kumar, V. and Gupta, A. and Gustavson, F.},
title={{PSPASES}: an efficient and scalable parallel sparse direct solver},
booktitle={Kluwer Intl. Series in Engineering and Science},
publisher={Kluwer},
volume={515},
year={1999},
editor={Yang, Tianruo},
url={ http://www-users.cs.umn.edu/~mjoshi/pspases/ }
}
%%KARYPIS ----------------------------------------------------------------------
%GET
@article{KarypisAggarwalKumarShekar99,
author={Karypis, G. and Aggarwal, R. and Kumar, V. and Shekhar, S.},
title={Multilevel hypergraph partitioning: applications in {VLSI} domain},
journal=IEEETVLSI,
volume={7},
number={1},
pages={69--79},
year={1999},
publisher={IEEE}
}
%got
@article{KarypisKumar98e,
author={Karypis, G. and Kumar, V.},
title={A fast and high quality multilevel scheme for partitioning irregular graphs},
journal=SISC,
year={1998},
volume={20},
pages={359-392},
url={ http://dx.doi.org/10.1137/S1064827595287997 },
abstract={Recently, a number of researchers have investigated a class
of graph partitioning algorithms that reduce the size of the graph by
collapsing vertices and edges, partition the smaller graph, and then
uncoarsen it to construct a partition for the original graph [Bui and
Jones, Proc. of the 6th SIAM Conference on Parallel Processing for
Scientific Computing, 1993, 445--452; Hendrickson and Leland, A
Multilevel Algorithm for Partitioning Graphs, Tech. report SAND
93-1301, Sandia National Laboratories, Albuquerque, NM, 1993]. From the
early work it was clear that multilevel techniques held great promise;
however, it was not knownif they can be made to consistently produce
high quality partitions for graphs arising in a wide range of
application domains. We investigate the effectiveness of many different
choices for all three phases: coarsening, partition of the coarsest
graph, and refinement. In particular, we present a new coarsening
heuristic (called heavy-edge heuristic) for which the size of the
partition of the coarse graph is within a small factor of the size of
the final partition obtained after multilevel refinement. We also
present a much faster variation of the Kernighan--Lin (KL) algorithm
for refining during uncoarsening. We test our scheme on a large number
of graphs arising in various domains including finite element methods,
linear programming, VLSI, and transportation. Our experiments show that
our scheme produces partitions that are consistently better than those
produced by spectral partitioning schemes in substantially smaller
time. Also, when our scheme is used to compute fill-reducing orderings
for sparse matrices, it produces orderings that have substantially
smaller fill than the widely used multiple minimum degree algorithm.},
annote={Metis}
}
%got
@article{KarypisKumar98,
author={Karypis, G. and Kumar, V.},
title={A parallel algorithm for multilevel graph partitioning and sparse matrix ordering},
journal=JPDC,
year={1998},
volume={48},
number={1},
pages={71--95},
annote={see also tech report TR-95-036},
url={ http://dx.doi.org/10.1006/jpdc.1997.1403 },
abstract={In this paper we present a parallel formulation of the
multilevel graph partitioning and sparse matrix ordering algorithm. A
key feature of our parallel formulation (that distinguishes it from
other proposed parallel formulations of multilevel algorithms) is that
it partitions the vertices of the graph into sqrt(p) parts while
distributing the overall adjacency matrix of the graph among
all p processors. This mapping results in substantially smaller
communication than one-dimensional distribution for graphs with
relatively high degree, especially if the graph is randomly distributed
among the processors. We also present a parallel algorithm for
computing a minimal cover of a bipartite graph which is a key operation
for obtaining a small vertex separator that is useful for computing the
fill reducing ordering of sparse matrices. Our parallel algorithm
achieves a speedup of up to 56 on 128 processors for moderate size
problems, further reducing the already moderate serial run time of
multilevel schemes. Furthermore, the quality of the produced partitions
and orderings are comparable to those produced by the serial multilevel
algorithm that has been shown to outperform both spectral partitioning
and multiple minimum degree.},
annote={ParMetis}
}
%GET
@techreport{KarypisKumar98f,
title={{hMETIS} 1.5: A hypergraph partitioning package},
author={Karypis, G. and Kumar, V.},
year={1998},
note={Department of Computer Science, University of Minnesota},
}
%got
@article{KarypisKumar00,
author={Karypis, G. and Kumar, V.},
title={Multilevel k-way Hypergraph Partitioning},
volume={11},
pages={285-300},
journal=VLSI,
year={2000},
annote={hMetis}
}
%%K continued -----------------------------------------------------------------
%got
@article{KayaaslanPinarCatalyurekAykanat12,
author={Kayaaslan, E. and Pinar, A. and \c{C}ataly\"urek, \"U. V. and Aykanat, C.},
title={Partitioning Hypergraphs in Scientific Computing Applications through Vertex Separators on Graphs},
journal=SISC,
volume={34},
number={2},
pages={A970-A992},
year={2012},
url={ http://dx.doi.org/10.1137/100810022 },
abstract={The modeling flexibility provided by hypergraphs has drawn a
lot of interest from the combinatorial scientific community, leading to
novel models and algorithms, their applications, and development of
associated tools. Hypergraphs are now a standard tool in combinatorial
scientific computing. The modeling flexibility of hypergraphs, however,
comes at a cost: algorithms on hypergraphs are inherently more
complicated than those on graphs, which sometimes translates to
nontrivial increases in processing times. Neither the modeling
flexibility of hypergraphs nor the runtime efficiency of graph
algorithms can be overlooked. Therefore, the new research thrust should
be how to cleverly trade off between the two. This work addresses one
method for this trade-off by solving the hypergraph partitioning
problem by finding vertex separators on graphs. Specifically, we
investigate how to solve the hypergraph partitioning problem by seeking
a vertex separator on its net intersection graph (NIG), where each net
of the hypergraph is represented by a vertex, and two vertices share an
edge if their nets have a common vertex. We propose a vertex-weighting
scheme to attain good node-balanced hypergraphs, since the NIG model
cannot preserve node-balancing information. Vertex-removal and
vertex-splitting techniques are described to optimize cut-net and
connectivity metrics, respectively, under the recursive bipartitioning
paradigm. We also developed implementations of our proposed hypergraph
partitioning formulations by adopting and modifying a state-of-the-art
graph partitioning by vertex separator tool onmetis. Experiments
conducted on a large collection of sparse matrices demonstrate the
effectiveness of our proposed techniques. }
}
%got
@article{KernighanLin70,
author={Kernighan, B. W. and Lin, S.},
title={An efficient heuristic procedure for partitioning graphs},
journal=BELL,
year={1970},
volume={49},
number=2,
pages={291-307},
}
%got
@article{KimEijkhout14,
author={Kyungjoo Kim and Victor Eijkhout},
title={A Parallel Sparse Direct Solver via Hierarchical {DAG} Scheduling},
journal=TOMS,
volume=41,
number=1,
year={2014},
pages={3:1--3:27},
abstract={We present a parallel sparse direct solver for multi-core
architectures based on Directed Acyclic Graph (DAG) scheduling.
Recently, DAG scheduling has become popular in advanced Dense Linear
Algebra libraries due to its efficient asynchronous parallel execution
of tasks. However, its application to sparse matrix problems is more
challenging as it has to deal with an enormous number of highly
irregular tasks. This typically results in substantial scheduling
overhead both in time and space, which causes overall parallel
performance to be suboptimal. We describe a parallel solver based on
two-level task parallelism: tasks are first generated from a parallel
tree traversal on the assembly tree; next, those tasks are further
refined by using algorithms-by-blocks to gain fine-grained parallelism.
The resulting fine-grained tasks are asynchronously executed after
their dependencies are analyzed. Our approach is distinct from others
in that we adopt two-level task scheduling to mirror the two-level
parallelism. As a result we reduce scheduling overhead, and increase
efficiency and flexibility. The proposed parallel sparse direct solver
is evaluated for the particular problems arising from the hp-Finite
Element Method where conventional sparse direct solvers do not scale
well.},
}
%got
@article{King70,
author={King, I. P.},
year={1970},
title={An Automatic Reordering Scheme for Simultaneous Equations Derived from Network Systems},
journal=IJNME,
volume={2},
pages={523-533},
url={ http://dx.doi.org/10.1002/nme.1620020406 },
abstract={A procedure is presented for the automatic renumbering of
network type equations prior to solution by sparse matrix techniques.
The method takes an arbitrary input sequence and creates an order which
makes possible solution of complex network systems by reducing demand
for computer storage and time. Examples are presented to demonstrate
the procedure in action and show that, even in cases where the system
has been carefully numbered, considerable improvement can be achieved.
The procedure is also applicable to band matrix solution techniques.}
}
%got
@article{Knuth72,
author={Knuth, D. E.},
title={{George} {Forsythe} and the development of computer science},
journal=CACM,
volume={15},
number={8},
pages={721--726},
year={1972},
}
%got
@inproceedings{KosterBisseling94,
author={Koster, J. and Bisseling, R. H.},
title={An improved algorithm for parallel sparse {LU} decomposition on a distributed-memory multiprocessor},
booktitle={Proc. Fifth SIAM Conference on Applied Linear Algebra},
year={1994},
pages={397--401},
publisher={SIAM},
address={Snowbird, Utah},
month={June}
}
%got
@article{Kratzer92,
author={Kratzer, S. G.},
title={Sparse {QR} factorization on a massively parallel computer},
journal=JSUPER,
volume=6,
pages={237--255},
number={3-4},
year={1992},
url={ http://dx.doi.org/10.1007/BF00155801 },
}
%got
@incollection{KratzerCleary93,
author={Kratzer, S. G. and Cleary, A. J.},
title={Sparse matrix factorization on {SIMD} parallel computers},
pages={211--228},
booktitle={Graph Theory and Sparse Matrix Computation},
series={IMA Volumes in Applied Mathematics},
year={1993},
publisher={Springer-Verlag},
address={New York},
editor={George, A. and Gilbert, J. R. and Liu, J. W. H.},
volume={56},
}
%got
@inproceedings{KrawezikPoole09,
author={Krawezik, G. and Poole, G.},
title={Accelerating the {ANSYS} Direct Sparse Solver with {GPUs}},
year={2009},
publisher={NCSA},
address={Urbana-Champaign, IL},
booktitle={Proc. Symposium on Application Accelerators in High Performance Computing ({SAAHPC})},
url={ http://saahpc.ncsa.illinois.edu/09/papers/Krawezik_paper.pdf }
}
%got
@article{KruskalRudolphSnir89,
author={Kruskal, C. P. and Rudolph, L. and Snir, M.},
title={Techniques for parallel manipulation of sparse matrices},
journal=TCS,
year={1989},
volume={64},
number={2},
pages={135-157},
url={ http://dx.doi.org/10.1016/0304-3975(89)90058-3 },
abstract={New techniques are presented for the manipulation of sparse
matrices on parallel {MIMD} computers. We consider the following
problems: matrix addition, matrix multiplication, row and column
permutation, matrix transpose, matrix vector multiplication, and
Gaussian elimination. }
}
%got
@inproceedings{KumarEswarSadayappanHuang94,
author={Kumar, B. and Eswar, K. and Sadayappan, P. and Huang, C.-H.},
booktitle={Scalable High-Performance Computing Conference, 1994., Proceedings of the},
title={A reordering and mapping algorithm for parallel sparse {Cholesky} factorization},
year={1994},
month={May},
pages={803-810},
abstract={A judiciously chosen symmetric permutation can significantly
reduce the amount of storage and computation for the Cholesky
factorization of sparse matrices. On distributed memory machines, the
issue of mapping data and computation onto processors is also
important. Previous research on ordering for parallelism has focussed
on idealized measures like execution time on an unbounded number of
processors, with zero communication costs. In this paper, we propose an
ordering and mapping algorithm that attempts to minimize communication
and performs load balancing of work among the processors. Performance
results on an Intel iPSC/860 hypercube are presented to demonstrate its
effectiveness},
url={ http://dx.doi.org/10.1109/SHPCC.1994.296723 },
}
%got
@article{KumarEswarSadayappanHuang95,
author={Kumar, B. and Eswar, K. and Sadayappan, P. and Huang, C.-H.},
title={A Clustering Algorithm for Parallel Sparse {Cholesky} Factorization},
journal=PARALETTERS,
volume={5},
pages={685--696},
year={1995},
url={ http://dx.doi.org/10.1142/S0129626495000606 },
abstract={This paper presents an integrated approach to two issues
relevant to efficient parallel sparse Cholesky factorization: 1) matrix
reordering for parallelism, and, 2) mapping of data to processors. A
clustering heuristic is proposed to performs a fill-preserving
reordering and mapping of data onto a fixed number of processors.
Performance results on a Cray T3D are presented to demonstrate its
effectiveness.}
}
%got
@article{KumarKumarBasu92,
author={Kumar, P. S. and Kumar, M. K. and Basu, A.},
title={A parallel algorithm for elimination tree computation and symbolic factorization },
journal=PC,
volume={18},
number={8},
pages={849 - 856},
year={1992},
url={ http://dx.doi.org/10.1016/0167-8191(92)90031-2 },
keywords={Sparse Cholesky decomposition},
keywords={symbolic factorization},
keywords={elimination tree},
keywords={local memory multiprocessor },
abstract={The notion of an elimination tree plays a very important
role in the parallel algorithms for sparse Cholesky decomposition,
symbolic factorization and in determining the mapping of columns of the
matrix to processors. In this paper, we present a parallel algorithm to
compute the elimination tree and simultaneously carry out symbolic
factorization on a local memory multiprocessor. An existing parallel
algorithm for symbolic factorization [5] requires the computation of
elimination tree separately. In our algorithm, we use a tree defined on
the given matrix, called false elimination tree, and convert it into
the actual elimination tree. In the process, we also compute the
structure of the columns of the factor matrix. Using the new parallel
algorithm on grid problems, we found that it performs 2 to 3 times
faster compared to the total time taken for sequential computation of
the elimination tree and the parallel computation of symbolic
factorization using [5]. Also, our algorithm is the first parallel
algorithm for elimination tree computation that gives a speed-up. }
}
%got
@article{KumarKumarBasu93,
author={Kumar, P. S. and Kumar, M. K. and Basu, A.},
title={Parallel algorithms for sparse triangular system solution},
journal=PC,
year={1993},
volume={19},
pages={187-196},
number={2},
url={ http://dx.doi.org/10.1016/0167-8191(93)90048-P },
keywords={Local-memory multiprocessor systems},
keywords={parallel algorithms},
keywords={sparse Cholesky factorization},
keywords={sparse triangular systems},
keywords={elimination tree },
abstract={We present new local-memory multiprocessor algorithms for
solving sparse triangular systems of equations that arise in the
context of Cholesky factorization. Unlike in the existing algorithms,
we use the notion of the elimination tree and achieve significant
improvement in the performance of both the forward and backward
substitution phases. Our algorithms also incorporate the generalization
of an important technique of Li and Coleman that gave rise to the best
performance for dense triangular system solution. }
}
%got
@article{KumfertPothen97,
author={Kumfert, G. K. and Pothen, A.},
title={Two improved algorithms for reducing the envelope and wavefront},
journal=BIT,
volume={37},
number={3},
pages={559--590},
year={1997},
}
%got
@incollection{Kundert86,
author={Kundert, K. S.},
year={1986},
title={Sparse Matrix Techniques and Their Applications to Circuit Simulation},
editor={Ruehli, A. E.},
booktitle={Circuit Analysis, Simulation and Design},
publisher={New York: North-Holland},
keywords={application circuit simulations survey}
}
%%L ---------------------------------------------------------------------------
%got
@techreport{LacosteRametFavergeIchitaroDongarra12,
title={Sparse direct solvers with accelerators over {DAG} runtimes},
author={Lacoste, X. and Ramet, P. and Faverge, M. and Ichitaro, Y. and Dongarra, J.},
pages={11},
institution={INRIA},
number={RR-7972},
year={2012},
url={ http://hal.inria.fr/hal-00700066 },
abstract={The current trend in the high performance computing shows
a dramatic increase in the number of cores on the shared memory compute
nodes. Algorithms, especially those related to linear algebra, need to
be adapted to these new computer architectures in order to be
efficient. PASTIX is a sparse parallel direct solver, that incorporates
a dynamic scheduler for strongly hierarchical modern architectures. In
this paper, we study the replacement of this internal highly integrated
scheduling strategy by two generic runtime frameworks: DAGUE and
STARPU. Those runtimes will give the opportunity to execute the
factorization tasks graph on emerging computers equipped with
accelerators. As for previous work done in dense linear algebra, we
present the kernels used for GPU computations inspired by the MAGMA
library and the DAG algorithm used with those two runtimes. A
comparative study of the performances of the supernodal solver with the
three different schedulers is performed on manycore architectures and
the improvements obtained with accelerators are presented with the
STARPU runtime. These results demonstrate that these DAG runtimes
provide uniform programming interfaces to obtain high performance on
different architectures on irregular problems as sparse direct
factorizations. },
address={Bordeaux, France},
}
%got
@article{Law85,
author={Law, K. H.},
title={Sparse matrix factor modification in structural reanalysis},
journal=IJNME,
year={1985},
volume={21},
number={1},
pages={37--63},
publisher={John Wiley \& Sons, Ltd},
url={ http://dx.doi.org/10.1002/nme.1620210106 },
abstract={Structural reanalysis problems, such as in nonlinear finite
element analysis or optimum design, involve progressive changes in the
global stiffness matrix and its matrix factors. Although many studies
have been devoted to the subject of matrix factor modification, most
investigations have dealt with the problem separately from sparse
matrix methods. This paper introduces a graph-theoretic model for the
forward solution procedure which is applicable for identifying the
modified entries of the matrix factors due to changes in the original
matrix. Applications of this graph-theoretic model to existing
refactorization methods are presented. The relation between
substructuring and sparse matrix ordering strategies, and their effects
on reanalysis are discussed. Modification of a sparse matrix associated
with an n x n finite element grid ordered by the nested dissection
scheme is analysed.}
}
%got
@article{Law89,
author={Law, K. H.},
title={On updating the structure of sparse matrix factors},
journal=IJNME,
year={1989},
volume={28},
number={10},
pages={2339--2360},
publisher={John Wiley \& Sons, Ltd},
url={ http://dx.doi.org/10.1002/nme.1620281010 },
abstract={Structural analysis often involves the solution of a sparse
system of linear equations using matrix factorization. For structural
reanalysis problems, the system of equations is progressively changing
and matrix factorization is required at each iteration step. In this
paper, we study the problem of updating the structure of sparse matrix
factors using an ordered-tree model. The ordered-tree model provides
the information needed to symbolically compute the non-zero structure
of the matrix factor of a sparse matrix. Furthermore, for matrix factor
modification problems, this tree model can be used to determine the
numerically modified entries in the matrix factor. Examples for
modification of a structure and h-adaptive refinement of a finite
element model are given to illustrate the potential application of the
procedures developed in this study.}
}
%got
@article{LawFenves86,
author={Law, K. H. and Fenves, S. J.},
title={A node-addition model for symbolic factorization},
journal=TOMS,
year={1986},
volume={12},
number={1},
pages={37--50},
}
%got
@article{LawMackay93,
author={Law, K. H. and Mackay, D. R.},
title={A parallel row-oriented sparse solution method for finite element structural analysis},
journal=IJNME,
volume={36},
number={17},
publisher={John Wiley \& Sons, Ltd},
pages={2895--2919},
year={1993},
url={ http://dx.doi.org/10.1002/nme.1620361704 },
abstract={This paper describes a parallel implementation of LDLT
factorization on a distributed-memory parallel computer. Specifically,
the parallel LDLT factorization procedure is based on a row-oriented
sparse storage scheme. In addition, a strategy is proposed for the
parallel solution of a triangular system of equations. The strategy is
to compute the inverses of the dense principal diagonal block
submatrices of the factor L, stored in a row-oriented structure.
Experimental results for a number of finite element models are
presented to illustrate the effectiveness of the parallel solution
schemes.}
}
%got
@article{LeeKimHongLee03,
author={Lee, H. and Kim, J. and Hong, S. J. and Lee, S.},
title={Task scheduling using a block dependency {DAG} for block-oriented sparse {Cholesky} factorization },
journal=PC,
volume={29},
number={1},
pages={135 - 159},
year={2003},
url={ http://dx.doi.org/10.1016/S0167-8191(02)00220-X },
keywords={Task scheduling},
keywords={Parallel sparse matrix factorization},
keywords={Block-oriented Cholesky factorization},
keywords={Directed acyclic graph },
abstract={Block-oriented sparse Cholesky factorization decomposes a
sparse matrix into rectangular subblocks; each block can then be
handled as a computational unit in order to increase data reuse in a
hierarchical memory system. Also, the factorization method increases
the degree of concurrency and reduces the overall communication volume
so that it performs more efficiently on a distributed-memory
multiprocessor system than the customary column-oriented factorization
method. But until now, mapping of blocks to processors has been
designed for load balance with restricted communication patterns. In
this paper, we represent tasks using a block dependency {DAG} that
represents the execution behavior of block sparse Cholesky
factorization in a distributed-memory system. Since the characteristics
of tasks for block Cholesky factorization are different from those of
the conventional parallel task model, we propose a new task scheduling
algorithm using a block dependency DAG. The proposed algorithm consists
of two stages: early-start clustering, and affined cluster mapping
(ACM). The early-start clustering stage is used to cluster tasks while
preserving the earliest start time of a task without limiting
parallelism. After task clustering, the {ACM} stage allocates clusters
to processors considering both communication cost and load balance.
Experimental results on a Myrinet cluster system show that the proposed
task scheduling approach outperforms other processor mapping methods. }
}
%got
@article{Leuze89,
author={Leuze, M.},
year={1989},
title={Independent Set Orderings for Parallel Matrix Factorization by {Gaussian} Elimination},
journal=PC,
volume={10},
number={2},
pages={177-191},
url={ http://dx.doi.org/10.1016/0167-8191(89)90016-1 },
keywords={parallel compatible set independent set},
keywords={Gaussian elimination},
keywords={matrix ordering techniques},
keywords={fill minimizing },
abstract={Commonly used matrix ordering techniques are designed to
minimize fill, i.e., they are designed to minimize the number of zero
elements which become nonzero during matrix factorization by Gaussian
elimination. If Gaussian elimination is to be implemented on a parallel
machine, however, minimum fill orderings are not necessarily optimal.
Rather, the primary concern is to order a matrix so as to minimize the
time required to complete its factorization. An ordering heuristic
which appears to perform well with respect to parallel factorization
time is one based on finding independent sets of vertices in the matrix
adjacency graph. }
}
%GET
@article{Levy71,
author={Levy, R.},
title={Resequencing of the structural stiffness matrix to improve computational efficiency},
journal={Quarterly Technical Review},
volume={1},
number={2},
pages={61--70},
year={1971}
}
%%LEWIS ------------------------------------------------------------------------
%got
@article{Lewis82,
author={Lewis, J. G.},
month={Jun.},
year={1982},
title={Algorithm 582: The {Gibbs-Poole-Stockmeyer} and {Gibbs-King} Algorithms for Reordering Sparse Matrices},
journal=TOMS,
volume={8},
number={2},
pages={190--194},
url={ http://dx.doi.org/10.1145/355993.355999 },
keywords={ordering profile reduction Gibbs-Poole-Stockmeyer algorithm Gibbs-King Algorithm}
}
%got
@article{Lewis82c,
author={Lewis, J. G.},
month={Jun.},
year={1982},
title={Implementation of the {Gibbs-Poole-Stockmeyer} and {Gibbs-King} Algorithms},
journal=TOMS,
volume={8},
number={2},
pages={180--189},
url={ http://dx.doi.org/10.1145/355993.355998 },
annote={discusses Algorithm 582},
keywords={ordering matrix bandwidth profile reduction Gibbs-Poole-Stockmeyer algorithm Gibbs-King algorithm}
}
%got
@article{LewisPeytonPothen89,
author={Lewis, J. G. and Peyton, B. W. and Pothen, A.},
title={A Fast Algorithm for Reordering Sparse Matrices for Parallel Factorization},
journal=SISC,
year={1989},
volume={10},
number={6},
pages={1146-1173},
url={ http://dx.doi.org/10.1137/0910070 },
}
%got
@article{LewisSimon88,
author={Lewis, J. G. and Simon, H. D.},
month={Mar.},
year={1988},
title={The Impact of Hardware Gather/Scatter on Sparse {Gaussian} Elimination},
journal=SISC,
volume={9},
number={2},
pages={304-311},
keywords={architecture performance},
url={ http://dx.doi.org/10.1137/0909019 },
annote={Supercomputing Forum, Boeing Computer Services, 1986, vol. 1, no. 2, pp. 9-11, have the slides for this},
abstract={Recent vector supercomputers provide vector memory access to
'randomly' indexed vectors, whereas early vector supercomputers
required contiguously or regularly indexed vectors. This additional
capability, known as 'hardware gather/scatter,' can be used to great
effect in general sparse Gaussian elimination. In this note we present
some examples that show the impact of this change in hardware on the
choice of algorithms for sparse Gaussian elimination. Common folk
wisdom holds that general sparse Gaussian elimination algorithms do not
perform well on vector computers. Our numerical results demonstrate
that hardware gather/scatter allows general sparse elimination
algorithms to outperform algorithms based on a band, envelope, or block
structure on such computers. }
}
%%LEXCELLENT -------------------------------------------------------------------
%got
@article{LExcellentSidLakhdar14,
author={J.-Y. L'Excellent and Sid-Lakhdar, W. M.},
title={Introduction of shared-memory parallelism in a distributed-memory multifrontal solver},
journal=PC,
volume={40},
number={3-4},
pages={34--46},
year={2014},
url={ http://dx.doi.org/10.1016/j.parco.2014.02.003 },
keywords={Shared-memory},
keywords={Multi-core},
keywords={NUMA},
keywords={LU factorization},
keywords={Sparse matrix},
keywords={Multifrontal method },
abstract={We introduce shared-memory parallelism in a
parallel distributed-memory solver, targeting multi-core architectures.
Our concern in this paper is pure shared-memory parallelism, although
the work will also impact distributed-memory parallelism. Our approach
avoids a deep redesign and fully benefits from the numerical kernels
and features of the original code. We use performance models to exploit
coarse-grain parallelism in an OpenMP environment while, at the same
time, also relying on third-party optimized multithreaded libraries. In
this context, we propose simple approaches to take advantage of
{NUMA} architectures, and original optimizations to limit thread
synchronization costs. The performance gains are analyzed in detail on
test problems from various application areas. Although the studied code
is a direct solver for sparse systems of linear equations, the
contributions of this paper are more general and could be useful in a
wider range of situations. }
}
%%LI ---------------------------------------------------------------------------
%got
@article{Li05,
author={Li, X. S.},
title={An Overview of {SuperLU}: Algorithms, Implementation, and User Interface},
journal=TOMS,
volume={31},
number={3},
month=sep,
year={2005},
pages={302--325},
url={ http://dx.doi.org/10.1145/1089014.1089017 },
abstract={We give an overview of the algorithms, design
philosophy, and implementation techniques in the software SuperLU, for
solving sparse unsymmetric linear systems. In particular, we highlight
the differences between the sequential SuperLU (including its
multithreaded extension) and parallel SuperLU_DIST. These include the
numerical pivoting strategy, the ordering strategy for preserving
sparsity, the ordering in which the updating tasks are performed, the
numerical kernel, and the parallelization strategy. Because of the
scalability concern, the parallel code is drastically different from
the sequential one. We describe the user interfaces of the libraries,
and illustrate how to use the libraries most efficiently depending on
some matrix characteristics. Finally, we give some examples of how the
solver has been used in large-scale scientific applications, and the
performance.},
}
%got
@article{Li08,
author={Li, X. S.},
title={Evaluation of {SuperLU} on multicore architectures},
journal=JPHYS,
volume={125},
number={012079},
year={2008},
}
%got
@techreport{Li13,
author={Li, X. S.},
title={Direct solvers for sparse matrices},
institution={Lawrence Berkeley National Lab},
year={2013},
note={ \newline http://crd-legacy.lbl.gov/$\sim$xiaoye/SuperLU/SparseDirectSurvey.pdf },
address={Berkeley, CA},
}
%got
@article{LiDemmel03,
author={Li, X. S. and Demmel, J. W.},
title={{SuperLU\_DIST}: A scalable distributed-memory sparse direct solver for unsymmetric linear systems},
journal=TOMS,
volume={29},
number={2},
pages={110--140},
month=jun,
year={2003},
url={ http://dx.doi.org/10.1145/779359.779361 },
abstract={In this paper we present the main algorithmic features
in the software package SuperLU_DIST, a distributed-memory sparse
direct solver for large sets of linear equations. We give in detail our
parallelization strategies, with focus on scalability issues, and
demonstrate its parallel performance and scalability on current
machines. The solver is based on sparse Gaussian elimination, with an
innovative static pivoting strayegy proposed earlier by the authors.
The main advantage of static pivoting over classical partial pivoting
is that it permits a priori determination of data structures and
communication patterns, which lets us exploit techniques used in
parallel sparse Choelesky algorithms to better parallelize both LU
decomposition and triangular solve on large scale distributed
machines.},
keywords={Sparse direct solver, distributed-memory
computers, parallelism, scalability, supernodal factorization}
}
%%L continued ------------------------------------------------------------------
%got
@article{LinMah77,
author={Lin, T. D. and Mah, R. S. H.},
year={1977},
title={Hierarchical Partition - A New Optimal Pivoting Algorithm},
journal=MATHPROG,
volume={12},
number={1},
pages={260-278},
keywords={ordering hierarchical partition algorithm},
url={ http://dx.doi.org/10.1007/BF01593792 },
}
%got
@article{LinChen99,
author={Lin, W.-Y. and Chen, C.-L.},
title={Minimum communication cost reordering for parallel sparse {Cholesky} factorization },
journal=PC,
volume={25},
number={8},
pages={943 - 967},
year={1999},
url={ http://dx.doi.org/10.1016/S0167-8191(99)00027-7 },
keywords={Communication cost},
keywords={Distributed-memory multiprocessor},
keywords={Sparse matrix},
keywords={Parallel factorization},
keywords={Equivalent reordering},
keywords={Elimination tree },
abstract={In this paper, we consider the problem of reducing the
communication cost for the parallel factorization of a sparse symmetric
positive definite matrix on a distributed-memory multiprocessor. We
define a parallel communication cost function and show that, with a
contrived example, simply minimizing the height of the elimination tree
is ineffective for exploiting minimum communication cost and the
discrepancy may grow infinitely. We propose an algorithm to find an
ordering such that the communication cost to complete the parallel
Cholesky factorization is minimum among all equivalent reorderings. Our
algorithm consumes O(nlogn+m) in time, where n is the number of nodes
and m the sum of all maximal clique sizes in the filled graph. }
}
%got
@article{LinChen00,
author={Lin, W.-Y. and Chen, C.-L.},
title={On evaluating elimination tree based parallel sparse {Cholesky} factorizations},
journal=IJCM,
volume={74},
number={3},
pages={361-377},
year={2000},
url={ http://dx.doi.org/10.1080/00207160008804948 }
}
%got
@article{LinChen05,
author={Lin, W.-Y. and Chen, C.-L.},
title={On optimal reorderings of sparse matrices for parallel {Cholesky} factorizations},
journal=SIMAX,
volume={27},
number={1},
pages={24--45},
year={2005},
}
%got
@article{LiptonRoseTarjan79,
author={Lipton, R. J. and Rose, D. J. and Tarjan, R. E.},
year={1979},
title={Generalized Nested Dissection},
journal=SINUM,
volume={16},
number={2},
pages={346-358},
url={ http://dx.doi.org/10.1137/0716027 },
keywords={ordering nested dissection},
abstract={J. A. George has discovered a method, called nested
dissection, for solving a system of linear equations defined on an $n=
k \times k$ square grid in $O(n\log n)$ and space $O(n^{{3 /2}} )$
time. We generalize this method without degrading the time and space
bounds so that it applies to any system of equations defined on a
planar or almost-planar graph. Such systems arise in the solution of
two-dimensional finite element problems. Our method uses the fact that
planar graphs have good separators. More generally, we show that
sparse Gaussian elimination is efficient for any class of graphs which
have good separators, and conversely that graphs without good
separators (including 'almost all' sparse graphs) are not amenable to
sparse Gaussian elimination. }
}
%got
@article{LiptonTarjan79,
author={Lipton, R. J. and Tarjan, R. E.},
title={A separator theorem for planar graphs},
journal=SIAMJAM,
year={1979},
volume={36},
number={2},
pages={177-189},
url={ http://dx.doi.org/10.1137/0136016 },
abstract={Let G be any n-vertex planar graph. We prove that the
vertices of G can be partitioned into three sets A, B, C such that no
edge joins a vertex in A with a vertex in B, neither A nor B contains
more than ${2n / 3}$ vertices, and C contains no more than
$2\sqrt 2 \sqrt n $ vertices. We exhibit an algorithm which finds such
a partition A, B, C in $O( n )$ time. }
}
%%LIU --------------------------------------------------------------------------
%got
@article{Liu85,
author={Liu, J. W. H.},
year={1985},
title={Modification of the Minimum-Degree Algorithm by Multiple Elimination},
journal=TOMS,
volume={11},
number={2},
pages={141--153},
url={ http://dx.doi.org/10.1145/214392.214398 },
annote={'On Multiple Elimination in the Minimum Degree Algorithm,'
Report CS-83-03, Dept. of Computer Science, York University (1983)},
abstract={The most widely used ordering scheme to reduce fills
and operations in sparse matrix computation is the minimum-degree
algorithm. The notion of {\em multiple elimination} is introduced here
as a modification to the conventional scheme. The motivation is
discussed using the $k$-by-$k$ grid model problem. Experimental results
indicate that the modified version retains the fill-reducing property
of (and is often better than) the original ordering algorithm and yet
requires less computer time. The reduction in ordering time is problem
dependent, and for some problems the modified algorithm can run a few
times faster than existing implementations of the minimum-degree
algorithm. The use of {\em external degree} in the algorithm is also
introduced.},
}
%got
@article{Liu86c,
author={Liu, J. W. H.},
year={1986},
title={A Compact Row Storage Scheme for {Cholesky} Factors Using Elimination Trees},
journal=TOMS,
volume={12},
number={2},
pages={127--148},
url={ http://dx.doi.org/10.1145/6497.6499 },
annote={Report CS-84-02, Dept. Computer Science, York University (1984)}
}
%got
@article{Liu86d,
author={Liu, J. W. H.},
year={1986},
title={Computational Models and Task Scheduling for Parallel Sparse {Cholesky} Factorization},
journal=PC,
volume={3},
number={4},
pages={327-342},
annote={Report CS-85-01, Dept. of Computer Science York University, Ontario},
url={ http://dx.doi.org/10.1016/0167-8191(86)90014-1 },
keywords={Sparse matrix},
keywords={parallel factorization},
keywords={graph models},
keywords={task scheduling},
keywords={critical path scheduling },
abstract={In this paper, a systematic and unified treatment of
computational task models for parallel sparse Cholesky factorization is
presented. They are classified as fine-, medium-, and large-grained
graph models. In particular, a new medium-grained model based on
column-oriented tasks is introduced, and it is shown to correspond
structurally to the filled graph of the given sparse matrix. The task
scheduling problem for the various task graphs is also discussed. A
practical algorithm to schedule the column tasks of the medium-grained
model for multiple processors is described. It is based on a heuristic
critical path scheduling method. This will give an overall scheme for
parallel sparse Cholesky factorization, appropriate for parallel
machines with shared-memory architecture like the Denelcor HEP. }
}
%got
@article{Liu86b,
author={Liu, J. W. H.},
year={1986},
title={On General Row Merging Schemes for Sparse {Givens} Transformations},
journal=SISC,
volume={7},
number={4},
pages={1190-1211},
url={ http://dx.doi.org/10.1137/0907081 },
keywords={orthogonal decomposition, Givens rotation, row/submatrix merging, variable row pivoting},
annote={Report CS-83-04, Dept. Computer Science, York University (1983)}
}
%got
@article{Liu86,
author={Liu, J. W. H.},
year={1986},
title={On the Storage Requirement in the Out-of-Core Multifrontal Method for Sparse Factorization},
journal=TOMS,
volume={12},
number={3},
pages={249--264},
keywords={algorithms {Cholesky} factorization multifrontal out-of-core}
}
%got
@article{Liu87,
author={Liu, J. W. H.},
month={July},
year={1987},
title={An Adaptive General Sparse Out-of-Core {Cholesky} Factorization Scheme},
journal=SISC,
volume={8},
number={4},
pages={585-599},
url={ http://dx.doi.org/10.1137/0908053 },
keywords={out-of-core, elimination tree, ordering, postordering, Cholesky factorization},
annote={CS-85-05, Dept. of Computer Science, York Univ. (1985)}
}
%got
@article{Liu87c,
author={Liu, J. W. H.},
month={July},
year={1987},
title={An Application of Generalized Tree Pebbling to Sparse Matrix Factorization},
journal=SIAMJADM,
volume={8},
number={3},
pages={375-395},
url={ http://dx.doi.org/10.1137/0608031 },
keywords={out-of-core, elimination tree, tree pebbling, symmetric matrices},
annote={Report CS-86-02, Dept. of Computer Science, York Univ., 1986},
}
%got
@article{Liu87e,
author={Liu, J. W. H.},
month={Nov.},
year={1987},
title={A Note on Sparse Factorization in a Paging Environment},
journal=SISC,
volume={8},
number={6},
pages={1085-1088},
url={ http://dx.doi.org/10.1137/0908087 },
keywords={Cholesky factorization, paging},
abstract={The impact of reordering on the Cholesky factorization of a
sparse matrix in a paging environment is examined. We show
experimentally that an equivalent reordering, if appropriately chosen,
can reduce the CPU time and elapsed time for sparse factorization. }
}
%got
@article{Liu87f,
author={Liu, J. W. H.},
title={On Threshold Pivoting in the Multifrontal Method for Sparse Indefinite Systems},
journal=TOMS,
volume={13},
number={3},
pages={250--261},
month=sep,
year={1987},
url={ http://dx.doi.org/10.1145/29380.31331 },
keywords={algorithms; measurement; performance; theory},
}
%got
@article{Liu87d,
author={Liu, J. W. H.},
month={Jun.},
year={1987},
title={A Partial Pivoting Strategy for Sparse Symmetric Matrix Decomposition},
journal=TOMS,
volume={13},
number={2},
pages={173--182},
keywords={diagonal pivoting, indefinite matrices, symmetric matrices, multifrontal, threshold partial pivoting, 2x2 pivots}
}
%got
@article{Liu88,
author={Liu, J. W. H.},
year={1988},
title={Equivalent Sparse Matrix Reordering by Elimination Tree Rotations},
journal=SISC,
volume={9},
number={3},
pages={424-444},
url={ http://dx.doi.org/10.1137/0909029 },
keywords={multifrontal, elimination tree, rotations}
}
%got
@article{Liu88b,
author={Liu, J. W. H.},
month={Jan.},
year={1988},
title={A Tree Model for Sparse Symmetric Indefinite Matrix Factorization},
journal=SIMAX,
volume={9},
pages={26-39},
keywords={2x2 pivots, indefinite symmetric, elimination tree}
}
%got
@article{Liu89d,
author={Liu, J. W. H.},
title={A graph partitioning algorithm by node separators},
journal=TOMS,
year={1989},
volume={15},
number={3},
pages={198--219},
}
%got
@article{Liu89b,
author={Liu, J. W. H.},
title={The minimum degree ordering with constraints},
journal=SISC,
year={1989},
volume={10},
number={6},
pages={1136-1145},
url={ http://dx.doi.org/10.1137/0910069 },
abstract={A hybrid scheme for ordering sparse symmetric matrices is
considered. It is based on a combined use of the top-down nested
dissection and the bottom-up minimum degree ordering schemes. A
separator set is first determined by some form of incomplete nested
dissection. The minimum degree ordering is then applied subject to the
constraint that the separator nodes must be ordered last. It is shown
experimentally that the quality of the resulting ordering from this
constrained scheme exhibits less sensitivity to the initial matrix
ordering than that of the original minimum degree ordering. An
important application of this approach to find orderings suitable for
parallel elimination is also illustrated. }
}
%got
@article{Liu89c,
author={Liu, J. W. H.},
title={The multifrontal method and paging in sparse {Cholesky} factorization},
journal=TOMS,
year={1989},
volume={15},
number={4},
pages={310--325},
}
%got
@article{Liu89a,
author={Liu, J. W. H.},
title={Reordering Sparse Matrices for Parallel Elimination},
journal=PC,
year={1989},
volume={11},
number={1},
pages={73-91},
url={ http://dx.doi.org/10.1016/0167-8191(89)90064-1 },
keywords={Sparse matrix},
keywords={parallel elimination},
keywords={reordering},
keywords={elimination tree},
keywords={height},
keywords={rotation },
abstract={We consider the problem of finding equivalent reorderings
of a sparse matrix so that the reordered matrix is suitable for
parallel Gaussian elimination. The elimination tree structure is used
as our parallel model. We show that the reordering scheme by Jess and
Kees generates an elimination tree with minimum height among all such
trees from the class of equivalent reorderings. A new height-reducing
algorithm based on elimination tree rotation is also introduced.
Experimental results are provided to compare these two approaches. The
new reordering algorithm using rotation is shown to produce trees with
minimum or near-minimum height. Yet, it requires significantly less
reordering time. }
}
%got
@article{Liu90a,
author={Liu, J. W. H.},
title={The Role of Elimination Trees in Sparse Factorization},
journal=SIMAX,
year={1990},
volume={11},
number={1},
pages={134-172},
}
%got
@article{Liu91,
author={Liu, J. W. H.},
title={A Generalized Envelope Method for Sparse Factorization by Rows},
journal=TOMS,
volume={17},
number={1},
pages={112--129},
month=mar,
year={1991},
url={ http://dx.doi.org/10.1145/103147.103159 },
abstract={A generalized form of the envelope method is proposed
for the solution of large sparse symmetric and positive definite
matrices by rows. The method is demonstrated to have practical
advantages over the conventional column-oriented factorization using
compressed column storage or the multifrontal method using full frontal
submatrices.},
}
%got
@article{Liu92,
author={Liu, J. W. H.},
title={The multifrontal method for sparse matrix solution: theory and practice},
journal=SIREV,
year={1992},
volume={34},
number={1},
pages={82--109},
url={ http://dx.doi.org/10.1137/1034004 },
}
%got
@article{LiuMirzaian89,
author={Liu, J. W. H. and Mirzaian, A.},
title={A linear reordering algorithm for parallel pivoting of chordal graphs},
journal=SIAMJDM,
year={1989},
volume={2},
pages={100--107},
abstract={This paper provides an efficient algorithm for generating an
ordering suitable for the parallel elimination of nodes in chordal
graphs. The time complexity of the reordering algorithm is shown to be
linear in the size of the chordal graph. The basic parallel pivoting
strategy is originally by Jess and Kees [IEEE Trans. Comput., C-31
(1982), pp. 231-239]. The relevance of the reordering to parallel
factorization of sparse matrices (not necessarily chordal) is also
discussed.},
url={ http://dx.doi.org/10.1137/0402011 },
annote={implements Jess-Kees' method},
}
%got
@article{LiuNgPeyton93,
author={Liu, J. W. H. and Ng, E. G. and Peyton, B. W.},
title={On finding supernodes for sparse matrix computations},
journal=SIMAX,
year={1993},
volume={14},
number={1},
pages={242-252},
url={ http://dx.doi.org/10.1137/0614019 }
}
%got
@article{LiuSherman76,
author={Liu, J. W. H. and Sherman, A. H.},
year={1976},
title={Comparative Analysis of the {Cuthill}-{McKee} and the Reverse {Cuthill}-{McKee} Ordering Algorithms for Sparse Matrices},
journal=SINUM,
volume={13},
number={2},
pages={198-213},
abstract={In this paper we examine the Cuthill-McKee algorithm for
ordering the unknowns and equations in systems of linear equations,
$A{\bf x}={\bf b}$, where A is sparse, symmetric, and positive
definite. This algorithm is designed to produce a permutation matrix P
such that $PAP^T $ has a small bandwidth. If we wish to exploit zeros
in the band of A which occur before the first nonzero in each row and
column, it has been experimentally observed that reversing the ordering
produced by the Cuthill-McKee algorithm is often very much better than
the original ordering in terms of the amount of storage and work
required to factor A. We prove that for band elimination methods, the
two orderings are equivalent and that, surprisingly, the reverse
ordering is always at least as good as the original one when envelope
elimination techniques are used. We give a condition on the matrix A
under which the reverse ordering is strictly better than the original
one, and we include several numerical experiments and analyses of
practical examples to illustrate our results. }
}
%%L continued ------------------------------------------------------------------
%got
@article{LuBarlow96,
author={Lu, S. M. and Barlow, J. L.},
title={Multifrontal computation with the orthogonal factors of sparse matrices},
journal=SIMAX,
year={1996},
volume={17},
number={3},
pages={658--679},
}
%got
@article{LucasBlankTiemann87,
author={Lucas, R. F. and Blank, T. and Tiemann, J. J.},
title={A Parallel Solution Method for Large Sparse Systems of Equations},
journal=IEEETCAD,
year={1987},
month={November},
volume={6},
number={6},
pages={981-991},
url={ http://dx.doi.org/10.1109/TCAD.1987.1270339 },
abstract={This paper presents a new distributed multifrontal sparse
matrix decomposition algorithm suitable for message passing parallel
processors. The algorithm uses a nested dissection ordering and a
multifrontal distribution of the matrix to minimize interprocessor data
dependencies and overcome the communication bottleneck previously
reported for sparse matrix decomposition [1]. Distributed multifrontal
forward elimination and back substitution algorithms are also provided.
Results of an implementation on the Intel iPSC are presented. Up to 16
processors are used to solve systems with as many as 7225 equations.
With 16 processors, speedups of 10.2 are observed and the decomposition
is shown to achieve 67 percent processor utilization. This work was
motivated by the need to reduce the computational bottleneck in the
Stanford PISCES [2] device simulator; however, it should be applicable
to a wide range of scientific and engineering problems. }
}
%got
@inproceedings{LucasWagenbrethDavisGrimes10,
author={Lucas, R. F. and Wagenbreth, G. and Davis, D. and Grimes, R. G.},
title={Multifrontal Computations on {GPUs} and Their Multi-core Hosts},
booktitle={VECPAR'10: Proc. 9th Intl. Meeting for High Performance Computing for Computational Science},
year={2010},
note={ http://vecpar.fe.up.pt/2010/papers/5.php },
abstract={The use of GPUs to accelerate the factoring of large sparse
symmetric indefinite matrices shows the potential of yielding important
benefits to a large group of widely used applications. This paper
examines how a multifrontal sparse solver performs when exploiting both
the GPU and its multi-core host. It demonstrates that the GPU can
dramatically accelerate the solver relative to one host CPU.
Furthermore, the solver can profitably exploit both the GPU to factor
its larger frontal matrices and multiple threads on the host to handle
the smaller frontal matrices.}
}
%got
@article{LuceNg14,
author={Luce, R. and Ng, E. G.},
title={On the minimum {FLOPs} problem in the sparse {Cholesky} factorization},
journal=SIMAX,
volume={35},
number={1},
pages={1--21},
year={2014},
url={ http://dx.doi.org/10.1137/130912438 }
}
%%M ---------------------------------------------------------------------------
%got
@article{ManneHafsteinsson95,
author={Manne, F. and Haffsteinsson, H.},
title={Efficient sparse {Cholesky} factorization on a massively parallel {SIMD} computer},
journal=SISC,
year={1995},
volume={16},
number={4},
pages={934--950},
url={ http://dx.doi.org/10.1137/0916054 },
abstract={We investigate the effect of load balancing when performing
Cholesky factorization on a massively parallel SIMD computer. In
particular we describe a supernodal algorithm for performing sparse
Cholesky factorization. The way the matrix is mapped onto the
processors has significant effect on its efficiency. We show that this
assignment problem can be modeled as a graph coloring problem in a
weighted graph. By a simple greedy algorithm, we obtain substantial
speedup compared with previously suggested data mapping schemes.
Experimental runs have been made on a 16K processor MasPar MP-2
parallel computer using symmetric test matrices with irregular sparsity
structure. On these problems our implementation achieves performance
rates of well above 200 Mflops in double precision arithmetic.}
}
%got
@article{Markowitz57,
author={Markowitz, H. M.},
year={1957},
title={The Elimination Form of the Inverse and Its Application to Linear Programming},
journal=MSCI,
volume={3},
number={3},
pages={255--269},
url={ http://dx.doi.org/10.1287/mnsc.3.3.255 },
keywords={ordering Markowitz ordering},
abstract={It is common for matrices in industrial applications of
linear programming to have a large proportion of zero coefficients.
While every item (raw material, intermediate material, end item,
equipment item) in, say, a petroleum refinery may be indirectly related
to every other, any particular process uses few of these. Thus the
matrix describing petroleum technology has a small percentage of
non-zeros. If spacial or temporal distinctions are introduced into the
model the percentage of non-zeros generally falls further. The present
paper discusses a form of inverse which is especially convenient to
obtain and use for matrices with a high percentage of zeros. The
application of this form of inverse in linear programming is also
discussed.}
}
%got
@article{Marro86,
author={Marro, L.},
month={Oct.},
year={1986},
title={A Linear Time Implementation of Profile Reduction Algorithms for Sparse Matrices},
journal=SISC,
volume={7},
number={4},
pages={1212-1231},
keywords={ordering profile reduction method},
url={ http://dx.doi.org/10.1137/0907082 },
abstract={The profile reduction method is intended for time and
storage reduction in solving a linear system of equations $Mx=b$
using direct methods. A Frontal Increase Minimization strategy (FIM
strategy) is a generalization of the so-called King's numbering
criterion. This class of strategy is used in some other classical
algorithms (Levy's, Snay's, Gibbs's algorithms). Although efficient,
these algorithms are far greater time consumers in their original
implementation than other classical profile reduction algorithms (e.g.,
Reverse Cuthill McKee algorithm). In this paper we first apply the
principles given by the authors to propose a unified 'classical'
implementation of the above mentioned algorithms. Then we provide some
time complexity estimates for this implementation. Secondly, we
describe an improved implementation of the FIM strategy algorithms
using a new insight into the numbering process and best appropriate
data structures. This implementation is proven linear in time
complexity with respect to the number of nonzeros in M for all the
above-mentioned algorithms. Finally, we provide practical execution
times on a collection of test problems, highlighting the improvement
achieved by the new implementation and its efficiency for small
problems. The evaluation of the performance/cost ratio of the FIM
strategy algorithms in the new implementation shows that they are
competitive compared to other classical profile reduction algorithms.}
}
%got
@article{Matstoms94,
author={Matstoms, P.},
title={Sparse {QR} factorization in {MATLAB}},
journal=TOMS,
year={1994},
volume={20},
number={1},
pages={136--159},
}
%got
@article{Matstoms95,
author={Matstoms, P.},
title={Parallel sparse {QR} factorization on shared memory architectures},
journal=PC,
volume=21,
number=3,
year={1995},
pages={473--486},
url={ http://dx.doi.org/10.1016/0167-8191(94)00092-O },
keywords={Orthogonal decomposition},
keywords={Sparse matrix},
keywords={Parallel programming},
keywords={Multifrontal method},
keywords={Shared memory multiprocess system },
abstract={We discuss a parallel shared memory implementation of
multifrontal {QR} factorization. To achieve high performance for
general large and sparse matrices, a combination of tree and node level
parallelism is used. Acceptable load balancing is obtained by the use
of a pool-of-tasks approach. For the storage of frontal and update
matrices, we use a buddy system based on Fibonacci blocks. It turns out
to be more efficient than blocks of size 2i, as proposed by other
authors. Also the order in which memory space for update and frontal
matrices are allocated is shown to be of importance. An implementation
of the proposed algorithm on the {CRAY} X-MP/416 (four processors),
gives speedups of about three with about 20\% of extra real memory space
required. }
}
%got
@article{Mayer09,
author={Mayer, J.},
title={Parallel algorithms for solving linear systems with sparse triangular matrices},
year={2009},
journal=COMP,
volume={86},
number={4},
url={ http://dx.doi.org/10.1007/s00607-009-0066-3 },
publisher={Springer Vienna},
keywords={Preconditioning; Iterative methods; Sparse linear systems; Parallelization; 65F10; 65F50; 65Y05},
pages={291--312},
abstract={In this article, we present two new algorithms for solving
given triangular systems in parallel on a shared memory architecture.
Multilevel incomplete LU factorization based preconditioners, which
have been very successful for solving linear systems iteratively,
require these triangular solves. Hence, the algorithms presented here
can be seen as parallelizing the application of these preconditioners.
The first algorithm solves the triangular matrix by block
anti-diagonals. The drawback of this approach is that it can be
difficult to choose an appropriate block structure. On the other hand,
if a good block partition can be found, this algorithm can be quite
effective. The second algorithm takes a hybrid approach by solving the
triangular system by block columns and anti-diagonals. It is usually as
effective as the first algorithm, but the block structure can be chosen
in a nearly optimal manner. Although numerical results indicate that
the speed-up can be fairly good, systems with matrices having a strong
diagonal structure or narrow bandwidth cannot be solved effectively in
parallel. Hence, for these matrices, the results are disappointing. On
the other hand, the results are better for matrices having a more
uniform distribution of non-zero elements. Although not discussed in
this article, these algorithms can possibly be adapted for distributed
memory architectures. }
}
%got
@article{McNamee71,
author={McNamee, J. M.},
title={{ACM Algorithm 408}: {A} Sparse Matrix Package (Part {I})},
journal=CACM,
volume={14},
number={4},
pages={265--273},
month=apr,
year={1971},
url={ http://dx.doi.org/10.1145/362575.362584 },
}
%got
@article{McNamee83b,
author={McNamee, J. M.},
title={{Algorithm 601}: A Sparse-Matrix Package -- Part {II}: Special Cases},
journal=TOMS,
volume={9},
number={3},
pages={344--345},
month=sep,
year={1983},
url={ http://dx.doi.org/10.1145/356044.356050 },
}
%got
@article{McNamee83,
author={McNamee, J. M.},
title={A Sparse Matrix Package -- Part {II}: Special Cases},
journal=TOMS,
volume={9},
number={3},
pages={340--343},
month=sep,
year={1983},
url={ http://dx.doi.org/10.1145/356044.356049 },
}
%got
@article{Melhem88,
author={Melhem, R. G.},
month={Mar.},
year={1988},
title={A Modified Frontal Technique Suitable for Parallel Systems},
journal=SISC,
volume={9},
number={2},
pages={289-303},
}
%got
@article{MesharIronyToledo06,
author={Meshar, O. and Irony, D. and Toledo, S.},
title={An out-of-core sparse symmetric-indefinite factorization method},
journal=TOMS,
volume={32},
number={3},
pages={445--471},
month=sep,
year={2006},
url={ http://dx.doi.org/10.1145/1163641.1163645 },
abstract={We present a new out-of-core sparse
symmetric-indefinite factorization algorithm. The most significant
innovation of the new algorithm is a dynamic partitioning method for
the sparse factor. This partitioning method results in very low
input/output traffic and allows the algorithm to run at high
computational rates even though the factor is stored on a slow disk.
Our implementation of the new code compares well with both
high-performance in-core sparse symmetric-indefinite codes and with a
high-performance out-of-core sparse Cholesky code.},
}
%got
@inproceedings{MillerTengThurstonVavasis93,
author={Miller, G. L. and Teng, S. H. and Thurston, W. and Vavasis, S. A.},
title={Automatic mesh partitioning},
pages={57--84},
booktitle={Graph Theory and Sparse Matrix Computation},
series={IMA Volumes in Applied Mathematics},
year={1993},
publisher={Springer-Verlag},
address={New York},
editor={George, A. and Gilbert, J. R. and Liu, J. W. H.},
volume={56},
}
%got
@article{MillerTengThurstonVavasis98,
author={Miller, G. L. and Teng, S. H. and Thurston, W. and Vavasis, S. A.},
title={Geometric separators for finite-element meshes},
journal=SISC,
year={1998},
volume={19},
number={2},
pages={364--386},
url={ http://dx.doi.org/10.1137/S1064827594262613 },
abstract={We propose a class of graphs that would occur naturally in
finite-element and finite-difference problems and we prove a bound on
separators for this class of graphs. Graphs in this class are embedded
in d-dimensional space in a certain manner. For d-dimensional graphs
our separator bound is $O(n^{(d-1)}d )$, which is the best possible
bound. We also propose a simple randomized algorithm to find this
separator in O(n) time. This separator algorithm can be used to
partition the mesh among processors of a parallel computer and can also
be used for the nested dissection sparse elimination algorithm. }
}
%%N ---------------------------------------------------------------------------
%got
@article{NakhlaSinghalVlach74,
author={Nakhla, M. and Singhal, K. and Vlach, J.},
month={Mar.},
year={1974},
title={An Optimal Pivoting Order for the Solution of Sparse Systems of Equations},
journal=IEEETCS,
volume={CAS-21},
number={2},
pages={222-225},
url={ http://dx.doi.org/10.1109/TCS.1974.1083827 },
abstract={Analytic expressions for finding fill-in, the number of
nonzero elements that change in value, and the number of long
operations during each step of the LU decomposition are given. A new
optimal pivot ordering algorithm is proposed which leads to a reduction
of the overall fill-in and long operation count. Comparison is made
with two other known algorithms.}
}
%%NG --------------------------------------------------------------------------
%got
@article{Ng91,
author={Ng, E. G.},
title={A Scheme for Handling Rank-Deficiency in the Solution of Sparse Linear Least Squares Problems},
journal=SISC,
year={1991},
volume={12},
number={5},
pages={1173-1183},
url={ http://dx.doi.org/10.1137/0912062 },
abstract={Several schemes for computing sparse orthogonal
factorizations using static data structures have been proposed
recently. One novel feature of some of these schemes is that the data
structures are large enough to store both the orthogonal
transformations and upper triangular factor explicitly. However, in
order to make use of the static storage schemes, the orthogonal
factorization has to be computed without column interchanges, which is
sufficient when the observation matrix has full rank. When the least
squares matrix is rank-deficient, computing the minimum-norm solution
to the least squares problem requires the knowledge of the rank of the
matrix, which may be difficult to get if the factorization is computed
without column pivoting. In this paper an algorithm is developed that
makes use of the resulting factorization to solve rank-deficient least
squares problems. The new algorithm has three major features. First, it
uses the original triangular factor. Second, this factor is not
altered. Third, the rank decision is based on the singular value
decomposition and not on the diagonal elements of the triangular
factor. The techniques used are similar to those employed by Bj\"orck.}
}
%got
@article{Ng93,
author={Ng, E. G.},
title={Supernodal symbolic {Cholesky} factorization on a local-memory multiprocessor},
journal=PC,
volume={19},
number={2},
pages={153 - 162},
year={1993},
url={ http://dx.doi.org/10.1016/0167-8191(93)90045-M },
keywords={Linear algebra},
keywords={Cholesky factorization},
keywords={sparse matrices},
keywords={distributed-memory multiprocessor},
keywords={symbolic factorization },
abstract={In this paper, we consider the symbolic factorization step
in computing the Cholesky factorization of a sparse symmetric positive
definite matrix on distributed-memory multiprocessor systems. By
exploiting the supernodal structure in the Cholesky factor, the
performance of a previous parallel symbolic factorization algorithm is
improved. Empirical tests demonstrate that there can be drastic
reduction in the execution time required by the new algorithm on an
Intel iPSC/2 hypercube. },
annote={was Ng91b, ORNL/TM-11836 tech report}
}
%got
@incollection{Ng13,
author={Ng, E. G.},
title={Sparse matrix methods},
chapter={53},
pages={931-951},
url={ http://dx.doi.org/10.1201/b16113-62 },
booktitle={Handbook of Linear Algebra, Second Edition},
publisher={Chapman and Hall/CRC},
year={2013},
}
%got
@article{NgPeyton92,
author={Ng, E. G. and Peyton, B. W.},
title={A Tight and Explicit Representation of {Q} in Sparse {QR} Factorization},
journal=IMAPREPRINT,
year={1992},
volume={981},
url={ https://www.ima.umn.edu/preprints/pp1992/may1992.html },
annote={also tech report ORNL/TM-12059. Did this ever appear anywyere? },
}
%got
@article{NgPeyton93b,
author={Ng, E. G. and Peyton, B. W.},
title={Block sparse {Cholesky} algorithms on advanced uniprocessor computers},
journal=SISC,
year={1993},
volume={14},
pages={1034--1056},
number={5},
}
%got
@article{NgPeyton93,
author={Ng, E. G. and Peyton, B. W.},
title={A supernodal {Cholesky} factorization algorithm for shared-memory multiprocessors},
journal=SISC,
year={1993},
volume={14},
pages={761-769},
url={ http://dx.doi.org/10.1137/0914048 },
abstract={This paper presents a parallel sparse Cholesky factorization
algorithm for shared-memory MIMD multiprocessors. The algorithm is
particularly well suited for vector supercomputers with multiple
processors, such as the Cray Y-MP. The new algorithm is a
straightforward parallelization of the left-looking supernodal sparse
Cholesky factorization algorithm. Like its sequential predecessor, it
improves performance by reducing indirect addressing and memory
traffic. Experimental results on a Cray Y-MP demonstrate the
effectiveness of the new algorithm. On eight processors of a Cray Y-MP,
the new routine performs the factorization at rates exceeding one Gflop
for several test problems from the Harwell-Boeing sparse matrix
collection. }
}
%got
@article{NgPeyton96,
author={Ng, E. G. and Peyton, B. W.},
title={Some results on structure prediction in sparse {QR} factorization},
journal=SIMAX,
year={1996},
volume={17},
number={2},
pages={443--459},
url={ http://dx.doi.org/10.1137/S0895479892230973 },
abstract={In $QR$ factorization of an $m \times n$ matrix $A\, ( m
\geq n )$, the orthogonal factor Q is often stored implicitly as an $m
\times n$ lower trapezoidal matrix W, known as the Householder matrix.
When the sparsity of A is to be exploited, the factorization is often
preceded by a symbolic factorization step, which computes a data
structure in which the nonzero entries of W and R are computed and
stored. This is achieved by computing an upper bound on the nonzero
structure of these factors, based solely on the nonzero structure of A.
In this paper we use a well-known upper bound on the nonzero structure
of W to obtain an upper bound on the nonzero structure of Q. Let U be
the matrix consisting of the first n columns of Q. One interesting
feature of the new bound is that the bound on W's structure is
identical to the lower trapezoidal part of the bound on U's structure.
We show that if A is strong Hall and has no zero entry on its main
diagonal, then the bounds on the nonzero structures of W and U are the
smallest possible based solely on the nonzero structure of A. We then
use this result to obtain corresponding smallest upper bounds in the
case where A is weak Hall, is in block upper triangular form, and has
no zero entry on its main diagonal. Finally, we show that one can
always reorder a weak Hall matrix into block upper triangular form so
that there is no increase in the fill incurred by the $QR$
factorization. }
}
%got
@article{NgRaghavan99,
author={Ng, E. G. and Raghavan, P.},
title={Performance of greedy ordering heuristics for sparse {Cholesky} factorization},
journal=SIMAX,
year={1999},
volume={20},
number={4},
pages={902-914},
url={ http://dx.doi.org/10.1137/S0895479897319313 },
abstract={Greedy algorithms for ordering sparse matrices for Cholesky
factorization can be based on different metrics. Minimum degree, a
popular and effective greedy ordering scheme, minimizes the number of
nonzero entries in the rank-1 update (degree) at each step of the
factorization. Alternatively, minimum deficiency minimizes the number
of nonzero entries introduced (deficiency) at each step of the
factorization. In this paper we develop two new heuristics: modified
minimum deficiency (MMDF) and modified multiple minimum degree (MMMD).
The former uses a metric similar to deficiency while the latter uses a
degree-like metric. Our experiments reveal that on the average, MMDF
orderings result in 21\% fewer operations to factor than minimum degree;
MMMD orderings result in 15\% fewer operations to factor than minimum
degree. MMMD requires on the average 7--13\% more time than minimum
degree, while MMDF requires on the average 33--34\% more time than
minimum degree. }
}
%%N continued -----------------------------------------------------------------
%got
@article{NorinPottle71,
author={Norin, R. S. and Pottle, C.},
month={Jan.},
year={1971},
title={Effective Ordering of Sparse Matrices Arising from Nonlinear Electrical Networks},
journal=IEEETCT,
volume={CT-18},
pages={139-145},
url={ http://dx.doi.org/10.1109/TCT.1971.1083236 },
abstract={Hachtel et al. [1], [2] have recently proposed sparse matrix
methods for nonlinear analysis incorporating an algorithm that
generates symbolic code which, when executed, solves a system of linear
equations of arbitrary, but particular, sparseness structure. They
point out that the execution time and storage requirements of this code
are critically dependent upon the ordering selected for processing the
network equations and variables, and have themselves developed ordering
methods. An efficient ordering algorithm is presented which tends to
minimize the length and execution time of this symbolic code. Although
the algorithm takes full advantage of the unique character of the
sparse system that arises from a certain nonlinear circuit analysis
representation, it is flexible enough to be used efficiently for
ordering sparse matrices with different characteristics. In particular,
it is especially appropriate when solving repetitively the large sparse
systems which appear in circuit analysis in general, nonlinear
differential and discrete system analysis, and in systems of linear or
nonlinear algebraic equations. These problems are often part of larger
problems or simulations. The algorithm contains parameters that may be
easily adjusted to vary the tradeoff between ordering time and ordering
efficiency. The method can (and should) be generalized to include some
pivoting for numerical accuracy. Results for a typical nonlinear
network indicate considerable improvement over previously published
ordering schemes. },
keywords={loop-free code generation}
}
%%O ---------------------------------------------------------------------------
%got
@article{Oliveira01,
author={Oliveira, S.},
title={Exact prediction of {QR} fill-in by row-merge trees},
journal=SISC,
year={2001},
volume={22},
number={6},
pages={1962--1973},
}
%GET
@article{OlschowkaNeumaier96,
author={Olschowka, M. and Neumaier, A.},
year={1996},
title={A new pivoting strategy for {Gaussian} elimination},
journal=LAA,
volume={240},
pages={131--151},
}
%got
@article{Ong87,
author={Ong, J. H.},
title={An algorithm for frontwidth reduction},
journal=JSC,
year={1987},
volume={2},
number={2},
pages={159-173},
url={ http://dx.doi.org/10.1007/BF01061484 },
keywords={Element resequencing; frontwidth minimization; frontal technique},
abstract={This paper presents an algorithm for the automatic
renumbering of elements prior to the solution by the frontal technique
in finite element analysis. The method is based on a modified King's
algorithm and is shown to be extremely efficient and consistent in
obtaining a small frontwidth.}
}
%got (Tim owns hardcopy of book)
@book{OsterbyZlatev83,
author={Osterby, O. and Zlatev, Z.},
year={1983},
title={Direct Methods for Sparse Matrices, Lecture Notes in Computer Science 157},
publisher={Berlin: Springer-Verlag},
note={Review by Eisenstat at http://dx.doi.org/10.1137/1028128 }
}
%got
@article{OstromskyHansenZlatev98,
author={Ostromsky, T. and Hansen, P. C. and Zlatev, Z.},
title={A coarse-grained parallel {QR}-factorization algorithm for sparse least squares problems},
journal=PC,
volume={24},
number={5-6},
pages={937--964},
year={1998},
url={ http://dx.doi.org/10.1016/S0167-8191(98)00034-9 },
keywords={Coarse-grained parallelism},
keywords={Least squares problem},
keywords={QR-factorization},
keywords={General sparse matrix},
keywords={Drop-tolerance},
keywords={Reordering},
keywords={Partitioning},
keywords={Block algorithm},
abstract={A sparse QR-factorization algorithm SPARQR for
coarse-grained parallel computations is described. The coefficient
matrix, which is assumed to be general sparse, is reordered in an
attempt to bring as many zero elements in the lower left corner as
possible. The reordered matrix is then partitioned into block rows, and
Givens plane rotations are applied in each block-row. These are
independent tasks and can be done in parallel. Row and column
permutations are carried out within the diagonal blocks in an attempt
to preserve better the sparsity of the matrix. The algorithm can be
used for solving least squares problems either directly or combined
with an iterative method (preconditioned conjugate gradients are used).
Small non-zero elements can optionally be dropped in the latter case.
This leads to a better preservation of the sparsity and, therefore, to
a faster factorization. The price which has to be paid is some loss of
accuracy. The iterative method is used to regain the accuracy lost
during the factorization. Numerical results from several experiments
with matrices from the well-known Harwell-Boeing collection as well as
with some larger sparse matrices are presented in this work. An SGI
Power Challenge computer with 16 processors has been used in the
experiments.},
}
%got
@article{Ostrouchov87,
author={Ostrouchov, G.},
title={Symbolic {Givens} reduction and row-ordering in large sparse least squares problems},
journal=SIMAX,
year={1993},
volume={8},
number={3},
pages={248-264},
url={ http://dx.doi.org/10.1137/0908033 },
abstract={In the solution of large sparse least squares problems by
Givens factorization, a preliminary symbolic step that determines a
good processing order and a data structure for the matrix factor is
used. In this paper, it is shown that a processing order equivalent to
sequential processing by rows can be as good as any processing order
using a single pivot row in each column. A notion of local
acceptability in row-ordering is introduced and shown to reduce fill
globally. Row-orderings satisfying this notion are essentially
equivalent to sequential processing by rows and the nature of
intermediate fill produced makes implicit representation of fill
possible. This forms the basis for a symbolic Givens reduction
algorithm that operates in a fixed data structure. }
}
%%P ---------------------------------------------------------------------------
%got
@article{PadminiMadanJain98,
author={Padmini, M. V. and Madan, B. B. and Jain, B. N.},
year={1998},
title={Reordering for parallelism},
journal=IJCM,
volume={67},
number={3-4},
pages={373-390},
url={ http://dx.doi.org/10.1080/00207169808804670 },
abstract={The proposed ordering scheme is the fusion of Jess and Kees
method and the Minimum degree ordering, that operates on a non-chordal
graph. The method produces a fill preserving ordering for all the test
problems selected from the Boeing-Harwell Sparse matrix collection. The
extent of parallelism extracted is nearly the same as that obtained by
using Liu's tree rotation heuristic. }
}
%got
@article{PaigeSaunders82,
author={Paige, C. C. and Saunders, M. A.},
title={{LSQR}: an algorithm for sparse linear equations and sparse least squares},
journal=TOMS,
year={1982},
volume={8},
pages={43--71},
url={ http://dx.doi.org/10.1145/355993.356000 },
}
%got
@article{Papadimitriou76,
author={Papadimitriou, Ch. H.},
title={The {NP}-Completeness of the bandwidth minimization problem},
year={1976},
journal=COMP,
volume={16},
number={3},
url={ http://dx.doi.org/10.1007/BF02280884 },
publisher={Springer-Verlag},
pages={263--270},
}
@article{Parter61,
author={Parter, S. V.},
year={1961},
title={The Use of Linear Graphs in {G}auss Elimination},
journal=SIREV,
volume={3},
pages={119-130},
url={ http://dx.doi.org/10.1137/1003021 }
}
%got
@article{Pellegrini97,
author={Pellegrini, F.},
title={Graph partitioning based methods and tools for scientific computing},
journal=PC,
year={1997},
volume={23},
number={6-8},
pages={153-1641},
annote={SCOTCH 3.1},
}
%got
@incollection{Pellegrini12,
author={Pellegrini, F.},
title={{Scotch} and {PT-Scotch} Graph Partitioning Software},
year={2012},
booktitle={Combinatorial Scientific Computing},
editor={Schenk, O.},
pages={373--406},
chapter={14},
url={ http://dx.doi.org/10.1201/b11644-15 },
publisher={Chapman and Hall/CRC Computational Science},
}
%got
@article{PellegriniRomanAmestoy00,
author={Pellegrini, F. and Roman, J. and Amestoy, P. R.},
title={Hybridizing nested dissection and halo approximate minimum degree for efficient sparse matrix ordering},
journal=CPE,
year={2000},
volume={12},
number={2-3},
pages={68--84},
url={ http://dx.doi.org/10.1002/(SICI)1096-9128(200002/03)12:2/3<69::AID-CPE472>3.0.CO;2-W },
keywords={sparse matrix ordering, nested dissection, approximate minimum degree, halo},
abstract={Minimum degree and nested dissection are the two most
popular reordering schemes used to reduce fill-in and operation count
when factoring and solving sparse matrices. Most of the
state-of-the-art ordering packages hybridize these methods by
performing incomplete nested dissection and ordering by minimum degree
the subgraphs associated with the leaves of the separation tree, but
most often only loose couplings have been achieved, resulting in poorer
performance than could have been expected. This paper presents a tight
coupling of the nested dissection and halo approximate minimum degree
algorithms, which allows the minimum degree algorithm to use exact
degrees on the boundaries of the subgraphs passed to it and to yield
back not only the ordering of the nodes of the subgraph, but also the
amalgamated assembly subtrees, for efficient block
computations.Experimental results show the performance improvement of
this hybridization, both in terms of fill-in reduction and increase of
concurrency on a parallel sparse block symmetric solver. }
}
%got
@article{Peters84,
author={Peters, F. J.},
year={1984},
title={Parallel Pivoting Algorithms for Sparse Symmetric Matrices},
journal=PC,
volume={1},
number={1},
pages={99-110},
keywords={parallel ordering symmetric matrices},
url={ http://dx.doi.org/10.1016/S0167-8191(84)90446-0 },
keywords={Sparse matrix},
keywords={LU-decomposition},
keywords={minimum-degree ordering},
keywords={MIMD computer },
abstract={In this paper it is investigated which pivots may be
processed simultaneously when solving a set of linear equations. It is
shown that for dense sets of equations all the pivots must necessarily
be processed one at a time; only if the set is sufficiently sparse,
some pivots may be processed simultaneously. We present parallel
pivoting algorithms for {MIMD} computers with sufficiently many
processors and a common memory. Moreover we present algorithms for
{MIMD} computers with an arbitrary, but fixed number of processors.
For both types of computers algorithms embodying an ordering strategy
are given. }
}
%got (Tim has book from TAMU library)
@incollection{Peters85,
author={Peters, F. J.},
year={1985},
title={Parallelism and Sparse Linear Equations},
editor={Evans, D. J.},
booktitle={Sparsity and Its Applications},
publisher={Cambridge, United Kingdom: Cambridge University Press},
pages={285--301},
keywords={parallel linear equations}
}
%got
@article{PetersWilkinson70,
author={Peters, G. and Wilkinson, J. H.},
title={The least squares problem and pseudo-inverses},
journal=CJ,
volume=13,
year={1970},
pages={309--316},
url={ http://dx.doi.org/10.1093/comjnl/13.3.309 },
abstract={ This paper presents a number of the most efficient mothods
for computing the pseudo-inverse of an m × n matrix, developing them
from a uniform standpoint. It shows that these are the natural
extentions of the more common methods for inverting an n × n matrix.
},
}
%got
@article{Peyton01,
author={Peyton, B. W.},
title={Minimal orderings revisited},
journal=SIMAX,
year={2001},
volume={23},
number={1},
pages={271-294},
url={ http://dx.doi.org/10.1137/S089547989936443X },
abstract={When minimum orderings proved too difficult to deal with,
Rose, Tarjan, and Lueker instead studied minimal orderings and how to
compute them [SIAM J. Comput., 5 (1976), pp. 266--283]. This paper
introduces an algorithm that is capable of computing much better
minimal orderings much more efficiently than the algorithm of Rose,
Tarjan, and Lueker. The new insight is a way to use certain structures
and concepts from modern sparse Cholesky solvers to reexpress one of
the basic results of Rose, Tarjan, and Lueker. The new algorithm begins
with any initial ordering and then refines it until a minimal ordering
is obtained. It is simple to obtain high-quality low-cost minimal
orderings by using fill-reducing heuristic orderings as initial
orderings for the algorithm. We examine several such initial orderings
in some detail. Our results here and previous work by others indicate
that the improvements obtained over the initial heuristic orderings are
relatively small because the initial orderings are minimal or nearly
minimal. Nested dissection orderings provide some significant
exceptions to this rule.}
}
%got
@article{PeytonPothenYuan93,
author={Peyton, B. W. and Pothen, A. and Yuan, X.},
title={Partitioning a chordal graph into transitive subgraphs for parallel sparse triangular solution},
journal=LAA,
year={1993},
volume={192},
pages={329-354},
}
%got
@article{PeytonPothenYuan95,
author={Peyton, B. W. and Pothen, A. and Yuan, X.},
title={A clique tree algorithm for partitioning a chordal graph into transitive subgraphs},
journal=LAA,
volume={223/224},
pages={553--588},
year={1995},
annote={was PeytonPothenYuan93b, tech report CS-93-27, Univ. Waterloo}
}
%got
@inproceedings{PierceHungLiuTsaiWangYu09,
author={Pierce, D. J. and Hung, Y. and Liu, C.-C. and Tsai, Y.-H. and Wang, W. and Yu, D.},
booktitle={Workshop on {GPU} Supercomputing},
publisher={National Taiwan University},
address={Taipei},
month={Jan.},
title={Sparse multifrontal performance gains via {NVIDIA} {GPU}},
year={2009},
note={ http://cqse.ntu.edu.tw/cqse/gpu2009.html }
}
%got
@article{PierceLewis97,
author={Pierce, D. J. and Lewis, J. G.},
title={Sparse Multifrontal Rank Revealing {QR} Factorization},
year={1997},
journal=SIMAX,
volume={18},
number={1},
pages={159--180},
}
%got
@article{Pina81,
author={Pina, H. L. G.},
year={1981},
title={An Algorithm for Frontwidth Reduction},
journal=IJNME,
publisher={John Wiley \& Sons, Ltd},
volume={17},
number={10},
pages={1539--1546},
url={ http://dx.doi.org/10.1002/nme.1620171008 },
abstract={This paper presents an algorithm for obtaining a small
frontwidth. This feature is of interest when employing the frontal
technique for solution of systems of linear equations in the finite
element method. The performance of the algorithm is assessed by several
examples at the end of the paper.},
}
%got (Tim has book from TAMU library)
@book{Pissanetsky84,
author={Pissanetsky, S.},
title={Sparse Matrix Technology},
publisher={New York: Academic Press},
year={1984},
address={London}
}
%%POTHEN ----------------------------------------------------------------------
%got
@article{Pothen93,
author={Pothen, A.},
title={Predicting the structure of sparse orthogonal factors},
journal=LAA,
year={1993},
volume={194},
pages={183--204},
}
%got
@incollection{Pothen96,
author={Pothen, A.},
title={Graph partitioning algorithms with applications to scientific computing},
booktitle={Parallel Numerical Algorithms},
publisher={Kluwer Academic Press},
year={1996},
pages={323--368},
editor={Keyes, D. E. and Sameh, A. H. and Venkatakrishan, V.},
}
%got
@article{PothenAlvarado92,
author={Pothen, A. and Alvarado, F. L.},
title={A Fast Reordering Algorithm for Parallel Sparse Triangular Solution},
journal=SISC,
year={1992},
volume={13},
number={2},
pages={645-653},
url={ http://dx.doi.org/10.1137/0913036 },
}
%got
@article{PothenFan90,
author={Pothen, A. and Fan, C.},
title={Computing the Block Triangular Form of a Sparse Matrix},
journal=TOMS,
volume={16},
number={4},
pages={303--324},
month=dec,
year={1990},
url={ http://dx.doi.org/10.1145/98267.98287 },
abstract={We consider the problem of permuting the rows and
columns of a rectangular or square, unsymmetric sparse matrix to
compute its block triangular form. This block triangular form is based
on a canonical decomposition of bipartite graphs induced by a maximum
matching and was discovered by Dulmage and Mendelsohn. We describe
implementations of algorithms to compute the block triangular form and
provide computational results on sparse matrices from test collections.
Several applications of the block triangular form are also included.},
}
%got
@article{PothenSimonLiou90,
author={Pothen, A. and Simon, H. D. and Liou, K.},
title={Partitioning Sparse Matrices with Eigenvectors of Graphs},
journal=SIMAX,
year={1990},
volume={11},
number={3},
pages={430--452},
url={ http://dx.doi.org/10.1137/0611030 },
}
%got
@incollection{PothenSun90,
author={Pothen, A. and Sun, C.},
title={Compact Clique Tree Data Structures in Sparse Matrix Factorizations},
booktitle={Large Scale Numerical Optimization},
publisher={SIAM},
year={1990},
editor={Coleman, T. F. and Li, Y.},
chapter={12},
}
%got
@article{PothenSun93,
author={Pothen, A. and Sun, C.},
title={A Mapping Algorithm for Parallel Sparse {Cholesky} Factorization},
journal=SISC,
year={1993},
volume={14},
number={5},
month={September},
pages={1253--1257},
url={ http://dx.doi.org/10.1137/0914074 },
abstract={A task-to-processor mapping algorithm is described for
computing the parallel multifrontal Cholesky factorization of irregular
sparse problems on distributed-memory multiprocessors. The performance
of the mapping algorithm is compared with the only general mapping
algorithm previously reported. Using this mapping, the distributed
multifrontal algorithm is nearly as efficient on a collection of
problems with irregular sparsity structure as it is for the regular
grid problems.}
}
%got
@incollection{PothenToledo04,
author={Pothen, A. and Toledo, S.},
title={Elimination structures in scientific computing},
booktitle={Handbook on Data Structures and Applications},
chapter={59},
editor={Mehta, D. and Sahni, S.},
publisher={Chapman and Hall /CRC},
year={2004},
url={ http://dx.doi.org/10.1201/9781420035179.ch59 }
}
%GET
@techreport{PouransariCoulierDarve15,
author={Pouransari, H. and Coulier, P. and Darve, E.},
title={Fast hierarchical solvers for sparse matrices},
institution={Dept. of Mechanical Engineering, Stanford University, and Dept. of Civil Engineering, KU Leuven},
year={2015},
number={arXiv:1510.07363},
month={October}
}
%%RAGHAVAN --------------------------------------------------------------------
%got
@article{Raghavan95,
author={Raghavan, P.},
title={Distributed sparse {Gaussian} elimination and orthogonal factorization},
journal=SISC,
volume={16},
number={6},
pages={1462--1477},
year={1995},
annote={was Raghavan93b, TR UIUCDCS-R-93-1818},
url={ http://dx.doi.org/10.1137/0916085 },
abstract={A unified framework is presented for a fully parallel
solution of large, sparse nonsymmetric linear systems on distributed
memory multiprocessors. Unlike earlier work, both symbolic and numeric
steps are parallelized. Parallel Cartesian nested dissection is used to
compute a fill-reducing ordering of A using a compact representation of
the column intersection graph, and the resulting separator tree is used
to estimate the structure of the factor and to distribute data and
perform multifrontal numeric computations. When the matrix is
nonsymmetric but square, the numeric computations involve Gaussian
elimination with partial pivoting; when the matrix is overdetermined,
row-oriented Householder transforms are applied to compute the
triangular factor of an orthogonal factorization. Extensive empirical
results are provided to demonstrate that the approach is effective both
in preserving sparsity and achieving good parallel performance on an
Intel iPSC/860. }
}
%got
@article{Raghavan97,
author={Raghavan, P.},
title={Parallel ordering using edge contraction },
journal=PC,
volume={23},
number={8},
pages={1045 - 1067},
year={1997},
url={ http://dx.doi.org/10.1016/S0167-8191(97)00018-5 },
keywords={Parallel algorithms},
keywords={Sparse linear systems},
keywords={Fill-in},
keywords={Ordering},
keywords={Sparse matrix factorization},
keywords={Nested dissection},
keywords={Parallel nested dissection },
abstract={Computing a fill-reducing ordering of a sparse matrix is a
central problem in the solution of sparse linear systems using direct
methods. In recent years, there has been significant research in
developing a sparse direct solver suitable for message-passing
multiprocessors. However, computing the ordering step in parallel
remains a challenge and there are very few methods available. This
paper describes a new scheme called parallel contracted ordering which
is a combination of a new parallel nested dissection heuristic and any
serial ordering method. The new nested dissection heuristic called
Shrink-Split {ND} (SSND) is based on parallel graph contraction. For
a system with N unknowns, the complexity of {SSND} is O((N/P)log P)
using P processors in a hypercube; the overall complexity is O(N/P)log
N) when the serial ordering method chosen is graph exploration P based
nested dissection. We provide extensive empirical results on the
quality of the ordering. We also report on the parallel performance of
a preliminary implementation on three different message passing
multiprocessors. }
}
%got
@article{Raghavan98,
author={Raghavan, P.},
title={Efficient Parallel Sparse Triangular Solution Using Selective Inversion},
journal=PARALETTERS,
volume={08},
number={01},
pages={29-40},
year={1998},
url={ http://dx.doi.org/10.1142/S0129626498000067 },
abstract={In a fully parallel sparse direct solver the matrix factors
are first computed and then used in forward and back substitution steps
to compute the solution. On message passing multiprocessors these
substitution steps are not performed at high efficiency and pose a
performance bottleneck for applications in which many systems with the
same matrix are solved. We present a "selective inversion" scheme (SI)
which computes inverses of a sequence of submatrices of the factor and
uses these to replace substitution steps by more efficient distributed
matrix-vector multiplications. Experiments on the Intel Paragon and the
IBM-SP2 demonstrate that the scheme has ideal scaled efficiency for 1 –
128 processors and is significantly faster than the traditional
approach. The cost of selective inversion is a small fraction of the
factorization cost; it is approximately 6\% of the factorization cost
for the model, two and three dimensional five-point, finite-difference
grids. On message-passing multiprocessors with high communication
latency, the extra inversion cost is easily offset by reduced
triangular solution time even for a relatively small number of
right-hand-side vectors.},
}
%got
@techreport{Raghavan02,
author={Raghavan, P.},
title={{DSCPACK}: Domain-separator codes for the parallel solution of sparse linear systems},
number={CSE-02-004},
institution={Penn State University},
year={2002},
url={ http://www.cse.psu.edu/~pxr3/software.html },
note={ http://www.cse.psu.edu/$\sim$pxr3/software.html },
address={State College, PA},
}
%%R continued -----------------------------------------------------------------
%got
@article{RauberRungerScholtes99,
author={Rauber, T. and R\"unger, G. and Scholtes, C.},
title={Scalability of sparse {Cholesky} factorization},
journal=IJHSC,
volume={10},
number={1},
pages={19-52},
year={1999},
url={ http://dx.doi.org/10.1142/S012905339900003X }
}
%got
@article{Razzaque80,
author={Razzaque, A.},
title={Automatic reduction of frontwidth for finite element analysis},
journal=IJNME,
publisher={John Wiley \& Sons, Ltd},
volume={25},
number={9},
year={1980},
pages={1315--1324},
url={ http://dx.doi.org/10.1002/nme.1620150904 },
abstract={An algorithm is presented for reducing the frontwidth of
finite element meshes. The technique takes an arbitrary input scheme
and reorders the elements so as to reduce the frontwidth. A number of
examples are presented to demonstrate the reliability and effectiveness
of the method.},
}
%%REID ------------------------------------------------------------------------
%get (Iain has book)
@book{Reid71b,
editor={Reid, J. K.},
year={1971},
title={Large Sparse Sets of Linear Equations},
publisher={New York: Academic Press},
url={ https://epubs.stfc.ac.uk/work/39906 },
note={Proc. Oxford Conf. Organized by the Inst. of Mathematics and its Applications (April 1970)},
annote={review by Parlett at http://dx.doi.org/10.1137/1016066 },
}
%got (Tim has book from TAMU library)
@incollection{Reid74,
author={Reid, J. K.},
year={1974},
title={Direct Methods for Sparse Matrices},
editor={Evans, D. J.},
booktitle={Software for Numerical Mathematics},
publisher={New York: Academic Press},
pages={29-48},
}
%got
@incollection{Reid77b,
author={Reid, J. K.},
year={1977},
title={Solution of Linear Systems of Equations: Direct Methods (general)},
booktitle={Sparse Matrix Techniques, Lecture Notes in Mathematics 572},
publisher={Berlin: Springer-Verlag},
editor={Barker, V. A.},
pages={102--129},
}
%got (Tim has book from TAMU library)
@incollection{Reid77,
author={Reid, J. K.},
year={1977},
title={Sparse Matrices},
editor={Jacobs, D. A. H.},
booktitle={The State of the Art in Numerical Analysis},
publisher={New York: Academic Press},
pages={85-146},
}
%got (Tim has book from TAMU library)
@incollection{Reid81,
author={Reid, J. K.},
year={1981},
title={Frontal Methods for Solving Finite-Element Systems of Linear Equations},
editor={Duff, I. S.},
booktitle={Sparse Matrices and Their Uses},
publisher={New York: Academic Press},
pages={265-281},
keywords={frontal methods finite element method}
}
%got
@article{Reid82,
author={Reid, J. K.},
title={A sparsity-exploiting variant of the {Bartels-Golub} decomposition for linear programming bases},
year={1982},
journal=MATHPROG,
volume={24},
number={1},
url={ http://dx.doi.org/10.1007/BF01585094 },
keywords={Linear Programming; Sparse LU Decomposition; Updating Ip Basis Factorizations; Bartels-Golub Decomposition},
pages={55-69},
}
%got
@article{ReidScott99,
author={Reid, J. K. and Scott, J. A.},
title={Ordering symmetric sparse matrices for small profile and wavefront},
journal=IJNME,
year={1999},
volume={45},
number={12},
publisher={John Wiley \& Sons, Ltd.},
url={ http://dx.doi.org/10.1002/(SICI)1097-0207(19990830)45:12<1737::AID-NME652>3.0.CO;2-T },
pages={1737--1755},
keywords={sparse matrices, symmetric pattern, profile reduction,
Sloan algorithm, reverse Cuthill-McKee algorithm, spectral method},
abstract={The ordering of large sparse symmetric matrices for small
profile and wavefront or for small bandwidth is important for the
efficiency of frontal and variable-band solvers. In this paper, we look
at the computation of pseudoperipheral nodes and compare the
effectiveness of using an algorithm based on level-set structures with
using the spectral method as the basis of the Reverse Cuthill-McKee
algorithm for bandwidth reduction. We also consider a number of ways of
improving the performance and efficiency of Sloan's algorithm for
profile and wavefront reduction, including the use of different
weights, the use of supervariables, and implementing the priority queue
as a binary heap. We also examine the use of the spectral ordering in
combination with Sloan's algorithm. The design of software to implement
the reverse Cuthill-McKee algorithm and a modified Sloan's algorithm is
discussed. Extensive numerical experiments that justify our choice of
algorithm are reported on.}
}
%got
@article{ReidScott01,
author={Reid, J. K. and Scott, J. A.},
title={Reversing the row order for the row-by-row frontal method},
year={2001},
journal=NLAA,
volume={8},
number={1},
publisher={John Wiley \& Sons, Ltd.},
url={ http://dx.doi.org/10.1002/1099-1506(200101/02)8:1<1::AID-NLA223>3.0.CO;2-I },
pages={1--6},
keywords={ordering rows, frontal method, sparse unsymmetric matrices},
abstract={The efficiency of the row-by-row frontal method for the
solution of unsymmetric sparse linear systems of equations is dependent
on the row ordering used. Numerical experience has shown us that it can
be advantageous to reverse a given row ordering. We present two results
on invariances under the reversal of the ordering and use real
applications to illustrate the variations that can take place upon row
reversal.}
}
%got
@article{ReidScott02,
author={Reid, J. K. and Scott, J. A.},
title={Implementing {Hager}'s exchange methods for matrix profile reduction},
journal=TOMS,
volume={28},
number={4},
pages={377--391},
month=dec,
year={2002},
url={ http://dx.doi.org/10.1145/592843.592844 },
abstract={Hager recently introduced down and up exchange methods
for reducing the profile of a sparse matrix with a symmetric sparsity
pattern. The methods are particularly useful for refining orderings
that have been obtained using a standard profile reduction algorithm,
such as the Sloan method. The running times for the exchange algorithms
reported by Hager suggested their cost could be prohibitive for
practical applications. We examine how to implement the exchange
algorithms efficiently. For a range of real test problems, it is shown
that the cost of running our new implementation does not add a
prohibitive overhead to the cost of the original reordering.},
}
%got
@article{ReidScott09b,
author={Reid, J. K. and Scott, J. A.},
title={An efficient out-of-core multifrontal solver for large-scale unsymmetric element problems},
year={2009},
journal=IJNME,
volume={77},
number={7},
publisher={John Wiley \& Sons, Ltd.},
url={ http://dx.doi.org/10.1002/nme.2437 },
pages={901--921},
keywords={large sparse unsymmetric linear systems, element problems,
out-of-core solver, multifrontal, rook pivoting, partial pivoting,
Fortran 95, HSL_MA74},
abstract={In many applications where the efficient solution of large
sparse linear systems of equations is required, a direct method is
frequently the method of choice. Unfortunately, direct methods have a
potentially severe limitation: as the problem size grows, the memory
needed generally increases rapidly. However, the in-core memory
requirements can be limited by storing the matrix and its factors
externally, allowing the solver to be used for very large problems. We
have designed a new out-of-core package for the large sparse
unsymmetric systems that arise from finite-element problems. The code,
which is called HSL_MA78, implements a multifrontal algorithm and
achieves efficiency through the use of specially designed code for
handling the input/output operations and efficient dense linear algebra
kernels. These kernels, which are available as a separate package
called HSL_MA74, use high-level BLAS to perform the partial
factorization of the frontal matrices and offer both threshold partial
and rook pivoting. In this paper, we describe the design of HSL_MA78
and explain its user interface and the options it offers. We also
describe the algorithms used by HSL_MA74 and illustrate the performance
of our new codes using problems from a range of practical applications.}
}
%got
@article{ReidScott09,
author={Reid, J. K. and Scott, J. A.},
title={An Out-of-core Sparse {Cholesky} Solver},
journal=TOMS,
volume={36},
number={2},
pages={9:1--9:33},
url={ http://dx.doi.org/10.1145/1499096.1499098 },
month=mar,
year={2009},
abstract={Direct methods for solving large sparse linear systems
of equations are popular because of their generality and robustness.
Their main weakness is that the memory they require usually increases
rapidly with problem size. We discuss the design and development of the
first release of a new symmetric direct solver that aims to circumvent
this limitation by allowing the system matrix, intermediate data, and
the matrix factors to be stored externally. The code, which is written
in Fortran and called {\tt HSL\_MA77}, implements a multifrontal
algorithm. The first release is for positive-definite systems and
performs a Cholesky factorization. Special attention is paid to the use
of efficient dense linear algebra kernel codes that handle the
full-matrix operations on the frontal matrix and to the input/output
operations. The input/output operations are performed using a separate
package that provides a virtual-memory system and allows the data to be
spread over many files; for very large problems these may be held on
more than one device. Numerical results are presented for a
collection of 30 large real-world problems, all of which were solved
successfully.},
}
%%R continued -----------------------------------------------------------------
%got
@article{Reiszig07,
author={Reiszig, G.},
title={Local fill reduction techniques for sparse symmetric linear systems},
journal=EE,
year={2007},
volume=89,
number=8,
pages={639-652},
month=sep,
url={ http://dx.doi.org/10.1007/s00202-006-0042-2 },
abstract={Local algorithms for obtaining pivot orderings for sparse
symmetric coefficient matrices are reviewed together with their
mathematical background, appropriate data structures and details of
efficient implementation. Heuristics that go beyond the classical
Minimum Degree and Minimum Local Fill scoring functions are discussed,
illustrated, improved and extensively tested on a test suite of
matrices from various applications. Our tests indicate that the
presented techniques have the potential of accelerating circuit
simulation significantly. }
}
%got
@inproceedings{RennichStosicDavis14,
author={Rennich, S. C. and Stosic, D. and Davis, T. A.},
title={Accelerating Sparse {Cholesky} Factorization on {GPUs}},
year={2014},
booktitle={Proc. IA3 Workshop on Irregular Applications: Architectures and Algorithms},
address={New Orleans, LA},
organization={(held in conjunction with SC14)},
pages={9--16},
}
%got
@article{RobeySulsky94,
author={Robey, T. H. and Sulsky, D. L.},
title={Row orderings for a sparse {QR} decomposition},
journal=SIMAX,
year={1994},
volume={15},
number={4},
pages={1208-1225},
url={ http://dx.doi.org/10.1137/S0895479890185641 },
abstract={A new row ordering strategy based on pairing rows to
minimize local fill-in is presented. The row ordering can be combined
with most column ordering strategies to reduce computation, maintain
sparsity, and solve rank deficient problems. Comparison of the new row
pairing algorithm with Duff's fixed pivot row ordering on a collection
of sparse matrix test problems shows a median 47-71\% reduction,
depending on the column ordering, in floating point operations (flops)
required for the QR decomposition. On a finite element application
using nested domain decomposition for the column ordering, the new row
ordering is competitive with the row ordering from nested domain
decomposition. }
}
%%ROSE ------------------------------------------------------------------------
%got (Tim has book from TAMU library)
@incollection{Rose72,
author={Rose, D. J.},
year={1972},
title={A Graph-Theoretic Study of the Numerical Solution of Sparse Positive Definite Systems of Linear Equations},
editor={Read, R. C.},
booktitle={Graph Theory and Computing},
publisher={New York: Academic Press},
pages={183-217},
url={ https://books.google.com/books?id=ja7iBQAAQBAJ },
keywords={ordering graph theory positive definite matrices},
}
%got
@incollection{RoseBunch72,
author={Rose, D. J. and Bunch, J. R.},
title={The Role of Partitioning in the Numerical Solution of Sparse Systems},
pages={177-187},
editor={Rose, D. J. and Willoughby, R. A.},
booktitle={Sparse Matrices and Their Applications},
address={New York},
publisher={New York: Plenum Press},
year={1972},
url={ http://link.springer.com/book/10.1007%2F978-1-4615-8675-3 },
}
%got
@article{RoseTarjan78,
author={Rose, D. J. and Tarjan, R. E.},
year={1978},
title={Algorithmic Aspects of Vertex Elimination on Directed Graphs},
journal=SIAMJAM,
volume={34},
number={1},
pages={176-197},
annote={in Proc. 7th Annual Symp. on Theory of Computing,1975},
url={ http://dx.doi.org/10.1137/0134014 },
abstract={We consider a graph-theoretic elimination process which
models Gaussian elimination on sparse systems of linear equations. We
describe this process by constructive theoretical results which lead to
efficient algorithms to 1) compute the fill-in produced by any
elimination ordering; 2) find a perfect elimination ordering if one
exists; and 3) reduce any fill-in to a minimal fill-in. We relate the
complexity of these elimination problems to other well-studied problems
by showing that tasks 1) and 2) are at least as time-consuming as
testing whether a directed graph is transitive, and that the problem of
finding a minimum ordering is NP-complete. }
}
%got
@article{RoseTarjanLueker76,
author={Rose, D. J. and Tarjan, R. E. and Lueker, G. S.},
year={1976},
title={Algorithmic Aspects of Vertex Elimination on Graphs},
journal=SIAMCOMP,
volume={5},
pages={266-283},
url={ http://dx.doi.org/10.1137/0205021 },
abstract={We consider a graph-theoretic elimination process which is
related to performing Gaussian elimination on sparse symmetric positive
definite systems of linear equations. We give a new linear-time
algorithm to calculate the fill-in produced by any elimination
ordering, and we give two new related algorithms for finding orderings
with special properties. One algorithm, based on breadth-first search,
finds a perfect elimination ordering, if any exists, in O(n+e)
time, if the problem graph has n vertices and e edges. An extension of
this algorithm finds a minimal (but not necessarily minimum) ordering
in O(ne) time. We conjecture that the problem of finding a minimum
ordering is NP-complete. }
}
%got
@article{RoseWhittenShermanTarjan80,
author={Rose, D. J. and Whitten, G. G. and Sherman, A. H. and Tarjan, R. E.},
year={1980},
title={Algorithms and Software for In-Core Factorization of Sparse Symmetric Positive Definite Matrices},
journal=CAS,
volume={11},
number={6},
pages={597-608},
url={ http://dx.doi.org/10.1016/0045-7949(80)90066-8 },
abstract={This paper surveys the current state-of-the-art in the
solution by Gaussian elimination of sparse, symmetric, and positive
definite systems of linear equations. The most important in-core
implementations of LDLt factorization are described and compared based
on the results of numerical experiments.}
}
%got
@book{RoseWilloughby72,
editor={Rose, D. J. and Willoughby, R. A.},
title={Sparse Matrices and Their Applications},
address={New York},
publisher={New York: Plenum Press},
year={1972},
url={ http://link.springer.com/book/10.1007%2F978-1-4615-8675-3 },
annote={review by Parlett at http://dx.doi.org/10.1137/1016066 },
}
%%ROTHBERG ---------------------------------------------------------------------
%got
@article{Rothberg95,
author={Rothberg, E.},
title={Alternatives for solving sparse triangular systems on distributed-memory computers},
journal=PC,
volume={21},
pages={1121--1136},
year={1995},
url={ http://dx.doi.org/10.1016/0167-8191(95)00003-7 },
abstract={ The solution of sparse triangular systems plays an important
role in the direct solution of sparse linear equations. This paper
evaluates two alternative approaches to performing this computation on
distributed-memory multiprocessors. First we consider a traditional
approach, which assumes a column distribution of the triangular system
among the processors. We find the performance of this approach to be
extremely low, for a variety of reasons. We then consider an
alternative approach, which assumes a block-column or panel
distribution of the triangular system. This alternative is found to
provide several important advantages over a column approach. Overall,
the panel approach provides as much as a ten-fold parallel performance
improvement. Performance results are presented from highly optimized
implementations on the iPSC 860 system. },
}
%got
@article{Rothberg96,
author={Rothberg, E.},
title={Performance of Panel and Block Approaches to Sparse {Cholesky} Factorization on the {iPSC/860} and {Paragon} Multicomputers},
publisher={SIAM},
year={1996},
journal=SISC,
volume={17},
number={3},
pages={699-713},
keywords={sparse Cholesky factorization; parallel machines; sparse matrices; scalability},
url={ http://dx.doi.org/10.1137/S106482759426715X },
abstract={Sparse Cholesky factorization has historically achieved
extremely low performance on distributed-memory multiprocessors. We
believe that three issues must be addressed to improve this situation:
(1) parallel factorization methods must be based on more efficient
sequential methods; (2) parallel machines must provide higher
interprocessor communication bandwidth; and (3) the sparse matrices
used to evaluate parallel sparse factorization performance should be
more representative of the sizes of matrices people would factor on
large parallel machines. This paper demonstrates that all three of
these issues have in fact already been addressed. Specifically, (1)
single node performance can be improved by moving from a
column-oriented approach, where the computational kernel is level 1
BLAS, to either a panel- or block-oriented approach, where the
computational kernel is level 3 BLAS; (2) communication hardware has
improved dramatically, with new parallel computers (the Intel Paragon
system) providing one to two orders of magnitude higher communication
bandwidth than previous parallel computers (the Intel iPSC/860 system);
and (3) several larger benchmark matrices are now available, and newer
parallel machines offer sufficient memory per node to factor these
larger matrices. The result of addressing these three issues is
extremely high performance on moderately parallel machines. This paper
demonstrates performance levels of 650 double-precision Mflops on 32
nodes of the Intel Paragon system, 1 Gflop on 64 nodes, and 1.7 Gflops
on 128 nodes. This paper also does a direct performance comparison
between the iPSC/860 and Paragon systems, as well as a comparison
between panel- and block-oriented approaches to parallel
factorization.}
}
%got
@article{RothbergEisenstat98,
author={Rothberg, E. and Eisenstat, S. C.},
title={Node selection strategies for bottom-up sparse matrix orderings},
journal=SIMAX,
year={1998},
volume={19},
number={3},
pages={682-695},
url={ http://dx.doi.org/10.1137/S0895479896302692 },
abstract={The minimum degree and minimum local fill algorithms are two
bottom-up heuristics for reordering a sparse matrix prior to
factorization. Minimum degree chooses a node of least degree to
eliminate next; minimum local fill chooses a n ode whose elimination
creates the least fill. Contrary to popular belief, we find that
minimum local fill produces significantly better orderings than minimum
degree, albeit at a greatly increased runtime. We describe two simple
modifications to this strategy that further improve ordering quality.
We also describe a simple modification to minimum degree, which we term
approximate minimum mean local fill, that reduces factorization work by
roughly 25\% with only a small increase in runtime. }
}
%got
@article{RothbergGupta91,
author={Rothberg, E. and Gupta, A.},
title={Efficient sparse matrix factorization on high-performance workstations - Exploiting the memory hierarchy},
journal=TOMS,
year={1991},
volume={17},
number={3},
pages={313--334},
url={ http://dx.doi.org/10.1145/114697.116809 },
}
%got
@article{RothbergGupta93,
author={Rothberg, E. and Gupta, A.},
title={An evaluation of left-looking, right-looking, and multifrontal approaches to sparse {Cholesky} factorization on hierarchical-memory machines},
journal=IJHSC,
volume={5},
number={4},
month={Nov.},
year={1993},
pages={537-593},
url={ http://dx.doi.org/10.1142/S0129053393000232 },
abstract={In this paper we present a comprehensive analysis of the
performance of a variety of sparse Cholesky factorization methods on
hierarchical-memory machines. We investigate methods that vary along
two different axes. Along the first axis, we consider three different
high-level approaches to sparse factorization: left-looking,
right-looking and multifrontal. Along the second axis, we consider the
implementation of each of these high-level approaches using different
sets of primitives. The primitives vary based on the structures they
manipulate. One important structure in sparse Cholesky factorization is
a single column of the matrix. We first consider primitives that
manipulate single columns. These are the most commonly used primitives
for expressing the sparse Cholesky computation. Another important
structure is the supernode, a set of columns with identical non-zero
structures. We consider sets of primitives that exploit the supernodal
structure of the matrix to varying degrees. We find that primitives
that manipulate larger structures greatly increase the amount of
exploitable data reuse, thus leading to dramatically higher performance
on hierarchical-memory machines. We observe performance increases of
two to three times when comparing methods based on primitives that make
extensive use of the supernodal structure to methods based on
primitives that manipulate columns. We also find that the overall
approach (left-looking, right-looking, or multifrontal) is less
important for performance than the particular set of primitives used to
implement the approach. },
}
%got
@article{RothbergGupta94,
author={Rothberg, E. and Gupta, A.},
title={An Efficient Block-Oriented Approach to Parallel Sparse {Cholesky} Factorization},
publisher={SIAM},
year={1994},
journal=SISC,
volume={15},
number={6},
pages={1413--1439},
keywords={sparse Cholesky factorization; systems of linear equations; parallel computing; supernodes; scalability},
url={ http://dx.doi.org/10.1137/0915085 },
abstract={This paper explores the use of a subblock decomposition
strategy for parallel sparse Cholesky factorization in which the sparse
matrix is decomposed into rectangular blocks. Such a strategy has
enormous theoretical scalability advantages over more traditional
column-oriented and panel-oriented decompositions. However, little
progress has been made in producing a practical subblock method. This
paper describes and evaluates an approach that is simple to implement,
provides slightly higher performance than column (and panel) methods on
small parallel machines, and has the potential to provide much higher
performance on large parallel machines. }
}
%got
@inproceedings{RothbergSchreiber94,
author={Rothberg, E. and Schreiber, R.},
title={Improved load distribution in parallel sparse {Cholesky} factorization},
year={1994},
booktitle={Proc. Supercomputing '94},
publisher={IEEE},
pages={783--792},
url={ http://dx.doi.org/10.1109/SUPERC.1994.344344 },
abstract={Compared to the customary column oriented approaches, block
oriented, distributed memory sparse Cholesky factorization benefits
from an asymptotic reduction in interprocessor communication volume and
an asymptotic increase in the amount of concurrency that is exposed in
the problem. Unfortunately, block oriented approaches (specifically,
the block fan out method) have suffered from poor balance of the
computational load. As a result, achieved performance can be quite low.
The paper investigates the reasons for this load imbalance and proposes
simple block mapping heuristics that dramatically improve it. The
result is a roughly 20% increase in realized parallel factorization
performance, as demonstrated by performance results from an Intel
Paragon system. We have achieved performance of nearly 3.2 billion
floating point operations per second with this technique on a 196 node
Paragon system},
}
%got
@article{RothbergSchreiber99,
author={Rothberg, E. and Schreiber, R.},
title={Efficient methods for out-of-core sparse {Cholesky} factorization},
journal=SISC,
volume={21},
number={1},
pages={129--144},
year={1999},
url={ http://dx.doi.org/10.1137/S1064827597322975 },
abstract={We consider the problem of sparse Cholesky factorizationwith
limited main memory. The goal is to efficiently factor matrices whose
Cholesky factors essentially fill the available disk storage, using
very little memory (as little as 16 Megabytes (MBytes)). This would
enable very large industrial problems to be solved with workstations of
very modest cost. We consider three candidate algorithms. Each is
based on a partitioning of the matrix into panels. The first is a
robust, out-of-core multifrontal method that keeps the factor, the
stack, and the large frontal matrices on disk. The others are
left-looking methods. We find that straightforward implementations of
all of them suffer from excessive disk I/O for large problems that
arise in interior-point algorithms for linear programming. We introduce
several improvements to these simple out-of-core methods and find that
a left-looking method that nevertheless uses the multifrontal algorithm
for portions of the matrix (subtrees of the supernodal elimination tree
whose multifrontal stack fits in memory) is very effective. With 32
Mbytes of main memory, it achieves over 77\% of its in-core performance
on all but one of our 12 test matrices (67\% in that one case), even
though the size of the factor is, in all cases, hundreds of millions or
even billions of bytes. }
}
%%R continued ------------------------------------------------------------------
%got
@article{RotkinToledo04,
author={Rotkin, V. and Toledo, S.},
title={The design and implementation of a new out-of-core sparse {Cholesky} factorization method},
journal=TOMS,
year={2004},
volume={30},
number={1},
pages={19--46},
}
%GET
@techreport{RouetLiGhyselsNapov15,
author={Rouet, F.-H. and Li, X. S. and Ghysels, P. and Napov, A.},
title={A distributed-memory package for dense Hierarchically Semi-Separable matrix computations using randomization},
institution={Lawrence Berkeley National Laboratory, Berkeley},
year={2015},
number={arXiv:1503.05464},
month={March}
}
%got
@article{RozinToledo05,
author={Rozin, E. and Toledo, S.},
title={Locality of reference in sparse {Cholesky} methods},
journal=ETNA,
volume={21},
pages={81--106},
year={2005},
url={ http://etna.mcs.kent.edu/volumes/2001-2010/vol21/abstract.php?vol=21&pages=81-106 },
abstract={This paper analyzes the cache efficiency of two
high-performance sparse Cholesky factorization algorithms: the
multifrontal algorithm and the left-looking algorithm. These two are
essentially the only two algorithms that are used in current codes;
generalizations of these algorithms are used in general-symmetric and
general-unsymmetric sparse triangular factorization codes. Our
theoretical analysis shows that while both algorithms sometimes enjoy a
high level of data reuse in the cache, they are incomparable: there are
matrices on which one is cache efficient and the other is not, and vice
versa. The theoretical analysis is backed up by detailed experimental
evidence, which shows that our theoretical analyses do predict
cache-miss rates and performance in practice, even though the theory
uses a fairly simple cache model. We also show, experimentally, that on
matrices arising from finite-element structural analysis, the
left-looking algorithm consistently outperforms the multifrontal
algorithm. Direct cache-miss measurements indicate that the difference
in performance is largely due to differences in the number of level-2
cache misses that the two algorithms generate. Finally, we also show
that there are matrices where the multifrontal algorithm may require
significantly more memory than the left-looking algorithm. On the other
hand, the left-looking algorithm never uses more memory than the
multifrontal one.}
}
%%S ---------------------------------------------------------------------------
%got
@article{SadayappanVisvanathan88,
author={Sadayappan, P. and Visvanathan, V.},
journal=IEEETC,
title={Circuit simulation on shared-memory multiprocessors},
year={1988},
month={Dec},
volume={37},
number={12},
pages={1634-1642},
url={ http://dx.doi.org/10.1109/12.9740 },
abstract={Reports the parallelization on a shared-memory vector
multiprocessor of the computationally intensive components of a circuit
simulator-matrix assembly (including device model evaluation) and the
unstructured sparse linear system solution. A theoretical model is used
to predict the performance of the lock-synchronized parallel matrix
assembly, and the results are compared to experimental measurements.
Alternate approaches to efficient sparse matrix solution are
contrasted, highlighting the impact of the matrix representation/access
strategy on achievable performance, and medium-grained approach with
superior performance is introduced. The techniques developed have been
incorporated into a prototype parallel implementation of the production
circuit simulator ADVICE on the Alliant FX/8 multiprocessor},
}
%got
@article{SadayappanVisvanathan89,
author={Sadayappan, P. and Visvanathan, V.},
title={Efficient sparse matrix factorization for circuit simulation on vector supercomputers},
journal=IEEETCAD,
volume={8},
number={12},
pages={1276--1285},
year={1989},
url={ http://dx.doi.org/10.1109/43.44508 },
abstract={An efficient approach to sparse matrix factorization on
vector supercomputers is described. The approach is suitable for
application domains like circuit simulation that require the repeated
direct solution of sparse linear systems of equations with identical
zero-nonzero structures. An overlap-scatter data structure is used to
represent the sparse matrix, enabling the use of multiple operand
access modes to achieve higher performance than earlier proposed
approaches. The superior performance of the new solver is demonstrated
using a number of matrices derived from circuit simulation runs},
}
%got
@article{SalaStanleyHeroux08,
author={Sala, M. and Stanley, K. S. and Heroux, M. A.},
title={On the Design of Interfaces to Sparse Direct Solvers},
journal=TOMS,
volume={34},
number={2},
month=mar,
year={2008},
pages={9:1--9:22},
url={ http://dx.doi.org/10.1145/1326548.1326551 },
abstract={We discuss the design of general, flexible, consistent,
reusable and efficient interfaces to software libraries for the direct
solution of systems of linear equations on both serial and distributed
memory architectures. We introduce a set of abstract classes to access
the linear system matrix elements and their distribution, access vector
elements, and control the solution of the linear system. We describe a
concrete implementation of the proposed interfaces, and report examples
of applications and numerical results showing that the overhead induced
by the object-oriented design is negligible under typical conditions of
usage. We include examples of applications, and we comment on the
advantages and limitations of the design.},
}
%got
@article{Saltz90,
author={Saltz, J. H.},
title={Aggregation Methods for Solving Sparse Triangular Systems on Multiprocessors},
journal=SISC,
year={1990},
volume={11},
number={1},
pages={123-144},
url={ http://dx.doi.org/10.1137/0911008 },
}
%got
@inproceedings{SaoLiuVuducLi15,
author={Sao, P. and Liu, X. and Vuduc, R. and Li, X. S},
title={A Sparse Direct Solver for Distributed Memory {Xeon} {Phi}-accelerated Systems},
booktitle={29th IEEE Intl. Parallel \& Distributed Processing Symposium (IPDPS)},
month={May},
year={2015},
address={Hyderabad, India},
abstract={This paper presents the first sparse direct solver for
distributed memory systems comprising hybrid multicore CPU and Intel
Xeon Phi co-processors. It builds on the algorithmic approach of
SUPERLU_DIST, which is right-looking and statically pivoted. Our
contribution is a novel algorithm, called the HALO. The name is
shorthand for highly asynchronous lazy offload; it refers to the way
the algorithm combines highly aggressive use of asynchrony with
accelerated offload, lazy updates, and data shadowing (a la halo or
ghost zones), all of which serve to hide and reduce communication,
whether to local memory, across the network, or over PCIe. We further
augment HALO with a model driven autotuning heuristic that chooses the
intra-node division of labor among CPU and Xeon Phi co-processor
components. When integrated into SUPERLU_DIST and evaluated on a
variety of realistic test problems in both single-node and multi-node
configurations, the resulting implementation achieves speedups of up to
2.5x over an already efficient multicore CPU implementation, and
achieves up to 83\% of a machine-specific upper-bound that we have
estimated. Our analysis quantifies how well our implementation performs
and allows us to speculate on the potential speedups that might come
from a variety of future improvements to the algorithm and system.}
}
%got
@inproceedings{SaoVuducLi14,
title={A distributed {CPU}-{GPU} sparse direct solver},
author={Sao, P. and Vuduc, R. and Li, X. S.},
booktitle={Proc. Euro-Par 2014 Parallel Processing},
address={Porto, Portugal},
month={August},
year={2014},
pages={487-498},
abstract={This paper presents the first hybrid MPI+OpenMP+CUDA
implementation of a distributed memory right-looking unsymmetric sparse
direct solver (i.e., sparse LU factorization) that uses static
pivoting. While BLAS calls can account for more than 40\% of the
overall factorization time, the difficulty is that small problem sizes
dominate the workload, making efficient GPU utilization challenging.
This fact motivates our approach, which is to find ways to aggregate
collections of small BLAS operations into larger ones; to schedule
operations to achieve load balance and hide long-latency operations,
such as PCIe transfer; and to exploit simultaneously all of a node's
available CPU cores and GPUs.},
series={Lecture Notes in Computer Science},
volume={8632},
editor={Silva, F. and Dutra, I. and {Santos Costa}, V.},
url={ http://dx.doi.org/10.1007/978-3-319-09873-9_41 },
publisher={Springer International Publishing},
}
%got
@article{SatoTinney63,
author={Sato, N. and Tinney, W. F.},
title={Techniques for Exploiting the Sparsity of the Network Admittance Matrix},
journal=IEEETPAS,
volume={82},
number={69},
pages={944--949},
year={1963},
url={ http://dx.doi.org/10.1109/TPAS.1963.291477 },
abstract={This paper describes some computer programing techniques for
taking advantage of the sparsity of the admittance matrix. The
techniques are based on two main ideas; (1) determination of a sequence
of operations. which results in a near minimum of memory and computing,
(2) preservation of these operations for repetition. Use of these
techniques makes it possible to obtain significant reductions in memory
and processing time for many network analysis programs. Claims are
substantiated by actual results.},
}
%got
@article{SchenkGartner02,
author={Schenk, O. and G\"artner, K.},
title={Two-level dynamic scheduling in {PARDISO}: Improved scalability on shared memory multiprocessing systems},
journal=PC,
volume={28},
number={2},
pages={187--197},
year={2002},
url={ http://dx.doi.org/10.1016/S0167-8191(01)00135-1 },
keywords={Large sparse linear systems},
keywords={Sparse matrix factorization},
keywords={Sparse LU decomposition},
keywords={Multiprocessor computers},
keywords={Parallel sparse solvers},
abstract={The PARDISO package is a mathematical library of OpenMP
routines for the parallel direct solution of large sparse linear
systems of equations. One objective of PARDISO is to achieve a high
efficiency on shared memory multiprocessing systems. A new
parallelization strategy based on a dynamic two-level scheduling scheme
is therefore explored. The method aims at minimizing cache conflicts
and interprocessor communication costs and, at the same time,
maximizing processor load balance and Level-3 BLAS performance. The
synchronization events are reduced by one order of magnitude compared
with a one-level scheduling strategy. This results in an efficient
parallel sparse LU decomposition method. An overview of the two-level
scheduling algorithm and the key algorithmic features of the solver
PARDISO is given. Finally, numerical results and a comparison with
another software package demonstrate the performance.},
}
%got
@article{SchenkGartner04,
author={Schenk, O. and G\"artner, K.},
title={Solving unsymmetric sparse systems of linear equations with {PARDISO}},
journal=FGCS,
year={2004},
volume={20},
number={3},
pages={475--487},
url={ http://dx.doi.org/10.1016/j.future.2003.07.011 },
keywords={Computational sciences},
keywords={Numerical linear algebra},
keywords={Direct solver},
keywords={Unsymmetric linear systems },
abstract={Supernode partitioning for unsymmetric matrices together
with complete block diagonal supernode pivoting and asynchronous
computation can achieve high gigaflop rates for parallel sparse {LU}
factorization on shared memory parallel computers. The progress in
weighted graph matching algorithms helps to extend these concepts
further and unsymmetric prepermutation of rows is used to place large
matrix entries on the diagonal. Complete block diagonal supernode
pivoting allows dynamical interchanges of columns and rows during the
factorization process. The level-3 {BLAS} efficiency is retained and an
advanced two-level left-right looking scheduling scheme results in good
speedup on {SMP} machines. These algorithms have been integrated into
the recent unsymmetric version of the {PARDISO} solver. Experiments
demonstrate that a wide set of unsymmetric linear systems can be solved
and high performance is consistently achieved for large sparse
unsymmetric matrices from real world applications. }
}
%got
@article{SchenkGartner06,
author={Schenk, O. and G\"artner, K.},
title={On fast factorization pivoting methods for sparse symmetric indefinite systems},
journal=ETNA,
year={2006},
volume={23},
pages={158--179},
url={ http://etna.math.kent.edu/volumes/2001-2010/vol23/abstract.php?vol=23&pages=158-179 },
abstract={This paper discusses new pivoting factorization methods for
solving sparse symmetric indefinite systems. As opposed to many
existing pivoting methods, our Supernode-Bunch-Kaufman (SBK) pivoting
method dynamically selects 1x1 and 2x2 pivots and may be supplemented
by pivot perturbation techniques. We demonstrate the effectiveness and
the numerical accuracy of this algorithm and also show that a high
performance implementation is feasible. We will also show that
symmetric maximum-weighted matching strategies add an additional level
of reliability to SBK. These techniques can be seen as a complement to
the alternative idea of using more complete pivoting techniques during
the numerical factorization. Numerical experiments validate these
conclusions.}
}
%got
@article{SchenkGartnerFichtner00,
author={Schenk, O. and G\"artner, K. and Fichtner, W.},
title={Efficient sparse {LU} factorization with left-right looking strategy on shared memory multiprocessors},
journal=BIT,
year={2000},
volume={40},
number={1},
pages={158--176},
url={ http://dx.doi.org/10.1023/A%3A1022326604210 },
abstract={An efficient sparse LU factorization algorithm on popular
shared memory multi-processors is presented. Pipelining parallelism is
essential to achieve higher parallel efficiency and it is exploited
with a left-right looking algorithm. No global barrier is used and a
completely asynchronous scheduling scheme is one central point of the
implementation. The algorithm has been successfully tested on SUN
Enterprise, DEC AlphaServer, SGI Origin 2000 and Cray T90 and J90
parallel computers, delivering up to 2.3 GFlop/s on an eight processor
DEC AlphaServer for medium-size semiconductor device simulations and
structural engineering problems.}
}
%got
@article{SchenkGartnerFichtnerStricker01,
author={Schenk, O. and G\"artner, K. and Fichtner, W. and Stricker, A.},
title={{PARDISO}: A High-Performance Serial and Parallel Sparse Linear Solver in Semiconductor Device Simulation},
journal=FGCS,
year={2001},
volume={18},
number={1},
pages={69--78},
url={ http://dx.doi.org/10.1016/S0167-739X(00)00076-5 },
keywords={Parallel sparse {LU} factorization},
keywords={Pipelining parallelism},
keywords={Semiconductor device simulation },
abstract={The package PARDISO is a high-performance, robust and easy
to use software for solving large sparse symmetric or structurally
symmetric linear systems of equations on shared memory multiprocessors.
PARDISO uses a combination of left- and right-looking Level-3 BLAS
supernode techniques to exploit pipelining parallelism. It delivers up
to 960 Mflop/s on COMPAQ Alpha ES40 (667 MHz) for irregular problems
and sparse matrix factorization has been clocked up at a speedup of 7
on an 8-node SGI Origin 2000. The paper gives an overview of the
algorithm, performance results and the integration of the solver into
complex industrial simulation tools. Finally, an example is discussed
inherently (due to the design goal) producing linear systems close to
singularity. }
}
%got
@article{Schreiber82,
author={Schreiber, R.},
year={1982},
title={A New Implementation of Sparse {Gaussian} Elimination},
journal=TOMS,
volume={8},
number={3},
annote={First use of the etree},
url={ http://dx.doi.org/10.1145/356004.356006 },
pages={256--276},
}
%got
@incollection{Schreiber93,
author={Schreiber, R.},
title={Scalability of sparse direct solvers},
pages={191--209},
booktitle={Graph Theory and Sparse Matrix Computation},
series={IMA Volumes in Applied Mathematics},
year={1993},
publisher={Springer-Verlag},
address={New York},
editor={George, A. and Gilbert, J. R. and Liu, J. W. H.},
volume={56},
}
%got
@article{Schulze01,
author={Schulze, J.},
title={Towards a tighter coupling of bottom-up and top-down sparse matrix ordering methods},
journal=BIT,
volume={41},
number={4},
pages={800--841},
year={2001},
}
%%SCOTT ------------------------------------------------------------------------
%got
@article{Scott99b,
author={Scott, J. A.},
title={A new row ordering strategy for frontal solvers},
journal=NLAA,
volume={6},
number={3},
publisher={John Wiley \& Sons, Ltd.},
url={ http://dx.doi.org/10.1002/(SICI)1099-1506(199904/05)6:3<189::AID-NLA160>3.0.CO;2-C },
pages={189--211},
keywords={ordering rows, frontal method, row graphs, sparse unsymmetric matrices},
year={1999},
abstract={The frontal method is a variant of Gaussian elimination
that has been widely used since the mid 1970s. In the innermost loop of
the computation the method exploits dense linear algebra kernels, which
are straightforward to vectorize and parallelize. This makes the method
attractive for modern computer architectures. However, unless the
matrix can be ordered so that the front is never very large, frontal
methods can require many more floating-point operations for
factorization than other approaches. We are interested in matrices that
have a highly asymmetric structure. We use the idea of a row graph of
an unsymmetric matrix combined with a variant of Sloan's profile
reduction algorithm to reorder the rows. We also look at applying the
spectral method to the row graph. Numerical experiments performed on a
range of practical problems illustrate that our proposed MSRO and
hybrid MSRO row ordering algorithms yield substantial reductions in the
front sizes and, when used with a frontal solver, significantly enhance
its performance both in terms of the factorization time and storage
requirements.}
}
%got
@article{Scott99,
author={Scott, J. A.},
title={On ordering elements for a frontal solver},
journal=CNME,
volume={15},
number={5},
publisher={John Wiley \& Sons, Ltd.},
url={ http://dx.doi.org/10.1002/(SICI)1099-0887(199905)15:5<309::AID-CNM246>3.0.CO;2-F },
pages={309--324},
keywords={ordering finite elements, frontal method, Sloan algorithm, spectral method},
year={1999},
abstract={The efficiency of the frontal method for the solution of
finite-element problems depends on the order in which the elements are
assembled. This paper looks at using variants of Sloan's algorithm to
reorder the elements. Both direct and indirect reordering algorithms
are considered and are used in combination with spectral orderings.
Numerical experiments are performed on a range of practical problems
and, on the basis of the results, a hybrid Sloan element resequencing
algorithm is proposed for use with a frontal algorithm.}
}
%got
@article{Scott01c,
author={Scott, J. A.},
title={The design of a portable parallel frontal solver for chemical process engineering problems},
journal=CCE,
year={2001},
volume={25},
pages={1699--1709},
url={ http://dx.doi.org/10.1016/S0098-1354(01)00731-1 },
abstract={We report on the design and development of a parallel
frontal code HSL_MP43 for the numerical solution of the large sparse
highly unsymmetric linear systems of equations that arise in
industrial-scale chemical process engineering. HSL_MP43 has been
developed for the mathematical software library HSL 2000
(http://www.cse.clrc.ac.uk/Activity/HSL). The main design goals for
HSL_MP43 were: probability, ease of use, efficiency, and flexibility.
We discuss how each of these objectives is addressed within HSL_MP43
and illustrate the performance of the code using a range of large-scale
problems from chemical process simulation and optimisation.}
}
%got
@article{Scott01,
author={Scott, J. A.},
title={A parallel frontal solver for finite element applications},
journal=IJNME,
volume={50},
number={5},
publisher={John Wiley \& Sons, Ltd.},
url={ http://dx.doi.org/10.1002/1097-0207(20010220)50:5<1131::AID-NME68>3.0.CO;2-X },
pages={1131--1144},
keywords={finite-elements, unsymmetric linear systems, frontal method, parallel processing, Fortran 90, MPI},
year={2001},
abstract={In finite element simulations, the overall computing time
is dominated by the time needed to solve large sparse linear systems of
equations. We report on the design and development of a parallel
frontal code that can significantly reduce the wallclock time needed
for the solution of these systems. The algorithm used is based on
dividing the finite element domain into subdomains and applying the
frontal method to each subdomain in parallel. The so-called multiple
front approach is shown to reduce the amount of work and memory
required compared with the frontal method and, when run on a small
number of processes, achieves good speedups. The code, HSL_MP42, has
been developed for the Harwell Subroutine Library
(http://www.numerical.rl.ac.uk/hsl). It is written in Fortran 90 and,
by using MPI for message passing, achieves portability across a wide
range of modern computer architectures.}
}
%got
@article{Scott03,
author={Scott, J. A.},
title={Parallel frontal solvers for large sparse linear systems},
journal=TOMS,
volume={29},
number={4},
month=dec,
year={2003},
pages={395--417},
url={ http://dx.doi.org/10.1145/962437.962440 },
abstract={Many applications in science and engineering give rise
to large sparse linear systems of equations that need to be solved as
efficiently as possible. As the size of the problems of interest
increases, it can become necessary to consider exploiting
multiprocessors to solve these systems. We report on the design and
development of parallel frontal solvers for the numerical solution of
large sparse linear systems. Three codes have been developed for the
mathematical software library HSL (www.cse.clrc.ac.uk/Activity/HSL).
The first is for unsymmetric finite-element problems; the second is for
symmetric positive definite finite-element problems; and the third is
for highly unsymmetric linear systems such as those that arise in
chemical process engineering. In each case, the problem is subdivided
into a small number of loosely connected subproblems and a frontal
method is then applied to each of the subproblems in parallel. We
discuss how our software is designed to achieve the goals of
portability, ease of use, efficiency, and flexibility, and illustrate
the performance using problems arising from real applications.},
}
%got
@article{Scott06,
author={Scott, J. A.},
title={A frontal solver for the 21st century},
journal=CNME,
volume={22},
number={10},
publisher={John Wiley \& Sons, Ltd.},
url={ http://dx.doi.org/10.1002/cnm.870 },
pages={1015--1029},
keywords={large sparse linear systems, finite elements, frontal method, out-of-core, Fortran 95},
year={2006},
abstract={In recent years there have been a number of important
developments in frontal algorithms for solving the large sparse linear
systems of equations that arise from finite-element problems. We report
on the design of a new fully portable and efficient frontal solver for
large-scale real and complex unsymmetric linear systems from
finite-element problems that incorporates these developments. The new
package offers both a flexible reverse communication interface and a
simple to use all-in-one interface, which is designed to make the
package more accessible to new users. Other key features include
automatic element ordering using a state-of-the-art hybrid multilevel
spectral algorithm, minimal main memory requirements, the use of
high-level BLAS, and facilities to allow the solver to be used as part
of a parallel multiple front solver. The performance of the new solver,
which is written in Fortran 95, is illustrated using a range of
problems from practical applications. The solver is available as
package HSL_MA42_ELEMENT within the HSL mathematical software library
and, for element problems, supersedes the well-known MA42 package.}
}
%got
@article{Scott10,
author={Scott, J. A.},
title={Scaling and Pivoting in an Out-of-core Sparse Direct Solver},
journal=TOMS,
volume={37},
number={2},
year={2010},
month=apr,
pages={19:1--19:23},
url={ http://dx.doi.org/10.1145/1731022.1731029 },
abstract={Out-of-core sparse direct solvers reduce the amount of
main memory needed to factorize and solve large sparse linear systems
by holding the matrix data, computed factors and main work arrays in
files on disk. The efficiency of the factorization and solution phases
is dependent upon the number of entries in the factors. For a given
pivot sequence, the level of fill beyond that predicted using the
sparsity pattern depends on the number of pivots that are delayed (that
is, pivots used later than expected because of numerical stability
considerations). Our aim is to limit the number of delayed pivots,
while maintaining robustness and accuracy. In this paper, we consider a
new out-of-core multifrontal solver HSL_MA78 that is designed to solve
the unsymmetric sparse linear systems that arise from finite element
applications. We consider how equilibration can be built into the
solver without requiring the system matrix to be held in main memory.
We also examine the effects of different pivoting strategies, including
threshold partial pivoting, threshold rook pivoting and static
pivoting. Numerical experiments on problems arising from a range of
practical applications illustrate the importance of scaling and show
that rook pivoting can be more efficient than partial pivoting.},
}
%got
@article{ScottHu07,
author={Scott, J. A. and Hu, Y.},
title={Experiences of Sparse Direct Symmetric Solvers},
journal=TOMS,
volume={33},
number={3},
month=aug,
year={2007},
pages={18:1--18:28},
url={ http://dx.doi.org/10.1145/1268769.1268772 },
abstract={We recently carried out an extensive comparison of the
performance of state-of-the-art sparse direct solvers for the numerical
solution of symmetric linear systems of equations. Some of these
solvers were written primarily as research codes while others have been
developed for commercial use. Our experiences of using the different
packages to solve a wide range of problems arising from real
applications were mixed. In this paper, we highlight some of these
experiences with the aim of providing advice to both software
developers and users of sparse direct solvers. We discuss key features
that a direct solver should offer and conclude that while performance
is an essential factor to consider when choosing a code, there are
other features that a user should also consider looking for that vary
significantly between packages.},
}
%%S continued ------------------------------------------------------------------
%got
@article{ShenYangJiao00,
author={Shen, K. and Yang, T. and Jiao, X.},
title={{S+}: efficient {2D} sparse {LU} factorization on parallel machines},
journal=SIMAX,
year={2000},
volume={22},
number={1},
pages={282--305},
url={ http://dx.doi.org/10.1137/S0895479898337385 },
abstract={Static symbolic factorization coupled with supernode
partitioning and asynchronous computation scheduling can achieve high
gigaflop rates for parallel sparse LU factorization with partial
pivoting. This paper studies properties of elimination forests and uses
them to optimize supernode partitioning/amalgamation and execution
scheduling. It also proposes supernodal matrix multiplication to speed
up kernel computation by retaining the BLAS-3 level efficiency and
avoiding unnecessary arithmetic operations. The experiments show that
our new design with proper space optimization, called S+ , improves our
previous solution substantially and can achieve up to 10 GFLOPS on 128
Cray T3E 450MHz nodes. }
}
%got
@article{Sherman78b,
author={Sherman, A. H.},
year={1978},
title={Algorithm 533: {NSPIV}, a {Fortran} Subroutine for Sparse {Gaussian} Elimination with Partial Pivoting},
journal=TOMS,
volume={4},
number={4},
pages={391-398},
url={ http://dx.doi.org/10.1145/356502.356498 },
}
%got
@article{Sherman78,
author={Sherman, A. H.},
year={1978},
title={Algorithms for Sparse {Gaussian} Elimination with Partial Pivoting},
journal=TOMS,
volume={4},
number={4},
pages={330--338},
url={ http://dx.doi.org/10.1145/356502.356494 },
keywords={NSPIV ordering partial pivoting}
}
%got
@article{SilvesterAudaStone84,
author={Silvester, P. P. and Auda, H. A. and Stone, G. D.},
title={A memory-economic frontwidth reduction algorithm},
journal=IJNME,
volume={20},
number={4},
publisher={John Wiley \& Sons, Ltd},
url={ http://dx.doi.org/10.1002/nme.1620200411 },
pages={733--743},
year={1984},
abstract={A simple element numbering algorithm is described which
yields near-minimal frontwidths for two- and three-dimensional finite
element assemblies. Renumbering of E elements requires an
immediate-access computer memory size which is proportional to the
square root of E in two-dimensional problems, and to the two-thirds
power of E in three dimensions. This very small memory requirement
allows processing of large problems in minicomputers. When used in
large computers, page-swapping operations are minimized.},
}
%got
@article{Sloan86,
author={Sloan, S. W.},
title={An algorithm for profile and wavefront reduction of sparse matrices},
journal=IJNME,
year={1986},
volume={23},
number={2},
pages={239--251},
publisher={John Wiley \& Sons, Ltd},
url={ http://dx.doi.org/10.1002/nme.1620230208 },
abstract={An algorithm for reducing the profile and wavefront of a
sparse matrix is described. The scheme is applicable to any sparse
matrix which has a symmetric pattern of zeros and may be used to
generate efficient labellings for finite element grids. In particular,
it is suitable for generating efficient labellings for profile and
frontal solution schemes. Empirical evidence, obtained from analysis of
the 30 test problems collected by Everstine, suggests that the new
algorithm is superior to existing methods for profile and wavefront
reduction. It is fast, requires only a small amount of memory, and is
simple to program.},
}
%GET
@inproceedings{SlotaRajamanickamMadduri14,
author={Slota, G. M. and Rajamanickam, S. and Madduri, K.},
title={{BFS} and Coloring-Based Parallel Algorithms for Strongly Connected Components and Related Problems},
booktitle={Parallel and Distributed Processing Symposium, 2014 IEEE 28th International},
year={2014},
pages={550-559},
url={ http://dx.doi.org/10.1109/IPDPS.2014.64 },
month={May},
}
%GET
@inproceedings{SlotaRajamanickamMadduri15,
author={Slota, G. M. and Rajamanickam, S. and Madduri, K.},
title={High-Performance Graph Analytics on Manycore Processors},
booktitle={Parallel and Distributed Processing Symposium (IPDPS), 2015 IEEE International},
year={2015},
pages={17-27},
url={ http://dx.doi.org/10.1109/IPDPS.2015.54 },
month={May},
}
%got
@inproceedings{SmartWhite88,
author={Smart, D. and White, J.},
year={1988},
title={Reducing the Parallel Solution Time of Sparse Circuit Matrices Using Reordered {Gaussian} Elimination and Relaxation},
booktitle={Proceedings of the IEEE International Symposium Circuits and Systems},
keywords={parallel unsymmetric reordering},
url={ http://dx.doi.org/10.1109/ISCAS.1988.15004 },
abstract={The authors examine two approaches for reducing parallel
sparse matrix solution time: the first based on pivot ordering
algorithms for Gaussian elimination, and the second based on relaxation
algorithms. A pivot ordering algorithm is presented which increases the
parallelism of Gaussian elimination compared to the commonly used
Markowitz method. The minimum number of parallel steps for the solution
of a tridiagonal matrix is derived, and it is shown that this optimum
is nearly achieved by the ordering heuristics which attempt to maximize
parallelism. Also presented is an optimality result about Gauss-Jacobi
over Gauss-Seidel relaxation on parallel processors. }
}
%got
@techreport{Snay79,
author={Snay, R. A.},
title={Reducing the profile of sparse symmetric matrices},
institution={National Oceanic and Atmospheric Administration},
year={1969},
number={NOS NGS-4},
url={ http://www.ngs.noaa.gov/PUBS_LIB/ReducingTheProfileOfSparseSymmetricMatrices_TM_NOS_NGS4.pdf },
abstract={An algorithm for improving the profile of a sparse
symmetric matrix is introduced. Tests on normal equation matrices
encountered in adjustments of geodetic networks by least squares
demonstrates that the algorithm produces significantly lower
profiles than the widely used reverse Cuthill-McKee algorithm.},
address={Washington, DC},
}
%GET
@techreport{Speelpenning78,
author={Speelpenning, B.},
title={The generalized element method},
year={1978},
number={UIUC-DCS-R-78-946},
institution={Dept. of Computer Science, Univ. of Illinois},
address={Urbana, Illinois},
}
%got
@article{Srinivas83,
author={Srinivas, M.A.},
title={Optimal Parallel Scheduling of Gaussian Elimination {DAG}'s},
journal=IEEETC,
year={1983},
month={Dec},
volume={C-32},
number={12},
pages={1109-1117},
abstract={A parallel algorithm for Gaussian elimination (GE) is
described, which solves a linear system of size n using m <= n parallel
processors and a shared random access memory. Converting the serial GE
algorithm to parallel form involves scheduling its computation DAG
(directed acyclic graph) on m processors. A lower bound for schedule
length is established for dense GE DAG's and it is proved that the
proposed algorithm produces schedules which achieve these bounds.
Finally, both the construction and execution of the schedule are
incorporated into a single concurrent program which is shown to run in
optimal time.},
keywords={Dense matrices;Gaussian elimination;directed acyclic
graphs;linear systems;parallel computation;scheduling;Computational
modeling;Concurrent computing;Linear systems;Matrix converters;Optimal
scheduling;Parallel processing;Processor scheduling;Random access
memory;Scheduling algorithm;Sparse matrices;Dense matrices;Gaussian
elimination;directed acyclic graphs;linear systems;parallel
computation;scheduling},
url={ http://dx.doi.org/10.1109/TC.1983.1676171 },
}
%got
@article{SuhlSuhl93,
author={Suhl, L. M. and Suhl, U. H.},
title={A fast {LU} update for linear programming},
year={1993},
journal=AOR,
volume={43},
number={1},
url={ http://dx.doi.org/10.1007/BF02025534 },
pages={33-47},
}
%got
@article{SuhlSuhl90,
author={Suhl, U. H. and Suhl, L. M.},
title={Computing Sparse {LU} Factorizations for Large-Scale Linear Programming Bases},
journal=ORSA,
year={1990},
volume={2},
number={4},
pages={325-335},
url={ http://dx.doi.org/10.1287/ijoc.2.4.325 },
abstract={This paper discusses the computation of LU factorizations
for large sparse matrices with emphasis on large-scale linear
programming bases. We present new implementation techniques which
reduce the computation times significantly. Numerical experiments with
large-scale real life test problems were conducted. The software is
compared with the basis factorization of MPSX/370, IBM's commercial LP
system.}
}
%got
@article{Sun96,
author={Sun, C.},
title={Parallel Sparse Orthogonal Factorization on Distributed-Memory Multiprocessors},
publisher={SIAM},
year={1996},
journal=SISC,
volume={17},
number={3},
pages={666-685},
keywords={parallel algorithms; sparse matrix; orthogonal factorization; multifrontal method; block partitioning scheme; distributed-memory multiprocessors},
url={ http://dx.doi.org/10.1137/S1064827593260449 },
}
%got
@article{Sun97,
author={Sun, C.},
title={Parallel solution of sparse linear least squares problems on distributed-memory multiprocessors },
journal=PC,
volume={23},
number={13},
pages={2075 - 2093},
year={1997},
url={ http://dx.doi.org/10.1016/S0167-8191(97)00064-1 },
keywords={Parallel algorithms},
keywords={Sparse matrix},
keywords={Orthogonal factorization},
keywords={Multifrontal method},
keywords={Least squares problems},
keywords={Triangular solution},
keywords={Distributed-memory multiprocessors },
abstract={This paper studies the parallel solution of large-scale
sparse linear least squares problems on distributed-memory
multiprocessors. The key components required for solving a sparse
linear least squares problem are sparse {QR} factorization and sparse
triangular solution. A block-oriented parallel algorithm for sparse
{QR} factorization has already been described in the literature. In
this paper, new block-oriented parallel algorithms for sparse
triangular solution are proposed. The arithmetic and communication
complexities of the new algorithms applied to regular grid problems are
analyzed. The proposed parallel sparse triangular solution algorithms
together with the block-oriented parallel sparse {QR} factorization
algorithm result in a highly efficient approach to the parallel
solution of sparse linear least squares problems. Performance results
obtained on an {IBM} Scalable {POWERparallel} system {SP2} are
presented. The largest least squares problem solved has over two
million rows and more than a quarter million columns. The execution
speed for the numerical factorization of this problem achieves over 3.7
gigaflops per second on an {IBM} {SP2} machine with 128 processors.}
}
%%T ---------------------------------------------------------------------------
%got
@article{Tarjan72,
author={Tarjan, R. E.},
title={Depth first search and linear graph algorithms},
journal=SICOMP,
year={1972},
volume={1},
pages={146--160},
url={ http://dx.doi.org/10.1137/0201010 },
abstract={The value of depth-first search or 'backtracking' as a
technique for solving problems is illustrated by two examples. An
improved version of an algorithm for finding the strongly connected
components of a directed graph and at algorithm for finding the
biconnected components of an undirect graph are presented. The space
and time requirements of both algorithms are bounded by $k_1 V + k_2 E
+ k_3 $ for some constants $k_1, k_2 $, and $k_3 $, where V is the
number of vertices and E is the number of edges of the graph being
examined.}
}
%got
@article{Tarjan75,
author={Tarjan, R. E.},
title={Efficiency of a good but not linear set union algorithm},
journal=JACM,
year={1975},
volume={22},
pages={215--225},
}
%got (Tim has book from TAMU library)
@incollection{Tarjan76,
author={Tarjan, R. E.},
year={1976},
title={Graph Theory and {Gaussian} Elimination},
editor={Bunch, J. R. and Rose, D. J.},
booktitle={Sparse Matrix Computations},
publisher={New York: Academic Press},
pages={3-22},
keywords={graph theory}
}
%%TEWARSON --------------------------------------------------------------------
%got
@article{Tewarson66,
author={Tewarson, R. P.},
title={On the product form of inverses of sparse matrices},
journal=SIREV,
volume={8},
number={3},
pages={336--342},
year={1966},
url={ http://dx.doi.org/10.1137/1008066 }
}
%got
@article{Tewarson67,
author={Tewarson, R. P.},
title={The product form of inverses of sparse matrices and graph theory},
journal=SIREV,
volume={9},
number={1},
pages={91--99},
year={1967},
url={ http://dx.doi.org/10.1137/1009004 }
}
%got
@article{Tewarson67c,
author={Tewarson, R. P.},
title={Row-column permutation of sparse matrices},
journal=CJ,
volume={10},
number={3},
pages={300--305},
year={1967},
abstract={The problem of transforming a given sparse matrix A into a
block diagonal form (b.d.f.) and the subsequent transformation of each
of the block diagonal matrices into as nearly upper triangular form
(u.t.f.) as possible, by using only row and column permutations, is
discussed. It is shown how some of the results from Graph Theory can be
used to transform A to the b.d.f. In order to transform the block
diagonal matrices into the u.t.f.'s, two methods are described, one of
which makes use of linear programming while the other uses approximate
probabilistic arguments. The latter method, in relation to the
computational effort, yields significant results in practice. }
}
%got
@article{Tewarson67b,
author={Tewarson, R. P.},
title={Solution of a system of simultaneous linear equations with a sparse coefficient matrix by elimination methods},
journal=BIT,
volume={7},
pages={226--239},
year={1967},
abstract={A method is described for handling systems of linear
equations with a sparse coefficient matrix as economically as possible.
In particular, a convenient rule for the order of pivoting is given.
Numerical experiments indicating considerable savings have also been
performed. }
}
%got
@article{Tewarson68,
author={Tewarson, R. P.},
title={On the orthonormalization of sparse vectors},
year={1968},
journal=COMP,
volume={3},
number={4},
url={ http://dx.doi.org/10.1007/BF02235393 },
publisher={Springer-Verlag},
pages={268--279},
}
%got
@article{Tewarson70,
author={Tewarson, R. P.},
title={Computations with sparse matrices},
journal=SIREV,
volume={12},
number={4},
pages={527--544},
year={1970},
url={ http://dx.doi.org/10.1137/1012103 }
}
%got
@article{Tewarson72,
author={Tewarson, R. P.},
title={On the {Gaussian} elimination method for inverting sparse matrices},
year={1972},
journal=COMP,
volume={9},
number={1},
url={ http://dx.doi.org/10.1007/BF02236371 },
publisher={Springer-Verlag},
pages={1-7},
}
%got (Tim has book from TAMU library)
@book{Tewarson73,
editor={Tewarson, R. P.},
year={1973},
title={Sparse Matrices},
publisher={New York: Academic Press},
url={ http://www.sciencedirect.com/science/bookseries/00765392/99 },
series={Mathematics in Science and Engineering},
volume={99},
note={TAMU Evans library QA188 .T48},
}
%%T continued -----------------------------------------------------------------
%got
@article{ThompsonShimazaki80,
author={Thompson, E. and Shimazaki, Y.},
year={1980},
title={A Frontal Procedure Using Skyline Storage},
journal=IJNME,
volume={15},
pages={889-910},
keywords={skyline storage methods frontal methods},
abstract={A frontal-skyline method is presented which allows for
compact skyline storage while using the frontal method. It is shown
that the method requires the same minimum core storage as the frontal
method and fewer transfers to and from disc than the blocked-skyline
method. Subroutines necessary for its implementation with finite
element codes are given.},
url={ http://dx.doi.org/10.1002/nme.1620150608 }
}
%got
@article{TinneyWalker67,
author={Tinney, W. F. and Walker, J. W.},
year={1967},
title={Direct Solutions of Sparse Network Equations by Optimally Ordered Triangular Factorization},
journal=PROCIEEE,
volume={55},
number={1},
pages={1801--1809},
keywords={minimum degree},
annote={first mention of minimum degree algorithm},
url={ http://dx.doi.org/10.1109/PROC.1967.6011},
abstract={Matrix inversion is very inefficient for computing direct
solutions of the large sparse systems of linear equations that arise in
many network problems. Optimally ordered triangular factorization of
sparse matrices is more efficient and offers other important
computational advantages in some applications. With this method, direct
solutions are computed from sparse matrix factors instead of from a
full inverse matrix, thereby gaining a significant advantage in speed,
computer memory requirements, and reduced round-off error. Improvements
of tea to one or more in speed and problem size over present
applications of the inverse can be achieved in many cases. Details of
the method, numerical examples, and the results of a large problem are
given.},
}
%got
@incollection{Tomlin72,
author={Tomlin, J. A.},
title={Modifying triangular factors of the basis in the {Simplex} method},
pages={77--85},
editor={Rose, D. J. and Willoughby, R. A.},
booktitle={Sparse Matrices and Their Applications},
address={New York},
publisher={New York: Plenum Press},
year={1972},
url={ http://link.springer.com/book/10.1007%2F978-1-4615-8675-3 },
}
%got
@article{TotoniHeathKale14,
author={Totoni, E. and Heath, M. T. and Kale, L. V.},
title={Structure-adaptive parallel solution of sparse triangular linear systems },
journal=PC,
volume={40},
number={9},
pages={454 - 470},
year={2014},
url={ http://dx.doi.org/10.1016/j.parco.2014.06.006 },
keywords={Triangular solver},
keywords={Parallel algorithms},
keywords={Sparse linear systems},
keywords={Distributed memory computers },
abstract={Abstract Solving sparse triangular systems of linear
equations is a performance bottleneck in many methods for solving more
general sparse systems. Both for direct methods and for many iterative
preconditioners, it is used to solve the system or improve an
approximate solution, often across many iterations. Solving triangular
systems is notoriously resistant to parallelism, however, and existing
parallel linear algebra packages appear to be ineffective in exploiting
significant parallelism for this problem. We develop a novel parallel
algorithm based on various heuristics that adapt to the structure of
the matrix and extract parallelism that is unexploited by conventional
methods. By analyzing and reordering operations, our algorithm can
often extract parallelism even for cases where most of the nonzero
matrix entries are near the diagonal. Our main parallelism strategies
are: (1) identify independent rows, (2) send data earlier to achieve
greater overlap, and (3) process dense off-diagonal regions in
parallel. We describe the implementation of our algorithm in Charm++
and {MPI} and present promising experimental results on up to 512
cores of BlueGene/P, using numerous sparse matrices from real
applications. }
}
%%U ---------------------------------------------------------------------------
%%V ---------------------------------------------------------------------------
%got
@article{vanderStappenBisseling93,
author={{Van der Stappen}, A. F. and Bisseling, R. H. and van de Vorst, J. G. G.},
title={Parallel sparse {LU} decomposition on a mesh network of transputers},
journal=SIMAX,
year={1993},
volume={14},
number={3},
pages={853-879},
url={ http://dx.doi.org/10.1137/0614059 },
abstract={A parallel algorithm is presented for the LU decomposition
of a general sparse matrix on a distributed-memory MIMD multiprocessor
with a square mesh communication network. In the algorithm, matrix
elements are assigned to processors according to the grid distribution.
Each processor represents the nonzero elements of its part of the
matrix by a local, ordered, two-dimensional linked-list data structure.
The complexity of important operations on this data structure and on
several others is analysed. At each step of the algorithm, a parallel
search for a set of m compatible pivot elements is performed. The
Markowitz counts of the pivot elements are close to minimum, to
preserve the sparsity of the matrix. The pivot elements also satisfy a
threshold criterion, to ensure numerical stability. The compatibility
of the m pivots enables the simultaneous elimination of m pivot rows
and m pivot columns in a rank-m update of the reduced matrix.
Experimental results on a network of 400 transputers are presented for
a set of test matrices from the Harwell-Boeing sparse matrix
collection.},
}
%GET
@article{VastenhouwBisseling05,
author={Vastenhouw, B. and Bisseling, R. H.},
title={A two-dimensional data distribution method for parallel sparse matrix-vector multiplication},
journal=SIREV,
volume={47},
number={1},
pages={67--95},
year={2005},
publisher={SIAM},
}
%%W ---------------------------------------------------------------------------
%GET
@article{WangLiRouetJianlinDeHoop15,
author={Wang, S. and Li, X. S. and Rouet, F.-H. and Xia, J. and {De Hoop}, M. V.},
year={2015},
title={A Parallel Geometric Multifrontal Solver Using Hierarchically Semiseparable Structure},
journal=TOMS,
volume={},
pages={},
note={to appear}
}
%got
@article{WebbFroncioni86,
author={Webb, J. P. and Froncioni, A.},
title={A time-memory trade-off frontwidth reduction algorithm for finite element analysis},
journal=IJNME,
volume={23},
number={10},
publisher={John Wiley \& Sons, Ltd},
url={ http://dx.doi.org/10.1002/nme.1620231009 },
pages={1905--1914},
year={1986},
abstract={A frontwidth reduction algorithm is presented with an
execution time which may be traded against its primary memory
requirement, making it possible to optimize the performance of the
algorithm on a particular computer. With an amount of primary memory
$O(E^{1/2})$, where E is the number of elements, the execution time of
the algorithm is $O(E^{3/2})$, in two or three dimensions. The
algorithm has two parts: first, new node-based data structures are
derived from the conventional element list, then these structures are
used to reorder the elements for reduced frontwidth.},
}
%got
@book{WilkinsonReinsch71,
editor={Wilkinson, J. H. and Reinsch, C.},
title={Handbook for Automatic Computation, Volume {II}: Linear Algebra},
year={1971},
url={ http://dx.doi.org/10.1007/978-3-642-86940-2 },
publisher={Springer-Verlag},
}
%got
@article{WingHuang80,
author={Wing, O. and Huang, J. W.},
title={A Computation Model of Parallel Solution of Linear Equations},
journal=IEEETC,
volume={C-29},
number={7},
pages={632--638},
year={1980},
url={ http://dx.doi.org/10.1109/TC.1980.1675634 },
abstract={The solution process of Ax=b is modeled by an acyclic
directed graph in which the nodes represent the arithmetic operations
applied to the elements of A, and the arcs represent the precedence
relations that exist among the operations in the solution process.
Operations that can be done in parallel are identified in the model and
the absolute minimum completion time and lower bounds on the minimum
number of processors required to solve the equations in minimal time
can be found from it. Properties of the model are derived. Hu's level
scheduling strategy is applied to examples of sparse matrix equations
with surprisingly good results. Speed-up using parallel processing is
found to be proportional to the number of processors when it is 10-20
percent of the order of A.}
}
%%X ---------------------------------------------------------------------------
%got
@article{Xia13,
author={Xia, J.},
title={Efficient structured multifrontal factorization for general large sparse matrices},
journal=SISC,
volume={35},
number={2},
pages={A832--A860},
year={2013},
}
%got
@article{Xia13b,
author={Xia, J.},
title={Randomized sparse direct solvers},
journal=SIMAX,
volume={34},
number={1},
pages={197--227},
year={2013},
}
%got
@article{XiaChandrasekaranGuLi09,
author={Xia, J. and Chandrasekaran, S. and Gu, M. and Li, X. S.},
title={Superfast Multifrontal Method for Structured Linear Systems of Equations},
journal=SIMAX,
volume={31},
number={3},
pages={1382-1411},
year={2009},
}
%got
@article{XiaChandrasekaranGuLi10,
author={Xia, J. and Chandrasekaran, S. and Gu, M. and Li, X. S.},
title={Fast Algorithms for Hierarchically Semiseparable Matrices},
journal=NLAA,
volume={17},
number={6},
pages={953--976},
year={2010},
url={ http://dx.doi.org/10.1002/nla.691 }
}
%%Y ---------------------------------------------------------------------------
%got
@article{Yannakakis81,
author={Yannakakis, M.},
year={1981},
title={Computing the Minimum Fill-In is {NP}-Complete},
journal=SIAMJADM,
volume={2},
pages={77-79},
keywords={ordering minimum fill-in NP-complete},
annote={This refers to the symmetric problem. For unsymmetric
matrices, see RoseTarjanLueker76 and RoseTarjan78},
url={ http://dx.doi.org/10.1137/0602010 },
abstract={We show that the following problem is NP-complete. Given a
graph, find the minimum number of edges (fill-in) whose addition makes
the graph chordal. This problem arises in the solution of sparse
symmetric positive definite systems of linear equations by Gaussian
elimination.}
}
%got
@techreport{YeralanDavisRanka15,
title={Sparse {QR} factorization on the {GPU}},
author={Yeralan, S. N. and Davis, T. A. and Ranka, S. and Sid-Lakhdar, W. M.},
institution={Texas A\&M University},
year={2016},
note={ http://faculty.cse.tamu.edu/davis/publications.html },
}
%got
@article{YuWangPierce11,
author={Yu, C. D. and Wang, W. and Pierce, D.},
title={A {CPU-GPU} hybrid approach for the unsymmetric multifrontal method},
journal=PC,
volume={37},
number={12},
pages={759--770},
year={2011},
note={6th International Workshop on Parallel Matrix Algorithms and Applications (PMAA'10)},
url={ http://dx.doi.org/10.1016/j.parco.2011.09.002 },
keywords={Sparse and unsymmetric linear systems},
keywords={Multifrontal},
keywords={CPU-GPU hybrid approach},
keywords={Parallel computing},
abstract={Multifrontal is an efficient direct method for solving
large-scale sparse and unsymmetric linear systems. The method
transforms a large sparse matrix factorization process into a sequence
of factorizations involving smaller dense frontal matrices. Some of
these dense operations can be accelerated by using a graphic processing
unit (GPU). We analyze the unsymmetric multifrontal method from both an
algorithmic and implementational perspective to see how a GPU, in
particular the NVIDIA Tesla C2070, can be used to accelerate the
computations. Our main accelerating strategies include (i) performing
BLAS on both CPU and GPU, (ii) improving the communication efficiency
between the CPU and GPU by using page-locked memory, zero-copy memory,
and asynchronous memory copy, and (iii) a modified algorithm that
reuses the memory between different GPU tasks and sets thresholds to
determine whether certain tasks be performed on the GPU. The proposed
acceleration strategies are implemented by modifying UMFPACK, which is
an unsymmetric multifrontal linear system solver. Numerical results
show that the CPU-GPU hybrid approach can accelerate the unsymmetric
multifrontal solver, especially for computationally expensive problems.
},
}
%%Z ---------------------------------------------------------------------------
%got
@article{ZhangElman92,
author={Zhang, G. and Elman, H. C.},
title={Parallel sparse {Cholesky} factorization on a shared memory multiprocessor},
journal=PC,
volume={18},
number={9},
pages={1009--1022},
year={1992},
url={ http://dx.doi.org/10.1016/0167-8191(92)90014-X },
keywords={Parallel algorithms},
keywords={sparse Cholesky factorization},
keywords={linear algebra},
keywords={shared memory multiprocessor},
abstract={Parallel implementations of Cholesky factorization for sparse
symmetric positive definite matrices are considered on a shared memory
multiprocessor computer. Two column-oriented schemes, known as the
column-Cholesky algorithm and the fan-in algorithm, along with
enhancements of each, are implemented and discussed. High parallel
efficiency of the column-Cholesky algorithm and its enhancement is
demonstrated for test problems. A detailed investigation of the
performance of the fan-in algorithm and its enhancement, the
compute-ahead fan-in algorithm, is made to study the effects of
overhead associated with the fan-in based schemes.},
}
%%ZLATEV ----------------------------------------------------------------------
%got
@article{Zlatev80,
author={Zlatev, Z.},
year={1980},
title={On Some Pivotal Strategies in {Gaussian} Elimination by Sparse Technique},
journal=SINUM,
volume={17},
number={1},
pages={18-30},
url={ http://dx.doi.org/10.1137/0717003 },
abstract={Pivotal interchanges are commonly used in the solution of
large and sparse systems of linear algebraic equations by Gaussian
elimination (in order to preserve the sparsity of the matrix and to
prevent the appearance of large roundoff errors during the
computations). The Markowitz strategy (see [H. M. Markowitz, The
elimination form of inverse and its applications to linear programming,
Management Sci., 3 (1957), pp. 255-269]) is often used to determine the
pivotal sequence. An efficient implementation of this strategy is given
by Curtis and Reid (see [A. R. Curtis and J. K. Reid, Fortran
subroutines for the solution of sparse sets of linear equations,
A.E.R.E., Report R.6844, HMSO, London, 1971]) and improved by Duff (see
[I. S. Duff, MA28-a set of Fortran subroutines for sparse unsymmetric
matrices, A.E.R.E., Report R.8730, HMSO, London, 1977]). In this paper
it is shown how the classical Markowitz idea can be generalized.
Consider the following parameters: u -the stability factor and
$p(s)$-the number of rows that will be searched for a pivotal element
at stage s of the elimination ($1 \leqq s \leqq n - 1,n$-the number of
equations). Pivotal strategies depending on these two parameters are
defined. The choice of the parameters is discussed. A comparison
between the pivotal strategy used in [I. S. Duff, MA28-a set of Fortran
subroutines for sparse unsymmetric matrices, A.E.R.E., Report R.8730,
HMSO, London, 1977] (where $p(s)=n-s+1$) and a pivotal strategy
with $p(s) \leqq 3$ is carried out. The results indicate that pivotal
strategies with small $p(s)$ may be more profitable than the original
Markowitz strategy.}
}
%got
@article{Zlatev82,
author={Zlatev, Z.},
year={1982},
title={Comparison of Two Pivotal Strategies in Sparse Plane Rotations},
journal=CMA,
volume={8},
pages={119-135},
url={ http://dx.doi.org/10.1016/0898-1221(82)90051-7 },
abstract={Let the rectangular matrix A be large and sparse. Assume that
plane rotations are used to decompose A into QDR where QTQ=I, D is
diagonal and R is upper triangular. Both column and row
interchanges have to be used in order to preserve the sparsity of
matrix A during the decomposition. If the column interchanges are
fixed, then the number of non-zero elements in R does not depend on the
row interchanges used. However, this does not mean that the
computational work is also independent of the row interchanges. Two
pivotal strategies, where the same rule is used in the choice of
pivotal columns, are described and compared. It is verified (by many
numerical examples) that if matrix A is not very sparse, then one of
these strategies will often perform better than the other both with
regard to the storage and the computing time. The accuracy and the
robustness of the computations are also discussed. In the
implementation described in this paper positive values of a special
parameter, drop-tolerance, can optionally be used to remove all "small"
elements created during the decomposition. The accuracy lost by
dropping some non-zero elements is normally regained by iterative
refinement. The numerical results indicate that this approach is very
efficient for some matrices.}
}
%got (Tim has book from TAMU library)
@incollection{Zlatev85,
author={Zlatev, Z.},
year={1985},
title={Sparse Matrix Techniques for General Matrices with Real Elements: Pivotal Strategies, Decompositions and Applications in {ODE} Software},
editor={Evans, D. J.},
booktitle={Sparsity and Its Applications},
publisher={Cambridge, United Kingdom: Cambridge University Press},
pages={185--228},
}
%got
@article{Zlatev87,
author={Zlatev, Z.},
title={A survey of the advances in the exploitation of the sparsity in the solution of large problems },
journal=JCAM,
volume={20},
pages={83--105},
year={1987},
url={ http://dx.doi.org/10.1016/0377-0427(87)90127-0 },
keywords={Sparsity},
keywords={storage schemes},
keywords={pivotal strategies},
keywords={Gaussian elimination},
keywords={orthogonal methods},
keywords={least squares },
abstract={The numerical treatment of many mathematical models (which
arise, for example, in physics, chemistry, biology or economics) leads
very often to huge algebraic problems, so that it is difficult (both
with regard to the storage needed and with regard to the computing time
spent) to handle these problems even on the big modern computers.
However, the matrices that occur in the algebraic problems are
fortunately sparse (many of their elements are equal to zero). The
exploitation of the sparsity leads to savings in both storage and
computer time, so that problems which can not be handled numerically
when the zero elements are stored in the computer memory and when the
arithmetic operations involving zero elements are performed become
tractable if the sparsity is exploited in a proper way. There are two
basic groups of storage schemes for exploiting the sparsity. If a
scheme of the first group is in use, then the non-zero elements have
permanent locations in the computer memory during the whole
computational process. Therefore the schemes of the first group are
called static. A non-zero element may be moved from one location to
another when a scheme from the second group is applied. Such schemes
are called dynamic. The advantages and the limitations of the schemes
from these two groups are discussed. The advances achieved after 1980
in the efforts to improve the performance of the schemes belonging to
both groups are given in a systematic way. Some questions that are
still open are briefly discussed. The advances achieved in some other
stages in the exploitation of the sparsity, which are not directly
connected with the storage schemes used, are outlined in the last
section. }
}
%got
@book{Zlatev91,
author={Zlatev, Z.},
title={Computational Methods for General Sparse Matrices},
publisher={Kluwer Academic Publishers},
address={Dordrecht, Boston, London},
year={1991},
url={ http://dx.doi.org/10.1007/978-94-017-1116-6 },
}
%got (Tim has book from TAMU library)
@incollection{ZlatevThomsen81,
author={Zlatev, Z. and Thomsen, P. G.},
year={1981},
title={Sparse Matrices - Efficient Decompositions and Applications},
editor={Duff, I. S.},
booktitle={Sparse Matrices and Their Uses},
publisher={New York: Academic Press},
pages={367-375},
keywords={pivoting, decomposition, large drop tolerance plus iterative refinement},
}
%got
@book{ZlatevWasniewskiSchaumburg81,
author={Zlatev, Z. and Wasniewski, J. and Schaumburg, K.},
year={1981},
title={Y12M: Solution of Large and Sparse Systems of Linear Algebraic Equations, Lecture Notes in Computer Science 121},
publisher={Berlin: Springer-Verlag},
annote={ review at http://dx.doi.org/10.1002/zamm.19840640229 },
url={ http://dx.doi.org/10.1007/3-540-10874-2 },
}
%%Z continued -----------------------------------------------------------------
%got
@article{ZmijewskiGilbert88,
author={Zmijewski, E. and Gilbert, J. R.},
year={1988},
title={A Parallel Algorithm for Sparse Symbolic {Cholesky} Factorization on a Multiprocessor},
journal=PC,
number={2},
volume={7},
pages={199-210},
url={ http://dx.doi.org/10.1016/0167-8191(88)90039-7 },
keywords={symmetric positive definite},
keywords={Linear algebra},
keywords={Cholesky factorization},
keywords={sparse matrix computation},
keywords={message-passing multiprocessor},
keywords={elimination forest },
abstract={We develop an algorithm for computing the symbolic Cholesky
factorization of a large sparse symmetric positive definite matrix. The
algorithm is intended for a message-passing multiprocessor system, such
as the hypercube, and is based on the concept of elimination forest. In
addition, we provide an algorithm for computing these forests along
with a discussion of the algorithm's complexity and a proof of its
correctness. },
}