Squashed 'third_party/boostorg/ublas/' content from commit e8607b3

Change-Id: Ia06afd642157a24e17fa9ddea28fb8601810b78e
git-subtree-dir: third_party/boostorg/ublas
git-subtree-split: e8607b3eea238e590eca93bfe498c21f470155c1
diff --git a/benchmarks/bench1/Jamfile.v2 b/benchmarks/bench1/Jamfile.v2
new file mode 100644
index 0000000..77b11c7
--- /dev/null
+++ b/benchmarks/bench1/Jamfile.v2
@@ -0,0 +1,10 @@
+# Copyright (c) 2004 Michael Stevens
+# Use, modification and distribution are subject to the
+# Boost Software License, Version 1.0. (See accompanying file
+# LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+# bench1 - measure the abstraction penalty of dense matrix and vector operations.
+
+exe bench1
+    : bench1.cpp bench11.cpp bench12.cpp bench13.cpp
+    ;
diff --git a/benchmarks/bench1/bench1.cpp b/benchmarks/bench1/bench1.cpp
new file mode 100644
index 0000000..87478e1
--- /dev/null
+++ b/benchmarks/bench1/bench1.cpp
@@ -0,0 +1,122 @@
+//
+//  Copyright (c) 2000-2002
+//  Joerg Walter, Mathias Koch
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+//
+//  The authors gratefully acknowledge the support of
+//  GeNeSys mbH & Co. KG in producing this work.
+//
+
+#include "bench1.hpp"
+
+void header (std::string text) {
+    std::cout << text << std::endl;
+}
+
+template<class T>
+struct peak_c_plus {
+    typedef T value_type;
+
+    void operator () (int runs) const {
+        try {
+            static T s (0);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                s += T (0);
+//                sink_scalar (s);
+            }
+            footer<value_type> () (0, 1, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class T>
+struct peak_c_multiplies {
+    typedef T value_type;
+
+    void operator () (int runs) const {
+        try {
+            static T s (1);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                s *= T (1);
+//                sink_scalar (s);
+            }
+            footer<value_type> () (0, 1, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+
+template<class T>
+void peak<T>::operator () (int runs) {
+    header ("peak");
+
+    header ("plus");
+    peak_c_plus<T> () (runs);
+
+    header ("multiplies");
+    peak_c_multiplies<T> () (runs);
+}
+
+
+template <typename scalar> 
+void do_bench (std::string type_string, int scale)
+{
+    header (type_string);
+    peak<scalar> () (1000000 * scale);
+
+    header (type_string + ", 3");
+    bench_1<scalar, 3> () (1000000 * scale);
+    bench_2<scalar, 3> () (300000 * scale);
+    bench_3<scalar, 3> () (100000 * scale);
+
+    header (type_string + ", 10");
+    bench_1<scalar, 10> () (300000 * scale);
+    bench_2<scalar, 10> () (30000 * scale);
+    bench_3<scalar, 10> () (3000 * scale);
+
+    header (type_string + ", 30");
+    bench_1<scalar, 30> () (100000 * scale);
+    bench_2<scalar, 30> () (3000 * scale);
+    bench_3<scalar, 30> () (100 * scale);
+
+    header (type_string + ", 100");
+    bench_1<scalar, 100> () (30000 * scale);
+    bench_2<scalar, 100> () (300 * scale);
+    bench_3<scalar, 100> () (3 * scale);
+}
+
+int main (int argc, char *argv []) {
+
+    int scale = 1;
+    if (argc > 1)
+        scale = std::atoi (argv [1]);
+
+#ifdef USE_FLOAT
+    do_bench<float> ("FLOAT", scale);
+#endif
+
+#ifdef USE_DOUBLE
+    do_bench<double> ("DOUBLE", scale);
+#endif
+
+#ifdef USE_STD_COMPLEX
+#ifdef USE_FLOAT
+    do_bench<std::complex<float> > ("COMPLEX<FLOAT>", scale);
+#endif
+
+#ifdef USE_DOUBLE
+    do_bench<std::complex<double> > ("COMPLEX<DOUBLE>", scale);
+#endif
+#endif
+
+    return 0;
+}
diff --git a/benchmarks/bench1/bench1.hpp b/benchmarks/bench1/bench1.hpp
new file mode 100644
index 0000000..d799463
--- /dev/null
+++ b/benchmarks/bench1/bench1.hpp
@@ -0,0 +1,159 @@
+//
+//  Copyright (c) 2000-2002
+//  Joerg Walter, Mathias Koch
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+//
+//  The authors gratefully acknowledge the support of
+//  GeNeSys mbH & Co. KG in producing this work.
+//
+
+#ifndef BENCH1_H
+#define BENCH1_H
+
+#include <iostream>
+#include <string>
+#include <valarray>
+
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+
+#include <boost/timer.hpp>
+
+
+#define BOOST_UBLAS_NOT_USED(x) (void)(x)
+
+
+namespace ublas = boost::numeric::ublas;
+
+void header (std::string text);
+
+template<class T>
+struct footer {
+    void operator () (int multiplies, int plus, int runs, double elapsed) {
+        std::cout << "elapsed: " << elapsed << " s, "
+                  << (multiplies * ublas::type_traits<T>::multiplies_complexity +
+                      plus * ublas::type_traits<T>::plus_complexity) * runs /
+                     (1024 * 1024 * elapsed) << " Mflops" << std::endl;
+    }
+};
+
+template<class T, int N>
+struct c_vector_traits {
+    typedef T type [N];
+};
+template<class T, int N, int M>
+struct c_matrix_traits {
+    typedef T type [N] [M];
+};
+
+template<class T, int N>
+struct initialize_c_vector  {
+    void operator () (typename c_vector_traits<T, N>::type &v) {
+        for (int i = 0; i < N; ++ i)
+            v [i] = std::rand () * 1.f;
+//            v [i] = 0.f;
+        }
+};
+template<class V>
+BOOST_UBLAS_INLINE
+void initialize_vector (V &v) {
+    int size = v.size ();
+    for (int i = 0; i < size; ++ i)
+        v [i] = std::rand () * 1.f;
+//      v [i] = 0.f;
+}
+
+template<class T, int N, int M>
+struct initialize_c_matrix  {
+    void operator () (typename c_matrix_traits<T, N, M>::type &m) {
+        for (int i = 0; i < N; ++ i)
+            for (int j = 0; j < M; ++ j)
+                m [i] [j] = std::rand () * 1.f;
+//                m [i] [j] = 0.f;
+    }
+};
+template<class M>
+BOOST_UBLAS_INLINE
+void initialize_matrix (M &m) {
+    int size1 = m.size1 ();
+    int size2 = m.size2 ();
+    for (int i = 0; i < size1; ++ i)
+        for (int j = 0; j < size2; ++ j)
+            m (i, j) = std::rand () * 1.f;
+//          m (i, j) = 0.f;
+}
+
+template<class T>
+BOOST_UBLAS_INLINE
+void sink_scalar (const T &s) {
+    static T g_s = s;
+}
+
+template<class T, int N>
+struct sink_c_vector {
+    void operator () (const typename c_vector_traits<T, N>::type &v) {
+        static typename c_vector_traits<T, N>::type g_v;
+        for (int i = 0; i < N; ++ i)
+            g_v [i] = v [i];
+    }
+};
+template<class V>
+BOOST_UBLAS_INLINE
+void sink_vector (const V &v) {
+    static V g_v (v);
+}
+
+template<class T, int N, int M>
+struct sink_c_matrix {
+    void operator () (const typename c_matrix_traits<T, N, M>::type &m) {
+    static typename c_matrix_traits<T, N, M>::type g_m;
+    for (int i = 0; i < N; ++ i)
+        for (int j = 0; j < M; ++ j)
+            g_m [i] [j] = m [i] [j];
+    }
+};
+template<class M>
+BOOST_UBLAS_INLINE
+void sink_matrix (const M &m) {
+    static M g_m (m);
+}
+
+template<class T>
+struct peak {
+    void operator () (int runs);
+};
+
+template<class T, int N>
+struct bench_1 {
+    void operator () (int runs);
+};
+
+template<class T, int N>
+struct bench_2 {
+    void operator () (int runs);
+};
+
+template<class T, int N>
+struct bench_3 {
+    void operator () (int runs);
+};
+
+struct safe_tag {};
+struct fast_tag {};
+
+//#define USE_FLOAT
+#define USE_DOUBLE
+// #define USE_STD_COMPLEX
+
+#define USE_C_ARRAY
+// #define USE_BOUNDED_ARRAY
+#define USE_UNBOUNDED_ARRAY
+// #define USE_STD_VALARRAY
+//#define USE_STD_VECTOR
+
+#endif
+
+
diff --git a/benchmarks/bench1/bench11.cpp b/benchmarks/bench1/bench11.cpp
new file mode 100644
index 0000000..806a422
--- /dev/null
+++ b/benchmarks/bench1/bench11.cpp
@@ -0,0 +1,287 @@
+//
+//  Copyright (c) 2000-2002
+//  Joerg Walter, Mathias Koch
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+//
+//  The authors gratefully acknowledge the support of
+//  GeNeSys mbH & Co. KG in producing this work.
+//
+
+#include "bench1.hpp"
+
+template<class T, int N>
+struct bench_c_inner_prod {
+    typedef T value_type;
+
+    void operator () (int runs) const {
+        try {
+            static typename c_vector_traits<T, N>::type v1, v2;
+            initialize_c_vector<T, N> () (v1);
+            initialize_c_vector<T, N> () (v2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                static value_type s (0);
+                for (int j = 0; j < N; ++ j) {
+                    s += v1 [j] * v2 [j];
+                }
+//                sink_scalar (s);
+            }
+            footer<value_type> () (N, N - 1, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class V, int N>
+struct bench_my_inner_prod {
+    typedef typename V::value_type value_type;
+
+    void operator () (int runs) const {
+        try {
+            static V v1 (N), v2 (N);
+            initialize_vector (v1);
+            initialize_vector (v2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                static value_type s (0);
+                s = ublas::inner_prod (v1, v2);
+//                sink_scalar (s);
+                BOOST_UBLAS_NOT_USED(s);
+            }
+            footer<value_type> () (N, N - 1, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class V, int N>
+struct bench_cpp_inner_prod {
+    typedef typename V::value_type value_type;
+
+    void operator () (int runs) const {
+        try {
+            static V v1 (N), v2 (N);
+            initialize_vector (v1);
+            initialize_vector (v2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                static value_type s (0);
+                s = (v1 * v2).sum ();
+//                sink_scalar (s);
+            }
+            footer<value_type> () (N, N - 1, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+
+template<class T, int N>
+struct bench_c_vector_add {
+    typedef T value_type;
+
+    void operator () (int runs) const {
+        try {
+            static typename c_vector_traits<T, N>::type v1, v2, v3;
+            initialize_c_vector<T, N> () (v1);
+            initialize_c_vector<T, N> () (v2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                for (int j = 0; j < N; ++ j) {
+                    v3 [j] = - (v1 [j] + v2 [j]);
+                }
+//                sink_c_vector<T, N> () (v3);
+                BOOST_UBLAS_NOT_USED(v3);
+            }
+            footer<value_type> () (0, 2 * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class V, int N>
+struct bench_my_vector_add {
+    typedef typename V::value_type value_type;
+
+    void operator () (int runs, safe_tag) const {
+        try {
+            static V v1 (N), v2 (N), v3 (N);
+            initialize_vector (v1);
+            initialize_vector (v2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                v3 = - (v1 + v2);
+//                sink_vector (v3);
+            }
+            footer<value_type> () (0, 2 * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+    void operator () (int runs, fast_tag) const {
+        try {
+            static V v1 (N), v2 (N), v3 (N);
+            initialize_vector (v1);
+            initialize_vector (v2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                v3.assign (- (v1 + v2));
+//                sink_vector (v3);
+            }
+            footer<value_type> () (0, 2 * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class V, int N>
+struct bench_cpp_vector_add {
+    typedef typename V::value_type value_type;
+
+    void operator () (int runs) const {
+        try {
+            static V v1 (N), v2 (N), v3 (N);
+            initialize_vector (v1);
+            initialize_vector (v2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                v3 = - (v1 + v2);
+//                sink_vector (v3);
+            }
+            footer<value_type> () (0, 2 * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+
+// Benchmark O (n)
+template<class T, int N>
+void bench_1<T, N>::operator () (int runs) {
+    header ("bench_1");
+
+    header ("inner_prod");
+
+    header ("C array");
+    bench_c_inner_prod<T, N> () (runs);
+
+#ifdef USE_C_ARRAY
+    header ("c_vector");
+    bench_my_inner_prod<ublas::c_vector<T, N>, N> () (runs);
+#endif
+
+#ifdef USE_BOUNDED_ARRAY
+    header ("vector<bounded_array>");
+    bench_my_inner_prod<ublas::vector<T, ublas::bounded_array<T, N> >, N> () (runs);
+#endif
+
+#ifdef USE_UNBOUNDED_ARRAY
+    header ("vector<unbounded_array>");
+    bench_my_inner_prod<ublas::vector<T, ublas::unbounded_array<T> >, N> () (runs);
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("vector<std::valarray>");
+    bench_my_inner_prod<ublas::vector<T, std::valarray<T> >, N> () ();
+#endif
+
+#ifdef USE_STD_VECTOR
+    header ("vector<std::vector>");
+    bench_my_inner_prod<ublas::vector<T, std::vector<T> >, N> () (runs);
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("std::valarray");
+    bench_cpp_inner_prod<std::valarray<T>, N> () (runs);
+#endif
+
+    header ("vector + vector");
+
+    header ("C array");
+    bench_c_vector_add<T, N> () (runs);
+
+#ifdef USE_C_ARRAY
+    header ("c_vector safe");
+    bench_my_vector_add<ublas::c_vector<T, N>, N> () (runs, safe_tag ());
+
+    header ("c_vector fast");
+    bench_my_vector_add<ublas::c_vector<T, N>, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_BOUNDED_ARRAY
+    header ("vector<bounded_array> safe");
+    bench_my_vector_add<ublas::vector<T, ublas::bounded_array<T, N> >, N> () (runs, safe_tag ());
+
+    header ("vector<bounded_array> fast");
+    bench_my_vector_add<ublas::vector<T, ublas::bounded_array<T, N> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_UNBOUNDED_ARRAY
+    header ("vector<unbounded_array> safe");
+    bench_my_vector_add<ublas::vector<T, ublas::unbounded_array<T> >, N> () (runs, safe_tag ());
+
+    header ("vector<unbounded_array> fast");
+    bench_my_vector_add<ublas::vector<T, ublas::unbounded_array<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("vector<std::valarray> safe");
+    bench_my_vector_add<ublas::vector<T, std::valarray<T> >, N> () (runs, safe_tag ());
+
+    header ("vector<std::valarray> fast");
+    bench_my_vector_add<ublas::vector<T, std::valarray<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VECTOR
+    header ("vector<std::vector> safe");
+    bench_my_vector_add<ublas::vector<T, std::vector<T> >, N> () (runs, safe_tag ());
+
+    header ("vector<std::vector> fast");
+    bench_my_vector_add<ublas::vector<T, std::vector<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("std::valarray");
+    bench_cpp_vector_add<std::valarray<T>, N> () (runs);
+#endif
+}
+
+#ifdef USE_FLOAT
+template struct bench_1<float, 3>;
+template struct bench_1<float, 10>;
+template struct bench_1<float, 30>;
+template struct bench_1<float, 100>;
+#endif
+
+#ifdef USE_DOUBLE
+template struct bench_1<double, 3>;
+template struct bench_1<double, 10>;
+template struct bench_1<double, 30>;
+template struct bench_1<double, 100>;
+#endif
+
+#ifdef USE_STD_COMPLEX
+#ifdef USE_FLOAT
+template struct bench_1<std::complex<float>, 3>;
+template struct bench_1<std::complex<float>, 10>;
+template struct bench_1<std::complex<float>, 30>;
+template struct bench_1<std::complex<float>, 100>;
+#endif
+
+#ifdef USE_DOUBLE
+template struct bench_1<std::complex<double>, 3>;
+template struct bench_1<std::complex<double>, 10>;
+template struct bench_1<std::complex<double>, 30>;
+template struct bench_1<std::complex<double>, 100>;
+#endif
+#endif
diff --git a/benchmarks/bench1/bench12.cpp b/benchmarks/bench1/bench12.cpp
new file mode 100644
index 0000000..439188f
--- /dev/null
+++ b/benchmarks/bench1/bench12.cpp
@@ -0,0 +1,491 @@
+//
+//  Copyright (c) 2000-2002
+//  Joerg Walter, Mathias Koch
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+//
+//  The authors gratefully acknowledge the support of
+//  GeNeSys mbH & Co. KG in producing this work.
+//
+
+#include "bench1.hpp"
+
+template<class T, int N>
+struct bench_c_outer_prod {
+    typedef T value_type;
+
+    void operator () (int runs) const {
+        try {
+            static typename c_matrix_traits<T, N, N>::type m;
+            static typename c_vector_traits<T, N>::type v1, v2;
+            initialize_c_vector<T, N> () (v1);
+            initialize_c_vector<T, N> () (v2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                for (int j = 0; j < N; ++ j) {
+                    for (int k = 0; k < N; ++ k) {
+                        m [j] [k] = - v1 [j] * v2 [k];
+                    }
+                }
+//                sink_c_matrix<T, N, N> () (m);
+            }
+            BOOST_UBLAS_NOT_USED(m);
+
+            footer<value_type> () (N * N, N * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class M, class V, int N>
+struct bench_my_outer_prod {
+    typedef typename M::value_type value_type;
+
+    void operator () (int runs, safe_tag) const {
+        try {
+            static M m (N, N);
+            static V v1 (N), v2 (N);
+            initialize_vector (v1);
+            initialize_vector (v2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                m = - ublas::outer_prod (v1, v2);
+//                sink_matrix (m);
+            }
+            footer<value_type> () (N * N, N * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+    void operator () (int runs, fast_tag) const {
+        try {
+            static M m (N, N);
+            static V v1 (N), v2 (N);
+            initialize_vector (v1);
+            initialize_vector (v2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                m.assign (- ublas::outer_prod (v1, v2));
+//                sink_matrix (m);
+            }
+            footer<value_type> () (N * N, N * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class M, class V, int N>
+struct bench_cpp_outer_prod {
+    typedef typename M::value_type value_type;
+
+    void operator () (int runs) const {
+        try {
+            static M m (N * N);
+            static V v1 (N), v2 (N);
+            initialize_vector (v1);
+            initialize_vector (v2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                for (int j = 0; j < N; ++ j) {
+                    for (int k = 0; k < N; ++ k) {
+                        m [N * j + k] = - v1 [j] * v2 [k];
+                    }
+                }
+//                sink_vector (m);
+            }
+            footer<value_type> () (N * N, N * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+
+template<class T, int N>
+struct bench_c_matrix_vector_prod {
+    typedef T value_type;
+
+    void operator () (int runs) const {
+        try {
+            static typename c_matrix_traits<T, N, N>::type m;
+            static typename c_vector_traits<T, N>::type v1, v2;
+            initialize_c_matrix<T, N, N> () (m);
+            initialize_c_vector<T, N> () (v1);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                for (int j = 0; j < N; ++ j) {
+                    v2 [j] = 0;
+                    for (int k = 0; k < N; ++ k) {
+                        v2 [j] += m [j] [k] * v1 [k];
+                    }
+                }
+//                sink_c_vector<T, N> () (v2);
+            }
+            footer<value_type> () (N * N, N * (N - 1), runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class M, class V, int N>
+struct bench_my_matrix_vector_prod {
+    typedef typename M::value_type value_type;
+
+    void operator () (int runs, safe_tag) const {
+        try {
+            static M m (N, N);
+            static V v1 (N), v2 (N);
+            initialize_matrix (m);
+            initialize_vector (v1);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                v2 = ublas::prod (m, v1);
+//                sink_vector (v2);
+            }
+            footer<value_type> () (N * N, N * (N - 1), runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+    void operator () (int runs, fast_tag) const {
+        try {
+            static M m (N, N);
+            static V v1 (N), v2 (N);
+            initialize_matrix (m);
+            initialize_vector (v1);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                v2.assign (ublas::prod (m, v1));
+//                sink_vector (v2);
+            }
+            footer<value_type> () (N * N, N * (N - 1), runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class M, class V, int N>
+struct bench_cpp_matrix_vector_prod {
+    typedef typename M::value_type value_type;
+
+    void operator () (int runs) const {
+        try {
+            static M m (N * N);
+            static V v1 (N), v2 (N);
+            initialize_vector (m);
+            initialize_vector (v1);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                for (int j = 0; j < N; ++ j) {
+                    std::valarray<value_type> row (m [std::slice (N * j, N, 1)]);
+                    v2 [j] = (row * v1).sum ();
+                }
+//                sink_vector (v2);
+            }
+            footer<value_type> () (N * N, N * (N - 1), runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+
+template<class T, int N>
+struct bench_c_matrix_add {
+    typedef T value_type;
+
+    void operator () (int runs) const {
+        try {
+            static typename c_matrix_traits<T, N, N>::type m1, m2, m3;
+            initialize_c_matrix<T, N, N> () (m1);
+            initialize_c_matrix<T, N, N> () (m2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                for (int j = 0; j < N; ++ j) {
+                    for (int k = 0; k < N; ++ k) {
+                        m3 [j] [k] = - (m1 [j] [k] + m2 [j] [k]);
+                    }
+                }
+//                sink_c_matrix<T, N, N> () (m3);
+            }
+            BOOST_UBLAS_NOT_USED(m3);
+
+            footer<value_type> () (0, 2 * N * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class M, int N>
+struct bench_my_matrix_add {
+    typedef typename M::value_type value_type;
+
+    void operator () (int runs, safe_tag) const {
+        try {
+            static M m1 (N, N), m2 (N, N), m3 (N, N);
+            initialize_matrix (m1);
+            initialize_matrix (m2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                m3 = - (m1 + m2);
+//                sink_matrix (m3);
+            }
+            footer<value_type> () (0, 2 * N * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+    void operator () (int runs, fast_tag) const {
+        try {
+            static M m1 (N, N), m2 (N, N), m3 (N, N);
+            initialize_matrix (m1);
+            initialize_matrix (m2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                m3.assign (- (m1 + m2));
+//                sink_matrix (m3);
+            }
+            footer<value_type> () (0, 2 * N * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class M, int N>
+struct bench_cpp_matrix_add {
+    typedef typename M::value_type value_type;
+
+    void operator () (int runs) const {
+        try {
+            static M m1 (N * N), m2 (N * N), m3 (N * N);
+            initialize_vector (m1);
+            initialize_vector (m2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                m3 = - (m1 + m2);
+//                sink_vector (m3);
+            }
+            footer<value_type> () (0, 2 * N * N, runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+
+// Benchmark O (n ^ 2)
+template<class T, int N>
+void bench_2<T, N>::operator () (int runs) {
+    header ("bench_2");
+
+    header ("outer_prod");
+
+    header ("C array");
+    bench_c_outer_prod<T, N> () (runs);
+
+#ifdef USE_C_ARRAY
+    header ("c_matrix, c_vector safe");
+    bench_my_outer_prod<ublas::c_matrix<T, N, N>,
+                        ublas::c_vector<T, N>, N> () (runs, safe_tag ());
+
+    header ("c_matrix, c_vector fast");
+    bench_my_outer_prod<ublas::c_matrix<T, N, N>,
+                        ublas::c_vector<T, N>, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_BOUNDED_ARRAY
+    header ("matrix<bounded_array>, vector<bounded_array> safe");
+    bench_my_outer_prod<ublas::matrix<T, ublas::row_major, ublas::bounded_array<T, N * N> >,
+                        ublas::vector<T, ublas::bounded_array<T, N> >, N> () (runs, safe_tag ());
+
+    header ("matrix<bounded_array>, vector<bounded_array> fast");
+    bench_my_outer_prod<ublas::matrix<T, ublas::row_major, ublas::bounded_array<T, N * N> >,
+                        ublas::vector<T, ublas::bounded_array<T, N> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_UNBOUNDED_ARRAY
+    header ("matrix<unbounded_array>, vector<unbounded_array> safe");
+    bench_my_outer_prod<ublas::matrix<T, ublas::row_major, ublas::unbounded_array<T> >,
+                        ublas::vector<T, ublas::unbounded_array<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<unbounded_array>, vector<unbounded_array> fast");
+    bench_my_outer_prod<ublas::matrix<T, ublas::row_major, ublas::unbounded_array<T> >,
+                        ublas::vector<T, ublas::unbounded_array<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("matrix<std::valarray>, vector<std::valarray> safe");
+    bench_my_outer_prod<ublas::matrix<T, ublas::row_major, std::valarray<T> >,
+                        ublas::vector<T, std::valarray<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<std::valarray>, vector<std::valarray> fast");
+    bench_my_outer_prod<ublas::matrix<T, ublas::row_major, std::valarray<T> >,
+                        ublas::vector<T, std::valarray<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VECTOR
+    header ("matrix<std::vector>, vector<std::vector> safe");
+    bench_my_outer_prod<ublas::matrix<T, ublas::row_major, std::vector<T> >,
+                        ublas::vector<T, std::vector<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<std::vector>, vector<std::vector> fast");
+    bench_my_outer_prod<ublas::matrix<T, ublas::row_major, std::vector<T> >,
+                        ublas::vector<T, std::vector<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("std::valarray");
+    bench_cpp_outer_prod<std::valarray<T>, std::valarray<T>, N> () (runs);
+#endif
+
+    header ("prod (matrix, vector)");
+
+    header ("C array");
+    bench_c_matrix_vector_prod<T, N> () (runs);
+
+#ifdef USE_C_ARRAY
+    header ("c_matrix, c_vector safe");
+    bench_my_matrix_vector_prod<ublas::c_matrix<T, N, N>,
+                                ublas::c_vector<T, N>, N> () (runs, safe_tag ());
+
+    header ("c_matrix, c_vector fast");
+    bench_my_matrix_vector_prod<ublas::c_matrix<T, N, N>,
+                                ublas::c_vector<T, N>, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_BOUNDED_ARRAY
+    header ("matrix<bounded_array>, vector<bounded_array> safe");
+    bench_my_matrix_vector_prod<ublas::matrix<T, ublas::row_major, ublas::bounded_array<T, N * N> >,
+                                ublas::vector<T, ublas::bounded_array<T, N> >, N> () (runs, safe_tag ());
+
+    header ("matrix<bounded_array>, vector<bounded_array> fast");
+    bench_my_matrix_vector_prod<ublas::matrix<T, ublas::row_major, ublas::bounded_array<T, N * N> >,
+                                ublas::vector<T, ublas::bounded_array<T, N> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_UNBOUNDED_ARRAY
+    header ("matrix<unbounded_array>, vector<unbounded_array> safe");
+    bench_my_matrix_vector_prod<ublas::matrix<T, ublas::row_major, ublas::unbounded_array<T> >,
+                                ublas::vector<T, ublas::unbounded_array<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<unbounded_array>, vector<unbounded_array> fast");
+    bench_my_matrix_vector_prod<ublas::matrix<T, ublas::row_major, ublas::unbounded_array<T> >,
+                                ublas::vector<T, ublas::unbounded_array<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("matrix<std::valarray>, vector<std::valarray> safe");
+    bench_my_matrix_vector_prod<ublas::matrix<T, ublas::row_major, std::valarray<T> >,
+                                ublas::vector<T, std::valarray<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<std::valarray>, vector<std::valarray> fast");
+    bench_my_matrix_vector_prod<ublas::matrix<T, ublas::row_major, std::valarray<T> >,
+                                ublas::vector<T, std::valarray<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VECTOR
+    header ("matrix<std::vector>, vector<std::vector> safe");
+    bench_my_matrix_vector_prod<ublas::matrix<T, ublas::row_major, std::vector<T> >,
+                                ublas::vector<T, std::vector<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<std::vector>, vector<std::vector> fast");
+    bench_my_matrix_vector_prod<ublas::matrix<T, ublas::row_major, std::vector<T> >,
+                                ublas::vector<T, std::vector<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("std::valarray");
+    bench_cpp_matrix_vector_prod<std::valarray<T>, std::valarray<T>, N> () (runs);
+#endif
+
+    header ("matrix + matrix");
+
+    header ("C array");
+    bench_c_matrix_add<T, N> () (runs);
+
+#ifdef USE_C_ARRAY
+    header ("c_matrix safe");
+    bench_my_matrix_add<ublas::c_matrix<T, N, N>, N> () (runs, safe_tag ());
+
+    header ("c_matrix fast");
+    bench_my_matrix_add<ublas::c_matrix<T, N, N>, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_BOUNDED_ARRAY
+    header ("matrix<bounded_array> safe");
+    bench_my_matrix_add<ublas::matrix<T, ublas::row_major, ublas::bounded_array<T, N * N> >, N> () (runs, safe_tag ());
+
+    header ("matrix<bounded_array> fast");
+    bench_my_matrix_add<ublas::matrix<T, ublas::row_major, ublas::bounded_array<T, N * N> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_UNBOUNDED_ARRAY
+    header ("matrix<unbounded_array> safe");
+    bench_my_matrix_add<ublas::matrix<T, ublas::row_major, ublas::unbounded_array<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<unbounded_array> fast");
+    bench_my_matrix_add<ublas::matrix<T, ublas::row_major, ublas::unbounded_array<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("matrix<std::valarray> safe");
+    bench_my_matrix_add<ublas::matrix<T, ublas::row_major, std::valarray<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<std::valarray> fast");
+    bench_my_matrix_add<ublas::matrix<T, ublas::row_major, std::valarray<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VECTOR
+    header ("matrix<std::vector> safe");
+    bench_my_matrix_add<ublas::matrix<T, ublas::row_major, std::vector<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<std::vector> fast");
+    bench_my_matrix_add<ublas::matrix<T, ublas::row_major, std::vector<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("std::valarray");
+    bench_cpp_matrix_add<std::valarray<T>, N> () (runs);
+#endif
+}
+
+#ifdef USE_FLOAT
+template struct bench_2<float, 3>;
+template struct bench_2<float, 10>;
+template struct bench_2<float, 30>;
+template struct bench_2<float, 100>;
+#endif
+
+#ifdef USE_DOUBLE
+template struct bench_2<double, 3>;
+template struct bench_2<double, 10>;
+template struct bench_2<double, 30>;
+template struct bench_2<double, 100>;
+#endif
+
+#ifdef USE_STD_COMPLEX
+#ifdef USE_FLOAT
+template struct bench_2<std::complex<float>, 3>;
+template struct bench_2<std::complex<float>, 10>;
+template struct bench_2<std::complex<float>, 30>;
+template struct bench_2<std::complex<float>, 100>;
+#endif
+
+#ifdef USE_DOUBLE
+template struct bench_2<std::complex<double>, 3>;
+template struct bench_2<std::complex<double>, 10>;
+template struct bench_2<std::complex<double>, 30>;
+template struct bench_2<std::complex<double>, 100>;
+#endif
+#endif
diff --git a/benchmarks/bench1/bench13.cpp b/benchmarks/bench1/bench13.cpp
new file mode 100644
index 0000000..fadb0b6
--- /dev/null
+++ b/benchmarks/bench1/bench13.cpp
@@ -0,0 +1,192 @@
+//
+//  Copyright (c) 2000-2002
+//  Joerg Walter, Mathias Koch
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+//
+//  The authors gratefully acknowledge the support of
+//  GeNeSys mbH & Co. KG in producing this work.
+//
+
+#include "bench1.hpp"
+
+template<class T, int N>
+struct bench_c_matrix_prod {
+    typedef T value_type;
+
+    void operator () (int runs) const {
+        try {
+            static typename c_matrix_traits<T, N, N>::type m1, m2, m3;
+            initialize_c_matrix<T, N, N> () (m1);
+            initialize_c_matrix<T, N, N> () (m2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                for (int j = 0; j < N; ++ j) {
+                    for (int k = 0; k < N; ++ k) {
+                        m3 [j] [k] = 0;
+                        for (int l = 0; l < N; ++ l) {
+                            m3 [j] [k] += m1 [j] [l] * m2 [l] [k];
+                        }
+                    }
+                }
+//                sink_c_matrix<T, N, N> () (m3);
+            }
+            footer<value_type> () (N * N * N, N * N * (N - 1), runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class M, int N>
+struct bench_my_matrix_prod {
+    typedef typename M::value_type value_type;
+
+    void operator () (int runs, safe_tag) const {
+        try {
+            static M m1 (N, N), m2 (N, N), m3 (N, N);
+            initialize_matrix (m1);
+            initialize_matrix (m2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                m3 = ublas::prod (m1, m2);
+//                sink_matrix (m3);
+            }
+            footer<value_type> () (N * N * N, N * N * (N - 1), runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+    void operator () (int runs, fast_tag) const {
+        try {
+            static M m1 (N, N), m2 (N, N), m3 (N, N);
+            initialize_matrix (m1);
+            initialize_matrix (m2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                m3.assign (ublas::prod (m1, m2));
+//                sink_matrix (m3);
+            }
+            footer<value_type> () (N * N * N, N * N * (N - 1), runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+template<class M, int N>
+struct bench_cpp_matrix_prod {
+    typedef typename M::value_type value_type;
+
+    void operator () (int runs) const {
+        try {
+            static M m1 (N * N), m2 (N * N), m3 (N * N);
+            initialize_vector (m1);
+            initialize_vector (m2);
+            boost::timer t;
+            for (int i = 0; i < runs; ++ i) {
+                for (int j = 0; j < N; ++ j) {
+                    std::valarray<value_type> row (m1 [std::slice (N * j, N, 1)]);
+                    for (int k = 0; k < N; ++ k) {
+                        std::valarray<value_type> column (m2 [std::slice (k, N, N)]);
+                        m3 [N * j + k] = (row * column).sum ();
+                    }
+                }
+//                sink_vector (m3);
+            }
+            footer<value_type> () (N * N * N, N * N * (N - 1), runs, t.elapsed ());
+        }
+        catch (std::exception &e) {
+            std::cout << e.what () << std::endl;
+        }
+    }
+};
+
+// Benchmark O (n ^ 3)
+template<class T, int N>
+void bench_3<T, N>::operator () (int runs) {
+    header ("bench_3");
+
+    header ("prod (matrix, matrix)");
+
+    header ("C array");
+    bench_c_matrix_prod<T, N> () (runs);
+
+#ifdef USE_C_ARRAY
+    header ("c_matrix safe");
+    bench_my_matrix_prod<ublas::c_matrix<T, N, N>, N> () (runs, safe_tag ());
+
+    header ("c_matrix fast");
+    bench_my_matrix_prod<ublas::c_matrix<T, N, N>, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_BOUNDED_ARRAY
+    header ("matrix<bounded_array> safe");
+    bench_my_matrix_prod<ublas::matrix<T, ublas::row_major, ublas::bounded_array<T, N * N> >, N> () (runs, safe_tag ());
+
+    header ("matrix<bounded_array> fast");
+    bench_my_matrix_prod<ublas::matrix<T, ublas::row_major, ublas::bounded_array<T, N * N> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_UNBOUNDED_ARRAY
+    header ("matrix<unbounded_array> safe");
+    bench_my_matrix_prod<ublas::matrix<T, ublas::row_major, ublas::unbounded_array<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<unbounded_array> fast");
+    bench_my_matrix_prod<ublas::matrix<T, ublas::row_major, ublas::unbounded_array<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("matrix<std::valarray> safe");
+    bench_my_matrix_prod<ublas::matrix<T, ublas::row_major, std::valarray<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<std::valarray> fast");
+    bench_my_matrix_prod<ublas::matrix<T, ublas::row_major, std::valarray<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VECTOR
+    header ("matrix<std::vector> safe");
+    bench_my_matrix_prod<ublas::matrix<T, ublas::row_major, std::vector<T> >, N> () (runs, safe_tag ());
+
+    header ("matrix<std::vector> fast");
+    bench_my_matrix_prod<ublas::matrix<T, ublas::row_major, std::vector<T> >, N> () (runs, fast_tag ());
+#endif
+
+#ifdef USE_STD_VALARRAY
+    header ("std::valarray");
+    bench_cpp_matrix_prod<std::valarray<T>, N> () (runs);
+#endif
+}
+
+#ifdef USE_FLOAT
+template struct bench_3<float, 3>;
+template struct bench_3<float, 10>;
+template struct bench_3<float, 30>;
+template struct bench_3<float, 100>;
+#endif
+
+#ifdef USE_DOUBLE
+template struct bench_3<double, 3>;
+template struct bench_3<double, 10>;
+template struct bench_3<double, 30>;
+template struct bench_3<double, 100>;
+#endif
+
+#ifdef USE_STD_COMPLEX
+#ifdef USE_FLOAT
+template struct bench_3<std::complex<float>, 3>;
+template struct bench_3<std::complex<float>, 10>;
+template struct bench_3<std::complex<float>, 30>;
+template struct bench_3<std::complex<float>, 100>;
+#endif
+
+#ifdef USE_DOUBLE
+template struct bench_3<std::complex<double>, 3>;
+template struct bench_3<std::complex<double>, 10>;
+template struct bench_3<std::complex<double>, 30>;
+template struct bench_3<std::complex<double>, 100>;
+#endif
+#endif