Squashed 'third_party/eigen/' changes from 61d72f6..cf794d3


Change-Id: I9b814151b01f49af6337a8605d0c42a3a1ed4c72
git-subtree-dir: third_party/eigen
git-subtree-split: cf794d3b741a6278df169e58461f8529f43bce5d
diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
new file mode 100644
index 0000000..af8eb9b
--- /dev/null
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -0,0 +1,61 @@
+#3.0.1
+#3.1.1
+#3.2.0
+3.2.4
+#5745:37f59e65eb6c
+5891:d8652709345d  # introduce AVX
+#5893:24b4dc92c6d3  # merge
+5895:997c2ef9fc8b  # introduce FMA
+#5904:e1eafd14eaa1  # complex and AVX
+5908:f8ee3c721251  # improve packing with ptranspose
+#5921:ca808bb456b0  # merge
+#5927:8b1001f9e3ac
+5937:5a4ca1ad8c53  # New gebp kernel handling up to 3 packets x 4 register-level blocks
+#5949:f3488f4e45b2  # merge
+#5969:e09031dccfd9  # Disable 3pX4 kernel on Altivec
+#5992:4a429f5e0483  # merge
+before-evaluators
+#6334:f6a45e5b8b7c  # Implement evaluator for sparse outer products
+#6639:c9121c60b5c7
+#6655:06f163b5221f  # Properly detect FMA support on ARM
+#6677:700e023044e7   # FMA has been wrongly disabled
+#6681:11d31dafb0e3
+#6699:5e6e8e10aad1   # merge default to tensors
+#6726:ff2d2388e7b9   # merge default to tensors
+#6742:0cbd6195e829   # merge default to tensors
+#6747:853d2bafeb8f   # Generalized the gebp apis
+6765:71584fd55762   # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation
+#6781:9cc5a931b2c6   # generalized gemv
+#6792:f6e1daab600a   # ensured that contractions that can be reduced to a matrix vector product
+#6844:039efd86b75c   # merge tensor
+6845:7333ed40c6ef   # change prefetching in gebp
+#6856:b5be5e10eb7f   # merge index conversion
+#6893:c3a64aba7c70   # clean blocking size computation
+#6898:6fb31ebe6492   # rotating kernel for ARM
+6899:877facace746   # rotating kernel for ARM only
+#6904:c250623ae9fa   # result_of
+6921:915f1b1fc158   # fix prefetching change for ARM
+6923:9ff25f6dacc6   # prefetching
+6933:52572e60b5d3   # blocking size strategy
+6937:c8c042f286b2   # avoid redundant pack_rhs
+6981:7e5d6f78da59   # dynamic loop swapping
+6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
+6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
+7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+7015:8aad8f35c955   # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables
+7016:a58d253e8c91   # Polish lookup tables generation
+7018:9b27294a8186   # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment
+7019:c758b1e2c073   # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now.
+7085:627e039fba68   # Bug 986: add support for coefficient-based product with 0 depth.
+7098:b6f1db9cf9ec   # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code
+7591:09a8e2186610   # 3.3-alpha1
+7650:b0f3c8f43025   # help clang inlining
+#8744:74b789ada92a   # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
+8789:efcb912e4356   # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes
+8972:81d53c711775   # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path
+8985:d935df21a082   # Remove the rotating kernel.
+8988:6c2dc56e73b3   # Bug 256: enable vectorization with unaligned loads/stores.
+9148:b8b8c421e36c   # Relax mixing-type constraints for binary coefficient-wise operators
+9174:d228bc282ac9   # merge
+9212:c90098affa7b   # Fix performance regression introduced in changeset 8aad8f35c955
+9213:9f1c14e4694b   # Fix performance regression in dgemm introduced by changeset 81d53c711775
diff --git a/bench/perf_monitoring/gemm/gemm.cpp b/bench/perf_monitoring/gemm/gemm.cpp
new file mode 100644
index 0000000..614bd47
--- /dev/null
+++ b/bench/perf_monitoring/gemm/gemm.cpp
@@ -0,0 +1,67 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <Eigen/Core>
+#include "../../BenchTimer.h"
+using namespace Eigen;
+
+#ifndef SCALAR
+#error SCALAR must be defined
+#endif
+
+typedef SCALAR Scalar;
+
+typedef Matrix<Scalar,Dynamic,Dynamic> Mat;
+
+EIGEN_DONT_INLINE
+void gemm(const Mat &A, const Mat &B, Mat &C)
+{
+  C.noalias() += A * B;
+}
+
+EIGEN_DONT_INLINE
+double bench(long m, long n, long k)
+{
+  Mat A(m,k);
+  Mat B(k,n);
+  Mat C(m,n);
+  A.setRandom();
+  B.setRandom();
+  C.setZero();
+  
+  BenchTimer t;
+  
+  double up = 1e8*4/sizeof(Scalar);
+  double tm0 = 4, tm1 = 10;
+  if(NumTraits<Scalar>::IsComplex)
+  {
+    up /= 4;
+    tm0 = 2;
+    tm1 = 4;
+  }
+  
+  double flops = 2. * m * n * k;
+  long rep = std::max(1., std::min(100., up/flops) );
+  long tries = std::max(tm0, std::min(tm1, up/flops) );
+  
+  BENCH(t, tries, rep, gemm(A,B,C));
+  
+  return 1e-9 * rep * flops / t.best();
+}
+
+int main(int argc, char **argv)
+{
+  std::vector<double> results;
+  
+  std::ifstream settings("gemm_settings.txt");
+  long m, n, k;
+  while(settings >> m >> n >> k)
+  {
+    //std::cerr << "  Testing " << m << " " << n << " " << k << std::endl;
+    results.push_back( bench(m, n, k) );
+  }
+  
+  std::cout << RowVectorXd::Map(results.data(), results.size());
+  
+  return 0;
+}
diff --git a/bench/perf_monitoring/gemm/gemm_settings.txt b/bench/perf_monitoring/gemm/gemm_settings.txt
new file mode 100644
index 0000000..5c43e1c
--- /dev/null
+++ b/bench/perf_monitoring/gemm/gemm_settings.txt
@@ -0,0 +1,15 @@
+8 8 8
+9 9 9
+24 24 24
+239 239 239
+240 240 240
+2400 24 24
+24 2400 24
+24 24 2400
+24 2400 2400
+2400 24 2400
+2400 2400 24
+2400 2400 64
+4800 23 160
+23 4800 160
+2400 2400 2400
diff --git a/bench/perf_monitoring/gemm/lazy_gemm.cpp b/bench/perf_monitoring/gemm/lazy_gemm.cpp
new file mode 100644
index 0000000..6dc3701
--- /dev/null
+++ b/bench/perf_monitoring/gemm/lazy_gemm.cpp
@@ -0,0 +1,98 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <Eigen/Core>
+#include "../../BenchTimer.h"
+using namespace Eigen;
+
+#ifndef SCALAR
+#error SCALAR must be defined
+#endif
+
+typedef SCALAR Scalar;
+
+template<typename MatA, typename MatB, typename MatC>
+EIGEN_DONT_INLINE
+void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
+{
+//   escape((void*)A.data());
+//   escape((void*)B.data());
+  C.noalias() += A.lazyProduct(B);
+//   escape((void*)C.data());
+}
+
+template<int m, int n, int k, int TA>
+EIGEN_DONT_INLINE
+double bench()
+{
+  typedef Matrix<Scalar,m,k,TA> MatA;
+  typedef Matrix<Scalar,k,n> MatB;
+  typedef Matrix<Scalar,m,n> MatC;
+
+  MatA A(m,k);
+  MatB B(k,n);
+  MatC C(m,n);
+  A.setRandom();
+  B.setRandom();
+  C.setZero();
+
+  BenchTimer t;
+
+  double up = 1e7*4/sizeof(Scalar);
+  double tm0 = 10, tm1 = 20;
+
+  double flops = 2. * m * n * k;
+  long rep = std::max(10., std::min(10000., up/flops) );
+  long tries = std::max(tm0, std::min(tm1, up/flops) );
+
+  BENCH(t, tries, rep, lazy_gemm(A,B,C));
+
+  return 1e-9 * rep * flops / t.best();
+}
+
+template<int m, int n, int k>
+double bench_t(int t)
+{
+  if(t)
+    return bench<m,n,k,RowMajor>();
+  else
+    return bench<m,n,k,0>();
+}
+
+EIGEN_DONT_INLINE
+double bench_mnk(int m, int n, int k, int t)
+{
+  int id = m*10000 + n*100 + k;
+  switch(id) {
+    case  10101 : return bench_t< 1, 1, 1>(t); break;
+    case  20202 : return bench_t< 2, 2, 2>(t); break;
+    case  30303 : return bench_t< 3, 3, 3>(t); break;
+    case  40404 : return bench_t< 4, 4, 4>(t); break;
+    case  50505 : return bench_t< 5, 5, 5>(t); break;
+    case  60606 : return bench_t< 6, 6, 6>(t); break;
+    case  70707 : return bench_t< 7, 7, 7>(t); break;
+    case  80808 : return bench_t< 8, 8, 8>(t); break;
+    case  90909 : return bench_t< 9, 9, 9>(t); break;
+    case 101010 : return bench_t<10,10,10>(t); break;
+    case 111111 : return bench_t<11,11,11>(t); break;
+    case 121212 : return bench_t<12,12,12>(t); break;
+  }
+  return 0;
+}
+
+int main(int argc, char **argv)
+{
+  std::vector<double> results;
+  
+  std::ifstream settings("lazy_gemm_settings.txt");
+  long m, n, k, t;
+  while(settings >> m >> n >> k >> t)
+  {
+    //std::cerr << "  Testing " << m << " " << n << " " << k << std::endl;
+    results.push_back( bench_mnk(m, n, k, t) );
+  }
+  
+  std::cout << RowVectorXd::Map(results.data(), results.size());
+  
+  return 0;
+}
diff --git a/bench/perf_monitoring/gemm/lazy_gemm_settings.txt b/bench/perf_monitoring/gemm/lazy_gemm_settings.txt
new file mode 100644
index 0000000..407d5d4
--- /dev/null
+++ b/bench/perf_monitoring/gemm/lazy_gemm_settings.txt
@@ -0,0 +1,15 @@
+1 1 1 0
+2 2 2 0
+3 3 3 0
+4 4 4 0
+4 4 4 1
+5 5 5 0
+6 6 6 0
+7 7 7 0
+7 7 7 1
+8 8 8 0
+9 9 9 0
+10 10 10 0
+11 11 11 0
+12 12 12 0
+12 12 12 1
diff --git a/bench/perf_monitoring/gemm/make_plot.sh b/bench/perf_monitoring/gemm/make_plot.sh
new file mode 100755
index 0000000..cd3214a
--- /dev/null
+++ b/bench/perf_monitoring/gemm/make_plot.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# base name of the bench
+# it reads $1.out
+# and generates $1.pdf
+WHAT=$1
+bench=$2
+
+header="rev "
+while read line
+do
+  if [ ! -z '$line' ]; then
+    header="$header  \"$line\""
+  fi
+done < $bench"_settings.txt"
+
+echo $header > $WHAT.out.header
+cat $WHAT.out >> $WHAT.out.header
+
+
+echo "set title '$WHAT'" > $WHAT.gnuplot
+echo "set key autotitle columnhead outside " >> $WHAT.gnuplot
+echo "set xtics rotate 1" >> $WHAT.gnuplot
+
+echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot
+echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot
+
+col=`cat $bench"_settings.txt" | wc -l`
+echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot
+echo " " >>  $WHAT.gnuplot
+
+gnuplot -persist < $WHAT.gnuplot
+
+# generate a png file
+# convert -background white -density 120 -rotate 90 -resize 800 +dither -colors 256 -quality 0 $WHAT.ps -background white -flatten  .$WHAT.png
+
+# clean
+rm $WHAT.out.header $WHAT.gnuplot
\ No newline at end of file
diff --git a/bench/perf_monitoring/gemm/run.sh b/bench/perf_monitoring/gemm/run.sh
new file mode 100755
index 0000000..9d6ee40
--- /dev/null
+++ b/bench/perf_monitoring/gemm/run.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+
+# ./run.sh gemm
+# ./run.sh lazy_gemm
+
+# Examples of environment variables to be set:
+#   PREFIX="haswell-fma-"
+#   CXX_FLAGS="-mfma"
+
+# Options:
+#   -up : enforce the recomputation of existing data, and keep best results as a merging strategy
+#   -s  : recompute selected changesets only and keep bests
+
+bench=$1
+
+if echo "$*" | grep '\-up' > /dev/null; then
+  update=true
+else
+  update=false
+fi
+
+if echo "$*" | grep '\-s' > /dev/null; then
+  selected=true
+else
+  selected=false
+fi
+
+global_args="$*"
+
+if [ $selected == true ]; then
+ echo "Recompute selected changesets only and keep bests"
+elif [ $update == true ]; then
+ echo "(Re-)Compute all changesets and keep bests"
+else
+ echo "Skip previously computed changesets"
+fi
+
+
+
+if [ ! -d "eigen_src" ]; then
+  hg clone https://bitbucket.org/eigen/eigen eigen_src
+else
+  cd eigen_src
+  hg pull -u
+  cd ..
+fi
+
+if [ ! -z '$CXX' ]; then
+  CXX=g++
+fi
+
+function make_backup
+{
+  if [ -f "$1.out" ]; then
+    mv "$1.out" "$1.backup"
+  fi
+}
+
+function merge
+{
+  count1=`echo $1 |  wc -w`
+  count2=`echo $2 |  wc -w`
+  
+  if [ $count1 == $count2 ]; then
+    a=( $1 ); b=( $2 )
+    res=""
+    for (( i=0 ; i<$count1 ; i++ )); do
+      ai=${a[$i]}; bi=${b[$i]}
+      tmp=`echo "if ($ai > $bi) $ai else $bi " | bc -l`
+      res="$res $tmp"
+    done
+    echo $res
+
+  else
+    echo $1
+  fi
+}
+
+function test_current 
+{
+  rev=$1
+  scalar=$2
+  name=$3
+  
+  prev=""
+  if [ -e "$name.backup" ]; then
+    prev=`grep $rev "$name.backup" | cut -c 14-`
+  fi
+  res=$prev
+  count_rev=`echo $prev |  wc -w`
+  count_ref=`cat $bench"_settings.txt" |  wc -l`
+  if echo "$global_args" | grep "$rev" > /dev/null; then
+    rev_found=true
+  else
+    rev_found=false
+  fi
+#  echo $update et $selected et $rev_found because $rev et "$global_args"
+#  echo $count_rev et $count_ref
+  if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] &&  [ $rev_found == true ]); then
+    if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src $bench.cpp -DSCALAR=$scalar -o $name; then
+      curr=`./$name`
+      if [ $count_rev == $count_ref ]; then
+        echo "merge previous $prev"
+        echo "with new       $curr"
+      else
+        echo "got            $curr"
+      fi
+      res=`merge "$curr" "$prev"`
+#       echo $res
+      echo "$rev $res" >> $name.out
+    else
+      echo "Compilation failed, skip rev $rev"
+    fi
+  else
+    echo "Skip existing results for $rev / $name"
+    echo "$rev $res" >> $name.out
+  fi
+}
+
+make_backup $PREFIX"s"$bench
+make_backup $PREFIX"d"$bench
+make_backup $PREFIX"c"$bench
+
+cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev
+do
+  if [ ! -z '$rev' ]; then
+    echo "Testing rev $rev"
+    cd eigen_src
+    hg up -C $rev > /dev/null
+    actual_rev=`hg identify | cut -f1 -d' '`
+    cd ..
+    
+    test_current $actual_rev float                  $PREFIX"s"$bench
+    test_current $actual_rev double                 $PREFIX"d"$bench
+    test_current $actual_rev "std::complex<double>" $PREFIX"c"$bench
+  fi
+  
+done
+
+echo "Float:"
+cat $PREFIX"s""$bench.out"
+echo " "
+
+echo "Double:"
+cat $PREFIX"d""$bench.out"
+echo ""
+
+echo "Complex:"
+cat $PREFIX"c""$bench.out"
+echo ""
+
+./make_plot.sh $PREFIX"s"$bench $bench
+./make_plot.sh $PREFIX"d"$bench $bench
+./make_plot.sh $PREFIX"c"$bench $bench
+
+