From 231ce21535c7bc0a145b581f823c6da00da175a9 Mon Sep 17 00:00:00 2001
From: Ilya Tokar <tokarip@google.com>
Date: Fri, 12 Jun 2020 17:20:42 -0400
Subject: [PATCH] Run two independent chains, when reducing tensors.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Running two chains exposes more instruction level parallelism,
by allowing to execute both chains at the same time.

Results are a bit noisy, but for medium length we almost hit
theoretical upper bound of 2x.

BM_fullReduction_16T/3        [using 16 threads]       17.3ns Â±11%        17.4ns Â± 9%        ~           (p=0.178 n=18+19)
BM_fullReduction_16T/4        [using 16 threads]       17.6ns Â±17%        17.0ns Â±18%        ~           (p=0.835 n=20+19)
BM_fullReduction_16T/7        [using 16 threads]       18.9ns Â±12%        18.2ns Â±10%        ~           (p=0.756 n=20+18)
BM_fullReduction_16T/8        [using 16 threads]       19.8ns Â±13%        19.4ns Â±21%        ~           (p=0.512 n=20+20)
BM_fullReduction_16T/10       [using 16 threads]       23.5ns Â±15%        20.8ns Â±24%     -11.37%        (p=0.000 n=20+19)
BM_fullReduction_16T/15       [using 16 threads]       35.8ns Â±21%        26.9ns Â±17%     -24.76%        (p=0.000 n=20+19)
BM_fullReduction_16T/16       [using 16 threads]       38.7ns Â±22%        27.7ns Â±18%     -28.40%        (p=0.000 n=20+19)
BM_fullReduction_16T/31       [using 16 threads]        146ns Â±17%          74ns Â±11%     -49.05%        (p=0.000 n=20+18)
BM_fullReduction_16T/32       [using 16 threads]        154ns Â±19%          84ns Â±30%     -45.79%        (p=0.000 n=20+19)
BM_fullReduction_16T/64       [using 16 threads]        603ns Â± 8%         308ns Â±12%     -48.94%        (p=0.000 n=17+17)
BM_fullReduction_16T/128      [using 16 threads]       2.44Âµs Â±13%        1.22Âµs Â± 1%     -50.29%        (p=0.000 n=17+17)
BM_fullReduction_16T/256      [using 16 threads]       9.84Âµs Â±14%        5.13Âµs Â±30%     -47.82%        (p=0.000 n=19+19)
BM_fullReduction_16T/512      [using 16 threads]       78.0Âµs Â± 9%        56.1Âµs Â±17%     -28.02%        (p=0.000 n=18+20)
BM_fullReduction_16T/1k       [using 16 threads]        325Âµs Â± 5%         263Âµs Â± 4%     -19.00%        (p=0.000 n=20+16)
BM_fullReduction_16T/2k       [using 16 threads]       1.09ms Â± 3%        0.99ms Â± 1%      -9.04%        (p=0.000 n=20+20)
BM_fullReduction_16T/4k       [using 16 threads]       7.66ms Â± 3%        7.57ms Â± 3%      -1.24%        (p=0.017 n=20+20)
BM_fullReduction_16T/10k      [using 16 threads]       65.3ms Â± 4%        65.0ms Â± 3%        ~           (p=0.718 n=20+20)
---
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 8332a9ae0..af9b58816 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -242,14 +242,26 @@ struct InnerMostDimReducer<Self, Op, true, true> {
       }
       return reducer.finalize(accum);
     } else {
+      const typename Self::Index UnrollSize =
+          (numValuesToReduce / (2*packetSize)) * 2*packetSize;
       const typename Self::Index VectorizedSize =
           (numValuesToReduce / packetSize) * packetSize;
       typename Self::PacketReturnType paccum =
           reducer.template initializePacket<typename Self::PacketReturnType>();
-      for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
+      typename Self::PacketReturnType paccum2 =
+          reducer.template initializePacket<typename Self::PacketReturnType>();
+      for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
         reducer.reducePacket(
             self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
+        reducer.reducePacket(
+            self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
+            &paccum2);
+      }
+      for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
+        reducer.reducePacket(self.m_impl.template packet<Unaligned>(
+                                 firstIndex + j), &paccum);
       }
+      reducer.reducePacket(paccum2, &paccum);
       for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
            ++j) {
         reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
-- 
GitLab