-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathparallelize.patch
More file actions
94 lines (92 loc) · 4.79 KB
/
parallelize.patch
File metadata and controls
94 lines (92 loc) · 4.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index bc5ebc52ff..1caa860666 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -1478,11 +1478,12 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
}
}
#endif
- for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+ tbb::parallel_for(packet_cols8 / 4, packet_cols4 / 4, [&](Index j2) {
+ j2 *= 4;
for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
micro_panel(fix<2>, fix<4>, traits, i, j2);
}
- }
+ });
for (Index j2 = packet_cols4; j2 < cols; j2++) {
for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
micro_panel(fix<2>, fix<1>, traits, i, j2);
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 5f6c3958d7..657339d6c4 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -228,7 +228,21 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLh
for (Index j2 = 0; j2 < cols; j2 += block_cols) {
Index jend = numext::mini(j2 + block_cols, cols);
Index i = 0;
- for (; i < n8; i += ResPacketSize * 8) process_rows<8>(i, j2, jend, lhs, rhs, res, palpha, pcj);
+ for (; i < n8; i += ResPacketSize * 8)
+ for (auto x = 0; x < 8; ++x) {
+ const auto offset = i + ResPacketSize * x;
+ pstoreu(res + offset, pmadd(tbb::parallel_deterministic_reduce(
+ tbb::blocked_range<Index>(j2, jend), pzero(ResPacket{}),
+ [&](const tbb::blocked_range<Index>& r, ResPacket running_total) {
+ for (auto j = r.begin(); j < r.end(); ++j)
+ running_total += pcj.pmul(
+ lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * x, j),
+ pset1<RhsPacket>(rhs(j, 0)));
+ return running_total;
+ },
+ std::plus<>()),
+ palpha, ploadu<ResPacket>(res + offset)));
+ }
if (i < n4) {
process_rows<4>(i, j2, jend, lhs, rhs, res, palpha, pcj);
i += ResPacketSize * 4;
diff --git a/Eigen/src/SVD/BDCSVDImpl.h b/Eigen/src/SVD/BDCSVDImpl.h
index 69e7156a45..a260fe380b 100644
--- a/Eigen/src/SVD/BDCSVDImpl.h
+++ b/Eigen/src/SVD/BDCSVDImpl.h
@@ -374,6 +374,16 @@ typename bdcsvd_impl<RealScalar_>::RealScalar bdcsvd_impl<RealScalar_>::secularE
const IndicesRef& perm,
const ArrayRef& diagShifted,
RealScalar shift) {
+ return tbb::parallel_deterministic_reduce(
+ tbb::blocked_range<Index>(Index(0), perm.size()), RealScalar(1),
+ [&](const tbb::blocked_range<Index>& r, RealScalar running_total) {
+ for (auto i = r.begin(); i < r.end(); ++i) {
+ const Index j = perm(i);
+ running_total += (col0(j) / (diagShifted(j) - mu)) * (col0(j) / (diag(j) + shift + mu));
+ }
+ return running_total;
+ },
+ std::plus<>());
Index m = perm.size();
RealScalar res = Literal(1);
for (Index i = 0; i < m; ++i) {
@@ -595,6 +605,8 @@ template <typename RealScalar_>
void bdcsvd_impl<RealScalar_>::computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm,
const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus,
MatrixXr& U, MatrixXr& V) {
+ static tbb::affinity_partitioner ap;
+
Index n = zhat.size();
Index m = perm.size();
@@ -604,10 +616,13 @@ void bdcsvd_impl<RealScalar_>::computeSingVecs(const ArrayRef& zhat, const Array
if (m_compV) V.col(k) = VectorType::Unit(n, k);
} else {
U.col(k).setZero();
- for (Index l = 0; l < m; ++l) {
- Index i = perm(l);
- U(i, k) = zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
- }
+ tbb::parallel_for(
+ Index(0), m,
+ [&](const Index l) {
+ const Index i = perm(l);
+ U(i, k) = zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
+ },
+ ap);
U(n, k) = Literal(0);
U.col(k).normalize();