4 år sedan · d960960b0e
--- a/libmandel/src/CpuGeneratorsAVX.cpp
+++ b/libmandel/src/CpuGeneratorsAVX.cpp
@@ -363,6 +363,24 @@ static inline VecPair twoProd(__m256d a, __m256d b)
 
				     return { p, err };
			
 
				 }
			
 
				 
			
 
				+static inline VecPair twoSq(__m256d a)
			
 
				+{
			
 
				+    __m256d p = _mm256_mul_pd(a, a);
			
 
				+    auto[a_hi, a_lo] = split(a);
			
 
				+    __m256d err =
			
 
				+        _mm256_add_pd(
			
 
				+            _mm256_add_pd(
			
 
				+                _mm256_sub_pd(_mm256_mul_pd(a_hi, a_hi), p),
			
 
				+                _mm256_add_pd(
			
 
				+                    _mm256_mul_pd(a_hi, a_lo),
			
 
				+                    _mm256_mul_pd(a_hi, a_lo)
			
 
				+                )
			
 
				+            ),
			
 
				+        _mm256_mul_pd(a_lo, a_lo)
			
 
				+        );
			
 
				+    return { p, err };
			
 
				+}
			
 
				+
			
 
				 
			
 
				 struct AvxDoubleDouble
			
 
				 {
			
@@ -402,6 +420,20 @@ struct AvxDoubleDouble
 
				         auto[r1, r2] = quickTwoSum(p1, p2);
			
 
				         return AvxDoubleDouble{ r1, r2 };
			
 
				     }
			
 
				+
			
 
				+    inline AvxDoubleDouble sq(void) const
			
 
				+    {
			
 
				+        auto[p1, p2] = twoSq(x[0]);
			
 
				+        __m256d x01_2 = _mm256_mul_pd(_mm256_add_pd(x[0], x[0]), x[1]);
			
 
				+        p2 = _mm256_add_pd(p2, x01_2);
			
 
				+        auto[r1, r2] = quickTwoSum(p1, p2);
			
 
				+        return AvxDoubleDouble{ r1, r2 };
			
 
				+    }
			
 
				+
			
 
				+    inline AvxDoubleDouble twice(void) const
			
 
				+    {
			
 
				+        return AvxDoubleDouble{ _mm256_add_pd(x[0], x[0]), _mm256_add_pd(x[1], x[1]) };
			
 
				+    }
			
 
				 };
			
 
				 
			
 
				 
			
@@ -467,6 +499,34 @@ struct AvxTripleDouble
 
				         auto[re1, re2] = quickTwoSum(q2, t3);
			
 
				         return { re0, re1, re2 };
			
 
				     }
			
 
				+
			
 
				+    inline AvxTripleDouble sq(void) const
			
 
				+    {
			
 
				+        auto twox0 = _mm256_add_pd(x[0], x[0]);
			
 
				+        auto[p1_0, p2_0] = twoProd(x[0], x[0]);
			
 
				+        auto[p2_1, p3_0] = twoProd(twox0, x[1]);
			
 
				+
			
 
				+        auto[t2, tl3] = twoSum(p2_0, p2_1);
			
 
				+        auto t3 =
			
 
				+            _mm256_add_pd(
			
 
				+                tl3,
			
 
				+                _mm256_add_pd(
			
 
				+                    p3_0,
			
 
				+                    _mm256_add_pd(
			
 
				+                        _mm256_mul_pd(x[1], x[1]),
			
 
				+                        _mm256_mul_pd(x[2], twox0)
			
 
				+                    )
			
 
				+                )
			
 
				+            );
			
 
				+        auto[re0, q2] = quickTwoSum(p1_0, t2);
			
 
				+        auto[re1, re2] = quickTwoSum(q2, t3);
			
 
				+        return { re0, re1, re2 };
			
 
				+    }
			
 
				+
			
 
				+    inline AvxTripleDouble twice(void) const
			
 
				+    {
			
 
				+        return AvxTripleDouble{ _mm256_add_pd(x[0], x[0]), _mm256_add_pd(x[1], x[1]), _mm256_add_pd(x[2], x[2]) };
			
 
				+    }
			
 
				 };
			
 
				 
			
 
				 } // namespace avx_private
			
@@ -530,9 +590,9 @@ void generateDoubleDoubleAvx(long width, long height, float* data, bool parallel
 
				 
			
 
				             __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
			
 
				             for (int k = 0; k < maxIter; k++) {
			
 
				-                AvxDoubleDouble aa = a * a;
			
 
				-                AvxDoubleDouble bb = b * b;
			
 
				-                AvxDoubleDouble abab = a * b; abab = abab + abab;
			
 
				+                AvxDoubleDouble aa = a.sq();
			
 
				+                AvxDoubleDouble bb = b.sq();
			
 
				+                AvxDoubleDouble abab = a * b.twice();
			
 
				                 a = aa - bb + cx;
			
 
				                 b = abab + cy;
			
 
				                 if (smooth) {
			
@@ -627,9 +687,9 @@ void generateTripleDoubleAvx(long width, long height, float* data, bool parallel
 
				 
			
 
				             __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
			
 
				             for (int k = 0; k < maxIter; k++) {
			
 
				-                AvxTripleDouble aa = a * a;
			
 
				-                AvxTripleDouble bb = b * b;
			
 
				-                AvxTripleDouble abab = a * b; abab = abab + abab;
			
 
				+                AvxTripleDouble aa = a.sq();
			
 
				+                AvxTripleDouble bb = b.sq();
			
 
				+                AvxTripleDouble abab = a * b.twice();
			
 
				                 a = aa - bb + cx;
			
 
				                 b = abab + cy;
			
 
				                 if (smooth) {
			
@@ -654,7 +714,7 @@ void generateTripleDoubleAvx(long width, long height, float* data, bool parallel
 
				                 if (smooth)
			
 
				                     data[i + k + j * width] = float(ftRes[k] < 0 ? maxIter :
			
 
				                         ftRes[k] >= maxIter ? maxIter :
			
 
				-                        ((float)ftRes[k]) + 1 - floatLog2(::floatLog(float(resa[k] * resa[k] + resb[k] * resb[k])) / 2));
			
 
				+                        ((float)ftRes[k]) + 1 - floatLog2(::floatLog(float(resa[k] * resa[k] + resb[k] * resb[k])) * 0.5f));
			
 
				                 else
			
 
				                     data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
			
 
				             }
			
--- a/libmandel/src/CpuGeneratorsAVXFMA.cpp
+++ b/libmandel/src/CpuGeneratorsAVXFMA.cpp
@@ -417,6 +417,21 @@ static inline VecTriple sixThreeSum(__m256d a, __m256d b, __m256d c,
 
				     return { r0, r1, r2 };
			
 
				 }
			
 
				 
			
 
				+static inline VecPair sixTwoSum(__m256d a, __m256d b, __m256d c,
			
 
				+                                    __m256d d, __m256d e, __m256d f)
			
 
				+{
			
 
				+    auto[x0, x1, x2] = threeSum(a, b, c);
			
 
				+    auto[y0, y1, y2] = threeSum(d, e, f);
			
 
				+
			
 
				+    auto[r0, t0] = twoSum(x0, y0);
			
 
				+    auto[t1, t2] = twoSum(x1, y1);
			
 
				+    auto[r1, t3] = twoSum(t0, t1);
			
 
				+    r1 = _mm256_add_pd(r1, _mm256_add_pd(x2, y2));
			
 
				+    r1 = _mm256_add_pd(r1, t3);
			
 
				+
			
 
				+    return { r0, r1 };
			
 
				+}
			
 
				+
			
 
				 static inline VecPair addDD(const VecPair& a, const VecPair& b)
			
 
				 {
			
 
				     auto[s, e] = twoSum(a.a, b.a);
			
@@ -470,6 +485,34 @@ static inline VecQuadruple renormalize(__m256d x0, __m256d x1, __m256d x2, __m25
 
				     return { b[0], b[1], b[2], b[3] };
			
 
				 }
			
 
				 
			
 
				+
			
 
				+static inline VecQuadruple renorm1(__m256d x0, __m256d x1, __m256d x2, __m256d x3, __m256d x4)
			
 
				+{
			
 
				+    auto [r0, t0] = quickTwoSum(x0, x1);
			
 
				+    auto [r1, t1] = quickTwoSum(t0, x2);
			
 
				+    auto [r2, t2] = quickTwoSum(t1, x3);
			
 
				+    auto r3 = _mm256_add_pd(t2, x4);
			
 
				+
			
 
				+    return { r0, r1, r2, r3 };
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static inline VecQuadruple renorm2(__m256d x0, __m256d x1, __m256d x2, __m256d x3, __m256d x4)
			
 
				+{
			
 
				+    auto [st1, t4] = quickTwoSum(x3, x4);
			
 
				+    auto [st2, t3] = quickTwoSum(x2, st1);
			
 
				+    auto [st3, t2] = quickTwoSum(x1, st2);
			
 
				+    auto [t0, t1] = quickTwoSum(x0, st3);
			
 
				+
			
 
				+    __m256d e = t0;
			
 
				+
			
 
				+    auto [r0, e1] = quickTwoSum(e, t1);
			
 
				+    auto [r1, e2] = quickTwoSum(e1, t2);
			
 
				+    auto [r2, e3] = quickTwoSum(e2, t3);
			
 
				+    auto r3 = _mm256_add_pd(e3, t4);
			
 
				+    return { r0, r1, r2, r3 };
			
 
				+}
			
 
				+
			
 
				 static inline VecPair twoProd(__m256d a, __m256d b)
			
 
				 {
			
 
				     __m256d p = _mm256_mul_pd(a, b);
			
@@ -515,6 +558,21 @@ struct AvxDoubleDouble
 
				         auto[r1, r2] = quickTwoSum(p1, p2);
			
 
				         return AvxDoubleDouble{ r1, r2 };
			
 
				     }
			
 
				+
			
 
				+    inline AvxDoubleDouble sq(void) const
			
 
				+    {
			
 
				+        auto[p1, p2] = twoProd(x[0], x[0]);
			
 
				+        __m256d x01 = _mm256_mul_pd(x[1], x[0]);
			
 
				+        p2 = _mm256_add_pd(p2, _mm256_add_pd(x01, x01));
			
 
				+        auto[r1, r2] = quickTwoSum(p1, p2);
			
 
				+        return AvxDoubleDouble{ r1, r2 };
			
 
				+    }
			
 
				+
			
 
				+    inline AvxDoubleDouble mul_pow2(double v) const
			
 
				+    {
			
 
				+        __m256d vv = _mm256_set1_pd(v);
			
 
				+        return { _mm256_mul_pd(vv, x[0]), _mm256_mul_pd(vv, x[1]) };
			
 
				+    }
			
 
				 };
			
 
				 
			
 
				 
			
@@ -580,9 +638,9 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const
 
				 
			
 
				             __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
			
 
				             for (int k = 0; k < info.maxIter; k++) {
			
 
				-                AvxDoubleDouble aa = a * a;
			
 
				-                AvxDoubleDouble bb = b * b;
			
 
				-                AvxDoubleDouble abab = a * b; abab = abab + abab;
			
 
				+                AvxDoubleDouble aa = a.sq();
			
 
				+                AvxDoubleDouble bb = b.sq();
			
 
				+                AvxDoubleDouble abab = a * b.mul_pow2(2.0);
			
 
				                 a = aa - bb + cx;
			
 
				                 b = abab + cy;
			
 
				                 if (info.smooth) {
			
@@ -592,7 +650,7 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const
 
				                 cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
			
 
				                 adder = _mm256_and_pd(adder, cmp);
			
 
				                 counter = _mm256_add_pd(counter, adder);
			
 
				-                if (_mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
			
 
				+                if ((k & 0x7) && _mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
			
 
				                     break;
			
 
				                 }
			
 
				             }
			
@@ -648,7 +706,7 @@ struct AvxQuadDouble
 
				         auto [r3, t3, _t4] = threeSum(s3, e2, t1);
			
 
				         auto [r4, _t5, _t6] = threeSum(e3, t3, t2);
			
 
				 
			
 
				-        auto [re0, re1, re2, re3] = renormalize(r0, r1, r2, r3, r4);
			
 
				+        auto [re0, re1, re2, re3] = renorm1(r0, r1, r2, r3, r4);
			
 
				         return { re0, re1, re2, re3 };
			
 
				     }
			
 
				 
			
@@ -665,7 +723,7 @@ struct AvxQuadDouble
 
				         auto [r3, t3, _t4] = threeSum(s3, e2, t1);
			
 
				         auto [r4, _t5, _t6] = threeSum(e3, t3, t2);
			
 
				 
			
 
				-        auto [re0, re1, re2, re3] = renormalize(r0, r1, r2, r3, r4);
			
 
				+        auto [re0, re1, re2, re3] = renorm1(r0, r1, r2, r3, r4);
			
 
				         return { re0, re1, re2, re3 };
			
 
				     }
			
 
				 
			
@@ -688,7 +746,38 @@ struct AvxQuadDouble
 
				         auto[r3, e1] = nineTwoSum(d0, d1, d2, d3, d4, d5, d6, d7, d8);
			
 
				         auto r4 = _mm256_add_pd(e0, e1);
			
 
				 
			
 
				-        auto [n0, n1, n2, n3] = renormalize(r0, r1, r2, r3, r4);
			
 
				+        auto [n0, n1, n2, n3] = renorm2(r0, r1, r2, r3, r4);
			
 
				+
			
 
				+        return { n0, n1, n2, n3 };
			
 
				+    }
			
 
				+
			
 
				+    inline AvxQuadDouble mul_pow2(double v) const
			
 
				+    {
			
 
				+        __m256d vv = _mm256_set1_pd(v);
			
 
				+        return { _mm256_mul_pd(vv, x[0]), _mm256_mul_pd(vv, x[1]),
			
 
				+                 _mm256_mul_pd(vv, x[2]), _mm256_mul_pd(vv, x[3]) };
			
 
				+    }
			
 
				+
			
 
				+    inline AvxQuadDouble sq(void) const
			
 
				+    {
			
 
				+        auto[a0, b0] = twoProd(x[0], x[0]);
			
 
				+        auto[b1, c0] = twoProd(x[0], x[1]);
			
 
				+        //auto[b2, c1] = twoProd(x[0], x[1]); //
			
 
				+        auto[c2, d0] = twoProd(x[0], x[2]);
			
 
				+        auto[c3, d1] = twoProd(x[1], x[1]);
			
 
				+        //auto[c4, d2] = twoProd(x[0], x[2]); //
			
 
				+        auto d5 = _mm256_mul_pd(x[3], x[0]);
			
 
				+        auto d6 = _mm256_mul_pd(x[2], x[1]);
			
 
				+        //auto d7 = _mm256_mul_pd(x[1], x[2]); //
			
 
				+        //auto d8 = _mm256_mul_pd(x[0], x[3]); //
			
 
				+
			
 
				+        auto r0 = a0;
			
 
				+        auto[r1, c5] = twoSum(b0, _mm256_add_pd(b1, b1)); // d3
			
 
				+        auto[r2, d4, e0] = sixThreeSum(_mm256_add_pd(c0, c0), /*c0*/ _mm256_set1_pd(0.0), c2, c3, c2, c5);
			
 
				+        auto[r3, e1] = sixTwoSum(d0, d1, d0, d4, _mm256_add_pd(d5, d5), _mm256_add_pd(d6, d6));
			
 
				+        auto r4 = _mm256_add_pd(e0, e1);
			
 
				+
			
 
				+        auto [n0, n1, n2, n3] = renorm2(r0, r1, r2, r3, r4);
			
 
				 
			
 
				         return { n0, n1, n2, n3 };
			
 
				     }
			
@@ -779,9 +868,9 @@ void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const m
 
				 
			
 
				             __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
			
 
				             for (int k = 0; k < info.maxIter; k++) {
			
 
				-                AvxQuadDouble aa = a * a;
			
 
				-                AvxQuadDouble bb = b * b;
			
 
				-                AvxQuadDouble abab = a * b; abab = abab + abab;
			
 
				+                AvxQuadDouble aa = a.sq();
			
 
				+                AvxQuadDouble bb = b.sq();
			
 
				+                AvxQuadDouble abab = a * b.mul_pow2(2.0);
			
 
				                 a = aa - bb + cx;
			
 
				                 b = abab + cy;
			
 
				                 if (info.smooth) {