5 年前 · c9e48a7eb1
--- a/libmandel/src/CpuGeneratorsAVXFMA.cpp
+++ b/libmandel/src/CpuGeneratorsAVXFMA.cpp
@@ -383,6 +383,14 @@ static inline VecTriple threeSum(__m256d a, __m256d b, __m256d c)
 
				 }
			
 
				 
			
 
				 
			
 
				+static inline VecPair threeTwoSum(__m256d a, __m256d b, __m256d c)
			
 
				+{
			
 
				+    auto[t, e1] = twoSum(a, b);
			
 
				+    auto[s, e2] = twoSum(t, c);
			
 
				+    return { s, _mm256_add_pd(e1, e2) };
			
 
				+}
			
 
				+
			
 
				+
			
 
				 static inline __m256d threeOneSum(__m256d a, __m256d b, __m256d c)
			
 
				 {
			
 
				     return _mm256_add_pd(a, _mm256_add_pd(b, c));
			
@@ -404,6 +412,31 @@ static inline VecTriple sixThreeSum(__m256d a, __m256d b, __m256d c,
 
				     return { r0, r1, r2 };
			
 
				 }
			
 
				 
			
 
				+static inline VecPair addDD(const VecPair& a, const VecPair& b)
			
 
				+{
			
 
				+    auto[s, e] = twoSum(a.a, b.a);
			
 
				+    e = _mm256_add_pd(e, _mm256_add_pd(a.b, b.b));
			
 
				+    auto[r1, r2] = quickTwoSum(s, e);
			
 
				+    return { r1, r2 };
			
 
				+}
			
 
				+
			
 
				+static inline VecPair nineTwoSum(__m256d a, __m256d b, __m256d c,
			
 
				+                                    __m256d d, __m256d e, __m256d f,
			
 
				+                                    __m256d g, __m256d h, __m256d i)
			
 
				+{
			
 
				+    auto[x1, x2] = twoSum(a, d);
			
 
				+    auto[y1, y2] = twoSum(b, c);
			
 
				+    auto[z1, z2] = twoSum(e, h);
			
 
				+    auto[u1, u2] = twoSum(f, g);
			
 
				+
			
 
				+    auto[t1, t2] = addDD({ x1, x2 }, { y1, y2 });
			
 
				+    auto[t3, t4] = addDD({ z1, z2 }, { u1, u2 });
			
 
				+
			
 
				+    auto[t5, t6] = addDD({ t1, t2 }, { t3, t4 });
			
 
				+
			
 
				+    return threeTwoSum(t5, t6, i);
			
 
				+}
			
 
				+
			
 
				 static inline VecQuadruple renormalize(__m256d x0, __m256d x1, __m256d x2, __m256d x3, __m256d x4)
			
 
				 {
			
 
				     auto [st1, t4] = quickTwoSum(x3, x4);
			
@@ -638,12 +671,18 @@ struct AvxQuadDouble
 
				         auto[c2, d0] = twoProd(x[0], sm.x[2]);
			
 
				         auto[c3, d1] = twoProd(x[1], sm.x[1]);
			
 
				         auto[c4, d2] = twoProd(x[2], sm.x[0]);
			
 
				+        auto d5 = _mm256_mul_pd(x[3], sm.x[0]);
			
 
				+        auto d6 = _mm256_mul_pd(x[2], sm.x[1]);
			
 
				+        auto d7 = _mm256_mul_pd(x[1], sm.x[2]);
			
 
				+        auto d8 = _mm256_mul_pd(x[0], sm.x[3]);
			
 
				 
			
 
				         auto r0 = a0;
			
 
				         auto[r1, c5, d3] = threeSum(b0, b1, b2);
			
 
				         auto[r2, d4, e0] = sixThreeSum(c0, c1, c2, c3, c4, c5);
			
 
				+        auto[r3, e1] = nineTwoSum(d0, d1, d2, d3, d4, d5, d6, d7, d8);
			
 
				+        auto r4 = _mm256_add_pd(e0, e1);
			
 
				 
			
 
				-        auto [n0, n1, n2, n3] = renormalize(r0, r1, r2, _mm256_set1_pd(0.0), _mm256_set1_pd(0.0));
			
 
				+        auto [n0, n1, n2, n3] = renormalize(r0, r1, r2, r3, r4);
			
 
				 
			
 
				         return { n0, n1, n2, n3 };
			
 
				     }
			
--- a/libmandel/src/Mandel.cpp
+++ b/libmandel/src/Mandel.cpp
@@ -219,6 +219,7 @@ std::unique_ptr<mnd::AdaptiveGenerator> MandelContext::createAdaptiveGenerator(v
 
				         floatGen = getCpuGenerator(GeneratorType::FLOAT_AVX_FMA);
			
 
				         doubleGen = getCpuGenerator(GeneratorType::DOUBLE_AVX_FMA);
			
 
				         doubleDoubleGen = getCpuGenerator(GeneratorType::DOUBLE_DOUBLE_AVX_FMA);
			
 
				+        quadDoubleGen = getCpuGenerator(GeneratorType::QUAD_DOUBLE_AVX_FMA);
			
 
				     }
			
 
				     if (cpuInfo.hasAvx512()) {
			
 
				         floatGen = getCpuGenerator(GeneratorType::FLOAT_AVX512);