浏览代码

even faster doubles

Nicolas Winkler 5 年之前
父节点
当前提交
8c2d55b29e
共有 1 个文件被更改,包括 8 次插入4 次删除
  1. 8 4
      libmandel/src/CpuGeneratorsAVXFMA.cpp

+ 8 - 4
libmandel/src/CpuGeneratorsAVXFMA.cpp

@@ -178,13 +178,17 @@ void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mande
 
             for (int k = 0; k < info.maxIter; k++) {
                 __m256d ab = _mm256_mul_pd(a, b);
+                __m256d bb = _mm256_mul_pd(b, b);
                 __m256d ab2 = _mm256_mul_pd(a2, b2);
-                a = _mm256_fmsub_pd(a, a, _mm256_fmsub_pd(b, b, cx));
-                a2 = _mm256_fmsub_pd(a2, a2, _mm256_fmsub_pd(b2, b2, cx2));
+                __m256d bb2 = _mm256_mul_pd(b2, b2);
+                a = _mm256_fmsub_pd(a, a, bb);
+                a = _mm256_add_pd(a, cx);
+                a2 = _mm256_fmsub_pd(a2, a2, bb2);
+                a2 = _mm256_add_pd(a2, cx2);
                 b = _mm256_fmadd_pd(two, ab, cy);
                 b2 = _mm256_fmadd_pd(two, ab2, cy);
-                __m256d cmp = _mm256_cmp_pd(_mm256_fmadd_pd(a, a, _mm256_mul_pd(b, b)), threshold, _CMP_LE_OQ);
-                __m256d cmp2 = _mm256_cmp_pd(_mm256_fmadd_pd(a2, a2, _mm256_mul_pd(b2, b2)), threshold, _CMP_LE_OQ);
+                __m256d cmp = _mm256_cmp_pd(_mm256_fmadd_pd(a, a, bb), threshold, _CMP_LE_OQ);
+                __m256d cmp2 = _mm256_cmp_pd(_mm256_fmadd_pd(a2, a2, bb2), threshold, _CMP_LE_OQ);
                 if (info.smooth) {
                     resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a));
                     resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b));