Browse Source

smooth avx

Nicolas Winkler 5 years ago
parent
commit
9c9bd61485
2 changed files with 74 additions and 40 deletions
  1. 54 31
      libmandel/src/CpuGeneratorsAVX.cpp
  2. 20 9
      libmandel/src/CpuGeneratorsAVXFMA.cpp

+ 54 - 31
libmandel/src/CpuGeneratorsAVX.cpp

@@ -36,22 +36,22 @@ void CpuGenerator<float, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo
         long i = 0;
         for (i; i < info.bWidth; i += 8) {
             __m256 xs = {
-                float(view.x + double(i) * view.width / info.bWidth),
-                float(view.x + double(i + 1) * view.width / info.bWidth),
-                float(view.x + double(i + 2) * view.width / info.bWidth),
-                float(view.x + double(i + 3) * view.width / info.bWidth),
-                float(view.x + double(i + 4) * view.width / info.bWidth),
-                float(view.x + double(i + 5) * view.width / info.bWidth),
-                float(view.x + double(i + 6) * view.width / info.bWidth),
-                float(view.x + double(i + 7) * view.width / info.bWidth)
+                float(view.x + float(i) * view.width / info.bWidth),
+                float(view.x + float(i + 1) * view.width / info.bWidth),
+                float(view.x + float(i + 2) * view.width / info.bWidth),
+                float(view.x + float(i + 3) * view.width / info.bWidth),
+                float(view.x + float(i + 4) * view.width / info.bWidth),
+                float(view.x + float(i + 5) * view.width / info.bWidth),
+                float(view.x + float(i + 6) * view.width / info.bWidth),
+                float(view.x + float(i + 7) * view.width / info.bWidth)
             };
 
-            __m256 counter = {0, 0, 0, 0, 0, 0, 0, 0};
-            __m256 adder = {1, 1, 1, 1, 1, 1, 1, 1};
-            __m256 resultsa = {0, 0, 0, 0, 0, 0, 0, 0};
-            __m256 resultsb = {0, 0, 0, 0, 0, 0, 0, 0};
+            __m256 counter = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m256 adder = { 1, 1, 1, 1, 1, 1, 1, 1 };
+            __m256 resultsa = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m256 resultsb = { 0, 0, 0, 0, 0, 0, 0, 0 };
 
-            __m256 threshold = {16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f};
+            __m256 threshold = { 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f };
 
             __m256 a = xs;
             __m256 b = ys;
@@ -129,6 +129,9 @@ void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInf
             __m256d counter = { 0, 0, 0, 0 };
             __m256d adder = { 1, 1, 1, 1 };
 
+            __m256d resultsa = { 0, 0, 0, 0 };
+            __m256d resultsb = { 0, 0, 0, 0 };
+
             __m256d a = xs;
             __m256d b = ys;
 
@@ -138,14 +141,14 @@ void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInf
                 __m256d abab = _mm256_mul_pd(a, b); abab = _mm256_add_pd(abab, abab);
                 a = _mm256_add_pd(_mm256_sub_pd(aa, bb), xs);
                 b = _mm256_add_pd(abab, ys);
-                __m256i cmp = _mm256_castpd_si256(_mm256_cmp_pd(_mm256_add_pd(aa, bb), threshold, _CMP_LE_OQ));
-                /*if (info.smooth) {
-                    resultsa = _mm256_or_pd(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
-                    resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
-                }*/
-                adder = _mm256_and_pd(adder, _mm256_castsi256_pd(cmp));
+                __m256d cmp = _mm256_cmp_pd(_mm256_add_pd(aa, bb), threshold, _CMP_LE_OQ);
+                if (info.smooth) {
+                    resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a));
+                    resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b));
+                }
+                adder = _mm256_and_pd(adder, cmp);
                 counter = _mm256_add_pd(counter, adder);
-                if ((k & 0x7) == 0 && _mm256_testz_si256(cmp, cmp) != 0) {
+                if ((k & 0x3) == 0 && _mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
                     break;
                 }
             }
@@ -159,9 +162,17 @@ void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInf
 
             double resData[8];
             double* ftRes = alignVec(resData);
+            double* resa = (double*) &resultsa;
+            double* resb = (double*) &resultsb;
             _mm256_store_pd(ftRes, counter);
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++)
-                data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
+            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
+                if (info.smooth)
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
+                        ftRes[k] >= info.maxIter ? info.maxIter :
+                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+                else
+                    data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
+            }
         }
     }
 }
@@ -323,20 +334,23 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd
             AvxDoubleDouble a = xs;
             AvxDoubleDouble b = ys;
 
+            __m256d resultsa;
+            __m256d resultsb;
+
             for (int k = 0; k < info.maxIter; k++) {
                 AvxDoubleDouble aa = a * a;
                 AvxDoubleDouble bb = b * b;
                 AvxDoubleDouble abab = a * b; abab = abab + abab;
                 a = aa - bb + xs;
                 b = abab + ys;
-                __m256i cmp = _mm256_castpd_si256(_mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ));
-                /*if (info.smooth) {
-                    resultsa = _mm256_or_pd(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
-                    resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
-                }*/
-                adder = _mm256_and_pd(adder, _mm256_castsi256_pd(cmp));
+                __m256d cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
+                if (info.smooth) {
+                    resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a.x[0]));
+                    resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b.x[0]));
+                }
+                adder = _mm256_and_pd(adder, cmp);
                 counter = _mm256_add_pd(counter, adder);
-                if (_mm256_testz_si256(cmp, cmp) != 0) {
+                if (_mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
                     break;
                 }
             }
@@ -350,9 +364,18 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd
 
             double resData[8];
             double* ftRes = alignVec(resData);
+            double* resa = (double*) &resultsa;
+            double* resb = (double*) &resultsb;
             _mm256_store_pd(ftRes, counter);
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++)
-                data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
+
+            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
+                if (info.smooth)
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
+                        ftRes[k] >= info.maxIter ? info.maxIter :
+                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+                else
+                    data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
+            }
         }
     }
 }

+ 20 - 9
libmandel/src/CpuGeneratorsAVXFMA.cpp

@@ -164,20 +164,23 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const
             AvxDoubleDouble a = xs;
             AvxDoubleDouble b = ys;
 
+            __m256d resultsa;
+            __m256d resultsb;
+
             for (int k = 0; k < info.maxIter; k++) {
                 AvxDoubleDouble aa = a * a;
                 AvxDoubleDouble bb = b * b;
                 AvxDoubleDouble abab = a * b; abab = abab + abab;
                 a = aa - bb + xs;
                 b = abab + ys;
-                __m256i cmp = _mm256_castpd_si256(_mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ));
-                /*if (info.smooth) {
-                    resultsa = _mm256_or_pd(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
-                    resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
-                }*/
-                adder = _mm256_and_pd(adder, _mm256_castsi256_pd(cmp));
+                __m256d cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
+                if (info.smooth) {
+                    resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a.x[0]));
+                    resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b.x[0]));
+                }
+                adder = _mm256_and_pd(adder, cmp);
                 counter = _mm256_add_pd(counter, adder);
-                if (_mm256_testz_si256(cmp, cmp) != 0) {
+                if (_mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
                     break;
                 }
             }
@@ -191,9 +194,17 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const
 
             double resData[8];
             double* ftRes = alignVec(resData);
+            double* resa = (double*) &resultsa;
+            double* resb = (double*) &resultsb;
             _mm256_store_pd(ftRes, counter);
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++)
-                data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
+            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
+                if (info.smooth)
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
+                        ftRes[k] >= info.maxIter ? info.maxIter :
+                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+                else
+                    data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
+            }
         }
     }
     return;