5 years ago · 9c9bd61485
--- a/libmandel/src/CpuGeneratorsAVX.cpp
+++ b/libmandel/src/CpuGeneratorsAVX.cpp
@@ -36,22 +36,22 @@ void CpuGenerator<float, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo
 
				         long i = 0;
			
 
				         for (i; i < info.bWidth; i += 8) {
			
 
				             __m256 xs = {
			
 
				-                float(view.x + double(i) * view.width / info.bWidth),
			
 
				-                float(view.x + double(i + 1) * view.width / info.bWidth),
			
 
				-                float(view.x + double(i + 2) * view.width / info.bWidth),
			
 
				-                float(view.x + double(i + 3) * view.width / info.bWidth),
			
 
				-                float(view.x + double(i + 4) * view.width / info.bWidth),
			
 
				-                float(view.x + double(i + 5) * view.width / info.bWidth),
			
 
				-                float(view.x + double(i + 6) * view.width / info.bWidth),
			
 
				-                float(view.x + double(i + 7) * view.width / info.bWidth)
			
 
				+                float(view.x + float(i) * view.width / info.bWidth),
			
 
				+                float(view.x + float(i + 1) * view.width / info.bWidth),
			
 
				+                float(view.x + float(i + 2) * view.width / info.bWidth),
			
 
				+                float(view.x + float(i + 3) * view.width / info.bWidth),
			
 
				+                float(view.x + float(i + 4) * view.width / info.bWidth),
			
 
				+                float(view.x + float(i + 5) * view.width / info.bWidth),
			
 
				+                float(view.x + float(i + 6) * view.width / info.bWidth),
			
 
				+                float(view.x + float(i + 7) * view.width / info.bWidth)
			
 
				             };
			
 
				 
			
 
				-            __m256 counter = {0, 0, 0, 0, 0, 0, 0, 0};
			
 
				-            __m256 adder = {1, 1, 1, 1, 1, 1, 1, 1};
			
 
				-            __m256 resultsa = {0, 0, 0, 0, 0, 0, 0, 0};
			
 
				-            __m256 resultsb = {0, 0, 0, 0, 0, 0, 0, 0};
			
 
				+            __m256 counter = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+            __m256 adder = { 1, 1, 1, 1, 1, 1, 1, 1 };
			
 
				+            __m256 resultsa = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+            __m256 resultsb = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				 
			
 
				-            __m256 threshold = {16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f};
			
 
				+            __m256 threshold = { 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f };
			
 
				 
			
 
				             __m256 a = xs;
			
 
				             __m256 b = ys;
			
@@ -129,6 +129,9 @@ void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInf
 
				             __m256d counter = { 0, 0, 0, 0 };
			
 
				             __m256d adder = { 1, 1, 1, 1 };
			
 
				 
			
 
				+            __m256d resultsa = { 0, 0, 0, 0 };
			
 
				+            __m256d resultsb = { 0, 0, 0, 0 };
			
 
				+
			
 
				             __m256d a = xs;
			
 
				             __m256d b = ys;
			
 
				 
			
@@ -138,14 +141,14 @@ void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInf
 
				                 __m256d abab = _mm256_mul_pd(a, b); abab = _mm256_add_pd(abab, abab);
			
 
				                 a = _mm256_add_pd(_mm256_sub_pd(aa, bb), xs);
			
 
				                 b = _mm256_add_pd(abab, ys);
			
 
				-                __m256i cmp = _mm256_castpd_si256(_mm256_cmp_pd(_mm256_add_pd(aa, bb), threshold, _CMP_LE_OQ));
			
 
				-                /*if (info.smooth) {
			
 
				-                    resultsa = _mm256_or_pd(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
			
 
				-                    resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
			
 
				-                }*/
			
 
				-                adder = _mm256_and_pd(adder, _mm256_castsi256_pd(cmp));
			
 
				+                __m256d cmp = _mm256_cmp_pd(_mm256_add_pd(aa, bb), threshold, _CMP_LE_OQ);
			
 
				+                if (info.smooth) {
			
 
				+                    resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a));
			
 
				+                    resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b));
			
 
				+                }
			
 
				+                adder = _mm256_and_pd(adder, cmp);
			
 
				                 counter = _mm256_add_pd(counter, adder);
			
 
				-                if ((k & 0x7) == 0 && _mm256_testz_si256(cmp, cmp) != 0) {
			
 
				+                if ((k & 0x3) == 0 && _mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
			
 
				                     break;
			
 
				                 }
			
 
				             }
			
@@ -159,9 +162,17 @@ void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInf
 
				 
			
 
				             double resData[8];
			
 
				             double* ftRes = alignVec(resData);
			
 
				+            double* resa = (double*) &resultsa;
			
 
				+            double* resb = (double*) &resultsb;
			
 
				             _mm256_store_pd(ftRes, counter);
			
 
				-            for (int k = 0; k < 4 && i + k < info.bWidth; k++)
			
 
				-                data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
			
 
				+            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
			
 
				+                if (info.smooth)
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
			
 
				+                        ftRes[k] >= info.maxIter ? info.maxIter :
			
 
				+                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
			
 
				+                else
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
			
 
				+            }
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -323,20 +334,23 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd
 
				             AvxDoubleDouble a = xs;
			
 
				             AvxDoubleDouble b = ys;
			
 
				 
			
 
				+            __m256d resultsa;
			
 
				+            __m256d resultsb;
			
 
				+
			
 
				             for (int k = 0; k < info.maxIter; k++) {
			
 
				                 AvxDoubleDouble aa = a * a;
			
 
				                 AvxDoubleDouble bb = b * b;
			
 
				                 AvxDoubleDouble abab = a * b; abab = abab + abab;
			
 
				                 a = aa - bb + xs;
			
 
				                 b = abab + ys;
			
 
				-                __m256i cmp = _mm256_castpd_si256(_mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ));
			
 
				-                /*if (info.smooth) {
			
 
				-                    resultsa = _mm256_or_pd(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
			
 
				-                    resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
			
 
				-                }*/
			
 
				-                adder = _mm256_and_pd(adder, _mm256_castsi256_pd(cmp));
			
 
				+                __m256d cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
			
 
				+                if (info.smooth) {
			
 
				+                    resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a.x[0]));
			
 
				+                    resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b.x[0]));
			
 
				+                }
			
 
				+                adder = _mm256_and_pd(adder, cmp);
			
 
				                 counter = _mm256_add_pd(counter, adder);
			
 
				-                if (_mm256_testz_si256(cmp, cmp) != 0) {
			
 
				+                if (_mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
			
 
				                     break;
			
 
				                 }
			
 
				             }
			
@@ -350,9 +364,18 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd
 
				 
			
 
				             double resData[8];
			
 
				             double* ftRes = alignVec(resData);
			
 
				+            double* resa = (double*) &resultsa;
			
 
				+            double* resb = (double*) &resultsb;
			
 
				             _mm256_store_pd(ftRes, counter);
			
 
				-            for (int k = 0; k < 4 && i + k < info.bWidth; k++)
			
 
				-                data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
			
 
				+
			
 
				+            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
			
 
				+                if (info.smooth)
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
			
 
				+                        ftRes[k] >= info.maxIter ? info.maxIter :
			
 
				+                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
			
 
				+                else
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
			
 
				+            }
			
 
				         }
			
 
				     }
			
 
				 }
			
--- a/libmandel/src/CpuGeneratorsAVXFMA.cpp
+++ b/libmandel/src/CpuGeneratorsAVXFMA.cpp
@@ -164,20 +164,23 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const
 
				             AvxDoubleDouble a = xs;
			
 
				             AvxDoubleDouble b = ys;
			
 
				 
			
 
				+            __m256d resultsa;
			
 
				+            __m256d resultsb;
			
 
				+
			
 
				             for (int k = 0; k < info.maxIter; k++) {
			
 
				                 AvxDoubleDouble aa = a * a;
			
 
				                 AvxDoubleDouble bb = b * b;
			
 
				                 AvxDoubleDouble abab = a * b; abab = abab + abab;
			
 
				                 a = aa - bb + xs;
			
 
				                 b = abab + ys;
			
 
				-                __m256i cmp = _mm256_castpd_si256(_mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ));
			
 
				-                /*if (info.smooth) {
			
 
				-                    resultsa = _mm256_or_pd(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
			
 
				-                    resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
			
 
				-                }*/
			
 
				-                adder = _mm256_and_pd(adder, _mm256_castsi256_pd(cmp));
			
 
				+                __m256d cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
			
 
				+                if (info.smooth) {
			
 
				+                    resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a.x[0]));
			
 
				+                    resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b.x[0]));
			
 
				+                }
			
 
				+                adder = _mm256_and_pd(adder, cmp);
			
 
				                 counter = _mm256_add_pd(counter, adder);
			
 
				-                if (_mm256_testz_si256(cmp, cmp) != 0) {
			
 
				+                if (_mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
			
 
				                     break;
			
 
				                 }
			
 
				             }
			
@@ -191,9 +194,17 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const
 
				 
			
 
				             double resData[8];
			
 
				             double* ftRes = alignVec(resData);
			
 
				+            double* resa = (double*) &resultsa;
			
 
				+            double* resb = (double*) &resultsb;
			
 
				             _mm256_store_pd(ftRes, counter);
			
 
				-            for (int k = 0; k < 4 && i + k < info.bWidth; k++)
			
 
				-                data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
			
 
				+            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
			
 
				+                if (info.smooth)
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
			
 
				+                        ftRes[k] >= info.maxIter ? info.maxIter :
			
 
				+                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
			
 
				+                else
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
			
 
				+            }
			
 
				         }
			
 
				     }
			
 
				     return;