7 years ago · 0ac6819cf7
--- a/libmandel/include/CpuGenerators.h
+++ b/libmandel/include/CpuGenerators.h
@@ -7,14 +7,17 @@ namespace mnd
 
				 {
			
 
				     enum CpuExtension
			
 
				     {
			
 
				+        NONE,
			
 
				         X86_SSE2,
			
 
				         X86_AVX,
			
 
				         ARM_NEON,
			
 
				     };
			
 
				 
			
 
				-    template<typename T, CpuExtension ex, bool parallel>
			
 
				+    template<typename T, mnd::CpuExtension ex = mnd::NONE, bool parallel = true, bool smooth = true>
			
 
				     class CpuGenerator;
			
 
				 
			
 
				+
			
 
				+    /*
			
 
				     class CpuGeneratorFloat;
			
 
				     class CpuGeneratorDouble;
			
 
				     class CpuGenerator128;
			
@@ -32,9 +35,66 @@ namespace mnd
 
				     class CpuGeneratorNeonFloat;
			
 
				     class CpuGeneratorNeonDouble;
			
 
				 #endif
			
 
				+*/
			
 
				 }
			
 
				 
			
 
				 
			
 
				+template<typename T, mnd::CpuExtension ex = mnd::NONE, bool parallel = true, bool smooth = true>
			
 
				+class mnd::CpuGenerator : public Generator
			
 
				+{
			
 
				+public:
			
 
				+    virtual void generate(const MandelInfo& info, float* data);
			
 
				+};
			
 
				+
			
 
				+
			
 
				+template<typename T, bool parallel, bool smooth>
			
 
				+class mnd::CpuGenerator<T, mnd::NONE, parallel, smooth> : public Generator
			
 
				+{
			
 
				+public:
			
 
				+    virtual void generate(const MandelInfo& info, float* data);
			
 
				+};
			
 
				+
			
 
				+
			
 
				+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) 
			
 
				+template<bool parallel, bool smooth>
			
 
				+class mnd::CpuGenerator<float, mnd::X86_SSE2, parallel, smooth> : public Generator
			
 
				+{
			
 
				+public:
			
 
				+    virtual void generate(const MandelInfo& info, float* data);
			
 
				+};
			
 
				+
			
 
				+template<bool parallel, bool smooth>
			
 
				+class mnd::CpuGenerator<double, mnd::X86_SSE2, parallel, smooth> : public Generator
			
 
				+{
			
 
				+public:
			
 
				+    virtual void generate(const MandelInfo& info, float* data);
			
 
				+};
			
 
				+
			
 
				+
			
 
				+template<bool parallel, bool smooth>
			
 
				+class mnd::CpuGenerator<float, mnd::X86_AVX, parallel, smooth> : public Generator
			
 
				+{
			
 
				+public:
			
 
				+    virtual void generate(const MandelInfo& info, float* data);
			
 
				+};
			
 
				+
			
 
				+template<bool parallel, bool smooth>
			
 
				+class mnd::CpuGenerator<double, mnd::X86_AVX, parallel, smooth> : public Generator
			
 
				+{
			
 
				+public:
			
 
				+    virtual void generate(const MandelInfo& info, float* data);
			
 
				+};
			
 
				+
			
 
				+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) 
			
 
				+template<typename T, bool parallel, bool smooth>
			
 
				+class mnd::CpuGenerator<T, mnd::ARM_NEON, parallel, smooth> : public Generator
			
 
				+{
			
 
				+public:
			
 
				+    virtual void generate(const MandelInfo& info, float* data);
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				 class mnd::CpuGeneratorFloat : public Generator
			
 
				 {
			
 
				 public:
			
@@ -115,4 +175,6 @@ public:
 
				 };
			
 
				 #endif
			
 
				 
			
 
				+*/
			
 
				+
			
 
				 #endif // MANDEL_CPUGENERATORS_H
			
--- a/libmandel/src/CpuGenerators.cpp
+++ b/libmandel/src/CpuGenerators.cpp
@@ -5,42 +5,66 @@
 
				 
			
 
				 #include <memory>
			
 
				 
			
 
				-using mnd::CpuGeneratorFloat;
			
 
				-using mnd::CpuGeneratorDouble;
			
 
				-using mnd::CpuGenerator128;
			
 
				+using mnd::CpuGenerator;
			
 
				 
			
 
				+template class CpuGenerator<float, mnd::NONE, false, false>;
			
 
				+template class CpuGenerator<float, mnd::NONE, false, true>;
			
 
				+template class CpuGenerator<float, mnd::NONE, true, false>;
			
 
				+template class CpuGenerator<float, mnd::NONE, true, true>;
			
 
				 
			
 
				-void CpuGeneratorFloat::generate(const mnd::MandelInfo& info, float* data)
			
 
				+template class CpuGenerator<double, mnd::NONE, false, false>;
			
 
				+template class CpuGenerator<double, mnd::NONE, false, true>;
			
 
				+template class CpuGenerator<double, mnd::NONE, true, false>;
			
 
				+template class CpuGenerator<double, mnd::NONE, true, true>;
			
 
				+
			
 
				+template class CpuGenerator<Fixed128, mnd::NONE, false, false>;
			
 
				+template class CpuGenerator<Fixed128, mnd::NONE, false, true>;
			
 
				+template class CpuGenerator<Fixed128, mnd::NONE, true, false>;
			
 
				+template class CpuGenerator<Fixed128, mnd::NONE, true, true>;
			
 
				+
			
 
				+
			
 
				+template<typename T, bool parallel, bool smooth>
			
 
				+void CpuGenerator<T, mnd::NONE, parallel, smooth>::generate(const mnd::MandelInfo& info, float* data)
			
 
				 {
			
 
				     const MandelViewport& view = info.view;
			
 
				-    omp_set_num_threads(2 * omp_get_num_procs());
			
 
				-#pragma omp parallel for
			
 
				+
			
 
				+    if constexpr (parallel)
			
 
				+        omp_set_num_threads(2 * omp_get_num_procs());
			
 
				+#pragma omp parallel for if constexpr (parallel)
			
 
				     for (long j = 0; j < info.bHeight; j++) {
			
 
				-        float y = float(view.y) + float(j) * float(view.height / info.bHeight);
			
 
				+        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
			
 
				         long i = 0;
			
 
				         for (i; i < info.bWidth; i++) {
			
 
				-            float x = float(view.x + double(i) * view.width / info.bWidth);
			
 
				+            T x = T(view.x + T(i) * T(view.width / info.bWidth));
			
 
				 
			
 
				-            float a = x;
			
 
				-            float b = y;
			
 
				+            T a = x;
			
 
				+            T b = y;
			
 
				 
			
 
				             int k = 0;
			
 
				             for (k = 0; k < info.maxIter; k++) {
			
 
				-                float aa = a * a;
			
 
				-                float bb = b * b;
			
 
				-                float ab = a * b;
			
 
				+                T aa = a * a;
			
 
				+                T bb = b * b;
			
 
				+                T ab = a * b;
			
 
				                 a = aa - bb + x;
			
 
				                 b = ab + ab + y;
			
 
				-                if (aa + bb > 16) {
			
 
				+                if (aa + bb > T(16)) {
			
 
				                     break;
			
 
				                 }
			
 
				             }
			
 
				-            data[i + j * info.bWidth] = k;
			
 
				+            if constexpr (smooth) {
			
 
				+                if (k >= info.maxIter)
			
 
				+                    data[i + j * info.bWidth] = info.maxIter;
			
 
				+                else
			
 
				+                    data[i + j * info.bWidth] = ((float) k) + 1 - ::log(::log(a * a + b * b) / 2) / ::log(2.0f);
			
 
				+            }
			
 
				+            else
			
 
				+                data[i + j * info.bWidth] = k;
			
 
				         }
			
 
				     }
			
 
				 }
			
 
				 
			
 
				 
			
 
				+/*
			
 
				 void CpuGeneratorDouble::generate(const mnd::MandelInfo& info, float* data)
			
 
				 {
			
 
				     const MandelViewport& view = info.view;
			
@@ -102,4 +126,4 @@ void CpuGenerator128::generate(const mnd::MandelInfo& info, float* data)
 
				         }
			
 
				     }
			
 
				 }
			
 
				-
			
 
				+*/
			
--- a/libmandel/src/CpuGeneratorsAVX.cpp
+++ b/libmandel/src/CpuGeneratorsAVX.cpp
@@ -5,16 +5,27 @@
 
				 
			
 
				 #include <memory>
			
 
				 
			
 
				-using mnd::CpuGeneratorAvxFloat;
			
 
				-using mnd::CpuGeneratorAvxDouble;
			
 
				+using mnd::CpuGenerator;
			
 
				 
			
 
				+template class CpuGenerator<float, mnd::X86_AVX, false, false>;
			
 
				+template class CpuGenerator<float, mnd::X86_AVX, false, true>;
			
 
				+template class CpuGenerator<float, mnd::X86_AVX, true, false>;
			
 
				+template class CpuGenerator<float, mnd::X86_AVX, true, true>;
			
 
				 
			
 
				-void CpuGeneratorAvxFloat::generate(const mnd::MandelInfo& info, float* data)
			
 
				+template class CpuGenerator<double, mnd::X86_AVX, false, false>;
			
 
				+template class CpuGenerator<double, mnd::X86_AVX, false, true>;
			
 
				+template class CpuGenerator<double, mnd::X86_AVX, true, false>;
			
 
				+template class CpuGenerator<double, mnd::X86_AVX, true, true>;
			
 
				+
			
 
				+template<bool parallel, bool smooth>
			
 
				+void CpuGenerator<float, mnd::X86_AVX, parallel, smooth>::generate(const mnd::MandelInfo& info, float* data)
			
 
				 {
			
 
				     using T = float;
			
 
				     const MandelViewport& view = info.view;
			
 
				+
			
 
				+    if constexpr(parallel)
			
 
				     omp_set_num_threads(2 * omp_get_num_procs());
			
 
				-#pragma omp parallel for
			
 
				+#pragma omp parallel for if (parallel)
			
 
				     for (long j = 0; j < info.bHeight; j++) {
			
 
				         T y = T(view.y) + T(j) * T(view.height / info.bHeight);
			
 
				         long i = 0;
			
@@ -48,8 +59,10 @@ void CpuGeneratorAvxFloat::generate(const mnd::MandelInfo& info, float* data)
 
				                 a = _mm256_add_ps(_mm256_sub_ps(aa, bb), xs);
			
 
				                 b = _mm256_add_ps(abab, ys);
			
 
				                 __m256 cmp = _mm256_cmp_ps(_mm256_add_ps(aa, bb), threshold, _CMP_LE_OQ);
			
 
				-                resultsa = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
			
 
				-                resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
			
 
				+                if constexpr (smooth) {
			
 
				+                    resultsa = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
			
 
				+                    resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
			
 
				+                }
			
 
				                 adder = _mm256_and_ps(adder, cmp);
			
 
				                 counter = _mm256_add_ps(counter, adder);
			
 
				                 if ((k & 0x7) == 0 && _mm256_testz_ps(cmp, cmp) != 0) {
			
@@ -70,22 +83,30 @@ void CpuGeneratorAvxFloat::generate(const mnd::MandelInfo& info, float* data)
 
				             float* resb = (float*) &resultsb;
			
 
				 
			
 
				             _mm256_store_ps(ftRes, counter);
			
 
				-            for (int k = 0; k < 8 && i + k < info.bWidth; k++)
			
 
				-                data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
			
 
				-                                                ftRes[k] >= info.maxIter ? info.maxIter :
			
 
				-                ((float)ftRes[k]) + 1 - log(log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / log(2.0f);
			
 
				+            for (int k = 0; k < 8 && i + k < info.bWidth; k++) {
			
 
				+                if constexpr (smooth) {
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
			
 
				+                        ftRes[k] >= info.maxIter ? info.maxIter :
			
 
				+                        ((float)ftRes[k]) + 1 - log(log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / log(2.0f);
			
 
				+                }
			
 
				+                else {
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter : ftRes[k];
			
 
				+                }
			
 
				+            }
			
 
				         }
			
 
				     }
			
 
				 }
			
 
				 
			
 
				 
			
 
				-void CpuGeneratorAvxDouble::generate(const mnd::MandelInfo& info, float* data)
			
 
				+template<bool parallel, bool smooth>
			
 
				+void CpuGenerator<double, mnd::X86_AVX, parallel, smooth>::generate(const mnd::MandelInfo& info, float* data)
			
 
				 {
			
 
				     using T = double;
			
 
				     const MandelViewport& view = info.view;
			
 
				 
			
 
				-    omp_set_num_threads(2 * omp_get_num_procs());
			
 
				-#pragma omp parallel for
			
 
				+    if constexpr(parallel)
			
 
				+        omp_set_num_threads(2 * omp_get_num_procs());
			
 
				+#pragma omp parallel for if (smooth)
			
 
				     for (long j = 0; j < info.bHeight; j++) {
			
 
				         T y = T(view.y) + T(j) * view.height / info.bHeight;
			
 
				         long i = 0;
			
--- a/libmandel/src/CpuGeneratorsSSE2.cpp
+++ b/libmandel/src/CpuGeneratorsSSE2.cpp
@@ -5,16 +5,27 @@
 
				 
			
 
				 #include <memory>
			
 
				 
			
 
				-using mnd::CpuGeneratorSse2Float;
			
 
				-using mnd::CpuGeneratorSse2Double;
			
 
				+using mnd::CpuGenerator;
			
 
				 
			
 
				+template class CpuGenerator<float, mnd::X86_SSE2, false, false>;
			
 
				+template class CpuGenerator<float, mnd::X86_SSE2, false, true>;
			
 
				+template class CpuGenerator<float, mnd::X86_SSE2, true, false>;
			
 
				+template class CpuGenerator<float, mnd::X86_SSE2, true, true>;
			
 
				 
			
 
				-void CpuGeneratorSse2Float::generate(const mnd::MandelInfo& info, float* data)
			
 
				+template class CpuGenerator<double, mnd::X86_SSE2, false, false>;
			
 
				+template class CpuGenerator<double, mnd::X86_SSE2, false, true>;
			
 
				+template class CpuGenerator<double, mnd::X86_SSE2, true, false>;
			
 
				+template class CpuGenerator<double, mnd::X86_SSE2, true, true>;
			
 
				+
			
 
				+template<bool parallel, bool smooth>
			
 
				+void CpuGenerator<float, mnd::X86_SSE2, parallel, smooth>::generate(const mnd::MandelInfo& info, float* data)
			
 
				 {
			
 
				     using T = float;
			
 
				     const MandelViewport& view = info.view;
			
 
				+
			
 
				+    if constexpr(parallel)
			
 
				     omp_set_num_threads(2 * omp_get_num_procs());
			
 
				-#pragma omp parallel for
			
 
				+#pragma omp parallel for if (parallel)
			
 
				     for (long j = 0; j < info.bHeight; j++) {
			
 
				         T y = T(view.y) + T(j) * T(view.height / info.bHeight);
			
 
				         long i = 0;
			
@@ -67,12 +78,14 @@ void CpuGeneratorSse2Float::generate(const mnd::MandelInfo& info, float* data)
 
				 }
			
 
				 
			
 
				 
			
 
				-void CpuGeneratorSse2Double::generate(const mnd::MandelInfo& info, float* data)
			
 
				+template<bool parallel, bool smooth>
			
 
				+void CpuGenerator<double, mnd::X86_SSE2, parallel, smooth>::generate(const mnd::MandelInfo& info, float* data)
			
 
				 {
			
 
				     using T = double;
			
 
				     const MandelViewport& view = info.view;
			
 
				-    omp_set_num_threads(2 * omp_get_num_procs());
			
 
				-#pragma omp parallel for
			
 
				+    if constexpr(parallel)
			
 
				+        omp_set_num_threads(2 * omp_get_num_procs());
			
 
				+#pragma omp parallel for if (parallel)
			
 
				     for (long j = 0; j < info.bHeight; j++) {
			
 
				         T y = T(view.y) + T(j) * T(view.height / info.bHeight);
			
 
				         long i = 0;
			
--- a/libmandel/src/mandel.cpp
+++ b/libmandel/src/mandel.cpp
@@ -1,4 +1,5 @@
 
				 #include "Mandel.h"
			
 
				+#include "Fixed.h"
			
 
				 
			
 
				 #include "CpuGenerators.h"
			
 
				 #include "ClGenerators.h"
			
@@ -55,27 +56,27 @@ MandelContext::MandelContext(void)
 
				 
			
 
				 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) 
			
 
				     if (cpuInfo.hasAvx()) {
			
 
				-        cpuGeneratorFloat = std::make_unique<CpuGeneratorAvxFloat>();
			
 
				-        cpuGeneratorDouble = std::make_unique<CpuGeneratorAvxDouble>();
			
 
				+        cpuGeneratorFloat = std::make_unique<CpuGenerator<float, mnd::X86_AVX>>();
			
 
				+        cpuGeneratorDouble = std::make_unique<CpuGenerator<double, mnd::X86_AVX>>();
			
 
				     }
			
 
				     else if (cpuInfo.hasSse2()) {
			
 
				-        cpuGeneratorFloat = std::make_unique<CpuGeneratorSse2Float>();
			
 
				-        cpuGeneratorDouble = std::make_unique<CpuGeneratorSse2Double>();
			
 
				+        cpuGeneratorFloat = std::make_unique<CpuGenerator<float, mnd::X86_SSE2>>();
			
 
				+        cpuGeneratorDouble = std::make_unique<CpuGenerator<double, mnd::X86_SSE2>>();
			
 
				     }
			
 
				     else
			
 
				 #elif defined(__aarch64__)
			
 
				     if (true) {
			
 
				-        cpuGeneratorFloat = std::make_unique<CpuGeneratorNeonFloat>();
			
 
				-        cpuGeneratorDouble = std::make_unique<CpuGeneratorNeonDouble>();
			
 
				+        cpuGeneratorFloat = std::make_unique<CpuGenerator<float, mnd::ARM_NEON>>();
			
 
				+        cpuGeneratorDouble = std::make_unique<CpuGenerator<double, mnd::ARM_NEON>>();
			
 
				     }
			
 
				     else
			
 
				 #endif
			
 
				     {
			
 
				-        cpuGeneratorFloat = std::make_unique<CpuGeneratorFloat>();
			
 
				-        cpuGeneratorDouble = std::make_unique<CpuGeneratorDouble>();
			
 
				+        cpuGeneratorFloat = std::make_unique<CpuGenerator<float>>();
			
 
				+        cpuGeneratorDouble = std::make_unique<CpuGenerator<double>>();
			
 
				     }
			
 
				 
			
 
				-    cpuGenerator128 = std::make_unique<CpuGenerator128>();
			
 
				+    cpuGenerator128 = std::make_unique<CpuGenerator<Fixed128>>();
			
 
				 
			
 
				     devices = createDevices();
			
 
				 }