Nicolas Winkler 5 tahun lalu
induk
melakukan
0ac6819cf7

+ 63 - 1
libmandel/include/CpuGenerators.h

@@ -7,14 +7,17 @@ namespace mnd
 {
     enum CpuExtension
     {
+        NONE,
         X86_SSE2,
         X86_AVX,
         ARM_NEON,
     };
 
-    template<typename T, CpuExtension ex, bool parallel>
+    template<typename T, mnd::CpuExtension ex = mnd::NONE, bool parallel = true, bool smooth = true>
     class CpuGenerator;
 
+
+    /*
     class CpuGeneratorFloat;
     class CpuGeneratorDouble;
     class CpuGenerator128;
@@ -32,9 +35,66 @@ namespace mnd
     class CpuGeneratorNeonFloat;
     class CpuGeneratorNeonDouble;
 #endif
+*/
 }
 
 
+template<typename T, mnd::CpuExtension ex = mnd::NONE, bool parallel = true, bool smooth = true>
+class mnd::CpuGenerator : public Generator
+{
+public:
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+
+template<typename T, bool parallel, bool smooth>
+class mnd::CpuGenerator<T, mnd::NONE, parallel, smooth> : public Generator
+{
+public:
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) 
+template<bool parallel, bool smooth>
+class mnd::CpuGenerator<float, mnd::X86_SSE2, parallel, smooth> : public Generator
+{
+public:
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+template<bool parallel, bool smooth>
+class mnd::CpuGenerator<double, mnd::X86_SSE2, parallel, smooth> : public Generator
+{
+public:
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+
+template<bool parallel, bool smooth>
+class mnd::CpuGenerator<float, mnd::X86_AVX, parallel, smooth> : public Generator
+{
+public:
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+template<bool parallel, bool smooth>
+class mnd::CpuGenerator<double, mnd::X86_AVX, parallel, smooth> : public Generator
+{
+public:
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) 
+template<typename T, bool parallel, bool smooth>
+class mnd::CpuGenerator<T, mnd::ARM_NEON, parallel, smooth> : public Generator
+{
+public:
+    virtual void generate(const MandelInfo& info, float* data);
+};
+#endif
+
+/*
 class mnd::CpuGeneratorFloat : public Generator
 {
 public:
@@ -115,4 +175,6 @@ public:
 };
 #endif
 
+*/
+
 #endif // MANDEL_CPUGENERATORS_H

+ 40 - 16
libmandel/src/CpuGenerators.cpp

@@ -5,42 +5,66 @@
 
 #include <memory>
 
-using mnd::CpuGeneratorFloat;
-using mnd::CpuGeneratorDouble;
-using mnd::CpuGenerator128;
+using mnd::CpuGenerator;
 
+template class CpuGenerator<float, mnd::NONE, false, false>;
+template class CpuGenerator<float, mnd::NONE, false, true>;
+template class CpuGenerator<float, mnd::NONE, true, false>;
+template class CpuGenerator<float, mnd::NONE, true, true>;
 
-void CpuGeneratorFloat::generate(const mnd::MandelInfo& info, float* data)
+template class CpuGenerator<double, mnd::NONE, false, false>;
+template class CpuGenerator<double, mnd::NONE, false, true>;
+template class CpuGenerator<double, mnd::NONE, true, false>;
+template class CpuGenerator<double, mnd::NONE, true, true>;
+
+template class CpuGenerator<Fixed128, mnd::NONE, false, false>;
+template class CpuGenerator<Fixed128, mnd::NONE, false, true>;
+template class CpuGenerator<Fixed128, mnd::NONE, true, false>;
+template class CpuGenerator<Fixed128, mnd::NONE, true, true>;
+
+
+template<typename T, bool parallel, bool smooth>
+void CpuGenerator<T, mnd::NONE, parallel, smooth>::generate(const mnd::MandelInfo& info, float* data)
 {
     const MandelViewport& view = info.view;
-    omp_set_num_threads(2 * omp_get_num_procs());
-#pragma omp parallel for
+
+    if constexpr (parallel)
+        omp_set_num_threads(2 * omp_get_num_procs());
+#pragma omp parallel for if constexpr (parallel)
     for (long j = 0; j < info.bHeight; j++) {
-        float y = float(view.y) + float(j) * float(view.height / info.bHeight);
+        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
         long i = 0;
         for (i; i < info.bWidth; i++) {
-            float x = float(view.x + double(i) * view.width / info.bWidth);
+            T x = T(view.x + T(i) * T(view.width / info.bWidth));
 
-            float a = x;
-            float b = y;
+            T a = x;
+            T b = y;
 
             int k = 0;
             for (k = 0; k < info.maxIter; k++) {
-                float aa = a * a;
-                float bb = b * b;
-                float ab = a * b;
+                T aa = a * a;
+                T bb = b * b;
+                T ab = a * b;
                 a = aa - bb + x;
                 b = ab + ab + y;
-                if (aa + bb > 16) {
+                if (aa + bb > T(16)) {
                     break;
                 }
             }
-            data[i + j * info.bWidth] = k;
+            if constexpr (smooth) {
+                if (k >= info.maxIter)
+                    data[i + j * info.bWidth] = info.maxIter;
+                else
+                    data[i + j * info.bWidth] = ((float) k) + 1 - ::log(::log(a * a + b * b) / 2) / ::log(2.0f);
+            }
+            else
+                data[i + j * info.bWidth] = k;
         }
     }
 }
 
 
+/*
 void CpuGeneratorDouble::generate(const mnd::MandelInfo& info, float* data)
 {
     const MandelViewport& view = info.view;
@@ -102,4 +126,4 @@ void CpuGenerator128::generate(const mnd::MandelInfo& info, float* data)
         }
     }
 }
-
+*/

+ 34 - 13
libmandel/src/CpuGeneratorsAVX.cpp

@@ -5,16 +5,27 @@
 
 #include <memory>
 
-using mnd::CpuGeneratorAvxFloat;
-using mnd::CpuGeneratorAvxDouble;
+using mnd::CpuGenerator;
 
+template class CpuGenerator<float, mnd::X86_AVX, false, false>;
+template class CpuGenerator<float, mnd::X86_AVX, false, true>;
+template class CpuGenerator<float, mnd::X86_AVX, true, false>;
+template class CpuGenerator<float, mnd::X86_AVX, true, true>;
 
-void CpuGeneratorAvxFloat::generate(const mnd::MandelInfo& info, float* data)
+template class CpuGenerator<double, mnd::X86_AVX, false, false>;
+template class CpuGenerator<double, mnd::X86_AVX, false, true>;
+template class CpuGenerator<double, mnd::X86_AVX, true, false>;
+template class CpuGenerator<double, mnd::X86_AVX, true, true>;
+
+template<bool parallel, bool smooth>
+void CpuGenerator<float, mnd::X86_AVX, parallel, smooth>::generate(const mnd::MandelInfo& info, float* data)
 {
     using T = float;
     const MandelViewport& view = info.view;
+
+    if constexpr(parallel)
     omp_set_num_threads(2 * omp_get_num_procs());
-#pragma omp parallel for
+#pragma omp parallel for if (parallel)
     for (long j = 0; j < info.bHeight; j++) {
         T y = T(view.y) + T(j) * T(view.height / info.bHeight);
         long i = 0;
@@ -48,8 +59,10 @@ void CpuGeneratorAvxFloat::generate(const mnd::MandelInfo& info, float* data)
                 a = _mm256_add_ps(_mm256_sub_ps(aa, bb), xs);
                 b = _mm256_add_ps(abab, ys);
                 __m256 cmp = _mm256_cmp_ps(_mm256_add_ps(aa, bb), threshold, _CMP_LE_OQ);
-                resultsa = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
-                resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
+                if constexpr (smooth) {
+                    resultsa = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
+                    resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
+                }
                 adder = _mm256_and_ps(adder, cmp);
                 counter = _mm256_add_ps(counter, adder);
                 if ((k & 0x7) == 0 && _mm256_testz_ps(cmp, cmp) != 0) {
@@ -70,22 +83,30 @@ void CpuGeneratorAvxFloat::generate(const mnd::MandelInfo& info, float* data)
             float* resb = (float*) &resultsb;
 
             _mm256_store_ps(ftRes, counter);
-            for (int k = 0; k < 8 && i + k < info.bWidth; k++)
-                data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
-                                                ftRes[k] >= info.maxIter ? info.maxIter :
-                ((float)ftRes[k]) + 1 - log(log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / log(2.0f);
+            for (int k = 0; k < 8 && i + k < info.bWidth; k++) {
+                if constexpr (smooth) {
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
+                        ftRes[k] >= info.maxIter ? info.maxIter :
+                        ((float)ftRes[k]) + 1 - log(log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / log(2.0f);
+                }
+                else {
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter : ftRes[k];
+                }
+            }
         }
     }
 }
 
 
-void CpuGeneratorAvxDouble::generate(const mnd::MandelInfo& info, float* data)
+template<bool parallel, bool smooth>
+void CpuGenerator<double, mnd::X86_AVX, parallel, smooth>::generate(const mnd::MandelInfo& info, float* data)
 {
     using T = double;
     const MandelViewport& view = info.view;
 
-    omp_set_num_threads(2 * omp_get_num_procs());
-#pragma omp parallel for
+    if constexpr(parallel)
+        omp_set_num_threads(2 * omp_get_num_procs());
+#pragma omp parallel for if (smooth)
     for (long j = 0; j < info.bHeight; j++) {
         T y = T(view.y) + T(j) * view.height / info.bHeight;
         long i = 0;

+ 20 - 7
libmandel/src/CpuGeneratorsSSE2.cpp

@@ -5,16 +5,27 @@
 
 #include <memory>
 
-using mnd::CpuGeneratorSse2Float;
-using mnd::CpuGeneratorSse2Double;
+using mnd::CpuGenerator;
 
+template class CpuGenerator<float, mnd::X86_SSE2, false, false>;
+template class CpuGenerator<float, mnd::X86_SSE2, false, true>;
+template class CpuGenerator<float, mnd::X86_SSE2, true, false>;
+template class CpuGenerator<float, mnd::X86_SSE2, true, true>;
 
-void CpuGeneratorSse2Float::generate(const mnd::MandelInfo& info, float* data)
+template class CpuGenerator<double, mnd::X86_SSE2, false, false>;
+template class CpuGenerator<double, mnd::X86_SSE2, false, true>;
+template class CpuGenerator<double, mnd::X86_SSE2, true, false>;
+template class CpuGenerator<double, mnd::X86_SSE2, true, true>;
+
+template<bool parallel, bool smooth>
+void CpuGenerator<float, mnd::X86_SSE2, parallel, smooth>::generate(const mnd::MandelInfo& info, float* data)
 {
     using T = float;
     const MandelViewport& view = info.view;
+
+    if constexpr(parallel)
     omp_set_num_threads(2 * omp_get_num_procs());
-#pragma omp parallel for
+#pragma omp parallel for if (parallel)
     for (long j = 0; j < info.bHeight; j++) {
         T y = T(view.y) + T(j) * T(view.height / info.bHeight);
         long i = 0;
@@ -67,12 +78,14 @@ void CpuGeneratorSse2Float::generate(const mnd::MandelInfo& info, float* data)
 }
 
 
-void CpuGeneratorSse2Double::generate(const mnd::MandelInfo& info, float* data)
+template<bool parallel, bool smooth>
+void CpuGenerator<double, mnd::X86_SSE2, parallel, smooth>::generate(const mnd::MandelInfo& info, float* data)
 {
     using T = double;
     const MandelViewport& view = info.view;
-    omp_set_num_threads(2 * omp_get_num_procs());
-#pragma omp parallel for
+    if constexpr(parallel)
+        omp_set_num_threads(2 * omp_get_num_procs());
+#pragma omp parallel for if (parallel)
     for (long j = 0; j < info.bHeight; j++) {
         T y = T(view.y) + T(j) * T(view.height / info.bHeight);
         long i = 0;

+ 10 - 9
libmandel/src/mandel.cpp

@@ -1,4 +1,5 @@
 #include "Mandel.h"
+#include "Fixed.h"
 
 #include "CpuGenerators.h"
 #include "ClGenerators.h"
@@ -55,27 +56,27 @@ MandelContext::MandelContext(void)
 
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) 
     if (cpuInfo.hasAvx()) {
-        cpuGeneratorFloat = std::make_unique<CpuGeneratorAvxFloat>();
-        cpuGeneratorDouble = std::make_unique<CpuGeneratorAvxDouble>();
+        cpuGeneratorFloat = std::make_unique<CpuGenerator<float, mnd::X86_AVX>>();
+        cpuGeneratorDouble = std::make_unique<CpuGenerator<double, mnd::X86_AVX>>();
     }
     else if (cpuInfo.hasSse2()) {
-        cpuGeneratorFloat = std::make_unique<CpuGeneratorSse2Float>();
-        cpuGeneratorDouble = std::make_unique<CpuGeneratorSse2Double>();
+        cpuGeneratorFloat = std::make_unique<CpuGenerator<float, mnd::X86_SSE2>>();
+        cpuGeneratorDouble = std::make_unique<CpuGenerator<double, mnd::X86_SSE2>>();
     }
     else
 #elif defined(__aarch64__)
     if (true) {
-        cpuGeneratorFloat = std::make_unique<CpuGeneratorNeonFloat>();
-        cpuGeneratorDouble = std::make_unique<CpuGeneratorNeonDouble>();
+        cpuGeneratorFloat = std::make_unique<CpuGenerator<float, mnd::ARM_NEON>>();
+        cpuGeneratorDouble = std::make_unique<CpuGenerator<double, mnd::ARM_NEON>>();
     }
     else
 #endif
     {
-        cpuGeneratorFloat = std::make_unique<CpuGeneratorFloat>();
-        cpuGeneratorDouble = std::make_unique<CpuGeneratorDouble>();
+        cpuGeneratorFloat = std::make_unique<CpuGenerator<float>>();
+        cpuGeneratorDouble = std::make_unique<CpuGenerator<double>>();
     }
 
-    cpuGenerator128 = std::make_unique<CpuGenerator128>();
+    cpuGenerator128 = std::make_unique<CpuGenerator<Fixed128>>();
 
     devices = createDevices();
 }