Nicolas Winkler 5 роки тому
батько
коміт
1717d444cb

+ 25 - 0
libmandel/include/CpuGenerators.h

@@ -124,6 +124,31 @@ public:
 };
 #endif
 
+
+template<bool parallel>
+class mnd::CpuGenerator<double, mnd::X86_AVX_FMA, parallel> : public Generator
+{
+public:
+    inline CpuGenerator(void) :
+        Generator{ mnd::getPrecision<double>() }
+    {
+    }
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+
+template<bool parallel>
+class mnd::CpuGenerator<float, mnd::X86_AVX_FMA, parallel> : public Generator
+{
+public:
+    inline CpuGenerator(void) :
+        Generator{ mnd::getPrecision<float>() }
+    {
+    }
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+
 template<bool parallel>
 class mnd::CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel> : public Generator
 {

+ 2 - 0
libmandel/include/Mandel.h

@@ -31,12 +31,14 @@ enum class mnd::GeneratorType
     FLOAT,
     FLOAT_SSE2,
     FLOAT_AVX,
+    FLOAT_AVX_FMA,
     FLOAT_AVX512,
     FLOAT_NEON,
     DOUBLE_FLOAT,
     DOUBLE,
     DOUBLE_SSE2,
     DOUBLE_AVX,
+    DOUBLE_AVX_FMA,
     DOUBLE_AVX512,
     DOUBLE_NEON,
     DOUBLE_DOUBLE,

+ 45 - 6
libmandel/src/ClGenerators.cpp

@@ -110,7 +110,7 @@ void ClGenerator::generate(const mnd::MandelInfo& info, float* data)
     iterate.setArg(7, int(info.smooth ? 1 : 0));
 
     // TODO check for overflow
-    if (false && device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT>() == 4) {
+    if (true || device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT>() >= 4) {
         queue.enqueueNDRangeKernel(iterate, 0, NDRange(info.bWidth * info.bHeight / 4));
     } else {
         queue.enqueueNDRangeKernel(iterate, 0, NDRange(info.bWidth * info.bHeight));
@@ -143,6 +143,44 @@ std::string ClGeneratorFloat::getKernelCode(bool smooth) const
     return 
 //        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
         "__kernel void iterate(__global float* A, const int width, float xl, float yt, float pixelScaleX, float pixelScaleY, int max, int smooth) {"
+        "   int index = get_global_id(0) * 4;\n"
+        "   int x = index % width;"
+        "   int y = index / width;"
+        "   float4 a = (float4) (x * pixelScaleX + xl, (x + 1) * pixelScaleX + xl, (x + 2) * pixelScaleX + xl, (x + 3) * pixelScaleX + xl);"
+        "   float4 b = (float4) (y * pixelScaleY + yt);"
+        "   float4 ca = a;"
+        "   float4 cb = b;"
+        "   float4 resa = a;"
+        "   float4 resb = b;"
+        "   int4 count = (int4)(0);"
+        ""
+        "   int n = 0;"
+        "   while (n < max) {"
+//        "       float aa = a * a;"
+//        "       float bb = b * b;"
+        "       float4 ab = a * b;\n"
+        "       float4 cmpVal = fma(a, a, b * b);\n"
+        "       int4 cmp = isless(cmpVal, (float4)(16.0f));\n"
+        "       if (!any(cmp)) break;\n"
+        "       a = fma(a, a, -fma(b, b, -ca));\n"
+        "       b = fma(2, ab, cb);\n"
+        "       if (smooth) {\n"
+        "           resa = as_float4(as_int4(a) & cmp | (as_int4(resa) & ~cmp));"
+        "           resb = as_float4(as_int4(b) & cmp | (as_int4(resb) & ~cmp));"
+        "       }\n"
+        "       count += cmp & (int4)(1);\n"
+        "       n++;"
+        "   }\n"
+        "   for (int i = 0; i < 4 && i + x < width; i++) {"
+        "       if (smooth != 0)\n"
+        "           A[index + i] = ((float) count[i]) + 1 - log(log(fma(resa[i], resa[i], resb[i] * resb[i])) / 2) / log(2.0f);\n"
+        "      else\n"
+        "          A[index + i] = ((float) count[i]);\n"
+        "   }"
+        "}";
+        /*
+//        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
+        "__kernel void iterate(__global float* A, const int width, float xl, float yt, float pixelScaleX, float pixelScaleY, int max, int smooth) {"
         "   int index = get_global_id(0);\n"
         "   int x = index % width;"
         "   int y = index / width;"
@@ -153,12 +191,12 @@ std::string ClGeneratorFloat::getKernelCode(bool smooth) const
         ""
         "   int n = 0;"
         "   while (n < max - 1) {"
-        "       float aa = a * a;"
-        "       float bb = b * b;"
+//        "       float aa = a * a;"
+//        "       float bb = b * b;"
         "       float ab = a * b;"
-        "       if (aa + bb > 16) break;"
-        "       a = aa - bb + ca;"
-        "       b = ab + ab + cb;"
+        "       if (fma(a, a, b * b) > 16) break;"
+        "       a = fma(a, a, -fma(b, b, -ca));"
+        "       b = fma(2, ab, cb);"
         "       n++;"
         "   }\n"
         "   if (n >= max - 1)\n"
@@ -170,6 +208,7 @@ std::string ClGeneratorFloat::getKernelCode(bool smooth) const
         "           A[index] = ((float)n);\n"
         "   }"
         "}";
+        */
 }
 
 

+ 14 - 16
libmandel/src/CpuGeneratorsAVX.cpp

@@ -26,6 +26,10 @@ void CpuGenerator<float, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo
 {
     using T = float;
     const MandelViewport& view = info.view;
+    const float dppf = float(view.width / info.bWidth);
+    const float viewxf = float(view.x);
+    __m256 viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
+    __m256 dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
 
     if constexpr(parallel)
         omp_set_num_threads(omp_get_num_procs());
@@ -35,16 +39,9 @@ void CpuGenerator<float, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo
         __m256 ys = {y, y, y, y, y, y, y, y};
         long i = 0;
         for (i; i < info.bWidth; i += 8) {
-            __m256 xs = {
-                float(view.x + float(i) * view.width / info.bWidth),
-                float(view.x + float(i + 1) * view.width / info.bWidth),
-                float(view.x + float(i + 2) * view.width / info.bWidth),
-                float(view.x + float(i + 3) * view.width / info.bWidth),
-                float(view.x + float(i + 4) * view.width / info.bWidth),
-                float(view.x + float(i + 5) * view.width / info.bWidth),
-                float(view.x + float(i + 6) * view.width / info.bWidth),
-                float(view.x + float(i + 7) * view.width / info.bWidth)
-            };
+            __m256 pixc = { float(i), float(i + 1), float(i + 2), float(i + 3), float(i + 4), float(i + 5), float(i + 6), float(i + 7) };
+
+            __m256 xs = _mm256_add_ps(_mm256_mul_ps(dpp, pixc), viewx);
 
             __m256 counter = { 0, 0, 0, 0, 0, 0, 0, 0 };
             __m256 adder = { 1, 1, 1, 1, 1, 1, 1, 1 };
@@ -108,6 +105,11 @@ void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInf
     using T = double;
     const MandelViewport& view = info.view;
 
+    const double dppf = double(view.width / info.bWidth);
+    const double viewxf = double(view.x);
+    __m256d viewx = { viewxf, viewxf, viewxf, viewxf };
+    __m256d dpp = { dppf, dppf, dppf, dppf };
+
     if constexpr(parallel)
         omp_set_num_threads(omp_get_num_procs());
 #pragma omp parallel for schedule(static, 1) if (parallel)
@@ -116,12 +118,8 @@ void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInf
         __m256d ys = { y, y, y, y };
         long i = 0;
         for (i; i < info.bWidth; i += 4) {
-            __m256d xs = {
-                double(view.x + double(i) * view.width / info.bWidth),
-                double(view.x + double(i + 1) * view.width / info.bWidth),
-                double(view.x + double(i + 2) * view.width / info.bWidth),
-                double(view.x + double(i + 3) * view.width / info.bWidth)
-            };
+            __m256d pixc = { double(i), double(i + 1), double(i + 2), double(i + 3) };
+            __m256d xs = _mm256_add_pd(_mm256_mul_pd(dpp, pixc), viewx);
 
             int itRes[4] = { 0, 0, 0, 0 };
 

+ 175 - 0
libmandel/src/CpuGeneratorsAVXFMA.cpp

@@ -11,11 +11,186 @@ using mnd::CpuGenerator;
 
 namespace mnd
 {
+    template class CpuGenerator<float, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<float, mnd::X86_AVX_FMA, true>;
+
+    template class CpuGenerator<double, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<double, mnd::X86_AVX_FMA, true>;
+
     template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, false>;
     template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>;
 }
 
 
+template<bool parallel>
+void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = float;
+    const MandelViewport& view = info.view;
+    const float dppf = float(view.width / info.bWidth);
+    const float viewxf = float(view.x);
+    __m256 viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
+    __m256 dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
+
+    if constexpr(parallel)
+        omp_set_num_threads(omp_get_num_procs());
+#pragma omp parallel for schedule(static, 1) if (parallel)
+    for (long j = 0; j < info.bHeight; j++) {
+        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
+        __m256 ys = {y, y, y, y, y, y, y, y};
+        long i = 0;
+        for (i; i < info.bWidth; i += 8) {
+            __m256 pixc = { float(i), float(i + 1), float(i + 2), float(i + 3), float(i + 4), float(i + 5), float(i + 6), float(i + 7) };
+            __m256 xs = _mm256_fmadd_ps(dpp, pixc, viewx);
+
+            __m256 counter = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m256 adder = { 1, 1, 1, 1, 1, 1, 1, 1 };
+            __m256 two = { 2, 2, 2, 2, 2, 2, 2, 2 };
+            __m256 resultsa = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m256 resultsb = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+            __m256 threshold = { 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f };
+
+            __m256 a = xs;
+            __m256 b = ys;
+
+            for (int k = 0; k < info.maxIter; k++) {
+                if ((k & 0xF) == 0) {
+                    __m256 bb = _mm256_mul_ps(b, b);
+                    __m256 abab = _mm256_mul_ps(a, b); //abab = _mm256_add_ps(abab, abab);
+                    a = _mm256_fmsub_ps(a, a, _mm256_fmsub_ps(b, b, xs));
+                    b = _mm256_fmadd_ps(two, abab, ys);
+                    __m256 cmp = _mm256_cmp_ps(_mm256_fmadd_ps(a, a, _mm256_mul_ps(b, b)), threshold, _CMP_LE_OQ);
+                    if (info.smooth) {
+                        resultsa = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
+                        resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
+                    }
+                    adder = _mm256_and_ps(adder, cmp);
+                    counter = _mm256_add_ps(counter, adder);
+                    if (_mm256_testz_ps(cmp, cmp) != 0) {
+                        break;
+                    }
+                }
+                else {
+                    //__m256 aa = _mm256_mul_ps(a, a);
+                    __m256 bb = _mm256_mul_ps(b, b);
+                    __m256 abab = _mm256_mul_ps(a, b); //abab = _mm256_add_ps(abab, abab);
+                    a = _mm256_fmsub_ps(a, a, _mm256_fmsub_ps(b, b, xs));
+                    b = _mm256_fmadd_ps(two, abab, ys);
+                    __m256 cmp = _mm256_cmp_ps(_mm256_fmadd_ps(a, a, _mm256_mul_ps(b, b)), threshold, _CMP_LE_OQ);
+                    if (info.smooth) {
+                        resultsa = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
+                        resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
+                    }
+                    adder = _mm256_and_ps(adder, cmp);
+                    counter = _mm256_add_ps(counter, adder);
+                }
+            }
+
+            auto alignVec = [](float* data) -> float* {
+                void* aligned = data;
+                ::size_t length = 64;
+                std::align(32, 8 * sizeof(float), aligned, length);
+                return static_cast<float*>(aligned);
+            };
+
+            float resData[16];
+            float* ftRes = alignVec(resData);
+            float* resa = (float*) &resultsa;
+            float* resb = (float*) &resultsb;
+
+            _mm256_store_ps(ftRes, counter);
+            for (int k = 0; k < 8 && i + k < info.bWidth; k++) {
+                if (info.smooth) {
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
+                        ftRes[k] >= info.maxIter ? info.maxIter :
+                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+                }
+                else {
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter : ftRes[k];
+                }
+            }
+        }
+    }
+}
+
+
+template<bool parallel>
+void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = double;
+    const MandelViewport& view = info.view;
+
+    const double dppf = double(view.width / info.bWidth);
+    const double viewxf = double(view.x);
+    __m256d viewx = { viewxf, viewxf, viewxf, viewxf };
+    __m256d dpp = { dppf, dppf, dppf, dppf };
+
+    if constexpr(parallel)
+        omp_set_num_threads(omp_get_num_procs());
+#pragma omp parallel for schedule(static, 1) if (parallel)
+    for (long j = 0; j < info.bHeight; j++) {
+        T y = T(view.y + T(j) * view.height / info.bHeight);
+        __m256d ys = { y, y, y, y };
+        long i = 0;
+        for (i; i < info.bWidth; i += 4) {
+            __m256d pixc = { double(i), double(i + 1), double(i + 2), double(i + 3) };
+            __m256d xs = _mm256_fmadd_pd(dpp, pixc, viewx);
+
+            int itRes[4] = { 0, 0, 0, 0 };
+
+            __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
+            __m256d counter = { 0, 0, 0, 0 };
+            __m256d adder = { 1, 1, 1, 1 };
+            __m256d two = { 2, 2, 2, 2 };
+
+            __m256d resultsa = { 0, 0, 0, 0 };
+            __m256d resultsb = { 0, 0, 0, 0 };
+
+            __m256d a = xs;
+            __m256d b = ys;
+
+            for (int k = 0; k < info.maxIter; k++) {
+                __m256d ab = _mm256_mul_pd(a, b);
+                a = _mm256_fmsub_pd(a, a, _mm256_fmsub_pd(b, b, xs));
+                b = _mm256_fmadd_pd(two, ab, ys);
+                __m256d cmp = _mm256_cmp_pd(_mm256_fmadd_pd(a, a, _mm256_mul_pd(b, b)), threshold, _CMP_LE_OQ);
+                if (info.smooth) {
+                    resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a));
+                    resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b));
+                }
+                adder = _mm256_and_pd(adder, cmp);
+                counter = _mm256_add_pd(counter, adder);
+                if ((k & 0x7) == 0 && _mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
+                    break;
+                }
+            }
+
+            auto alignVec = [](double* data) -> double* {
+                void* aligned = data;
+                ::size_t length = 64;
+                std::align(32, 4 * sizeof(double), aligned, length);
+                return static_cast<double*>(aligned);
+            };
+
+            double resData[8];
+            double* ftRes = alignVec(resData);
+            double* resa = (double*) &resultsa;
+            double* resb = (double*) &resultsb;
+            _mm256_store_pd(ftRes, counter);
+            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
+                if (info.smooth)
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
+                        ftRes[k] >= info.maxIter ? info.maxIter :
+                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+                else
+                    data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
+            }
+        }
+    }
+}
+
+
 struct VecPair
 {
     __m256d a;

+ 6 - 0
libmandel/src/Mandel.cpp

@@ -27,12 +27,14 @@ static const std::map<mnd::GeneratorType, std::string> typeNames =
     { mnd::GeneratorType::FLOAT, "float" },
     { mnd::GeneratorType::FLOAT_SSE2, "float SSE2" },
     { mnd::GeneratorType::FLOAT_AVX, "float AVX" },
+    { mnd::GeneratorType::FLOAT_AVX_FMA, "float AVX+FMA" },
     { mnd::GeneratorType::FLOAT_AVX512, "float AVX512" },
     { mnd::GeneratorType::FLOAT_NEON, "float NEON" },
     { mnd::GeneratorType::DOUBLE_FLOAT, "double float" },
     { mnd::GeneratorType::DOUBLE, "double" },
     { mnd::GeneratorType::DOUBLE_SSE2, "double SSE2" },
     { mnd::GeneratorType::DOUBLE_AVX, "double AVX" },
+    { mnd::GeneratorType::DOUBLE_AVX_FMA, "double AVX+FMA" },
     { mnd::GeneratorType::DOUBLE_AVX512, "double AVX512" },
     { mnd::GeneratorType::DOUBLE_NEON, "double NEON" },
     { mnd::GeneratorType::DOUBLE_DOUBLE, "double double" },
@@ -111,7 +113,11 @@ MandelContext::MandelContext(void)
         cpuGenerators.insert({ GeneratorType::DOUBLE_AVX, std::move(db) });
         cpuGenerators.insert({ GeneratorType::DOUBLE_DOUBLE_AVX, std::move(ddb) });
         if (cpuInfo.hasFma()) {
+            auto favxfma = std::make_unique<CpuGenerator<float, mnd::X86_AVX_FMA, true>>();
+            auto davxfma = std::make_unique<CpuGenerator<double, mnd::X86_AVX_FMA, true>>();
             auto ddavx = std::make_unique<CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>>();
+            cpuGenerators.insert({ GeneratorType::FLOAT_AVX_FMA, std::move(favxfma) });
+            cpuGenerators.insert({ GeneratorType::DOUBLE_AVX_FMA, std::move(davxfma) });
             cpuGenerators.insert({ GeneratorType::DOUBLE_DOUBLE_AVX_FMA, std::move(ddavx) });
         }
     }