Ver código fonte

optimized for avx512

Nicolas Winkler 5 anos atrás
pai
commit
9abe4b0248

+ 39 - 39
choosegenerators.cpp

@@ -17,45 +17,45 @@ mnd::MandelViewport Benchmarker::benchViewport(void)
 
 
 const std::vector<mnd::MandelInfo> Benchmarker::benches {
-    mnd::MandelInfo{ benchViewport(), 25, 25, 15, false },
-    mnd::MandelInfo{ benchViewport(), 25, 25, 25, false },
-    mnd::MandelInfo{ benchViewport(), 25, 25, 75, false },
-    mnd::MandelInfo{ benchViewport(), 25, 25, 125, false },
-    mnd::MandelInfo{ benchViewport(), 25, 25, 250, false },
-    mnd::MandelInfo{ benchViewport(), 50, 25, 250, false },
-    mnd::MandelInfo{ benchViewport(), 50, 50, 250, false },
-    mnd::MandelInfo{ benchViewport(), 50, 50, 500, false },
-    mnd::MandelInfo{ benchViewport(), 50, 100, 500, false },
-    mnd::MandelInfo{ benchViewport(), 100, 100, 500, false },
-    mnd::MandelInfo{ benchViewport(), 100, 100, 1000, false },
-    mnd::MandelInfo{ benchViewport(), 100, 200, 1000, false },
-    mnd::MandelInfo{ benchViewport(), 200, 200, 1000, false },
-    mnd::MandelInfo{ benchViewport(), 200, 200, 2000, false },
-    mnd::MandelInfo{ benchViewport(), 200, 400, 2000, false },
-    mnd::MandelInfo{ benchViewport(), 400, 400, 2000, false },
-    mnd::MandelInfo{ benchViewport(), 400, 400, 4000, false },
-    mnd::MandelInfo{ benchViewport(), 400, 800, 4000, false },
-    mnd::MandelInfo{ benchViewport(), 800, 800, 4000, false },
-    mnd::MandelInfo{ benchViewport(), 800, 800, 8000, false },
-    mnd::MandelInfo{ benchViewport(), 800, 800, 16000, false },
-    mnd::MandelInfo{ benchViewport(), 800, 1600, 16000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 16000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 32000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 64000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 128000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 256000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 512000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 1024000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 4096000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 8192000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 16384000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 32768000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 65536000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 131072000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 262144000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 524288000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 1048576000, false },
-    mnd::MandelInfo{ benchViewport(), 1600, 1600, 2097152000, false },
+    mnd::MandelInfo{ benchViewport(), 32, 32, 15, false },
+    mnd::MandelInfo{ benchViewport(), 32, 32, 25, false },
+    mnd::MandelInfo{ benchViewport(), 32, 32, 75, false },
+    mnd::MandelInfo{ benchViewport(), 32, 32, 125, false },
+    mnd::MandelInfo{ benchViewport(), 32, 32, 250, false },
+    mnd::MandelInfo{ benchViewport(), 64, 32, 250, false },
+    mnd::MandelInfo{ benchViewport(), 64, 64, 250, false },
+    mnd::MandelInfo{ benchViewport(), 64, 64, 500, false },
+    mnd::MandelInfo{ benchViewport(), 64, 128, 500, false },
+    mnd::MandelInfo{ benchViewport(), 128, 128, 500, false },
+    mnd::MandelInfo{ benchViewport(), 128, 128, 1000, false },
+    mnd::MandelInfo{ benchViewport(), 128, 256, 1000, false },
+    mnd::MandelInfo{ benchViewport(), 256, 256, 1000, false },
+    mnd::MandelInfo{ benchViewport(), 256, 256, 2000, false },
+    mnd::MandelInfo{ benchViewport(), 256, 512, 2000, false },
+    mnd::MandelInfo{ benchViewport(), 512, 512, 2000, false },
+    mnd::MandelInfo{ benchViewport(), 512, 512, 4000, false },
+    mnd::MandelInfo{ benchViewport(), 512, 1024, 4000, false },
+    mnd::MandelInfo{ benchViewport(), 1024, 1024, 4000, false },
+    mnd::MandelInfo{ benchViewport(), 1024, 1024, 8000, false },
+    mnd::MandelInfo{ benchViewport(), 1024, 1024, 16000, false },
+    mnd::MandelInfo{ benchViewport(), 1024, 2048, 16000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 16000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 32000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 64000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 128000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 256000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 512000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 1024000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 4096000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 8192000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 16384000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 32768000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 65536000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 131072000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 262144000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 524288000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 1048576000, false },
+    mnd::MandelInfo{ benchViewport(), 2048, 2048, 2097152000, false },
 };
 
 

+ 1 - 1
libmandel/CMakeLists.txt

@@ -39,7 +39,7 @@ SET(MandelSources
 FILE(GLOB MandelHeaders include/*.h)
 
 if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
-    list(APPEND MandelSources src/CpuGeneratorsAVX512.cpp  src/CpuGeneratorsAVX.cpp src/CpuGeneratorsAVXFMA.cpp src/CpuGeneratorsSSE2.cpp)
+    list(APPEND MandelSources src/CpuGeneratorsAVX.cpp src/CpuGeneratorsAVXFMA.cpp src/CpuGeneratorsSSE2.cpp src/CpuGeneratorsAVX512.cpp)
 elseif(ARCH STREQUAL "ARM")
     list(APPEND MandelSources src/CpuGeneratorsNeon.cpp)
 endif()

+ 122 - 61
libmandel/src/CpuGeneratorsAVX512.cpp

@@ -43,106 +43,167 @@ void CpuGenerator<float, mnd::X86_AVX_512, parallel>::generate(const mnd::Mandel
 
             __m512 counter = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
             __m512 adder = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+            __m512 two = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  };
+            __m512 resultsa = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m512 resultsb = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 
             __m512 threshold = { 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f,
                                  16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f };
 
-            __m512 ys = { y, y, y, y, y, y, y, y, y, y, y, y, y, y, y };
             __m512 a = xs;
             __m512 b = ys;
 
-            for (int k = 0; k < info.maxIter; k++) {
-                __m512 aa = _mm512_mul_ps(a, a);
-                __m512 bb = _mm512_mul_ps(b, b);
-                __m512 abab = _mm512_mul_ps(a, b); abab = _mm512_add_ps(abab, abab);
-                a = _mm512_add_ps(_mm512_sub_ps(aa, bb), xs);
-                b = _mm512_add_ps(abab, ys);
-                __mmask16 cmp = _mm512_cmp_ps_mask(_mm512_add_ps(aa, bb), threshold, _CMP_LE_OQ);
-                counter = _mm512_mask_add_ps(counter, cmp, counter, adder);
-                if (cmp == 0) {
-                    break;
+            if (info.smooth) {
+                for (int k = 0; k < info.maxIter; k++) {
+                    __m512 aa = _mm512_mul_ps(a, a);
+                    __m512 abab = _mm512_mul_ps(a, b);
+                    __mmask16 cmp = _mm512_cmp_ps_mask(_mm512_fmadd_ps(b, b, aa), threshold, _CMP_LE_OQ);
+                    a = _mm512_sub_ps(aa, _mm512_fmsub_ps(b, b, xs));
+                    b = _mm512_fmadd_ps(two, abab, ys);
+                    counter = _mm512_mask_add_ps(counter, cmp, counter, adder);
+                    resultsa = _mm512_mask_blend_ps(cmp, resultsa, a);
+                    resultsb = _mm512_mask_blend_ps(cmp, resultsb, b);
+                    if (cmp == 0) {
+                        break;
+                    }
+                }
+            }
+            else {
+                for (int k = 0; k < info.maxIter; k++) {
+                    __m512 aa = _mm512_mul_ps(a, a);
+                    __m512 abab = _mm512_mul_ps(a, b);
+                    __mmask16 cmp = _mm512_cmp_ps_mask(_mm512_fmadd_ps(b, b, aa), threshold, _CMP_LE_OQ);
+                    a = _mm512_sub_ps(aa, _mm512_fmsub_ps(b, b, xs));
+                    b = _mm512_fmadd_ps(two, abab, ys);
+                    counter = _mm512_mask_add_ps(counter, cmp, counter, adder);
+                    if (cmp == 0) {
+                        break;
+                    }
                 }
             }
 
             auto alignVec = [](float* data) -> float* {
                 void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 16 * sizeof(float), aligned, length);
+                ::size_t length = 64 * sizeof(float);
+                std::align(64, 48 * sizeof(float), aligned, length);
                 return static_cast<float*>(aligned);
             };
 
-            float resData[32];
+            float resData[64];
             float* ftRes = alignVec(resData);
-
+            float* resa = ftRes + 16;
+            float* resb = ftRes + 32;
             _mm512_store_ps(ftRes, counter);
-            for (int k = 0; k < 8 && i + k < info.bWidth; k++)
-                data[i + k + j * info.bWidth] = ftRes[k] > 0 ? ftRes[k] : info.maxIter;
+            if (info.smooth) {
+                _mm512_store_ps(resa, resultsa);
+                _mm512_store_ps(resb, resultsb);
+            }
+            for (int k = 0; k < 16 && i + k < info.bWidth; k++) {
+                if (info.smooth) {
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
+                        ftRes[k] >= info.maxIter ? info.maxIter :
+                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+                }
+                else {
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter : ftRes[k];
+                }
+            }
         }
     }
 }
 
 
+
 template<bool parallel>
 void CpuGenerator<double, mnd::X86_AVX_512, parallel>::generate(const mnd::MandelInfo& info, float* data)
 {
-    using T = float;
+    using T = double;
     const MandelViewport& view = info.view;
 
-    const float dppf = float(view.width / info.bWidth);
-    const float viewxf = float(view.x);
-    __m512 viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf,
-                     viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
-    __m512 dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf,
-                   dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
+    const double dppf = double(view.width / info.bWidth);
+    const double viewxf = double(view.x);
+    __m512d viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
+    __m512d dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
 
     if constexpr(parallel)
         omp_set_num_threads(omp_get_num_procs());
 #pragma omp parallel for schedule(static, 1) if (parallel)
     for (long j = 0; j < info.bHeight; j++) {
         T y = T(view.y + double(j) * view.height / info.bHeight);
-        __m512 ys = { y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y };
+        __m512d ys = { y, y, y, y, y, y, y, y };
         long i = 0;
-        for (i; i < info.bWidth; i += 16) {
-            __m512 pixc = { float(i), float(i + 1), float(i + 2), float(i + 3), float(i + 4), float(i + 5), float(i + 6), float(i + 7),
-                            float(i + 8), float(i + 9), float(i + 10), float(i + 11), float(i + 12), float(i + 13), float(i + 14), float(i + 15) };
-            __m512 xs = _mm512_fmadd_ps(dpp, pixc, viewx);
-
-            __m512 counter = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-            __m512 adder = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
-
-            __m512 threshold = { 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f,
-                                 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f };
-
-            __m512 ys = { y, y, y, y, y, y, y, y, y, y, y, y, y, y, y };
-            __m512 a = xs;
-            __m512 b = ys;
-
-            for (int k = 0; k < info.maxIter; k++) {
-                __m512 aa = _mm512_mul_ps(a, a);
-                __m512 bb = _mm512_mul_ps(b, b);
-                __m512 abab = _mm512_mul_ps(a, b); abab = _mm512_add_ps(abab, abab);
-                a = _mm512_add_ps(_mm512_sub_ps(aa, bb), xs);
-                b = _mm512_add_ps(abab, ys);
-                __mmask16 cmp = _mm512_cmp_ps_mask(_mm512_add_ps(aa, bb), threshold, _CMP_LE_OQ);
-                counter = _mm512_mask_add_ps(counter, cmp, counter, adder);
-                if (cmp == 0) {
-                    break;
+        for (i; i < info.bWidth; i += 8) {
+            __m512d pixc = { double(i), double(i + 1), double(i + 2), double(i + 3), double(i + 4), double(i + 5), double(i + 6), double(i + 7) };
+            __m512d xs = _mm512_fmadd_pd(dpp, pixc, viewx);
+
+            __m512d counter = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m512d adder = { 1, 1, 1, 1, 1, 1, 1, 1 };
+            __m512d two = { 2, 2, 2, 2, 2, 2, 2, 2 };
+            __m512d resultsa = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m512d resultsb = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+            __m512d threshold = { 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f };
+
+            __m512d a = xs;
+            __m512d b = ys;
+
+            if (info.smooth) {
+                for (int k = 0; k < info.maxIter; k++) {
+                    __m512d aa = _mm512_mul_pd(a, a);
+                    __m512d ab = _mm512_mul_pd(a, b);
+                    __mmask8 cmp = _mm512_cmp_pd_mask(_mm512_fmadd_pd(b, b, aa), threshold, _CMP_LE_OQ);
+                    a = _mm512_sub_pd(aa, _mm512_fmsub_pd(b, b, xs));
+                    b = _mm512_fmadd_pd(two, ab, ys);
+                    resultsa = _mm512_mask_blend_pd(cmp, resultsa, a);
+                    resultsb = _mm512_mask_blend_pd(cmp, resultsb, b);
+                    counter = _mm512_mask_add_pd(counter, cmp, counter, adder);
+                    if (cmp == 0) {
+                        break;
+                    }
+                }
+            }
+            else {
+                for (int k = 0; k < info.maxIter; k++) {
+                    __m512d aa = _mm512_mul_pd(a, a);
+                    __m512d ab = _mm512_mul_pd(a, b);
+                    __mmask8 cmp = _mm512_cmp_pd_mask(_mm512_fmadd_pd(b, b, aa), threshold, _CMP_LE_OQ);
+                    a = _mm512_sub_pd(aa, _mm512_fmsub_pd(b, b, xs));
+                    b = _mm512_fmadd_pd(two, ab, ys);
+                    counter = _mm512_mask_add_pd(counter, cmp, counter, adder);
+                    if (cmp == 0) {
+                        break;
+                    }
                 }
             }
 
-            auto alignVec = [](float* data) -> float* {
+            auto alignVec = [](double* data) -> double* {
                 void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 16 * sizeof(float), aligned, length);
-                return static_cast<float*>(aligned);
+                ::size_t length = 32 * sizeof(double);
+                std::align(64, 24 * sizeof(double), aligned, length);
+                return static_cast<double*>(aligned);
             };
 
-            float resData[32];
-            float* ftRes = alignVec(resData);
-
-            _mm512_store_ps(ftRes, counter);
-            for (int k = 0; k < 8 && i + k < info.bWidth; k++)
-                data[i + k + j * info.bWidth] = ftRes[k] > 0 ? ftRes[k] : info.maxIter;
+            double resData[64];
+            double* ftRes = alignVec(resData);
+            double* resa = ftRes + 8;
+            double* resb = ftRes + 16;
+            _mm512_store_pd(ftRes, counter);
+            if (info.smooth) {
+                _mm512_store_pd(resa, resultsa);
+                _mm512_store_pd(resb, resultsb);
+            }
+            for (int k = 0; k < 8 && i + k < info.bWidth; k++) {
+                if (info.smooth) {
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
+                        ftRes[k] >= info.maxIter ? info.maxIter :
+                        ((float)ftRes[k]) + 1 - ::log(::log((float) (resa[k] * resa[k] + resb[k] * resb[k])) / 2) / ::log(2.0f);
+                }
+                else {
+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter : ftRes[k];
+                }
+            }
         }
     }
-}
+}
+
+

+ 10 - 0
libmandel/src/Mandel.cpp

@@ -105,6 +105,12 @@ MandelContext::MandelContext(void)
 {
 
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) 
+    if (cpuInfo.hasAvx512()) {
+        auto fl = std::make_unique<CpuGenerator<float, mnd::X86_AVX_512, true>>();
+        auto db = std::make_unique<CpuGenerator<double, mnd::X86_AVX_512, true>>();
+        cpuGenerators.insert({ GeneratorType::FLOAT_AVX512, std::move(fl) });
+        cpuGenerators.insert({ GeneratorType::DOUBLE_AVX512, std::move(db) });
+    }
     if (cpuInfo.hasAvx()) {
         auto fl = std::make_unique<CpuGenerator<float, mnd::X86_AVX, true>>();
         auto db = std::make_unique<CpuGenerator<double, mnd::X86_AVX, true>>();
@@ -190,6 +196,10 @@ std::unique_ptr<mnd::AdaptiveGenerator> MandelContext::createAdaptiveGenerator(v
         doubleGen = getCpuGenerator(GeneratorType::DOUBLE_AVX_FMA);
         doubleDoubleGen = getCpuGenerator(GeneratorType::DOUBLE_DOUBLE_AVX_FMA);
     }
+    if (cpuInfo.hasAvx512()) {
+        floatGen = getCpuGenerator(GeneratorType::FLOAT_AVX512);
+        doubleGen = getCpuGenerator(GeneratorType::DOUBLE_AVX512);
+    }
 
     if (cpuInfo.hasNeon()) {
         floatGen = getCpuGenerator(GeneratorType::FLOAT_NEON);