Browse Source

made avxfma extern and added hexdouble avxfma

Nicolas Winkler 4 years ago
parent
commit
db5304cabc

+ 12 - 0
libmandel/include/CpuGenerators.h

@@ -155,6 +155,18 @@ public:
 
 
 
 
 template<bool parallel>
 template<bool parallel>
+class mnd::CpuGenerator<mnd::HexDouble, mnd::X86_AVX_FMA, parallel> : public MandelGenerator
+{
+public:
+    inline CpuGenerator(void) :
+        MandelGenerator{ mnd::Precision::HEX_DOUBLE, mnd::X86_AVX_FMA }
+    {
+    }
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+
+template<bool parallel>
 class mnd::CpuGenerator<float, mnd::X86_AVX_512, parallel> : public MandelGenerator
 class mnd::CpuGenerator<float, mnd::X86_AVX_512, parallel> : public MandelGenerator
 {
 {
 public:
 public:

+ 153 - 0
libmandel/src/CpuGenerators.cpp

@@ -120,6 +120,11 @@ void CpuGenerator<T, mnd::NONE, parallel>::generate(const mnd::MandelInfo& info,
 }
 }
 
 
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
+
+// ===========================================
+// =================== AVX ===================
+// ===========================================
+
 namespace mnd
 namespace mnd
 {
 {
     template class CpuGenerator<float, mnd::X86_AVX, false>;
     template class CpuGenerator<float, mnd::X86_AVX, false>;
@@ -232,6 +237,154 @@ void CpuGenerator<mnd::TripleDouble, mnd::X86_AVX, parallel>::generate(const mnd
 }
 }
 
 
 
 
+
+// ===========================================
+// ================= AVX-FMA =================
+// ===========================================
+
+namespace mnd
+{
+    template class CpuGenerator<float, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<float, mnd::X86_AVX_FMA, true>;
+
+    template class CpuGenerator<double, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<double, mnd::X86_AVX_FMA, true>;
+
+    template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>;
+
+    template class CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, true>;
+
+    template class CpuGenerator<HexDouble, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<HexDouble, mnd::X86_AVX_FMA, true>;
+}
+
+
+extern void generateFloatAvxFma(long width, long height, float* data, bool parallel,
+    float vx, float vy, float vw, float vh, int maxIter, bool smooth,
+    bool julia, float jX, float jY);
+
+extern void generateDoubleAvxFma(long width, long height, float* data, bool parallel,
+    double vx, double vy, double vw, double vh, int maxIter, bool smooth,
+    bool julia, double jX, double jY);
+
+extern void generateDoubleDoubleAvxFma(long width, long height, float* data, bool parallel,
+    double vx1, double vx2, double vy1, double vy2, double vw1, double vw2, double vh1, double vh2, int maxIter, bool smooth,
+    bool julia, double jX1, double jX2, double jY1, double jY2);
+
+extern void generateQuadDoubleAvxFma(long width, long height, float* data, bool parallel,
+    const double* vx, const double* vy,
+    const double* vw, const double* vh,
+    int maxIter, bool smooth, bool julia,
+    const double* jXp, const double* jYp);
+
+extern void generateHexDoubleAvxFma(long width, long height, float* data, bool parallel,
+    const double* vx, const double* vy,
+    const double* vw, const double* vh,
+    int maxIter, bool smooth, bool julia,
+    const double* jXp, const double* jYp);
+
+
+template<bool parallel>
+void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = float;
+    const MandelViewport& view = info.view;
+
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
+
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+
+    generateFloatAvxFma(info.bWidth, info.bHeight, data, parallel, vx, vy, vw, vh, info.maxIter, info.smooth, info.julia, jX, jY);
+}
+
+
+template<bool parallel>
+void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = double;
+    const MandelViewport& view = info.view;
+
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
+
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+
+    generateDoubleAvxFma(info.bWidth, info.bHeight, data, parallel, vx, vy, vw, vh, info.maxIter, info.smooth, info.julia, jX, jY);
+}
+
+
+template<bool parallel>
+void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = mnd::DoubleDouble;
+    const MandelViewport& view = info.view;
+
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
+
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+
+    generateDoubleDoubleAvxFma(info.bWidth, info.bHeight, data, parallel,
+        vx.x[0], vx.x[1], vy.x[0], vy.x[1], vw.x[0], vw.x[1], vh.x[0], vh.x[1],
+        info.maxIter, info.smooth, info.julia, jX.x[0], jX.x[1], jY.x[0], jY.x[1]);
+}
+
+
+template<bool parallel>
+void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = mnd::QuadDouble;
+    const MandelViewport& view = info.view;
+
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
+
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+
+    generateQuadDoubleAvxFma(info.bWidth, info.bHeight, data, parallel,
+        vx.x, vy.x,
+        vw.x, vh.x,
+        info.maxIter, info.smooth, info.julia,
+        jX.x, jY.x);
+}
+
+
+template<bool parallel>
+void CpuGenerator<mnd::HexDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = mnd::HexDouble;
+    const MandelViewport& view = info.view;
+
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
+
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+
+    generateHexDoubleAvxFma(info.bWidth, info.bHeight, data, parallel,
+        vx.x, vy.x,
+        vw.x, vh.x,
+        info.maxIter, info.smooth, info.julia,
+        jX.x, jY.x);
+}
+
 #ifdef WITH_AVX512
 #ifdef WITH_AVX512
 
 
 namespace mnd
 namespace mnd

+ 2 - 1
libmandel/src/CpuGeneratorsAVX.cpp

@@ -625,6 +625,7 @@ void generateDoubleDoubleAvx(long width, long height, float* data, bool parallel
     }
     }
 }
 }
 
 
+
 void generateTripleDoubleAvx(long width, long height, float* data, bool parallel,
 void generateTripleDoubleAvx(long width, long height, float* data, bool parallel,
     double vx1, double vx2, double vx3, double vy1, double vy2, double vy3,
     double vx1, double vx2, double vx3, double vy1, double vy2, double vy3,
     double vw1, double vw2, double vw3, double vh1, double vh2, double vh3,
     double vw1, double vw2, double vw3, double vh1, double vh2, double vh3,
@@ -714,7 +715,7 @@ void generateTripleDoubleAvx(long width, long height, float* data, bool parallel
                 if (smooth)
                 if (smooth)
                     data[i + k + j * width] = float(ftRes[k] < 0 ? maxIter :
                     data[i + k + j * width] = float(ftRes[k] < 0 ? maxIter :
                         ftRes[k] >= maxIter ? maxIter :
                         ftRes[k] >= maxIter ? maxIter :
-                        ((float)ftRes[k]) + 1 - floatLog2(::floatLog(float(resa[k] * resa[k] + resb[k] * resb[k])) * 0.5f));
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(float(resa[k] * resa[k] + resb[k] * resb[k])) * 0.5f));
                 else
                 else
                     data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
                     data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
             }
             }

+ 482 - 277
libmandel/src/CpuGeneratorsAVXFMA.cpp

@@ -1,54 +1,44 @@
-#include "CpuGenerators.h"
+#include "FloatLog.h"
 
 
 #include <immintrin.h>
 #include <immintrin.h>
 #include <omp.h>
 #include <omp.h>
-#include <cmath>
 
 
-#include <utility>
-#include <memory>
-
-using mnd::CpuGenerator;
-
-namespace mnd
+///
+/// \brief unique namespace just to be a little more sure we do not
+///        accidentally compile a function used somewhere else and use
+///        avx instructions in there.
+///
+namespace avxfma_private
 {
 {
-    template class CpuGenerator<float, mnd::X86_AVX_FMA, false>;
-    template class CpuGenerator<float, mnd::X86_AVX_FMA, true>;
-
-    template class CpuGenerator<double, mnd::X86_AVX_FMA, false>;
-    template class CpuGenerator<double, mnd::X86_AVX_FMA, true>;
-
-    template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, false>;
-    template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>;
-
-    template class CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, false>;
-    template class CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, true>;
+#include "LightDoubleDouble.h"
+#include "QuadDouble.h"
+#include "HexDouble.h"
 }
 }
 
 
 
 
-template<bool parallel>
-void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+
+void generateFloatAvxFma(long width, long height, float* data, bool parallel,
+    float vx, float vy, float vw, float vh, int maxIter, bool smooth,
+    bool julia, float jX, float jY)
 {
 {
     using T = float;
     using T = float;
-    const MandelViewport& view = info.view;
-    const float dppf = float(view.width / info.bWidth);
-    const float viewxf = float(view.x);
+    const float dppf = float(vw / width);
+    const float viewxf = vx; 
     __m256 viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
     __m256 viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
     __m256 dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
     __m256 dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
 
 
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
     __m256 juliaX = { jX, jX, jX, jX, jX, jX, jX, jX };
     __m256 juliaX = { jX, jX, jX, jX, jX, jX, jX, jX };
     __m256 juliaY = { jY, jY, jY, jY, jY, jY, jY, jY };
     __m256 juliaY = { jY, jY, jY, jY, jY, jY, jY, jY };
 
 
 #if defined(_OPENMP)
 #if defined(_OPENMP)
-    if constexpr(parallel)
+    if (parallel)
         omp_set_num_threads(omp_get_num_procs());
         omp_set_num_threads(omp_get_num_procs());
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #endif
 #endif
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
+    for (long j = 0; j < height; j++) {
+        T y = vy + T(j) * vw / height;
         __m256 ys = {y, y, y, y, y, y, y, y};
         __m256 ys = {y, y, y, y, y, y, y, y};
-        for (long i = 0; i < info.bWidth; i += 24) {
+        for (long i = 0; i < width; i += 24) {
             __m256 pixc = { float(i), float(i + 1), float(i + 2), float(i + 3), float(i + 4), float(i + 5), float(i + 6), float(i + 7) };
             __m256 pixc = { float(i), float(i + 1), float(i + 2), float(i + 3), float(i + 4), float(i + 5), float(i + 6), float(i + 7) };
             __m256 pixc2 = { float(i + 8), float(i + 9), float(i + 10), float(i + 11), float(i + 12), float(i + 13), float(i + 14), float(i + 15) };
             __m256 pixc2 = { float(i + 8), float(i + 9), float(i + 10), float(i + 11), float(i + 12), float(i + 13), float(i + 14), float(i + 15) };
             __m256 pixc3 = { float(i + 16), float(i + 17), float(i + 18), float(i + 19), float(i + 20), float(i + 21), float(i + 22), float(i + 23) };
             __m256 pixc3 = { float(i + 16), float(i + 17), float(i + 18), float(i + 19), float(i + 20), float(i + 21), float(i + 22), float(i + 23) };
@@ -82,16 +72,16 @@ void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mandel
             __m256 b2 = ys;
             __m256 b2 = ys;
             __m256 b3 = ys;
             __m256 b3 = ys;
 
 
-            __m256 cx = info.julia ? juliaX : xs;
-            __m256 cx2 = info.julia ? juliaX : xs2;
-            __m256 cx3 = info.julia ? juliaX : xs3;
-            __m256 cy = info.julia ? juliaY : ys;
+            __m256 cx = julia ? juliaX : xs;
+            __m256 cx2 = julia ? juliaX : xs2;
+            __m256 cx3 = julia ? juliaX : xs3;
+            __m256 cy = julia ? juliaY : ys;
 
 
-            if (info.smooth) {
+            if (smooth) {
                 __m256 cmp = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
                 __m256 cmp = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
                 __m256 cmp2 = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
                 __m256 cmp2 = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
                 __m256 cmp3 = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
                 __m256 cmp3 = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
-                for (int k = 0; k < info.maxIter; k++) {
+                for (int k = 0; k < maxIter; k++) {
                     __m256 bb = _mm256_mul_ps(b, b);
                     __m256 bb = _mm256_mul_ps(b, b);
                     __m256 bb2 = _mm256_mul_ps(b2, b2);
                     __m256 bb2 = _mm256_mul_ps(b2, b2);
                     __m256 bb3 = _mm256_mul_ps(b3, b3);
                     __m256 bb3 = _mm256_mul_ps(b3, b3);
@@ -134,7 +124,7 @@ void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mandel
                 }
                 }
             }
             }
             else {
             else {
-                for (int k = 0; k < info.maxIter; k++) {
+                for (int k = 0; k < maxIter; k++) {
                     __m256 bb = _mm256_mul_ps(b, b);
                     __m256 bb = _mm256_mul_ps(b, b);
                     __m256 bb2 = _mm256_mul_ps(b2, b2);
                     __m256 bb2 = _mm256_mul_ps(b2, b2);
                     __m256 bb3 = _mm256_mul_ps(b3, b3);
                     __m256 bb3 = _mm256_mul_ps(b3, b3);
@@ -162,36 +152,28 @@ void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mandel
                 }
                 }
             }
             }
 
 
-
-            auto alignVec = [](float* data) -> float* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 8 * sizeof(float), aligned, length);
-                return static_cast<float*>(aligned);
-            };
-
             float resData[96];
             float resData[96];
-            float* ftRes = alignVec(resData);
+            float* ftRes = resData;
             float* resa = ftRes + 24;
             float* resa = ftRes + 24;
             float* resb = resa + 24;
             float* resb = resa + 24;
 
 
-            _mm256_store_ps(ftRes, counter);
-            _mm256_store_ps(ftRes + 8, counter2);
-            _mm256_store_ps(ftRes + 16, counter3);
-            _mm256_store_ps(resa, resultsa);
-            _mm256_store_ps(resa + 8, resultsa2);
-            _mm256_store_ps(resa + 16, resultsa3);
-            _mm256_store_ps(resb, resultsb);
-            _mm256_store_ps(resb + 8, resultsb2);
-            _mm256_store_ps(resb + 16, resultsb3);
-            for (int k = 0; k < 24 && i + k < info.bWidth; k++) {
-                if (info.smooth) {
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+            _mm256_storeu_ps(ftRes, counter);
+            _mm256_storeu_ps(ftRes + 8, counter2);
+            _mm256_storeu_ps(ftRes + 16, counter3);
+            _mm256_storeu_ps(resa, resultsa);
+            _mm256_storeu_ps(resa + 8, resultsa2);
+            _mm256_storeu_ps(resa + 16, resultsa3);
+            _mm256_storeu_ps(resb, resultsb);
+            _mm256_storeu_ps(resb + 8, resultsb2);
+            _mm256_storeu_ps(resb + 16, resultsb3);
+            for (int k = 0; k < 24 && i + k < width; k++) {
+                if (smooth) {
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
                 }
                 }
                 else {
                 else {
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter : ftRes[k];
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter : ftRes[k];
                 }
                 }
             }
             }
         }
         }
@@ -199,32 +181,30 @@ void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mandel
 }
 }
 
 
 
 
-template<bool parallel>
-void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+void generateDoubleAvxFma(long width, long height, float* data, bool parallel,
+    double vx, double vy, double vw, double vh, int maxIter, bool smooth,
+    bool julia, double jX, double jY)
 {
 {
     using T = double;
     using T = double;
-    const MandelViewport& view = info.view;
 
 
-    const double dppf = double(view.width / info.bWidth);
-    const double viewxf = double(view.x);
+    const double dppf = double(vw / width);
+    const double viewxf = double(vx);
     __m256d viewx = { viewxf, viewxf, viewxf, viewxf };
     __m256d viewx = { viewxf, viewxf, viewxf, viewxf };
     __m256d dpp = { dppf, dppf, dppf, dppf };
     __m256d dpp = { dppf, dppf, dppf, dppf };
 
 
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
     __m256d juliaX = { jX, jX, jX, jX };
     __m256d juliaX = { jX, jX, jX, jX };
     __m256d juliaY = { jY, jY, jY, jY };
     __m256d juliaY = { jY, jY, jY, jY };
 
 
 
 
 #if defined(_OPENMP)
 #if defined(_OPENMP)
-    if constexpr(parallel)
+    if (parallel)
         omp_set_num_threads(omp_get_num_procs());
         omp_set_num_threads(omp_get_num_procs());
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #endif
 #endif
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = T(view.y + T(j) * view.height / info.bHeight);
+    for (long j = 0; j < height; j++) {
+        T y = vy + T(j) * vh / height;
         __m256d ys = { y, y, y, y };
         __m256d ys = { y, y, y, y };
-        for (long i = 0; i < info.bWidth; i += 8) {
+        for (long i = 0; i < width; i += 8) {
             __m256d pixc = { double(i), double(i + 1), double(i + 2), double(i + 3) };
             __m256d pixc = { double(i), double(i + 1), double(i + 2), double(i + 3) };
             __m256d pixc2 = { double(i + 4), double(i + 5), double(i + 6), double(i + 7) };
             __m256d pixc2 = { double(i + 4), double(i + 5), double(i + 6), double(i + 7) };
             __m256d xs = _mm256_fmadd_pd(dpp, pixc, viewx);
             __m256d xs = _mm256_fmadd_pd(dpp, pixc, viewx);
@@ -249,14 +229,14 @@ void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mande
             __m256d a2 = xs2;
             __m256d a2 = xs2;
             __m256d b2 = ys;
             __m256d b2 = ys;
 
 
-            __m256d cx = info.julia ? juliaX : xs;
-            __m256d cy = info.julia ? juliaY : ys;
-            __m256d cx2 = info.julia ? juliaX : xs2;
+            __m256d cx = julia ? juliaX : xs;
+            __m256d cy = julia ? juliaY : ys;
+            __m256d cx2 = julia ? juliaX : xs2;
             //__m256d cy2 = info.julia ? juliaY : ys;
             //__m256d cy2 = info.julia ? juliaY : ys;
 
 
             __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
             __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
             __m256d cmp2 = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
             __m256d cmp2 = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
-            for (int k = 0; k < info.maxIter; k++) {
+            for (int k = 0; k < maxIter; k++) {
                 __m256d aa = _mm256_mul_pd(a, a);
                 __m256d aa = _mm256_mul_pd(a, a);
                 __m256d ab = _mm256_mul_pd(a, b);
                 __m256d ab = _mm256_mul_pd(a, b);
                 __m256d bb = _mm256_mul_pd(b, b);
                 __m256d bb = _mm256_mul_pd(b, b);
@@ -269,7 +249,7 @@ void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mande
                 a2 = _mm256_add_pd(a2, cx2);
                 a2 = _mm256_add_pd(a2, cx2);
                 b = _mm256_fmadd_pd(two, ab, cy);
                 b = _mm256_fmadd_pd(two, ab, cy);
                 b2 = _mm256_fmadd_pd(two, ab2, cy);
                 b2 = _mm256_fmadd_pd(two, ab2, cy);
-                if (info.smooth) {
+                if (smooth) {
                     resultsa = _mm256_blendv_pd(resultsa, a, cmp);
                     resultsa = _mm256_blendv_pd(resultsa, a, cmp);
                     resultsb = _mm256_blendv_pd(resultsb, b, cmp);
                     resultsb = _mm256_blendv_pd(resultsb, b, cmp);
                     resultsa2 = _mm256_blendv_pd(resultsa2, a2, cmp2);
                     resultsa2 = _mm256_blendv_pd(resultsa2, a2, cmp2);
@@ -287,45 +267,33 @@ void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mande
                 }
                 }
             }
             }
 
 
-            auto alignVec = [](double* data) -> double* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 4 * sizeof(double), aligned, length);
-                return static_cast<double*>(aligned);
-            };
-
-            double resData[8];
-            double* ftRes = alignVec(resData);
-            double* resa = (double*) &resultsa;
-            double* resb = (double*) &resultsb;
-            _mm256_store_pd(ftRes, counter);
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
-                if (info.smooth)
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+            double resData[24];
+            double* ftRes = resData;
+            double* resa = ftRes + 8;
+            double* resb = ftRes + 16;
+            _mm256_storeu_pd(ftRes, counter);
+            _mm256_storeu_pd(ftRes + 4, counter2);
+            _mm256_storeu_pd(resa, resultsa);
+            _mm256_storeu_pd(resa + 4, resultsa2);
+            _mm256_storeu_pd(resb, resultsb);
+            _mm256_storeu_pd(resb + 4, resultsb2);
+            for (int k = 0; k < 8 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
                 else
                 else
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter : float(ftRes[k]);
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter : float(ftRes[k]);
             }
             }
-
-            resa = (double*) &resultsa2;
-            resb = (double*) &resultsb2;
-            _mm256_store_pd(ftRes, counter2);
-            i += 4;
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
-                if (info.smooth)
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
-                else
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter : float(ftRes[k]);
-            }
-            i -= 4;
         }
         }
     }
     }
 }
 }
 
 
 
 
+namespace avxfma_private
+{
+
+
 struct VecPair
 struct VecPair
 {
 {
     __m256d a;
     __m256d a;
@@ -402,6 +370,25 @@ static inline __m256d threeOneSum(__m256d a, __m256d b, __m256d c)
 }
 }
 
 
 
 
+static inline VecQuadruple fourSum(__m256d a, __m256d b, __m256d c, __m256d d)
+{
+    auto[t1, t2] = twoSum(a, b);
+    auto[t3, t4] = twoSum(t1, c);
+    auto[r0, t5] = twoSum(t3, d);
+    auto[r1, r2, r3] = threeSum(t2, t4, t5);
+    return { r0, r1, r2, r3 };
+}
+
+static inline VecPair fiveTwoSum(__m256d a, __m256d b, __m256d c, __m256d d, __m256d e)
+{
+    auto[t1, t2] = twoSum(a, b);
+    auto[t3, t4] = twoSum(t1, c);
+    auto[t5, t6] = twoSum(t3, d);
+    auto[r0, t7] = twoSum(t5, e);
+    return { r0, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(t2, t4), t6), t7) };
+}
+
+
 static inline VecTriple sixThreeSum(__m256d a, __m256d b, __m256d c,
 static inline VecTriple sixThreeSum(__m256d a, __m256d b, __m256d c,
                                     __m256d d, __m256d e, __m256d f)
                                     __m256d d, __m256d e, __m256d f)
 {
 {
@@ -457,6 +444,23 @@ static inline VecPair nineTwoSum(__m256d a, __m256d b, __m256d c,
     return threeTwoSum(t5, t6, i);
     return threeTwoSum(t5, t6, i);
 }
 }
 
 
+
+inline VecTriple nineThreeSum(__m256d a, __m256d b, __m256d c,
+                              __m256d d, __m256d e, __m256d f,
+                              __m256d g, __m256d h, __m256d i)
+{
+    auto[a1, a2, a3] = threeSum(a, b, c);
+    auto[b1, b2, b3] = threeSum(d, e, f);
+    auto[c1, c2, c3] = threeSum(g, h, i);
+
+    auto[r1, t1, t2] = threeSum(a1, b1, c1);
+    auto[r2, t3, t4, t5] = fourSum(a2, b2, c2, t1);
+    auto r3 = _mm256_add_pd(_mm256_add_pd(
+            _mm256_add_pd(_mm256_add_pd(a3, b3), _mm256_add_pd(c3, t2)),
+            _mm256_add_pd(t3, t4)), t5);
+    return { r1, r2, r3 };
+}
+
 static inline VecQuadruple renormalize(__m256d x0, __m256d x1, __m256d x2, __m256d x3, __m256d x4)
 static inline VecQuadruple renormalize(__m256d x0, __m256d x1, __m256d x2, __m256d x3, __m256d x4)
 {
 {
     auto [st1, t4] = quickTwoSum(x3, x4);
     auto [st1, t4] = quickTwoSum(x3, x4);
@@ -576,110 +580,6 @@ struct AvxDoubleDouble
 };
 };
 
 
 
 
-template<bool parallel>
-void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
-{
-    const MandelViewport& view = info.view;
-
-    using T = LightDoubleDouble;
-
-    T viewx = mnd::convert<T>(view.x);
-    T viewy = mnd::convert<T>(view.y);
-    T wpp = mnd::convert<T>(view.width / info.bWidth);
-    T hpp = mnd::convert<T>(view.height / info.bHeight);
-
-
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
-
-    AvxDoubleDouble juliaX = { jX[0], jX[1] };
-    AvxDoubleDouble juliaY = { jY[0], jY[1] };
-
-#if defined(_OPENMP)
-    if constexpr(parallel)
-        omp_set_num_threads(omp_get_num_procs());
-#   pragma omp parallel for schedule(static, 1) if (parallel)
-#endif
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = viewy + T(double(j)) * hpp;
-        __m256d y0s = { y.x[0], y.x[0], y.x[0], y.x[0] };
-        __m256d y1s = { y.x[1], y.x[1], y.x[1], y.x[1] };
-        AvxDoubleDouble ys{ y0s, y1s };
-        for (long i = 0; i < info.bWidth; i += 4) {
-            T x1 = viewx + T(double(i)) * wpp;
-            T x2 = x1 + wpp;
-            T x3 = x2 + wpp;
-            T x4 = x3 + wpp;
-
-            __m256d x0s = {
-                x1[0], x2[0], x3[0], x4[0],
-            };
-
-            __m256d x1s = {
-                x1[1], x2[1], x3[1], x4[1],
-            };
-
-            AvxDoubleDouble xs{ x0s, x1s };
-
-            AvxDoubleDouble cx = info.julia ? juliaX : xs;
-            AvxDoubleDouble cy = info.julia ? juliaY : ys;
-
-            int itRes[4] = { 0, 0, 0, 0 };
-
-            __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
-            __m256d counter = { 0, 0, 0, 0 };
-            __m256d adder = { 1, 1, 1, 1 };
-
-            AvxDoubleDouble a = xs;
-            AvxDoubleDouble b = ys;
-
-            __m256d resultsa;
-            __m256d resultsb;
-
-            __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
-            for (int k = 0; k < info.maxIter; k++) {
-                AvxDoubleDouble aa = a.sq();
-                AvxDoubleDouble bb = b.sq();
-                AvxDoubleDouble abab = a * b.mul_pow2(2.0);
-                a = aa - bb + cx;
-                b = abab + cy;
-                if (info.smooth) {
-                    resultsa = _mm256_blendv_pd(resultsa, a.x[0], cmp);
-                    resultsb = _mm256_blendv_pd(resultsb, b.x[0], cmp);
-                }
-                cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
-                adder = _mm256_and_pd(adder, cmp);
-                counter = _mm256_add_pd(counter, adder);
-                if ((k & 0x7) && _mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
-                    break;
-                }
-            }
-
-            auto alignVec = [](double* data) -> double* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 4 * sizeof(double), aligned, length);
-                return static_cast<double*>(aligned);
-            };
-
-            double resData[8];
-            double* ftRes = alignVec(resData);
-            double* resa = (double*) &resultsa;
-            double* resb = (double*) &resultsb;
-            _mm256_store_pd(ftRes, counter);
-
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
-                if (info.smooth)
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
-                else
-                    data[i + k + j * info.bWidth] = ftRes[k] >= 0 ? float(ftRes[k]) : info.maxIter;
-            }
-        }
-    }
-}
-
 struct AvxQuadDouble
 struct AvxQuadDouble
 {
 {
     __m256d x[4];
     __m256d x[4];
@@ -784,66 +684,279 @@ struct AvxQuadDouble
 };
 };
 
 
 
 
-template<bool parallel>
-void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+struct AvxHexDouble
 {
 {
-    const MandelViewport& view = info.view;
+    __m256d x[6];
 
 
-    using T = mnd::Real;
+    inline AvxHexDouble(__m256d a, __m256d b, __m256d c, __m256d d, __m256d e, __m256d f) :
+        x{ a, b, c, d, e, f }
+    {}
 
 
-    T viewx = mnd::convert<T>(view.x);
-    T viewy = mnd::convert<T>(view.y);
-    T wpp = mnd::convert<T>(view.width / info.bWidth);
-    T hpp = mnd::convert<T>(view.height / info.bHeight);
+    inline AvxHexDouble(double a, double b, double c, double d, double e, double f) :
+        x{ _mm256_set1_pd(a), _mm256_set1_pd(b), _mm256_set1_pd(c),
+           _mm256_set1_pd(d), _mm256_set1_pd(e), _mm256_set1_pd(f) }
+    {}
 
 
 
 
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
+    inline AvxHexDouble operator + (const AvxHexDouble& sm) const
+    {
+        auto[a0, a1] = twoSum(x[0], sm.x[0]);
+        auto[b0, b1] = twoSum(x[1], sm.x[1]);
+        auto[c0, c1] = twoSum(x[2], sm.x[2]);
+        auto[d0, d1] = twoSum(x[3], sm.x[3]);
+        auto[e0, e1] = twoSum(x[4], sm.x[4]);
+
+        auto t0 = a0;
+        auto [t1, p1] = twoSum(a1, b0);
+        auto [t2, p2, p3] = threeSum(b1, c0, p1);
+        auto [t3, p4, p5, p6] = fourSum(c1, d0, p2, p3);
+        auto [t4, p7] = fiveTwoSum(d1, e0, p4, p5, p6);
+        auto t5 = _mm256_add_pd(_mm256_add_pd(x[5], sm.x[5]), _mm256_add_pd(e1, p7));
+
+        auto[re0, er1] = quickTwoSum(t0, t1);
+        auto[re1, e2] = quickTwoSum(er1, t2);
+        auto[re2, e3] = quickTwoSum(e2, t3);
+        auto[re3, e4] = quickTwoSum(e3, t4);
+        auto[re4, re5] = quickTwoSum(e4, t5);
+
+        return { re0, re1, re2, re3, re4, re5 };
+    }
 
 
+    inline AvxHexDouble operator - (const AvxHexDouble& sm) const
+    {
+        auto[a0, a1] = twoDiff(x[0], sm.x[0]);
+        auto[b0, b1] = twoDiff(x[1], sm.x[1]);
+        auto[c0, c1] = twoDiff(x[2], sm.x[2]);
+        auto[d0, d1] = twoDiff(x[3], sm.x[3]);
+        auto[e0, e1] = twoDiff(x[4], sm.x[4]);
+
+        auto t0 = a0;
+        auto [t1, p1] = twoSum(a1, b0);
+        auto [t2, p2, p3] = threeSum(b1, c0, p1);
+        auto [t3, p4, p5, p6] = fourSum(c1, d0, p2, p3);
+        auto [t4, p7] = fiveTwoSum(d1, e0, p4, p5, p6);
+        auto t5 = _mm256_add_pd(_mm256_add_pd(x[5], sm.x[5]), _mm256_add_pd(e1, p7));
+
+        auto[re0, er1] = quickTwoSum(t0, t1);
+        auto[re1, e2] = quickTwoSum(er1, t2);
+        auto[re2, e3] = quickTwoSum(e2, t3);
+        auto[re3, e4] = quickTwoSum(e3, t4);
+        auto[re4, re5] = quickTwoSum(e4, t5);
+
+        return { re0, re1, re2, re3, re4, re5 };
+    }
 
 
-    auto toQd = [] (const mnd::Real& x) -> std::tuple<double, double, double, double> {
-        double a = double(x);
-        mnd::Real rem = x - a;
-        double b = double(rem);
-        rem = rem - b;
-        double c = double(rem);
-        rem = rem - c;
-        double d = double(rem);
-        return { a, b, c, d };
-    };
+    inline AvxHexDouble operator * (const AvxHexDouble& sm) const
+    {
+        auto[p1_0, p2_0] = twoProd(x[0], sm.x[0]);
+
+        auto[p2_1, p3_0] = twoProd(x[0], sm.x[1]);
+        auto[p2_2, p3_1] = twoProd(x[1], sm.x[0]);
+
+        auto[p3_2, p4_0] = twoProd(x[2], sm.x[0]);
+        auto[p3_3, p4_1] = twoProd(x[1], sm.x[1]);
+        auto[p3_4, p4_2] = twoProd(x[0], sm.x[2]);
+
+        auto[p4_3, p5_0] = twoProd(x[3], sm.x[0]);
+        auto[p4_4, p5_1] = twoProd(x[2], sm.x[1]);
+        auto[p4_5, p5_2] = twoProd(x[1], sm.x[2]);
+        auto[p4_6, p5_3] = twoProd(x[0], sm.x[3]);
+
+        auto[p5_4, p6_0] = twoProd(x[4], sm.x[0]);
+        auto[p5_5, p6_1] = twoProd(x[3], sm.x[1]);
+        auto[p5_6, p6_2] = twoProd(x[2], sm.x[2]);
+        auto[p5_7, p6_3] = twoProd(x[1], sm.x[3]);
+        auto[p5_8, p6_4] = twoProd(x[0], sm.x[4]);
+
+        auto t1 = p1_0;
+        auto[t2, tl3, tl4] = threeSum(p2_0, p2_1, p2_2);
+        auto[t3, tl4_2, tl5] = sixThreeSum(p3_0, p3_1, p3_2, p3_3, p3_4, tl3);
+        auto[t4, tl5_2, tl6] = nineThreeSum(p4_0, p4_1, p4_2, p4_3, p4_4, p4_5, p4_6, tl4, tl4_2);
+        auto[x1, x2, x3] = nineThreeSum(p5_0, p5_1, p5_2, p5_3, p5_4, p5_5, p5_6, p5_7, p5_8);
+        auto[t5, tl6_1, tl7] = sixThreeSum(x1, x2, x3, tl5, tl5_2, _mm256_set1_pd(0.0));
+
+        auto t6 =
+            _mm256_add_pd(
+                _mm256_add_pd(
+                    _mm256_add_pd(
+                        _mm256_add_pd(tl6, tl6_1),
+                        _mm256_add_pd(tl7, p6_0)
+                    ),
+                    _mm256_add_pd(
+                        _mm256_add_pd(p6_1, p6_2),
+                        _mm256_add_pd(p6_3, p6_4)
+                    )
+                ),
+                _mm256_add_pd(
+                    _mm256_fmadd_pd(x[5], sm.x[0], _mm256_fmadd_pd(x[4], sm.x[1], _mm256_mul_pd(x[3], sm.x[2]))),
+                    _mm256_fmadd_pd(x[2], sm.x[3], _mm256_fmadd_pd(x[1], sm.x[4], _mm256_mul_pd(x[0], sm.x[5])))
+                )
+            );
+
+        auto[re0, e1] = quickTwoSum(t1, t2);
+        auto[re1, e2] = quickTwoSum(e1, t3);
+        auto[re2, e3] = quickTwoSum(e2, t4);
+        auto[re3, e4] = quickTwoSum(e3, t5);
+        auto[re4, re5] = quickTwoSum(e4, t6);
+
+        return { re0, re1, re2, re3, re4, re5 };
+    }
 
 
-    auto toAvxQuadDouble = [&toQd] (const mnd::Real& x) -> AvxQuadDouble {
-        auto [a, b, c, d] = toQd(x);
-        return AvxQuadDouble{ a, b, c, d };
-    };
+    inline AvxHexDouble mul_pow2(double v) const
+    {
+        __m256d vv = _mm256_set1_pd(v);
+        return { _mm256_mul_pd(vv, x[0]), _mm256_mul_pd(vv, x[1]),
+                 _mm256_mul_pd(vv, x[2]), _mm256_mul_pd(vv, x[3]),
+                 _mm256_mul_pd(vv, x[4]), _mm256_mul_pd(vv, x[5]) };
+    }
 
 
-    auto toAvxQuadDouble4 = [&toQd] (const mnd::Real& a, const mnd::Real& b,
-                            const mnd::Real& c, const mnd::Real& d) -> AvxQuadDouble {
-        auto [x0, y0, z0, u0] = toQd(a);
-        auto [x1, y1, z1, u1] = toQd(b);
-        auto [x2, y2, z2, u2] = toQd(c);
-        auto [x3, y3, z3, u3] = toQd(d);
+    inline AvxHexDouble sq(void) const
+    {
+        return operator*(*this);
+    }
+};
+
+} // namespace avxfma_private
+
+void generateDoubleDoubleAvxFma(long width, long height, float* data, bool parallel,
+    double vx1, double vx2, double vy1, double vy2, double vw1, double vw2, double vh1, double vh2, int maxIter, bool smooth,
+    bool julia, double jX1, double jX2, double jY1, double jY2)
+{
+    using namespace avxfma_private;
+    using T = mnd::LightDoubleDouble;
 
 
-        __m256d xs = { x0, x1, x2, x3 };
-        __m256d ys = { y0, y1, y2, y3 };
-        __m256d zs = { z0, z1, z2, z3 };
-        __m256d us = { u0, u1, u2, u3 };
+    T viewx{ vx1, vx2 };
+    T viewy{ vy1, vy2 };
+    T wpp = T{ vw1, vw2 } * T(1.0 / width);
+    T hpp = T{ vh1, vh2 } * T(1.0 / height);
+
+    T jX{ jX1, jX2 };
+    T jY{ jY1, jY2 };
+    AvxDoubleDouble juliaX = { jX[0], jX[1] };
+    AvxDoubleDouble juliaY = { jY[0], jY[1] };
+
+#if defined(_OPENMP)
+    if (parallel)
+        omp_set_num_threads(omp_get_num_procs());
+#   pragma omp parallel for schedule(static, 1) if (parallel)
+#endif
+    for (long j = 0; j < height; j++) {
+        T y = viewy + T(double(j)) * hpp;
+        __m256d y0s = { y.x[0], y.x[0], y.x[0], y.x[0] };
+        __m256d y1s = { y.x[1], y.x[1], y.x[1], y.x[1] };
+        AvxDoubleDouble ys{ y0s, y1s };
+        for (long i = 0; i < width; i += 4) {
+            T x1 = viewx + T(double(i)) * wpp;
+            T x2 = x1 + wpp;
+            T x3 = x2 + wpp;
+            T x4 = x3 + wpp;
+
+            __m256d x0s = {
+                x1[0], x2[0], x3[0], x4[0],
+            };
+
+            __m256d x1s = {
+                x1[1], x2[1], x3[1], x4[1],
+            };
+
+            AvxDoubleDouble xs{ x0s, x1s };
+
+            AvxDoubleDouble cx = julia ? juliaX : xs;
+            AvxDoubleDouble cy = julia ? juliaY : ys;
+
+            int itRes[4] = { 0, 0, 0, 0 };
+
+            __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
+            __m256d counter = { 0, 0, 0, 0 };
+            __m256d adder = { 1, 1, 1, 1 };
+
+            AvxDoubleDouble a = xs;
+            AvxDoubleDouble b = ys;
+
+            __m256d resultsa;
+            __m256d resultsb;
+
+            __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
+            for (int k = 0; k < maxIter; k++) {
+                AvxDoubleDouble aa = a.sq();
+                AvxDoubleDouble bb = b.sq();
+                AvxDoubleDouble abab = a * b.mul_pow2(2.0);
+                a = aa - bb + cx;
+                b = abab + cy;
+                if (smooth) {
+                    resultsa = _mm256_blendv_pd(resultsa, a.x[0], cmp);
+                    resultsb = _mm256_blendv_pd(resultsb, b.x[0], cmp);
+                }
+                cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
+                adder = _mm256_and_pd(adder, cmp);
+                counter = _mm256_add_pd(counter, adder);
+                if ((k & 0x7) && _mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
+                    break;
+                }
+            }
+
+            double resData[12];
+            double* ftRes = resData;
+            double* resa = ftRes + 4;
+            double* resb = ftRes + 8;
+            _mm256_storeu_pd(ftRes, counter);
+            _mm256_storeu_pd(resa, resultsa);
+            _mm256_storeu_pd(resb, resultsb);
+
+            for (int k = 0; k < 4 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
+                else
+                    data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
+            }
+        }
+    }
+}
+
+
+void generateQuadDoubleAvxFma(long width, long height, float* data, bool parallel,
+    const double* vx, const double* vy,
+    const double* vw, const double* vh,
+    int maxIter, bool smooth, bool julia,
+    const double* jXp, const double* jYp)
+{
+    using namespace avxfma_private;
+    using T = mnd::QuadDouble;
+
+    T viewx{ vx[0], vx[1], vx[2], vx[3] };
+    T viewy{ vy[0], vy[1], vy[2], vy[3] };
+    T wpp = T{ vw[0], vw[1], vw[2], vw[3] } * T(1.0 / width);
+    T hpp = T{ vh[0], vh[1], vh[2], vh[3] } * T(1.0 / height);
+
+
+    T jX{ jXp[0], jXp[1], jXp[2], jXp[3] };
+    T jY{ jYp[0], jYp[1], jYp[2], jYp[3] };
+
+
+    auto toAvxQuadDouble4 = [] (const T& a, const T& b,
+            const T& c, const T& d) -> AvxQuadDouble {
+        __m256d xs = { a[0], b[0], c[0], d[0] };
+        __m256d ys = { a[1], b[1], c[1], d[1] };
+        __m256d zs = { a[2], b[2], c[2], d[2] };
+        __m256d us = { a[3], b[3], c[3], d[3] };
 
 
         return AvxQuadDouble{ xs, ys, zs, us };
         return AvxQuadDouble{ xs, ys, zs, us };
     };
     };
 
 
-    AvxQuadDouble juliaX = toAvxQuadDouble(jX);
-    AvxQuadDouble juliaY = toAvxQuadDouble(jY);
+    AvxQuadDouble juliaX{ jX[0], jX[1], jX[2], jX[3] };
+    AvxQuadDouble juliaY{ jY[0], jY[1], jY[2], jY[3] };
 
 
 #if defined(_OPENMP)
 #if defined(_OPENMP)
-    if constexpr(parallel)
+    if (parallel)
         omp_set_num_threads(omp_get_num_procs());
         omp_set_num_threads(omp_get_num_procs());
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #endif
 #endif
-    for (long j = 0; j < info.bHeight; j++) {
+    for (long j = 0; j < height; j++) {
         T y = viewy + T(double(j)) * hpp;
         T y = viewy + T(double(j)) * hpp;
-        AvxQuadDouble ys = toAvxQuadDouble(y);
-        for (long i = 0; i < info.bWidth; i += 4) {
+        AvxQuadDouble ys{ y[0], y[1], y[2], y[3] };
+        for (long i = 0; i < width; i += 4) {
             T x1 = viewx + T(double(i)) * wpp;
             T x1 = viewx + T(double(i)) * wpp;
             T x2 = x1 + wpp;
             T x2 = x1 + wpp;
             T x3 = x2 + wpp;
             T x3 = x2 + wpp;
@@ -851,10 +964,8 @@ void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const m
 
 
             AvxQuadDouble xs = toAvxQuadDouble4(x1, x2, x3, x4);
             AvxQuadDouble xs = toAvxQuadDouble4(x1, x2, x3, x4);
 
 
-            AvxQuadDouble cx = info.julia ? juliaX : xs;
-            AvxQuadDouble cy = info.julia ? juliaY : ys;
-
-            int itRes[4] = { 0, 0, 0, 0 };
+            AvxQuadDouble cx = julia ? juliaX : xs;
+            AvxQuadDouble cy = julia ? juliaY : ys;
 
 
             __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
             __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
             __m256d counter = { 0, 0, 0, 0 };
             __m256d counter = { 0, 0, 0, 0 };
@@ -867,13 +978,13 @@ void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const m
             __m256d resultsb;
             __m256d resultsb;
 
 
             __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
             __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
-            for (int k = 0; k < info.maxIter; k++) {
+            for (int k = 0; k < maxIter; k++) {
                 AvxQuadDouble aa = a.sq();
                 AvxQuadDouble aa = a.sq();
                 AvxQuadDouble bb = b.sq();
                 AvxQuadDouble bb = b.sq();
                 AvxQuadDouble abab = a * b.mul_pow2(2.0);
                 AvxQuadDouble abab = a * b.mul_pow2(2.0);
                 a = aa - bb + cx;
                 a = aa - bb + cx;
                 b = abab + cy;
                 b = abab + cy;
-                if (info.smooth) {
+                if (smooth) {
                     resultsa = _mm256_blendv_pd(resultsa, a.x[0], cmp);
                     resultsa = _mm256_blendv_pd(resultsa, a.x[0], cmp);
                     resultsb = _mm256_blendv_pd(resultsb, b.x[0], cmp);
                     resultsb = _mm256_blendv_pd(resultsb, b.x[0], cmp);
                 }
                 }
@@ -885,27 +996,121 @@ void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const m
                 }
                 }
             }
             }
 
 
-            auto alignVec = [](double* data) -> double* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 4 * sizeof(double), aligned, length);
-                return static_cast<double*>(aligned);
-            };
+            double resData[12];
+            double* ftRes = resData;
+            double* resa = resData + 4;
+            double* resb = resData + 8;
+            _mm256_storeu_pd(ftRes, counter);
+            _mm256_storeu_pd(resa, resultsa);
+            _mm256_storeu_pd(resb, resultsb);
+
+            for (int k = 0; k < 4 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
+                else
+                    data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
+            }
+        }
+    }
+}
+
+
+void generateHexDoubleAvxFma(long width, long height, float* data, bool parallel,
+    const double* vx, const double* vy,
+    const double* vw, const double* vh,
+    int maxIter, bool smooth, bool julia,
+    const double* jX, const double* jY)
+{
+    using namespace avxfma_private;
+    using T = mnd::HexDouble;
+
+    T viewx{ vx[0], vx[1], vx[2], vx[3], vx[4], vx[5] };
+    T viewy{ vy[0], vy[1], vy[2], vy[3] , vy[4], vy[5] };
+    T wpp = T{ vw[0], vw[1], vw[2], vw[3], vw[4], vw[5] } * T(1.0 / width);
+    T hpp = T{ vh[0], vh[1], vh[2], vh[3], vh[4], vh[5] } * T(1.0 / height);
+
+    auto toAvxHexDouble4 = [] (const T& a, const T& b,
+            const T& c, const T& d) -> AvxHexDouble {
+        __m256d xs = { a[0], b[0], c[0], d[0] };
+        __m256d ys = { a[1], b[1], c[1], d[1] };
+        __m256d zs = { a[2], b[2], c[2], d[2] };
+        __m256d us = { a[3], b[3], c[3], d[3] };
+        __m256d vs = { a[4], b[4], c[4], d[4] };
+        __m256d ws = { a[5], b[5], c[5], d[5] };
+
+        return AvxHexDouble{ xs, ys, zs, us, vs, ws };
+    };
 
 
-            double resData[8];
-            double* ftRes = alignVec(resData);
-            double* resa = (double*) &resultsa;
-            double* resb = (double*) &resultsb;
-            _mm256_store_pd(ftRes, counter);
-
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
-                if (info.smooth)
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+    AvxHexDouble juliaX{ jX[0], jX[1], jX[2], jX[3], jX[4], jX[5] };
+    AvxHexDouble juliaY{ jY[0], jY[1], jY[2], jY[3], jY[4], jY[5] };
+
+#if defined(_OPENMP)
+    if (parallel)
+        omp_set_num_threads(omp_get_num_procs());
+#   pragma omp parallel for schedule(static, 1) if (parallel)
+#endif
+    for (long j = 0; j < height; j++) {
+        T y = viewy + T(double(j)) * hpp;
+        AvxHexDouble ys{ y[0], y[1], y[2], y[3], y[4], y[5] };
+        for (long i = 0; i < width; i += 4) {
+            T x1 = viewx + T(double(i)) * wpp;
+            T x2 = x1 + wpp;
+            T x3 = x2 + wpp;
+            T x4 = x3 + wpp;
+
+            AvxHexDouble xs = toAvxHexDouble4(x1, x2, x3, x4);
+
+            AvxHexDouble cx = julia ? juliaX : xs;
+            AvxHexDouble cy = julia ? juliaY : ys;
+
+            __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
+            __m256d counter = { 0, 0, 0, 0 };
+            __m256d adder = { 1, 1, 1, 1 };
+
+            AvxHexDouble a = xs;
+            AvxHexDouble b = ys;
+
+            __m256d resultsa;
+            __m256d resultsb;
+
+            __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
+            for (int k = 0; k < maxIter; k++) {
+                AvxHexDouble aa = a.sq();
+                AvxHexDouble bb = b.sq();
+                AvxHexDouble abab = a * b.mul_pow2(2.0);
+                a = aa - bb + cx;
+                b = abab + cy;
+                if (smooth) {
+                    resultsa = _mm256_blendv_pd(resultsa, a.x[0], cmp);
+                    resultsb = _mm256_blendv_pd(resultsb, b.x[0], cmp);
+                }
+                cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
+                adder = _mm256_and_pd(adder, cmp);
+                counter = _mm256_add_pd(counter, adder);
+                if (_mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
+                    break;
+                }
+            }
+
+            double resData[12];
+            double* ftRes = resData;
+            double* resa = resData + 4;
+            double* resb = resData + 8;
+            _mm256_storeu_pd(ftRes, counter);
+            _mm256_storeu_pd(resa, resultsa);
+            _mm256_storeu_pd(resb, resultsb);
+
+            for (int k = 0; k < 4 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
                 else
                 else
-                    data[i + k + j * info.bWidth] = ftRes[k] >= 0 ? float(ftRes[k]) : info.maxIter;
+                    data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
             }
             }
         }
         }
     }
     }
 }
 }
+

+ 2 - 0
libmandel/src/Mandel.cpp

@@ -93,10 +93,12 @@ MandelContext::MandelContext(void)
         auto davxfma = std::make_unique<CpuGenerator<double, mnd::X86_AVX_FMA, true>>();
         auto davxfma = std::make_unique<CpuGenerator<double, mnd::X86_AVX_FMA, true>>();
         auto ddavxfma = std::make_unique<CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>>();
         auto ddavxfma = std::make_unique<CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>>();
         auto qdavxfma = std::make_unique<CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, true>>();
         auto qdavxfma = std::make_unique<CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, true>>();
+        auto hdavxfma = std::make_unique<CpuGenerator<HexDouble, mnd::X86_AVX_FMA, true>>();
         cpuGenerators.insert({ std::pair{ Precision::FLOAT, CpuExtension::X86_AVX_FMA }, std::move(favxfma) });
         cpuGenerators.insert({ std::pair{ Precision::FLOAT, CpuExtension::X86_AVX_FMA }, std::move(favxfma) });
         cpuGenerators.insert({ std::pair{ Precision::DOUBLE, CpuExtension::X86_AVX_FMA }, std::move(davxfma) });
         cpuGenerators.insert({ std::pair{ Precision::DOUBLE, CpuExtension::X86_AVX_FMA }, std::move(davxfma) });
         cpuGenerators.insert({ std::pair{ Precision::DOUBLE_DOUBLE, CpuExtension::X86_AVX_FMA }, std::move(ddavxfma) });
         cpuGenerators.insert({ std::pair{ Precision::DOUBLE_DOUBLE, CpuExtension::X86_AVX_FMA }, std::move(ddavxfma) });
         cpuGenerators.insert({ std::pair{ Precision::QUAD_DOUBLE, CpuExtension::X86_AVX_FMA }, std::move(qdavxfma) });
         cpuGenerators.insert({ std::pair{ Precision::QUAD_DOUBLE, CpuExtension::X86_AVX_FMA }, std::move(qdavxfma) });
+        cpuGenerators.insert({ std::pair{ Precision::HEX_DOUBLE, CpuExtension::X86_AVX_FMA }, std::move(hdavxfma) });
     }
     }
     if (cpuInfo.hasSse2()) {
     if (cpuInfo.hasSse2()) {
         auto fl = std::make_unique<CpuGenerator<float, mnd::X86_SSE2, true>>();
         auto fl = std::make_unique<CpuGenerator<float, mnd::X86_SSE2, true>>();