Ver código fonte

made avxfma extern and added hexdouble avxfma

Nicolas Winkler 4 anos atrás
pai
commit
db5304cabc

+ 12 - 0
libmandel/include/CpuGenerators.h

@@ -155,6 +155,18 @@ public:
 
 
 template<bool parallel>
+class mnd::CpuGenerator<mnd::HexDouble, mnd::X86_AVX_FMA, parallel> : public MandelGenerator
+{
+public:
+    inline CpuGenerator(void) :
+        MandelGenerator{ mnd::Precision::HEX_DOUBLE, mnd::X86_AVX_FMA }
+    {
+    }
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+
+template<bool parallel>
 class mnd::CpuGenerator<float, mnd::X86_AVX_512, parallel> : public MandelGenerator
 {
 public:

+ 153 - 0
libmandel/src/CpuGenerators.cpp

@@ -120,6 +120,11 @@ void CpuGenerator<T, mnd::NONE, parallel>::generate(const mnd::MandelInfo& info,
 }
 
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
+
+// ===========================================
+// =================== AVX ===================
+// ===========================================
+
 namespace mnd
 {
     template class CpuGenerator<float, mnd::X86_AVX, false>;
@@ -232,6 +237,154 @@ void CpuGenerator<mnd::TripleDouble, mnd::X86_AVX, parallel>::generate(const mnd
 }
 
 
+
+// ===========================================
+// ================= AVX-FMA =================
+// ===========================================
+
+namespace mnd
+{
+    template class CpuGenerator<float, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<float, mnd::X86_AVX_FMA, true>;
+
+    template class CpuGenerator<double, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<double, mnd::X86_AVX_FMA, true>;
+
+    template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>;
+
+    template class CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, true>;
+
+    template class CpuGenerator<HexDouble, mnd::X86_AVX_FMA, false>;
+    template class CpuGenerator<HexDouble, mnd::X86_AVX_FMA, true>;
+}
+
+
+extern void generateFloatAvxFma(long width, long height, float* data, bool parallel,
+    float vx, float vy, float vw, float vh, int maxIter, bool smooth,
+    bool julia, float jX, float jY);
+
+extern void generateDoubleAvxFma(long width, long height, float* data, bool parallel,
+    double vx, double vy, double vw, double vh, int maxIter, bool smooth,
+    bool julia, double jX, double jY);
+
+extern void generateDoubleDoubleAvxFma(long width, long height, float* data, bool parallel,
+    double vx1, double vx2, double vy1, double vy2, double vw1, double vw2, double vh1, double vh2, int maxIter, bool smooth,
+    bool julia, double jX1, double jX2, double jY1, double jY2);
+
+extern void generateQuadDoubleAvxFma(long width, long height, float* data, bool parallel,
+    const double* vx, const double* vy,
+    const double* vw, const double* vh,
+    int maxIter, bool smooth, bool julia,
+    const double* jXp, const double* jYp);
+
+extern void generateHexDoubleAvxFma(long width, long height, float* data, bool parallel,
+    const double* vx, const double* vy,
+    const double* vw, const double* vh,
+    int maxIter, bool smooth, bool julia,
+    const double* jXp, const double* jYp);
+
+
+template<bool parallel>
+void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = float;
+    const MandelViewport& view = info.view;
+
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
+
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+
+    generateFloatAvxFma(info.bWidth, info.bHeight, data, parallel, vx, vy, vw, vh, info.maxIter, info.smooth, info.julia, jX, jY);
+}
+
+
+template<bool parallel>
+void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = double;
+    const MandelViewport& view = info.view;
+
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
+
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+
+    generateDoubleAvxFma(info.bWidth, info.bHeight, data, parallel, vx, vy, vw, vh, info.maxIter, info.smooth, info.julia, jX, jY);
+}
+
+
+template<bool parallel>
+void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = mnd::DoubleDouble;
+    const MandelViewport& view = info.view;
+
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
+
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+
+    generateDoubleDoubleAvxFma(info.bWidth, info.bHeight, data, parallel,
+        vx.x[0], vx.x[1], vy.x[0], vy.x[1], vw.x[0], vw.x[1], vh.x[0], vh.x[1],
+        info.maxIter, info.smooth, info.julia, jX.x[0], jX.x[1], jY.x[0], jY.x[1]);
+}
+
+
+template<bool parallel>
+void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = mnd::QuadDouble;
+    const MandelViewport& view = info.view;
+
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
+
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+
+    generateQuadDoubleAvxFma(info.bWidth, info.bHeight, data, parallel,
+        vx.x, vy.x,
+        vw.x, vh.x,
+        info.maxIter, info.smooth, info.julia,
+        jX.x, jY.x);
+}
+
+
+template<bool parallel>
+void CpuGenerator<mnd::HexDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = mnd::HexDouble;
+    const MandelViewport& view = info.view;
+
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
+
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+
+    generateHexDoubleAvxFma(info.bWidth, info.bHeight, data, parallel,
+        vx.x, vy.x,
+        vw.x, vh.x,
+        info.maxIter, info.smooth, info.julia,
+        jX.x, jY.x);
+}
+
 #ifdef WITH_AVX512
 
 namespace mnd

+ 2 - 1
libmandel/src/CpuGeneratorsAVX.cpp

@@ -625,6 +625,7 @@ void generateDoubleDoubleAvx(long width, long height, float* data, bool parallel
     }
 }
 
+
 void generateTripleDoubleAvx(long width, long height, float* data, bool parallel,
     double vx1, double vx2, double vx3, double vy1, double vy2, double vy3,
     double vw1, double vw2, double vw3, double vh1, double vh2, double vh3,
@@ -714,7 +715,7 @@ void generateTripleDoubleAvx(long width, long height, float* data, bool parallel
                 if (smooth)
                     data[i + k + j * width] = float(ftRes[k] < 0 ? maxIter :
                         ftRes[k] >= maxIter ? maxIter :
-                        ((float)ftRes[k]) + 1 - floatLog2(::floatLog(float(resa[k] * resa[k] + resb[k] * resb[k])) * 0.5f));
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(float(resa[k] * resa[k] + resb[k] * resb[k])) * 0.5f));
                 else
                     data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
             }

+ 482 - 277
libmandel/src/CpuGeneratorsAVXFMA.cpp

@@ -1,54 +1,44 @@
-#include "CpuGenerators.h"
+#include "FloatLog.h"
 
 #include <immintrin.h>
 #include <omp.h>
-#include <cmath>
 
-#include <utility>
-#include <memory>
-
-using mnd::CpuGenerator;
-
-namespace mnd
+///
+/// \brief unique namespace just to be a little more sure we do not
+///        accidentally compile a function used somewhere else and use
+///        avx instructions in there.
+///
+namespace avxfma_private
 {
-    template class CpuGenerator<float, mnd::X86_AVX_FMA, false>;
-    template class CpuGenerator<float, mnd::X86_AVX_FMA, true>;
-
-    template class CpuGenerator<double, mnd::X86_AVX_FMA, false>;
-    template class CpuGenerator<double, mnd::X86_AVX_FMA, true>;
-
-    template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, false>;
-    template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>;
-
-    template class CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, false>;
-    template class CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, true>;
+#include "LightDoubleDouble.h"
+#include "QuadDouble.h"
+#include "HexDouble.h"
 }
 
 
-template<bool parallel>
-void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+
+void generateFloatAvxFma(long width, long height, float* data, bool parallel,
+    float vx, float vy, float vw, float vh, int maxIter, bool smooth,
+    bool julia, float jX, float jY)
 {
     using T = float;
-    const MandelViewport& view = info.view;
-    const float dppf = float(view.width / info.bWidth);
-    const float viewxf = float(view.x);
+    const float dppf = float(vw / width);
+    const float viewxf = vx; 
     __m256 viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
     __m256 dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
 
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
     __m256 juliaX = { jX, jX, jX, jX, jX, jX, jX, jX };
     __m256 juliaY = { jY, jY, jY, jY, jY, jY, jY, jY };
 
 #if defined(_OPENMP)
-    if constexpr(parallel)
+    if (parallel)
         omp_set_num_threads(omp_get_num_procs());
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #endif
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
+    for (long j = 0; j < height; j++) {
+        T y = vy + T(j) * vw / height;
         __m256 ys = {y, y, y, y, y, y, y, y};
-        for (long i = 0; i < info.bWidth; i += 24) {
+        for (long i = 0; i < width; i += 24) {
             __m256 pixc = { float(i), float(i + 1), float(i + 2), float(i + 3), float(i + 4), float(i + 5), float(i + 6), float(i + 7) };
             __m256 pixc2 = { float(i + 8), float(i + 9), float(i + 10), float(i + 11), float(i + 12), float(i + 13), float(i + 14), float(i + 15) };
             __m256 pixc3 = { float(i + 16), float(i + 17), float(i + 18), float(i + 19), float(i + 20), float(i + 21), float(i + 22), float(i + 23) };
@@ -82,16 +72,16 @@ void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mandel
             __m256 b2 = ys;
             __m256 b3 = ys;
 
-            __m256 cx = info.julia ? juliaX : xs;
-            __m256 cx2 = info.julia ? juliaX : xs2;
-            __m256 cx3 = info.julia ? juliaX : xs3;
-            __m256 cy = info.julia ? juliaY : ys;
+            __m256 cx = julia ? juliaX : xs;
+            __m256 cx2 = julia ? juliaX : xs2;
+            __m256 cx3 = julia ? juliaX : xs3;
+            __m256 cy = julia ? juliaY : ys;
 
-            if (info.smooth) {
+            if (smooth) {
                 __m256 cmp = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
                 __m256 cmp2 = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
                 __m256 cmp3 = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
-                for (int k = 0; k < info.maxIter; k++) {
+                for (int k = 0; k < maxIter; k++) {
                     __m256 bb = _mm256_mul_ps(b, b);
                     __m256 bb2 = _mm256_mul_ps(b2, b2);
                     __m256 bb3 = _mm256_mul_ps(b3, b3);
@@ -134,7 +124,7 @@ void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mandel
                 }
             }
             else {
-                for (int k = 0; k < info.maxIter; k++) {
+                for (int k = 0; k < maxIter; k++) {
                     __m256 bb = _mm256_mul_ps(b, b);
                     __m256 bb2 = _mm256_mul_ps(b2, b2);
                     __m256 bb3 = _mm256_mul_ps(b3, b3);
@@ -162,36 +152,28 @@ void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mandel
                 }
             }
 
-
-            auto alignVec = [](float* data) -> float* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 8 * sizeof(float), aligned, length);
-                return static_cast<float*>(aligned);
-            };
-
             float resData[96];
-            float* ftRes = alignVec(resData);
+            float* ftRes = resData;
             float* resa = ftRes + 24;
             float* resb = resa + 24;
 
-            _mm256_store_ps(ftRes, counter);
-            _mm256_store_ps(ftRes + 8, counter2);
-            _mm256_store_ps(ftRes + 16, counter3);
-            _mm256_store_ps(resa, resultsa);
-            _mm256_store_ps(resa + 8, resultsa2);
-            _mm256_store_ps(resa + 16, resultsa3);
-            _mm256_store_ps(resb, resultsb);
-            _mm256_store_ps(resb + 8, resultsb2);
-            _mm256_store_ps(resb + 16, resultsb3);
-            for (int k = 0; k < 24 && i + k < info.bWidth; k++) {
-                if (info.smooth) {
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+            _mm256_storeu_ps(ftRes, counter);
+            _mm256_storeu_ps(ftRes + 8, counter2);
+            _mm256_storeu_ps(ftRes + 16, counter3);
+            _mm256_storeu_ps(resa, resultsa);
+            _mm256_storeu_ps(resa + 8, resultsa2);
+            _mm256_storeu_ps(resa + 16, resultsa3);
+            _mm256_storeu_ps(resb, resultsb);
+            _mm256_storeu_ps(resb + 8, resultsb2);
+            _mm256_storeu_ps(resb + 16, resultsb3);
+            for (int k = 0; k < 24 && i + k < width; k++) {
+                if (smooth) {
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
                 }
                 else {
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter : ftRes[k];
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter : ftRes[k];
                 }
             }
         }
@@ -199,32 +181,30 @@ void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mandel
 }
 
 
-template<bool parallel>
-void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+void generateDoubleAvxFma(long width, long height, float* data, bool parallel,
+    double vx, double vy, double vw, double vh, int maxIter, bool smooth,
+    bool julia, double jX, double jY)
 {
     using T = double;
-    const MandelViewport& view = info.view;
 
-    const double dppf = double(view.width / info.bWidth);
-    const double viewxf = double(view.x);
+    const double dppf = double(vw / width);
+    const double viewxf = double(vx);
     __m256d viewx = { viewxf, viewxf, viewxf, viewxf };
     __m256d dpp = { dppf, dppf, dppf, dppf };
 
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
     __m256d juliaX = { jX, jX, jX, jX };
     __m256d juliaY = { jY, jY, jY, jY };
 
 
 #if defined(_OPENMP)
-    if constexpr(parallel)
+    if (parallel)
         omp_set_num_threads(omp_get_num_procs());
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #endif
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = T(view.y + T(j) * view.height / info.bHeight);
+    for (long j = 0; j < height; j++) {
+        T y = vy + T(j) * vh / height;
         __m256d ys = { y, y, y, y };
-        for (long i = 0; i < info.bWidth; i += 8) {
+        for (long i = 0; i < width; i += 8) {
             __m256d pixc = { double(i), double(i + 1), double(i + 2), double(i + 3) };
             __m256d pixc2 = { double(i + 4), double(i + 5), double(i + 6), double(i + 7) };
             __m256d xs = _mm256_fmadd_pd(dpp, pixc, viewx);
@@ -249,14 +229,14 @@ void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mande
             __m256d a2 = xs2;
             __m256d b2 = ys;
 
-            __m256d cx = info.julia ? juliaX : xs;
-            __m256d cy = info.julia ? juliaY : ys;
-            __m256d cx2 = info.julia ? juliaX : xs2;
+            __m256d cx = julia ? juliaX : xs;
+            __m256d cy = julia ? juliaY : ys;
+            __m256d cx2 = julia ? juliaX : xs2;
             //__m256d cy2 = info.julia ? juliaY : ys;
 
             __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
             __m256d cmp2 = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
-            for (int k = 0; k < info.maxIter; k++) {
+            for (int k = 0; k < maxIter; k++) {
                 __m256d aa = _mm256_mul_pd(a, a);
                 __m256d ab = _mm256_mul_pd(a, b);
                 __m256d bb = _mm256_mul_pd(b, b);
@@ -269,7 +249,7 @@ void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mande
                 a2 = _mm256_add_pd(a2, cx2);
                 b = _mm256_fmadd_pd(two, ab, cy);
                 b2 = _mm256_fmadd_pd(two, ab2, cy);
-                if (info.smooth) {
+                if (smooth) {
                     resultsa = _mm256_blendv_pd(resultsa, a, cmp);
                     resultsb = _mm256_blendv_pd(resultsb, b, cmp);
                     resultsa2 = _mm256_blendv_pd(resultsa2, a2, cmp2);
@@ -287,45 +267,33 @@ void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::Mande
                 }
             }
 
-            auto alignVec = [](double* data) -> double* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 4 * sizeof(double), aligned, length);
-                return static_cast<double*>(aligned);
-            };
-
-            double resData[8];
-            double* ftRes = alignVec(resData);
-            double* resa = (double*) &resultsa;
-            double* resb = (double*) &resultsb;
-            _mm256_store_pd(ftRes, counter);
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
-                if (info.smooth)
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+            double resData[24];
+            double* ftRes = resData;
+            double* resa = ftRes + 8;
+            double* resb = ftRes + 16;
+            _mm256_storeu_pd(ftRes, counter);
+            _mm256_storeu_pd(ftRes + 4, counter2);
+            _mm256_storeu_pd(resa, resultsa);
+            _mm256_storeu_pd(resa + 4, resultsa2);
+            _mm256_storeu_pd(resb, resultsb);
+            _mm256_storeu_pd(resb + 4, resultsb2);
+            for (int k = 0; k < 8 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
                 else
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter : float(ftRes[k]);
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter : float(ftRes[k]);
             }
-
-            resa = (double*) &resultsa2;
-            resb = (double*) &resultsb2;
-            _mm256_store_pd(ftRes, counter2);
-            i += 4;
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
-                if (info.smooth)
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
-                else
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter : float(ftRes[k]);
-            }
-            i -= 4;
         }
     }
 }
 
 
+namespace avxfma_private
+{
+
+
 struct VecPair
 {
     __m256d a;
@@ -402,6 +370,25 @@ static inline __m256d threeOneSum(__m256d a, __m256d b, __m256d c)
 }
 
 
+static inline VecQuadruple fourSum(__m256d a, __m256d b, __m256d c, __m256d d)
+{
+    auto[t1, t2] = twoSum(a, b);
+    auto[t3, t4] = twoSum(t1, c);
+    auto[r0, t5] = twoSum(t3, d);
+    auto[r1, r2, r3] = threeSum(t2, t4, t5);
+    return { r0, r1, r2, r3 };
+}
+
+static inline VecPair fiveTwoSum(__m256d a, __m256d b, __m256d c, __m256d d, __m256d e)
+{
+    auto[t1, t2] = twoSum(a, b);
+    auto[t3, t4] = twoSum(t1, c);
+    auto[t5, t6] = twoSum(t3, d);
+    auto[r0, t7] = twoSum(t5, e);
+    return { r0, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(t2, t4), t6), t7) };
+}
+
+
 static inline VecTriple sixThreeSum(__m256d a, __m256d b, __m256d c,
                                     __m256d d, __m256d e, __m256d f)
 {
@@ -457,6 +444,23 @@ static inline VecPair nineTwoSum(__m256d a, __m256d b, __m256d c,
     return threeTwoSum(t5, t6, i);
 }
 
+
+inline VecTriple nineThreeSum(__m256d a, __m256d b, __m256d c,
+                              __m256d d, __m256d e, __m256d f,
+                              __m256d g, __m256d h, __m256d i)
+{
+    auto[a1, a2, a3] = threeSum(a, b, c);
+    auto[b1, b2, b3] = threeSum(d, e, f);
+    auto[c1, c2, c3] = threeSum(g, h, i);
+
+    auto[r1, t1, t2] = threeSum(a1, b1, c1);
+    auto[r2, t3, t4, t5] = fourSum(a2, b2, c2, t1);
+    auto r3 = _mm256_add_pd(_mm256_add_pd(
+            _mm256_add_pd(_mm256_add_pd(a3, b3), _mm256_add_pd(c3, t2)),
+            _mm256_add_pd(t3, t4)), t5);
+    return { r1, r2, r3 };
+}
+
 static inline VecQuadruple renormalize(__m256d x0, __m256d x1, __m256d x2, __m256d x3, __m256d x4)
 {
     auto [st1, t4] = quickTwoSum(x3, x4);
@@ -576,110 +580,6 @@ struct AvxDoubleDouble
 };
 
 
-template<bool parallel>
-void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
-{
-    const MandelViewport& view = info.view;
-
-    using T = LightDoubleDouble;
-
-    T viewx = mnd::convert<T>(view.x);
-    T viewy = mnd::convert<T>(view.y);
-    T wpp = mnd::convert<T>(view.width / info.bWidth);
-    T hpp = mnd::convert<T>(view.height / info.bHeight);
-
-
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
-
-    AvxDoubleDouble juliaX = { jX[0], jX[1] };
-    AvxDoubleDouble juliaY = { jY[0], jY[1] };
-
-#if defined(_OPENMP)
-    if constexpr(parallel)
-        omp_set_num_threads(omp_get_num_procs());
-#   pragma omp parallel for schedule(static, 1) if (parallel)
-#endif
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = viewy + T(double(j)) * hpp;
-        __m256d y0s = { y.x[0], y.x[0], y.x[0], y.x[0] };
-        __m256d y1s = { y.x[1], y.x[1], y.x[1], y.x[1] };
-        AvxDoubleDouble ys{ y0s, y1s };
-        for (long i = 0; i < info.bWidth; i += 4) {
-            T x1 = viewx + T(double(i)) * wpp;
-            T x2 = x1 + wpp;
-            T x3 = x2 + wpp;
-            T x4 = x3 + wpp;
-
-            __m256d x0s = {
-                x1[0], x2[0], x3[0], x4[0],
-            };
-
-            __m256d x1s = {
-                x1[1], x2[1], x3[1], x4[1],
-            };
-
-            AvxDoubleDouble xs{ x0s, x1s };
-
-            AvxDoubleDouble cx = info.julia ? juliaX : xs;
-            AvxDoubleDouble cy = info.julia ? juliaY : ys;
-
-            int itRes[4] = { 0, 0, 0, 0 };
-
-            __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
-            __m256d counter = { 0, 0, 0, 0 };
-            __m256d adder = { 1, 1, 1, 1 };
-
-            AvxDoubleDouble a = xs;
-            AvxDoubleDouble b = ys;
-
-            __m256d resultsa;
-            __m256d resultsb;
-
-            __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
-            for (int k = 0; k < info.maxIter; k++) {
-                AvxDoubleDouble aa = a.sq();
-                AvxDoubleDouble bb = b.sq();
-                AvxDoubleDouble abab = a * b.mul_pow2(2.0);
-                a = aa - bb + cx;
-                b = abab + cy;
-                if (info.smooth) {
-                    resultsa = _mm256_blendv_pd(resultsa, a.x[0], cmp);
-                    resultsb = _mm256_blendv_pd(resultsb, b.x[0], cmp);
-                }
-                cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
-                adder = _mm256_and_pd(adder, cmp);
-                counter = _mm256_add_pd(counter, adder);
-                if ((k & 0x7) && _mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
-                    break;
-                }
-            }
-
-            auto alignVec = [](double* data) -> double* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 4 * sizeof(double), aligned, length);
-                return static_cast<double*>(aligned);
-            };
-
-            double resData[8];
-            double* ftRes = alignVec(resData);
-            double* resa = (double*) &resultsa;
-            double* resb = (double*) &resultsb;
-            _mm256_store_pd(ftRes, counter);
-
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
-                if (info.smooth)
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
-                else
-                    data[i + k + j * info.bWidth] = ftRes[k] >= 0 ? float(ftRes[k]) : info.maxIter;
-            }
-        }
-    }
-}
-
 struct AvxQuadDouble
 {
     __m256d x[4];
@@ -784,66 +684,279 @@ struct AvxQuadDouble
 };
 
 
-template<bool parallel>
-void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
+struct AvxHexDouble
 {
-    const MandelViewport& view = info.view;
+    __m256d x[6];
 
-    using T = mnd::Real;
+    inline AvxHexDouble(__m256d a, __m256d b, __m256d c, __m256d d, __m256d e, __m256d f) :
+        x{ a, b, c, d, e, f }
+    {}
 
-    T viewx = mnd::convert<T>(view.x);
-    T viewy = mnd::convert<T>(view.y);
-    T wpp = mnd::convert<T>(view.width / info.bWidth);
-    T hpp = mnd::convert<T>(view.height / info.bHeight);
+    inline AvxHexDouble(double a, double b, double c, double d, double e, double f) :
+        x{ _mm256_set1_pd(a), _mm256_set1_pd(b), _mm256_set1_pd(c),
+           _mm256_set1_pd(d), _mm256_set1_pd(e), _mm256_set1_pd(f) }
+    {}
 
 
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
+    inline AvxHexDouble operator + (const AvxHexDouble& sm) const
+    {
+        auto[a0, a1] = twoSum(x[0], sm.x[0]);
+        auto[b0, b1] = twoSum(x[1], sm.x[1]);
+        auto[c0, c1] = twoSum(x[2], sm.x[2]);
+        auto[d0, d1] = twoSum(x[3], sm.x[3]);
+        auto[e0, e1] = twoSum(x[4], sm.x[4]);
+
+        auto t0 = a0;
+        auto [t1, p1] = twoSum(a1, b0);
+        auto [t2, p2, p3] = threeSum(b1, c0, p1);
+        auto [t3, p4, p5, p6] = fourSum(c1, d0, p2, p3);
+        auto [t4, p7] = fiveTwoSum(d1, e0, p4, p5, p6);
+        auto t5 = _mm256_add_pd(_mm256_add_pd(x[5], sm.x[5]), _mm256_add_pd(e1, p7));
+
+        auto[re0, er1] = quickTwoSum(t0, t1);
+        auto[re1, e2] = quickTwoSum(er1, t2);
+        auto[re2, e3] = quickTwoSum(e2, t3);
+        auto[re3, e4] = quickTwoSum(e3, t4);
+        auto[re4, re5] = quickTwoSum(e4, t5);
+
+        return { re0, re1, re2, re3, re4, re5 };
+    }
 
+    inline AvxHexDouble operator - (const AvxHexDouble& sm) const
+    {
+        auto[a0, a1] = twoDiff(x[0], sm.x[0]);
+        auto[b0, b1] = twoDiff(x[1], sm.x[1]);
+        auto[c0, c1] = twoDiff(x[2], sm.x[2]);
+        auto[d0, d1] = twoDiff(x[3], sm.x[3]);
+        auto[e0, e1] = twoDiff(x[4], sm.x[4]);
+
+        auto t0 = a0;
+        auto [t1, p1] = twoSum(a1, b0);
+        auto [t2, p2, p3] = threeSum(b1, c0, p1);
+        auto [t3, p4, p5, p6] = fourSum(c1, d0, p2, p3);
+        auto [t4, p7] = fiveTwoSum(d1, e0, p4, p5, p6);
+        auto t5 = _mm256_add_pd(_mm256_add_pd(x[5], sm.x[5]), _mm256_add_pd(e1, p7));
+
+        auto[re0, er1] = quickTwoSum(t0, t1);
+        auto[re1, e2] = quickTwoSum(er1, t2);
+        auto[re2, e3] = quickTwoSum(e2, t3);
+        auto[re3, e4] = quickTwoSum(e3, t4);
+        auto[re4, re5] = quickTwoSum(e4, t5);
+
+        return { re0, re1, re2, re3, re4, re5 };
+    }
 
-    auto toQd = [] (const mnd::Real& x) -> std::tuple<double, double, double, double> {
-        double a = double(x);
-        mnd::Real rem = x - a;
-        double b = double(rem);
-        rem = rem - b;
-        double c = double(rem);
-        rem = rem - c;
-        double d = double(rem);
-        return { a, b, c, d };
-    };
+    inline AvxHexDouble operator * (const AvxHexDouble& sm) const
+    {
+        auto[p1_0, p2_0] = twoProd(x[0], sm.x[0]);
+
+        auto[p2_1, p3_0] = twoProd(x[0], sm.x[1]);
+        auto[p2_2, p3_1] = twoProd(x[1], sm.x[0]);
+
+        auto[p3_2, p4_0] = twoProd(x[2], sm.x[0]);
+        auto[p3_3, p4_1] = twoProd(x[1], sm.x[1]);
+        auto[p3_4, p4_2] = twoProd(x[0], sm.x[2]);
+
+        auto[p4_3, p5_0] = twoProd(x[3], sm.x[0]);
+        auto[p4_4, p5_1] = twoProd(x[2], sm.x[1]);
+        auto[p4_5, p5_2] = twoProd(x[1], sm.x[2]);
+        auto[p4_6, p5_3] = twoProd(x[0], sm.x[3]);
+
+        auto[p5_4, p6_0] = twoProd(x[4], sm.x[0]);
+        auto[p5_5, p6_1] = twoProd(x[3], sm.x[1]);
+        auto[p5_6, p6_2] = twoProd(x[2], sm.x[2]);
+        auto[p5_7, p6_3] = twoProd(x[1], sm.x[3]);
+        auto[p5_8, p6_4] = twoProd(x[0], sm.x[4]);
+
+        auto t1 = p1_0;
+        auto[t2, tl3, tl4] = threeSum(p2_0, p2_1, p2_2);
+        auto[t3, tl4_2, tl5] = sixThreeSum(p3_0, p3_1, p3_2, p3_3, p3_4, tl3);
+        auto[t4, tl5_2, tl6] = nineThreeSum(p4_0, p4_1, p4_2, p4_3, p4_4, p4_5, p4_6, tl4, tl4_2);
+        auto[x1, x2, x3] = nineThreeSum(p5_0, p5_1, p5_2, p5_3, p5_4, p5_5, p5_6, p5_7, p5_8);
+        auto[t5, tl6_1, tl7] = sixThreeSum(x1, x2, x3, tl5, tl5_2, _mm256_set1_pd(0.0));
+
+        auto t6 =
+            _mm256_add_pd(
+                _mm256_add_pd(
+                    _mm256_add_pd(
+                        _mm256_add_pd(tl6, tl6_1),
+                        _mm256_add_pd(tl7, p6_0)
+                    ),
+                    _mm256_add_pd(
+                        _mm256_add_pd(p6_1, p6_2),
+                        _mm256_add_pd(p6_3, p6_4)
+                    )
+                ),
+                _mm256_add_pd(
+                    _mm256_fmadd_pd(x[5], sm.x[0], _mm256_fmadd_pd(x[4], sm.x[1], _mm256_mul_pd(x[3], sm.x[2]))),
+                    _mm256_fmadd_pd(x[2], sm.x[3], _mm256_fmadd_pd(x[1], sm.x[4], _mm256_mul_pd(x[0], sm.x[5])))
+                )
+            );
+
+        auto[re0, e1] = quickTwoSum(t1, t2);
+        auto[re1, e2] = quickTwoSum(e1, t3);
+        auto[re2, e3] = quickTwoSum(e2, t4);
+        auto[re3, e4] = quickTwoSum(e3, t5);
+        auto[re4, re5] = quickTwoSum(e4, t6);
+
+        return { re0, re1, re2, re3, re4, re5 };
+    }
 
-    auto toAvxQuadDouble = [&toQd] (const mnd::Real& x) -> AvxQuadDouble {
-        auto [a, b, c, d] = toQd(x);
-        return AvxQuadDouble{ a, b, c, d };
-    };
+    inline AvxHexDouble mul_pow2(double v) const
+    {
+        __m256d vv = _mm256_set1_pd(v);
+        return { _mm256_mul_pd(vv, x[0]), _mm256_mul_pd(vv, x[1]),
+                 _mm256_mul_pd(vv, x[2]), _mm256_mul_pd(vv, x[3]),
+                 _mm256_mul_pd(vv, x[4]), _mm256_mul_pd(vv, x[5]) };
+    }
 
-    auto toAvxQuadDouble4 = [&toQd] (const mnd::Real& a, const mnd::Real& b,
-                            const mnd::Real& c, const mnd::Real& d) -> AvxQuadDouble {
-        auto [x0, y0, z0, u0] = toQd(a);
-        auto [x1, y1, z1, u1] = toQd(b);
-        auto [x2, y2, z2, u2] = toQd(c);
-        auto [x3, y3, z3, u3] = toQd(d);
+    inline AvxHexDouble sq(void) const
+    {
+        return operator*(*this);
+    }
+};
+
+} // namespace avxfma_private
+
+void generateDoubleDoubleAvxFma(long width, long height, float* data, bool parallel,
+    double vx1, double vx2, double vy1, double vy2, double vw1, double vw2, double vh1, double vh2, int maxIter, bool smooth,
+    bool julia, double jX1, double jX2, double jY1, double jY2)
+{
+    using namespace avxfma_private;
+    using T = mnd::LightDoubleDouble;
 
-        __m256d xs = { x0, x1, x2, x3 };
-        __m256d ys = { y0, y1, y2, y3 };
-        __m256d zs = { z0, z1, z2, z3 };
-        __m256d us = { u0, u1, u2, u3 };
+    T viewx{ vx1, vx2 };
+    T viewy{ vy1, vy2 };
+    T wpp = T{ vw1, vw2 } * T(1.0 / width);
+    T hpp = T{ vh1, vh2 } * T(1.0 / height);
+
+    T jX{ jX1, jX2 };
+    T jY{ jY1, jY2 };
+    AvxDoubleDouble juliaX = { jX[0], jX[1] };
+    AvxDoubleDouble juliaY = { jY[0], jY[1] };
+
+#if defined(_OPENMP)
+    if (parallel)
+        omp_set_num_threads(omp_get_num_procs());
+#   pragma omp parallel for schedule(static, 1) if (parallel)
+#endif
+    for (long j = 0; j < height; j++) {
+        T y = viewy + T(double(j)) * hpp;
+        __m256d y0s = { y.x[0], y.x[0], y.x[0], y.x[0] };
+        __m256d y1s = { y.x[1], y.x[1], y.x[1], y.x[1] };
+        AvxDoubleDouble ys{ y0s, y1s };
+        for (long i = 0; i < width; i += 4) {
+            T x1 = viewx + T(double(i)) * wpp;
+            T x2 = x1 + wpp;
+            T x3 = x2 + wpp;
+            T x4 = x3 + wpp;
+
+            __m256d x0s = {
+                x1[0], x2[0], x3[0], x4[0],
+            };
+
+            __m256d x1s = {
+                x1[1], x2[1], x3[1], x4[1],
+            };
+
+            AvxDoubleDouble xs{ x0s, x1s };
+
+            AvxDoubleDouble cx = julia ? juliaX : xs;
+            AvxDoubleDouble cy = julia ? juliaY : ys;
+
+            int itRes[4] = { 0, 0, 0, 0 };
+
+            __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
+            __m256d counter = { 0, 0, 0, 0 };
+            __m256d adder = { 1, 1, 1, 1 };
+
+            AvxDoubleDouble a = xs;
+            AvxDoubleDouble b = ys;
+
+            __m256d resultsa;
+            __m256d resultsb;
+
+            __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
+            for (int k = 0; k < maxIter; k++) {
+                AvxDoubleDouble aa = a.sq();
+                AvxDoubleDouble bb = b.sq();
+                AvxDoubleDouble abab = a * b.mul_pow2(2.0);
+                a = aa - bb + cx;
+                b = abab + cy;
+                if (smooth) {
+                    resultsa = _mm256_blendv_pd(resultsa, a.x[0], cmp);
+                    resultsb = _mm256_blendv_pd(resultsb, b.x[0], cmp);
+                }
+                cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
+                adder = _mm256_and_pd(adder, cmp);
+                counter = _mm256_add_pd(counter, adder);
+                if ((k & 0x7) && _mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
+                    break;
+                }
+            }
+
+            double resData[12];
+            double* ftRes = resData;
+            double* resa = ftRes + 4;
+            double* resb = ftRes + 8;
+            _mm256_storeu_pd(ftRes, counter);
+            _mm256_storeu_pd(resa, resultsa);
+            _mm256_storeu_pd(resb, resultsb);
+
+            for (int k = 0; k < 4 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
+                else
+                    data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
+            }
+        }
+    }
+}
+
+
+void generateQuadDoubleAvxFma(long width, long height, float* data, bool parallel,
+    const double* vx, const double* vy,
+    const double* vw, const double* vh,
+    int maxIter, bool smooth, bool julia,
+    const double* jXp, const double* jYp)
+{
+    using namespace avxfma_private;
+    using T = mnd::QuadDouble;
+
+    T viewx{ vx[0], vx[1], vx[2], vx[3] };
+    T viewy{ vy[0], vy[1], vy[2], vy[3] };
+    T wpp = T{ vw[0], vw[1], vw[2], vw[3] } * T(1.0 / width);
+    T hpp = T{ vh[0], vh[1], vh[2], vh[3] } * T(1.0 / height);
+
+
+    T jX{ jXp[0], jXp[1], jXp[2], jXp[3] };
+    T jY{ jYp[0], jYp[1], jYp[2], jYp[3] };
+
+
+    auto toAvxQuadDouble4 = [] (const T& a, const T& b,
+            const T& c, const T& d) -> AvxQuadDouble {
+        __m256d xs = { a[0], b[0], c[0], d[0] };
+        __m256d ys = { a[1], b[1], c[1], d[1] };
+        __m256d zs = { a[2], b[2], c[2], d[2] };
+        __m256d us = { a[3], b[3], c[3], d[3] };
 
         return AvxQuadDouble{ xs, ys, zs, us };
     };
 
-    AvxQuadDouble juliaX = toAvxQuadDouble(jX);
-    AvxQuadDouble juliaY = toAvxQuadDouble(jY);
+    AvxQuadDouble juliaX{ jX[0], jX[1], jX[2], jX[3] };
+    AvxQuadDouble juliaY{ jY[0], jY[1], jY[2], jY[3] };
 
 #if defined(_OPENMP)
-    if constexpr(parallel)
+    if (parallel)
         omp_set_num_threads(omp_get_num_procs());
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #endif
-    for (long j = 0; j < info.bHeight; j++) {
+    for (long j = 0; j < height; j++) {
         T y = viewy + T(double(j)) * hpp;
-        AvxQuadDouble ys = toAvxQuadDouble(y);
-        for (long i = 0; i < info.bWidth; i += 4) {
+        AvxQuadDouble ys{ y[0], y[1], y[2], y[3] };
+        for (long i = 0; i < width; i += 4) {
             T x1 = viewx + T(double(i)) * wpp;
             T x2 = x1 + wpp;
             T x3 = x2 + wpp;
@@ -851,10 +964,8 @@ void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const m
 
             AvxQuadDouble xs = toAvxQuadDouble4(x1, x2, x3, x4);
 
-            AvxQuadDouble cx = info.julia ? juliaX : xs;
-            AvxQuadDouble cy = info.julia ? juliaY : ys;
-
-            int itRes[4] = { 0, 0, 0, 0 };
+            AvxQuadDouble cx = julia ? juliaX : xs;
+            AvxQuadDouble cy = julia ? juliaY : ys;
 
             __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
             __m256d counter = { 0, 0, 0, 0 };
@@ -867,13 +978,13 @@ void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const m
             __m256d resultsb;
 
             __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
-            for (int k = 0; k < info.maxIter; k++) {
+            for (int k = 0; k < maxIter; k++) {
                 AvxQuadDouble aa = a.sq();
                 AvxQuadDouble bb = b.sq();
                 AvxQuadDouble abab = a * b.mul_pow2(2.0);
                 a = aa - bb + cx;
                 b = abab + cy;
-                if (info.smooth) {
+                if (smooth) {
                     resultsa = _mm256_blendv_pd(resultsa, a.x[0], cmp);
                     resultsb = _mm256_blendv_pd(resultsb, b.x[0], cmp);
                 }
@@ -885,27 +996,121 @@ void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const m
                 }
             }
 
-            auto alignVec = [](double* data) -> double* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 4 * sizeof(double), aligned, length);
-                return static_cast<double*>(aligned);
-            };
+            double resData[12];
+            double* ftRes = resData;
+            double* resa = resData + 4;
+            double* resb = resData + 8;
+            _mm256_storeu_pd(ftRes, counter);
+            _mm256_storeu_pd(resa, resultsa);
+            _mm256_storeu_pd(resb, resultsb);
+
+            for (int k = 0; k < 4 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
+                else
+                    data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
+            }
+        }
+    }
+}
+
+
+void generateHexDoubleAvxFma(long width, long height, float* data, bool parallel,
+    const double* vx, const double* vy,
+    const double* vw, const double* vh,
+    int maxIter, bool smooth, bool julia,
+    const double* jX, const double* jY)
+{
+    using namespace avxfma_private;
+    using T = mnd::HexDouble;
+
+    T viewx{ vx[0], vx[1], vx[2], vx[3], vx[4], vx[5] };
+    T viewy{ vy[0], vy[1], vy[2], vy[3] , vy[4], vy[5] };
+    T wpp = T{ vw[0], vw[1], vw[2], vw[3], vw[4], vw[5] } * T(1.0 / width);
+    T hpp = T{ vh[0], vh[1], vh[2], vh[3], vh[4], vh[5] } * T(1.0 / height);
+
+    auto toAvxHexDouble4 = [] (const T& a, const T& b,
+            const T& c, const T& d) -> AvxHexDouble {
+        __m256d xs = { a[0], b[0], c[0], d[0] };
+        __m256d ys = { a[1], b[1], c[1], d[1] };
+        __m256d zs = { a[2], b[2], c[2], d[2] };
+        __m256d us = { a[3], b[3], c[3], d[3] };
+        __m256d vs = { a[4], b[4], c[4], d[4] };
+        __m256d ws = { a[5], b[5], c[5], d[5] };
+
+        return AvxHexDouble{ xs, ys, zs, us, vs, ws };
+    };
 
-            double resData[8];
-            double* ftRes = alignVec(resData);
-            double* resa = (double*) &resultsa;
-            double* resb = (double*) &resultsb;
-            _mm256_store_pd(ftRes, counter);
-
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
-                if (info.smooth)
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+    AvxHexDouble juliaX{ jX[0], jX[1], jX[2], jX[3], jX[4], jX[5] };
+    AvxHexDouble juliaY{ jY[0], jY[1], jY[2], jY[3], jY[4], jY[5] };
+
+#if defined(_OPENMP)
+    if (parallel)
+        omp_set_num_threads(omp_get_num_procs());
+#   pragma omp parallel for schedule(static, 1) if (parallel)
+#endif
+    for (long j = 0; j < height; j++) {
+        T y = viewy + T(double(j)) * hpp;
+        AvxHexDouble ys{ y[0], y[1], y[2], y[3], y[4], y[5] };
+        for (long i = 0; i < width; i += 4) {
+            T x1 = viewx + T(double(i)) * wpp;
+            T x2 = x1 + wpp;
+            T x3 = x2 + wpp;
+            T x4 = x3 + wpp;
+
+            AvxHexDouble xs = toAvxHexDouble4(x1, x2, x3, x4);
+
+            AvxHexDouble cx = julia ? juliaX : xs;
+            AvxHexDouble cy = julia ? juliaY : ys;
+
+            __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
+            __m256d counter = { 0, 0, 0, 0 };
+            __m256d adder = { 1, 1, 1, 1 };
+
+            AvxHexDouble a = xs;
+            AvxHexDouble b = ys;
+
+            __m256d resultsa;
+            __m256d resultsb;
+
+            __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
+            for (int k = 0; k < maxIter; k++) {
+                AvxHexDouble aa = a.sq();
+                AvxHexDouble bb = b.sq();
+                AvxHexDouble abab = a * b.mul_pow2(2.0);
+                a = aa - bb + cx;
+                b = abab + cy;
+                if (smooth) {
+                    resultsa = _mm256_blendv_pd(resultsa, a.x[0], cmp);
+                    resultsb = _mm256_blendv_pd(resultsb, b.x[0], cmp);
+                }
+                cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
+                adder = _mm256_and_pd(adder, cmp);
+                counter = _mm256_add_pd(counter, adder);
+                if (_mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
+                    break;
+                }
+            }
+
+            double resData[12];
+            double* ftRes = resData;
+            double* resa = resData + 4;
+            double* resb = resData + 8;
+            _mm256_storeu_pd(ftRes, counter);
+            _mm256_storeu_pd(resa, resultsa);
+            _mm256_storeu_pd(resb, resultsb);
+
+            for (int k = 0; k < 4 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
                 else
-                    data[i + k + j * info.bWidth] = ftRes[k] >= 0 ? float(ftRes[k]) : info.maxIter;
+                    data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
             }
         }
     }
 }
+

+ 2 - 0
libmandel/src/Mandel.cpp

@@ -93,10 +93,12 @@ MandelContext::MandelContext(void)
         auto davxfma = std::make_unique<CpuGenerator<double, mnd::X86_AVX_FMA, true>>();
         auto ddavxfma = std::make_unique<CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>>();
         auto qdavxfma = std::make_unique<CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, true>>();
+        auto hdavxfma = std::make_unique<CpuGenerator<HexDouble, mnd::X86_AVX_FMA, true>>();
         cpuGenerators.insert({ std::pair{ Precision::FLOAT, CpuExtension::X86_AVX_FMA }, std::move(favxfma) });
         cpuGenerators.insert({ std::pair{ Precision::DOUBLE, CpuExtension::X86_AVX_FMA }, std::move(davxfma) });
         cpuGenerators.insert({ std::pair{ Precision::DOUBLE_DOUBLE, CpuExtension::X86_AVX_FMA }, std::move(ddavxfma) });
         cpuGenerators.insert({ std::pair{ Precision::QUAD_DOUBLE, CpuExtension::X86_AVX_FMA }, std::move(qdavxfma) });
+        cpuGenerators.insert({ std::pair{ Precision::HEX_DOUBLE, CpuExtension::X86_AVX_FMA }, std::move(hdavxfma) });
     }
     if (cpuInfo.hasSse2()) {
         auto fl = std::make_unique<CpuGenerator<float, mnd::X86_SSE2, true>>();