Forráskód Böngészése

add avx triple doubles #4

Nicolas Winkler 4 éve
szülő
commit
4a72f68fed

+ 1 - 0
libmandel/CMakeLists.txt

@@ -44,6 +44,7 @@ SET(MandelSources
     src/IterationCompiler.cpp
     src/IterationIR.cpp
     src/NaiveIRGenerator.cpp
+    src/FloatLog.cpp
 )
 FILE(GLOB MandelHeaders include/*.h)
 

+ 14 - 0
libmandel/include/CpuGenerators.h

@@ -68,6 +68,7 @@ public:
     virtual void generate(const MandelInfo& info, float* data);
 };
 
+
 template<bool parallel>
 class mnd::CpuGenerator<double, mnd::X86_AVX, parallel> : public MandelGenerator
 {
@@ -79,6 +80,7 @@ public:
     virtual void generate(const MandelInfo& info, float* data);
 };
 
+
 template<bool parallel>
 class mnd::CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel> : public MandelGenerator
 {
@@ -92,6 +94,18 @@ public:
 
 
 template<bool parallel>
+class mnd::CpuGenerator<mnd::TripleDouble, mnd::X86_AVX, parallel> : public MandelGenerator
+{
+public:
+    inline CpuGenerator(void) :
+        MandelGenerator{ mnd::Precision::TRIPLE_DOUBLE, mnd::X86_AVX }
+    {
+    }
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+
+template<bool parallel>
 class mnd::CpuGenerator<float, mnd::X86_AVX_FMA, parallel> : public MandelGenerator
 {
 public:

+ 7 - 0
libmandel/include/FloatLog.h

@@ -0,0 +1,7 @@
+#ifndef MANDEL_FLOATLOG_H
+#define MANDEL_FLOATLOG_H
+
+float floatLog(float x);
+float floatLog2(float x);
+
+#endif // MANDEL_FLOATLOG_H

+ 1 - 0
libmandel/include/Generators.h

@@ -104,6 +104,7 @@ enum class mnd::GeneratorType : int
     DOUBLE_DOUBLE_AVX_FMA,
     DOUBLE_DOUBLE_NEON,
     TRIPLE_DOUBLE,
+    TRIPLE_DOUBLE_AVX,
     QUAD_DOUBLE,
     QUAD_DOUBLE_AVX_FMA,
     FLOAT128,

+ 5 - 3
libmandel/include/LightDoubleDouble.h

@@ -1,8 +1,6 @@
 #ifndef MANDEL_LIGHTDOUBLEDOUBLE_H
 #define MANDEL_LIGHTDOUBLEDOUBLE_H
 
-#include <utility>
-
 namespace mnd
 {
     struct LightDoubleDouble;
@@ -11,7 +9,11 @@ namespace mnd
     namespace ldd
     {
         template<typename T>
-        using Pair = std::pair<T, T>;
+        struct Pair {
+            T first;
+            T second;
+        };
+
         using DoublePair = Pair<double>;
         using FloatPair = Pair<float>;
 

+ 108 - 172
libmandel/src/CpuGenerators.cpp

@@ -112,213 +112,149 @@ void CpuGenerator<T, mnd::NONE, parallel>::generate(const mnd::MandelInfo& info,
     }
 }
 
+namespace mnd
+{
+    template class CpuGenerator<float, mnd::X86_AVX, false>;
+    template class CpuGenerator<float, mnd::X86_AVX, true>;
+
+    template class CpuGenerator<double, mnd::X86_AVX, false>;
+    template class CpuGenerator<double, mnd::X86_AVX, true>;
+
+    template class CpuGenerator<DoubleDouble, mnd::X86_AVX, false>;
+    template class CpuGenerator<DoubleDouble, mnd::X86_AVX, true>;
+
+    template class CpuGenerator<TripleDouble, mnd::X86_AVX, false>;
+    template class CpuGenerator<TripleDouble, mnd::X86_AVX, true>;
+}
+
+extern void generateFloatAvx(long width, long height, float* data, bool parallel,
+    float vx, float vy, float vw, float vh, int maxIter, bool smooth,
+    bool julia, float jX, float jY);
+
+extern void generateDoubleAvx(long width, long height, float* data, bool parallel,
+    double vx, double vy, double vw, double vh, int maxIter, bool smooth,
+    bool julia, double jX, double jY);
+
+extern void generateDoubleDoubleAvx(long width, long height, float* data, bool parallel,
+    double vx1, double vx2, double vy1, double vy2, double vw1, double vw2, double vh1, double vh2, int maxIter, bool smooth,
+    bool julia, double jX1, double jX2, double jY1, double jY2);
+
+extern void generateTripleDoubleAvx(long width, long height, float* data, bool parallel,
+    double vx1, double vx2, double vx3, double vy1, double vy2, double vy3,
+    double vw1, double vw2,  double vw3, double vh1, double vh2, double vh3,
+    int maxIter, bool smooth, bool julia,
+    double jX1, double jX2, double jX3, double jY1, double jY2, double jY3);
+
 
-/*
 template<bool parallel>
-void CpuGenerator<double, mnd::NONE, parallel>::generate(const mnd::MandelInfo& info, float* data)
+void CpuGenerator<float, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo& info, float* data)
 {
+    using T = float;
     const MandelViewport& view = info.view;
 
-    T viewx = mnd::convert<T>(view.x);
-    T viewy = mnd::convert<T>(view.y);
-    T wpp = mnd::convert<T>(view.width / info.bWidth);
-    T hpp = mnd::convert<T>(view.height / info.bHeight);
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
 
-    if constexpr (parallel)
-        omp_set_num_threads(omp_get_num_procs());
-#pragma omp parallel for schedule(static, 1) if (parallel)
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = viewy + T(double(j)) * hpp;
-        long i = 0;
-        for (i; i < info.bWidth; i++) {
-            T x = viewx + T(double(i)) * wpp;
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
 
-            T a = x;
-            T b = y;
+    generateFloatAvx(info.bWidth, info.bHeight, data, parallel, vx, vy, vw, vh, info.maxIter, info.smooth, info.julia, jX, jY);
+}
 
-            int k = 0;
-            for (k = 0; k < info.maxIter; k++) {
-                T aa = a * a;
-                T bb = b * b;
-                T ab = a * b;
-                a = aa - bb + x;
-                b = ab + ab + y;
-                if (aa + bb > T(16.0)) {
-                    break;
-                }
-            }
-            if (info.smooth) {
-                if (k >= info.maxIter)
-                    data[i + j * info.bWidth] = float(info.maxIter);
-                else
-                    data[i + j * info.bWidth] = ((float) k) + 1 - ::logf(::logf(mnd::convert<float>(a * a + b * b)) / 2) / ::logf(2.0f);
-            }
-            else
-                data[i + j * info.bWidth] = k;
-        }
-    }
-}*/
 
-/*
-#if defined(WITH_BOOST) || 1
 template<bool parallel>
-void CpuGenerator<Fixed128, mnd::NONE, parallel>::generate(const mnd::MandelInfo& info, float* data)
+void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo& info, float* data)
 {
-    using T = Fixed128;
+    using T = double;
     const MandelViewport& view = info.view;
 
-    const auto fixedFromFloat = [] (const mnd::Float128& f) {
-        boost::multiprecision::int128_t frac = boost::multiprecision::int128_t(f * 4294967296.0 * 4294967296.0 * 4294967296.0);
-        std::vector<uint32_t> bits;
-        export_bits(frac, std::back_inserter(bits), 32);
-        bits.clear();
-        while (bits.size() < 4) bits.push_back(0);
-        return Fixed128{ bits[0], bits[1], bits[2], bits[3] };
-    };
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
 
-    if constexpr (parallel)
-        omp_set_num_threads(2 * omp_get_num_procs());
-#pragma omp parallel for if (parallel)
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = fixedFromFloat(view.y + mnd::Real(j) * view.height / info.bHeight);
-        long i = 0;
-        for (i; i < info.bWidth; i++) {
-            T x = fixedFromFloat(view.x + mnd::Real(i) * view.width / info.bWidth);
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
 
-            T a = x;
-            T b = y;
-
-            int k = 0;
-            for (k = 0; k < info.maxIter; k++) {
-                T aa = a * a;
-                T bb = b * b;
-                T ab = a * b;
-                a = aa - bb + x;
-                b = ab + ab + y;
-                if (aa + bb > T(16)) {
-                    break;
-                }
-            }
-            if constexpr (smooth) {
-                if (k >= info.maxIter)
-                    data[i + j * info.bWidth] = info.maxIter;
-                else
-                    data[i + j * info.bWidth] = ((float) k) + 1 - ::logf(::logf(float(a * a + b * b)) / 2) / ::logf(2.0f);
-            }
-            else
-                data[i + j * info.bWidth] = k;
-        }
-    }
+    generateDoubleAvx(info.bWidth, info.bHeight, data, parallel, vx, vy, vw, vh, info.maxIter, info.smooth, info.julia, jX, jY);
 }
-#endif // WITH_BOOST
-*/
 
-#ifdef WITH_MPFR
-template<unsigned int bits, bool parallel>
-void CpuGenerator<mnd::MpfrFloat<bits>, mnd::NONE, parallel>::generate(const mnd::MandelInfo& info, float* data)
+
+template<bool parallel>
+void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo& info, float* data)
 {
+    using T = mnd::DoubleDouble;
     const MandelViewport& view = info.view;
-    using T = mnd::MpfrFloat<bits>;
 
-#if defined(_OPENMP)
-    if constexpr (parallel)
-        omp_set_num_threads(2 * omp_get_num_procs());
-#   pragma omp parallel for if (parallel)
-#endif
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
-        long i = 0;
-        for (i; i < info.bWidth; i++) {
-            T x = T(view.x + T(i) * T(view.width / info.bWidth));
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
 
-            T a = x;
-            T b = y;
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
 
-            int k = 0;
-            for (k = 0; k < info.maxIter; k++) {
-                T aa = a * a;
-                T bb = b * b;
-                T ab = a * b;
-                a = aa - bb + x;
-                b = ab + ab + y;
-                if (aa + bb > T(16)) {
-                    break;
-                }
-            }
-            if (info.smooth) {
-                if (k >= info.maxIter)
-                    data[i + j * info.bWidth] = info.maxIter;
-                else
-                    data[i + j * info.bWidth] = ((float) k) + 1 - ::log(::log(a * a + b * b) / 2) / ::log(2.0f);
-            }
-            else
-                data[i + j * info.bWidth] = k;
-        }
-    }
+    generateDoubleDoubleAvx(info.bWidth, info.bHeight, data, parallel,
+        vx.x[0], vx.x[1], vy.x[0], vy.x[1], vw.x[0], vw.x[1], vh.x[0], vh.x[1],
+        info.maxIter, info.smooth, info.julia, jX.x[0], jX.x[1], jY.x[0], jY.x[1]);
 }
-#endif // WITH_MPFR
 
 
-/*
-void CpuGeneratorDouble::generate(const mnd::MandelInfo& info, float* data)
+template<bool parallel>
+void CpuGenerator<mnd::TripleDouble, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo& info, float* data)
 {
+    using T = mnd::TripleDouble;
     const MandelViewport& view = info.view;
-    omp_set_num_threads(2 * omp_get_num_procs());
-#pragma omp parallel for
-    for (long j = 0; j < info.bHeight; j++) {
-        double y = double(view.y) + double(j) * double(view.height / info.bHeight);
-        long i = 0;
-        for (i; i < info.bWidth; i++) {
-            double x = view.x + double(i) * view.width / info.bWidth;
 
-            double a = x;
-            double b = y;
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
 
-            int k = 0;
-            for (k = 0; k < info.maxIter; k++) {
-                double aa = a * a;
-                double bb = b * b;
-                double ab = a * b;
-                a = aa - bb + x;
-                b = ab + ab + y;
-                if (aa + bb > 16) {
-                    break;
-                }
-            }
-            data[i + j * info.bWidth] = k;
-        }
-    }
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+
+    generateTripleDoubleAvx(info.bWidth, info.bHeight, data, parallel,
+        vx.x[0], vx.x[1], vx.x[2], vy.x[0], vy.x[1], vy.x[2],
+        vw.x[0], vw.x[1], vw.x[2], vh.x[0], vh.x[1], vh.x[2],
+        info.maxIter, info.smooth, info.julia,
+        jX.x[0], jX.x[1], jX.x[2], jY.x[0], jY.x[1], jY.x[2]);
 }
 
 
-void CpuGenerator128::generate(const mnd::MandelInfo& info, float* data)
+#ifdef WITH_AVX512
+
+namespace mnd
 {
+    template class CpuGenerator<float, mnd::X86_AVX_512, false>;
+    template class CpuGenerator<float, mnd::X86_AVX_512, true>;
+
+    //template class CpuGenerator<double, mnd::X86_AVX_512, false>;
+    //template class CpuGenerator<double, mnd::X86_AVX_512, true>;
+}
+
+extern void generateFloatAvx512(long width, long height, float* data, bool parallel,
+    float vx, float vy, float vw, float vh, int maxIter, bool smooth,
+    bool julia, float jX, float jY);
+
+template<bool parallel>
+void CpuGenerator<float, mnd::X86_AVX_512, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = float;
     const MandelViewport& view = info.view;
-    omp_set_num_threads(2 * omp_get_num_procs());
-#pragma omp parallel for
-    for (long j = 0; j < info.bHeight; j++) {
-        Fixed128 y = Fixed128(view.y) + Fixed128(j) * Fixed128(view.height / info.bHeight);
-        long i = 0;
-        for (i; i < info.bWidth; i++) {
-            Fixed128 x = view.x + Fixed128(i) * Fixed128(view.width / info.bWidth);
 
-            Fixed128 a = x;
-            Fixed128 b = y;
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
 
-            int k = 0;
-            for (k = 0; k < info.maxIter; k++) {
-                Fixed128 aa = a * a;
-                Fixed128 bb = b * b;
-                Fixed128 ab = a * b;
-                a = aa - bb + x;
-                b = ab + ab + y;
-                if (aa + bb > Fixed128(16)) {
-                    break;
-                }
-            }
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
 
-            data[i + j * info.bWidth] = k;
-        }
-    }
+    generateFloatAvx512(info.bWidth, info.bHeight, data, parallel, vx, vy, vw, vh, info.maxIter, info.smooth, info.julia, jX, jY);
 }
-*/
+
+#endif // WITH_AVX512
+

+ 264 - 130
libmandel/src/CpuGeneratorsAVX.cpp

@@ -1,51 +1,41 @@
-#include "CpuGenerators.h"
-#include "LightDoubleDouble.h"
+#include "FloatLog.h"
 
 #include <immintrin.h>
 #include <omp.h>
-#include <cmath>
-
-#include <utility>
-#include <memory>
-
-using mnd::CpuGenerator;
 
-namespace mnd
+///
+/// \brief unique namespace just to be a little more sure we do not
+///        accidentally compile a function used somewhere else and use
+///        avx instructions in there.
+///
+namespace avx_private
 {
-    template class CpuGenerator<float, mnd::X86_AVX, false>;
-    template class CpuGenerator<float, mnd::X86_AVX, true>;
-
-    template class CpuGenerator<double, mnd::X86_AVX, false>;
-    template class CpuGenerator<double, mnd::X86_AVX, true>;
-    
-    template class CpuGenerator<DoubleDouble, mnd::X86_AVX, false>;
-    template class CpuGenerator<DoubleDouble, mnd::X86_AVX, true>;
+#include "LightDoubleDouble.h"
+#include "TripleDouble.h"
 }
 
-template<bool parallel>
-void CpuGenerator<float, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo& info, float* data)
+
+void generateFloatAvx(long width, long height, float* data, bool parallel,
+    float vx, float vy, float vw, float vh, int maxIter, bool smooth,
+    bool julia, float jX, float jY)
 {
     using T = float;
-    const MandelViewport& view = info.view;
-    const float dppf = float(view.width / info.bWidth);
-    const float viewxf = float(view.x);
-    __m256 viewx = _mm256_set1_ps(viewxf);
+    const float dppf = float(vw / width);
+    __m256 viewx = _mm256_set1_ps(vx);
     __m256 dpp = _mm256_set1_ps(dppf);
 
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
     __m256 juliaX = { jX, jX, jX, jX, jX, jX, jX, jX };
     __m256 juliaY = { jY, jY, jY, jY, jY, jY, jY, jY };
 
 #if defined(_OPENMP)
-    if constexpr(parallel)
+    if (parallel)
         omp_set_num_threads(omp_get_num_procs());
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #endif
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
+    for (long j = 0; j < height; j++) {
+        T y = vy + T(j) * vw / height;
         __m256 ys = _mm256_set1_ps(y);
-        for (long i = 0; i < info.bWidth; i += 16) {
+        for (long i = 0; i < width; i += 16) {
             __m256 pixc = { float(i), float(i + 1), float(i + 2), float(i + 3), float(i + 4), float(i + 5), float(i + 6), float(i + 7) };
             __m256 pixc2 = { float(i + 8), float(i + 9), float(i + 10), float(i + 11), float(i + 12), float(i + 13), float(i + 14), float(i + 15) };
 
@@ -65,19 +55,19 @@ void CpuGenerator<float, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo
             __m256 b = ys;
             __m256 b2 = ys;
 
-            __m256 cx = info.julia ? juliaX : xs;
-            __m256 cx2 = info.julia ? juliaX : xs2;
-            __m256 cy = info.julia ? juliaY : ys;
+            __m256 cx = julia ? juliaX : xs;
+            __m256 cx2 = julia ? juliaX : xs2;
+            __m256 cy = julia ? juliaY : ys;
 
             __m256 resultsa = a;
             __m256 resultsb = b;
             __m256 resultsa2 = a2;
             __m256 resultsb2 = b2;
 
-            if (info.smooth) {
+            if (smooth) {
                 __m256 cmp = _mm256_cmp_ps(a, a, _CMP_LE_OQ);
                 __m256 cmp2 = _mm256_cmp_ps(a, a, _CMP_LE_OQ);
-                for (int k = 0; k < info.maxIter; k++) {
+                for (int k = 0; k < maxIter; k++) {
                     __m256 aa = _mm256_mul_ps(a, a);
                     __m256 aa2 = _mm256_mul_ps(a2, a2);
                     __m256 bb = _mm256_mul_ps(b, b);
@@ -104,7 +94,7 @@ void CpuGenerator<float, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo
                 }
             }
             else {
-                for (int k = 0; k < info.maxIter; k++) {
+                for (int k = 0; k < maxIter; k++) {
                     __m256 aa = _mm256_mul_ps(a, a);
                     __m256 aa2 = _mm256_mul_ps(a2, a2);
                     __m256 bb = _mm256_mul_ps(b, b);
@@ -127,33 +117,25 @@ void CpuGenerator<float, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo
                 }
             }
 
-
-            auto alignVec = [](float* data) -> float* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 8 * sizeof(float), aligned, length);
-                return static_cast<float*>(aligned);
-            };
-
             float resData[64];
-            float* ftRes = alignVec(resData);
+            float* ftRes = resData;
             float* resa = ftRes + 16;
             float* resb = resa + 16;
 
-            _mm256_store_ps(ftRes, counter);
-            _mm256_store_ps(ftRes + 8, counter2);
-            _mm256_store_ps(resa, resultsa);
-            _mm256_store_ps(resa + 8, resultsa2);
-            _mm256_store_ps(resb, resultsb);
-            _mm256_store_ps(resb + 8, resultsb2);
-            for (int k = 0; k < 16 && i + k < info.bWidth; k++) {
-                if (info.smooth) {
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+            _mm256_storeu_ps(ftRes, counter);
+            _mm256_storeu_ps(ftRes + 8, counter2);
+            _mm256_storeu_ps(resa, resultsa);
+            _mm256_storeu_ps(resa + 8, resultsa2);
+            _mm256_storeu_ps(resb, resultsb);
+            _mm256_storeu_ps(resb + 8, resultsb2);
+            for (int k = 0; k < 16 && i + k < width; k++) {
+                if (smooth) {
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) * 0.5f);
                 }
                 else {
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter : ftRes[k];
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter : ftRes[k];
                 }
             }
         }
@@ -161,31 +143,28 @@ void CpuGenerator<float, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo
 }
 
 
-template<bool parallel>
-void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo& info, float* data)
+void generateDoubleAvx(long width, long height, float* data, bool parallel,
+    double vx, double vy, double vw, double vh, int maxIter, bool smooth,
+    bool julia, double jX, double jY)
 {
     using T = double;
-    const MandelViewport& view = info.view;
 
-    const double dppf = double(view.width / info.bWidth);
-    const double viewxf = double(view.x);
-    __m256d viewx = { viewxf, viewxf, viewxf, viewxf };
+    const double dppf = double(vw / width);
+    __m256d viewx = { vx, vx, vx, vx };
     __m256d dpp = { dppf, dppf, dppf, dppf };
 
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
     __m256d juliaX = { jX, jX, jX, jX };
     __m256d juliaY = { jY, jY, jY, jY };
 
 #if defined(_OPENMP)
-    if constexpr(parallel)
+    if (parallel)
         omp_set_num_threads(omp_get_num_procs());
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #endif
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = T(view.y + T(j) * view.height / info.bHeight);
+    for (long j = 0; j < height; j++) {
+        T y = vy + T(j) * vh / height;
         __m256d ys = { y, y, y, y };
-        for (long i = 0; i < info.bWidth; i += 8) {
+        for (long i = 0; i < width; i += 8) {
             __m256d pixc = { double(i), double(i + 1), double(i + 2), double(i + 3) };
             __m256d pixc2 = { double(i + 4), double(i + 5), double(i + 6), double(i + 7) };
             __m256d xs = _mm256_add_pd(_mm256_mul_pd(dpp, pixc), viewx);
@@ -209,14 +188,14 @@ void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInf
             __m256d a2 = xs2;
             __m256d b2 = ys;
 
-            __m256d cx = info.julia ? juliaX : xs;
-            __m256d cx2 = info.julia ? juliaX : xs2;
-            __m256d cy = info.julia ? juliaY : ys;
+            __m256d cx = julia ? juliaX : xs;
+            __m256d cx2 = julia ? juliaX : xs2;
+            __m256d cy = julia ? juliaY : ys;
 
-            if (info.smooth) {
+            if (smooth) {
                 __m256d cmp = _mm256_cmp_pd(a, a, _CMP_LE_OQ);
                 __m256d cmp2 = _mm256_cmp_pd(a, a, _CMP_LE_OQ);
-                for (int k = 0; k < info.maxIter; k++) {
+                for (int k = 0; k < maxIter; k++) {
                     __m256d aa = _mm256_mul_pd(a, a);
                     __m256d aa2 = _mm256_mul_pd(a2, a2);
                     __m256d bb = _mm256_mul_pd(b, b);
@@ -244,7 +223,7 @@ void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInf
                 }
             }
             else {
-                for (int k = 0; k < info.maxIter; k++) {
+                for (int k = 0; k < maxIter; k++) {
                     __m256d aa = _mm256_mul_pd(a, a);
                     __m256d aa2 = _mm256_mul_pd(a2, a2);
                     __m256d bb = _mm256_mul_pd(b, b);
@@ -266,41 +245,33 @@ void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInf
                         break;
                     }
                 }
-
             }
 
-            auto alignVec = [](double* data) -> double* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 4 * sizeof(double), aligned, length);
-                return static_cast<double*>(aligned);
-            };
-
             double resData[8];
-            double* ftRes = alignVec(resData);
+            double* ftRes = resData;
             double* resa = (double*) &resultsa;
             double* resb = (double*) &resultsb;
-            _mm256_store_pd(ftRes, counter);
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
-                if (info.smooth)
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? float(info.maxIter) :
-                        ftRes[k] >= info.maxIter ? float(info.maxIter) :
-                        float(((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f));
+            _mm256_storeu_pd(ftRes, counter);
+            for (int k = 0; k < 4 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = ftRes[k] < 0 ? float(maxIter) :
+                        ftRes[k] >= maxIter ? float(maxIter) :
+                        float(((float)ftRes[k]) + 1 - floatLog2(floatLog(float(resa[k] * resa[k] + resb[k] * resb[k])) / 2));
                 else
-                    data[i + k + j * info.bWidth] = ftRes[k] >= 0 ? float(ftRes[k]) : info.maxIter;
+                    data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
             }
 
             resa = (double*) &resultsa2;
             resb = (double*) &resultsb2;
-            _mm256_store_pd(ftRes, counter2);
+            _mm256_storeu_pd(ftRes, counter2);
             i += 4;
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
-                if (info.smooth)
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? float(info.maxIter) :
-                        ftRes[k] >= info.maxIter ? float(info.maxIter) :
-                        float(((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f));
+            for (int k = 0; k < 4 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = ftRes[k] < 0 ? float(maxIter) :
+                        ftRes[k] >= maxIter ? float(maxIter) :
+                        float(((float)ftRes[k]) + 1 - floatLog2(floatLog(float(resa[k] * resa[k] + resb[k] * resb[k])) / 2));
                 else
-                    data[i + k + j * info.bWidth] = ftRes[k] >= 0 ? float(ftRes[k]) : info.maxIter;
+                    data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
             }
             i -= 4;
         }
@@ -346,6 +317,14 @@ static inline VecPair twoDiff(__m256d a, __m256d b)
 }
 
 
+static inline VecPair threeTwoSum(__m256d a, __m256d b, __m256d c)
+{
+    auto[t1, t2] = twoSum(a, b);
+    auto[r0, t3] = twoSum(t1, c);
+    return { r0, _mm256_add_pd(t2, t3) };
+}
+
+
 static inline VecPair split(__m256d a)
 {
     /*
@@ -381,6 +360,7 @@ static inline VecPair twoProd(__m256d a, __m256d b)
     return { p, err };
 }
 
+
 struct AvxDoubleDouble
 {
     __m256d x[2];
@@ -421,33 +401,33 @@ struct AvxDoubleDouble
     }
 };
 
-template<bool parallel>
-void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo& info, float* data)
-{
-    const MandelViewport& view = info.view;
-
-    using T = LightDoubleDouble;
 
-    T viewx = mnd::convert<T>(view.x);
-    T viewy = mnd::convert<T>(view.y);
-    T wpp = mnd::convert<T>(view.width / info.bWidth);
-    T hpp = mnd::convert<T>(view.height / info.bHeight);
+void generateDoubleDoubleAvx(long width, long height, float* data, bool parallel,
+    double vx1, double vx2, double vy1, double vy2, double vw1, double vw2, double vh1, double vh2, int maxIter, bool smooth,
+    bool julia, double jX1, double jX2, double jY1, double jY2)
+{
+    using namespace avx_private;
+    using T = mnd::LightDoubleDouble;
 
+    T viewx{ vx1, vx2 };
+    T viewy{ vy1, vy2 };
+    T wpp = T{ vw1, vw2 } * T(1.0 / width);
+    T hpp = T{ vh1, vh2 } * T(1.0 / height);
 
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
+    T jX{ jX1, jX2 };
+    T jY{ jY1, jY2 };
     AvxDoubleDouble juliaX = { jX[0], jX[1] };
     AvxDoubleDouble juliaY = { jY[0], jY[1] };
 
 #if defined(_OPENMP)
-    if constexpr(parallel)
+    if (parallel)
         omp_set_num_threads(omp_get_num_procs());
 #   pragma omp parallel for schedule(static, 1) if (parallel)
 #endif
-    for (long j = 0; j < info.bHeight; j++) {
+    for (long j = 0; j < height; j++) {
         T y = viewy + T(double(j)) * hpp;
         AvxDoubleDouble ys{ y[0], y[1] };
-        for (long i = 0; i < info.bWidth; i += 4) {
+        for (long i = 0; i < width; i += 4) {
             T x1 = viewx + T(double(i)) * wpp;
             T x2 = x1 + wpp;
             T x3 = x2 + wpp;
@@ -463,8 +443,8 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd
 
             AvxDoubleDouble xs{ x0s, x1s };
 
-            AvxDoubleDouble cx = info.julia ? juliaX : xs;
-            AvxDoubleDouble cy = info.julia ? juliaY : ys;
+            AvxDoubleDouble cx = julia ? juliaX : xs;
+            AvxDoubleDouble cy = julia ? juliaY : ys;
 
             int itRes[4] = { 0, 0, 0, 0 };
 
@@ -479,13 +459,13 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd
             __m256d resultsb = _mm256_set1_pd(0);
 
             __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
-            for (int k = 0; k < info.maxIter; k++) {
+            for (int k = 0; k < maxIter; k++) {
                 AvxDoubleDouble aa = a * a;
                 AvxDoubleDouble bb = b * b;
                 AvxDoubleDouble abab = a * b; abab = abab + abab;
                 a = aa - bb + cx;
                 b = abab + cy;
-                if (info.smooth) {
+                if (smooth) {
                     resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a.x[0]));
                     resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b.x[0]));
                 }
@@ -497,30 +477,184 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd
                 }
             }
 
-            auto alignVec = [](double* data) -> double* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 4 * sizeof(double), aligned, length);
-                return static_cast<double*>(aligned);
+            double resData[8];
+            double* ftRes = resData;
+            double* resa = (double*) &resultsa;
+            double* resb = (double*) &resultsb;
+            _mm256_storeu_pd(ftRes, counter);
+
+            for (int k = 0; k < 4 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = float(ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(::floatLog(float(resa[k] * resa[k] + resb[k] * resb[k])) / 2));
+                else
+                    data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
+            }
+        }
+    }
+}
+
+struct AvxTripleDouble
+{
+    __m256d x[3];
+
+    inline AvxTripleDouble(__m256d a, __m256d b, __m256d c) :
+        x{ a, b, c }
+    {}
+
+    inline AvxTripleDouble(double a, double b, double c) :
+        x{ _mm256_set1_pd(a), _mm256_set1_pd(b), _mm256_set1_pd(c) }
+    {}
+
+    inline AvxTripleDouble operator + (const AvxTripleDouble& b) const
+    {
+        const auto& a = *this;
+        auto[r0, t0] = twoSum(a.x[0], b.x[0]);
+        auto[t1, t2] = twoSum(a.x[1], b.x[1]);
+        auto[r1, t3] = twoSum(t0, t1);
+        auto r2 = _mm256_add_pd(_mm256_add_pd(t2, _mm256_add_pd(a.x[2], b.x[2])), t3);
+
+        auto[re1, t4] = quickTwoSum(r0, r1);
+        auto[re2, re3] = quickTwoSum(t4, r2);
+        return { re1, re2, re3 };
+    }
+
+    inline AvxTripleDouble operator - (const AvxTripleDouble& b) const
+    {
+        const auto& a = *this;
+        auto[r0, t0] = twoDiff(a.x[0], b.x[0]);
+        auto[t1, t2] = twoDiff(a.x[1], b.x[1]);
+        auto[r1, t3] = twoSum(t0, t1);
+        auto r2 = _mm256_add_pd(_mm256_add_pd(t2, _mm256_sub_pd(a.x[2], b.x[2])), t3);
+
+        auto[re1, t4] = quickTwoSum(r0, r1);
+        auto[re2, re3] = quickTwoSum(t4, r2);
+        return { re1, re2, re3 };
+    }
+
+    inline AvxTripleDouble operator * (const AvxTripleDouble& b) const
+    {
+        const auto& a = *this;
+        auto[p1_0, p2_0] = twoProd(a.x[0], b.x[0]);
+        auto[p2_1, p3_0] = twoProd(a.x[0], b.x[1]);
+        auto[p2_2, p3_1] = twoProd(a.x[1], b.x[0]);
+
+        auto[t2, tl3] = threeTwoSum(p2_0, p2_1, p2_2);
+        auto t3 = _mm256_add_pd(tl3,
+            _mm256_add_pd(
+                _mm256_add_pd(p3_0, p3_1),
+                _mm256_add_pd(
+                    _mm256_mul_pd(a.x[1], b.x[1]),
+                    _mm256_add_pd(
+                        _mm256_mul_pd(a.x[2], b.x[0]),
+                        _mm256_mul_pd(a.x[0], b.x[2])
+                    )
+                )
+            )
+            );
+        auto[re0, q2] = quickTwoSum(p1_0, t2);
+        auto[re1, re2] = quickTwoSum(q2, t3);
+        return { re0, re1, re2 };
+    }
+};
+
+
+void generateTripleDoubleAvx(long width, long height, float* data, bool parallel,
+    double vx1, double vx2, double vx3, double vy1, double vy2, double vy3,
+    double vw1, double vw2, double vw3, double vh1, double vh2, double vh3,
+    int maxIter, bool smooth, bool julia, double jX1,
+    double jX2, double jX3, double jY1, double jY2, double jY3)
+{
+    using namespace avx_private;
+    using T = mnd::TripleDouble;
+
+    T viewx{ vx1, vx2, vx3 };
+    T viewy{ vy1, vy2, vy2 };
+    T wpp = T{ vw1, vw2, vw3 } * T(1.0 / width);
+    T hpp = T{ vh1, vh2, vh3 } * T(1.0 / height);
+
+    T jX{ jX1, jX2, jX3 };
+    T jY{ jY1, jY2, jY3 };
+    AvxTripleDouble juliaX = { jX[0], jX[1], jX[2] };
+    AvxTripleDouble juliaY = { jY[0], jY[1], jY[2] };
+
+#if defined(_OPENMP)
+    if (parallel)
+        omp_set_num_threads(omp_get_num_procs());
+#   pragma omp parallel for schedule(static, 1) if (parallel)
+#endif
+    for (long j = 0; j < height; j++) {
+        T y = viewy + T(double(j)) * hpp;
+        AvxTripleDouble ys{ y[0], y[1], y[2] };
+        for (long i = 0; i < width; i += 4) {
+            T x1 = viewx + T(double(i)) * wpp;
+            T x2 = x1 + wpp;
+            T x3 = x2 + wpp;
+            T x4 = x3 + wpp;
+
+            __m256d x0s = {
+                x1[0], x2[0], x3[0], x4[0],
+            };
+            __m256d x1s = {
+                x1[1], x2[1], x3[1], x4[1],
             };
+            __m256d x2s = {
+                x1[2], x2[2], x3[2], x4[2],
+            };
+
+            AvxTripleDouble xs{ x0s, x1s, x2s };
+
+            AvxTripleDouble cx = julia ? juliaX : xs;
+            AvxTripleDouble cy = julia ? juliaY : ys;
+
+            int itRes[4] = { 0, 0, 0, 0 };
+
+            __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
+            __m256d counter = { 0, 0, 0, 0 };
+            __m256d adder = { 1, 1, 1, 1 };
+
+            AvxTripleDouble a = xs;
+            AvxTripleDouble b = ys;
+
+            __m256d resultsa = _mm256_set1_pd(0);
+            __m256d resultsb = _mm256_set1_pd(0);
+
+            __m256d cmp = _mm256_cmp_pd(threshold, threshold, _CMP_LE_OQ);
+            for (int k = 0; k < maxIter; k++) {
+                AvxTripleDouble aa = a * a;
+                AvxTripleDouble bb = b * b;
+                AvxTripleDouble abab = a * b; abab = abab + abab;
+                a = aa - bb + cx;
+                b = abab + cy;
+                if (smooth) {
+                    resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a.x[0]));
+                    resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b.x[0]));
+                }
+                cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
+                adder = _mm256_and_pd(adder, cmp);
+                counter = _mm256_add_pd(counter, adder);
+                if (_mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
+                    break;
+                }
+            }
 
             double resData[8];
-            double* ftRes = alignVec(resData);
+            double* ftRes = resData;
             double* resa = (double*) &resultsa;
             double* resb = (double*) &resultsb;
-            _mm256_store_pd(ftRes, counter);
+            _mm256_storeu_pd(ftRes, counter);
 
-            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
-                if (info.smooth)
-                    data[i + k + j * info.bWidth] = float(ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(float(resa[k] * resa[k] + resb[k] * resb[k])) / 2) / ::log(2.0f));
+            for (int k = 0; k < 4 && i + k < width; k++) {
+                if (smooth)
+                    data[i + k + j * width] = float(ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(::floatLog(float(resa[k] * resa[k] + resb[k] * resb[k])) / 2));
                 else
-                    data[i + k + j * info.bWidth] = ftRes[k] >= 0 ? float(ftRes[k]) : info.maxIter;
+                    data[i + k + j * width] = ftRes[k] >= 0 ? float(ftRes[k]) : maxIter;
             }
         }
     }
 }
 
 
-

+ 36 - 57
libmandel/src/CpuGeneratorsAVX512.cpp

@@ -1,48 +1,34 @@
-#include "CpuGenerators.h"
-
 #include <immintrin.h>
 #include <omp.h>
 
-#include <memory>
+#include "FloatLog.h"
 
-using mnd::CpuGenerator;
 
-namespace mnd
-{
-    template class CpuGenerator<float, mnd::X86_AVX_512, false>;
-    template class CpuGenerator<float, mnd::X86_AVX_512, true>;
-
-    template class CpuGenerator<double, mnd::X86_AVX_512, false>;
-    template class CpuGenerator<double, mnd::X86_AVX_512, true>;
-}
-
-template<bool parallel>
-void CpuGenerator<float, mnd::X86_AVX_512, parallel>::generate(const mnd::MandelInfo& info, float* data)
+void generateFloatAvx512(long width, long height, float* data, bool parallel,
+                         float vx, float vy, float vw, float vh, int maxIter, bool smooth,
+                         bool julia, float jX, float jY)
 {
     using T = float;
-    const MandelViewport& view = info.view;
 
-    const float dppf = float(view.width / info.bWidth);
-    const float viewxf = float(view.x);
+    const float dppf = float(vw / width);
+    const float viewxf = float(vx);
     __m512 viewx = _mm512_set1_ps(viewxf);
     __m512 dpp = _mm512_set1_ps(dppf);
     __m512 enumerate = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
     __m512 two = _mm512_set1_ps(2);
 
-    T jX = mnd::convert<T>(info.juliaX);
-    T jY = mnd::convert<T>(info.juliaY);
     __m512 juliaX = _mm512_set1_ps(jX);
     __m512 juliaY = _mm512_set1_ps(jY);
 
 #if defined(_OPENMP)
-    if constexpr(parallel)
+    if (parallel)
         omp_set_num_threads(omp_get_num_procs());
 #pragma omp parallel for schedule(static, 1) if (parallel)
 #endif
-    for (long j = 0; j < info.bHeight; j++) {
-        T y = T(view.y + double(j) * view.height / info.bHeight);
+    for (long j = 0; j < height; j++) {
+        T y = vy + float(j) * vh / height;
         __m512 ys = _mm512_set1_ps(y);
-        for (long i = 0; i < info.bWidth; i += 2 * 16) {
+        for (long i = 0; i < width; i += 2 * 16) {
             __m512 pixc0 = _mm512_add_ps(_mm512_set1_ps(float(i)), enumerate);
             __m512 pixc1 = _mm512_add_ps(_mm512_set1_ps(float(i + 16)), enumerate);
             //__m512 pixc2 = _mm512_add_ps(_mm512_set1_ps(float(i + 32)), enumerate);
@@ -68,11 +54,11 @@ void CpuGenerator<float, mnd::X86_AVX_512, parallel>::generate(const mnd::Mandel
             __m512 cx0 = xs0;
             __m512 cx1 = xs1;
             __m512 cy = ys;
-	    if (info.julia) {
-		cx0 = juliaX;
-		cx1 = juliaX;
-		cy = juliaY;
-	    }
+	        if (julia) {
+		        cx0 = juliaX;
+		        cx1 = juliaX;
+		        cy = juliaY;
+	        }
 
             __m512 a0 = xs0;
             __m512 a1 = xs1;
@@ -81,10 +67,10 @@ void CpuGenerator<float, mnd::X86_AVX_512, parallel>::generate(const mnd::Mandel
             __m512 b1 = ys;
             //__m512 b2 = ys;
 
-            if (info.smooth) {
+            if (smooth) {
                 __mmask16 cmp0 = 0xFFFF;
                 __mmask16 cmp1 = 0xFFFF;
-                for (int k = 0; k < info.maxIter; k++) {
+                for (int k = 0; k < maxIter; k++) {
                     __m512 aa0 = _mm512_mul_ps(a0, a0);
                     __m512 aa1 = _mm512_mul_ps(a1, a1);
                     //__m512 aa2 = _mm512_mul_ps(a2, a2);
@@ -119,7 +105,7 @@ void CpuGenerator<float, mnd::X86_AVX_512, parallel>::generate(const mnd::Mandel
                 }
             }
             else {
-                for (int k = 0; k < info.maxIter; k++) {
+                for (int k = 0; k < maxIter; k++) {
                     __m512 aa0 = _mm512_mul_ps(a0, a0);
                     __m512 aa1 = _mm512_mul_ps(a1, a1);
                     //__m512 aa2 = _mm512_mul_ps(a2, a2);
@@ -144,43 +130,36 @@ void CpuGenerator<float, mnd::X86_AVX_512, parallel>::generate(const mnd::Mandel
                 }
             }
 
-            auto alignVec = [](float* data) -> float* {
-                void* aligned = data;
-                ::size_t length = 3 * 64 * sizeof(float);
-                std::align(64, 48 * sizeof(float), aligned, length);
-                return static_cast<float*>(aligned);
-            };
-
             float resData[3 * 64];
-            float* ftRes = alignVec(resData);
-            float* resa = ftRes + 3 * 16;
-            float* resb = ftRes + 6 * 16;
-            _mm512_store_ps(ftRes, counter0);
-            _mm512_store_ps(ftRes + 16, counter1);
+            float* ftRes = resData;
+            float* resa = resData + 3 * 16;
+            float* resb = resData + 6 * 16;
+            _mm512_storeu_ps(ftRes, counter0);
+            _mm512_storeu_ps(ftRes + 16, counter1);
             //_mm512_store_ps(ftRes + 32, counter2);
-            if (info.smooth) {
-                _mm512_store_ps(resa, resultsa0);
-                _mm512_store_ps(resa + 16, resultsa1);
+            if (smooth) {
+                _mm512_storeu_ps(resa, resultsa0);
+                _mm512_storeu_ps(resa + 16, resultsa1);
                 //_mm512_store_ps(resa + 32, resultsa2);
-                _mm512_store_ps(resb, resultsb0);
-                _mm512_store_ps(resb + 16, resultsb1);
+                _mm512_storeu_ps(resb, resultsb0);
+                _mm512_storeu_ps(resb + 16, resultsb1);
                 //_mm512_store_ps(resb + 32, resultsb2);
             }
-            for (int k = 0; k < 2 * 16 && i + k < info.bWidth; k++) {
-                if (info.smooth) {
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
-                        ftRes[k] >= info.maxIter ? info.maxIter :
-                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
+            for (int k = 0; k < 2 * 16 && i + k < width; k++) {
+                if (smooth) {
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter :
+                        ftRes[k] >= maxIter ? maxIter :
+                        ((float)ftRes[k]) + 1 - floatLog2(floatLog(resa[k] * resa[k] + resb[k] * resb[k]) * 0.5);
                 }
                 else {
-                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter : ftRes[k];
+                    data[i + k + j * width] = ftRes[k] < 0 ? maxIter : ftRes[k];
                 }
             }
         }
     }
 }
 
-
+/*
 
 template<bool parallel>
 void CpuGenerator<double, mnd::X86_AVX_512, parallel>::generate(const mnd::MandelInfo& info, float* data)
@@ -286,5 +265,5 @@ void CpuGenerator<double, mnd::X86_AVX_512, parallel>::generate(const mnd::Mande
         }
     }
 }
-
+*/
 

+ 13 - 0
libmandel/src/FloatLog.cpp

@@ -0,0 +1,13 @@
+#include "FloatLog.h"
+#include <cmath>
+
+float floatLog(float x)
+{
+    return ::logf(x);
+}
+
+float floatLog2(float x)
+{
+    return ::log2f(x);
+}
+

+ 1 - 0
libmandel/src/Generators.cpp

@@ -212,6 +212,7 @@ namespace mnd
         case GeneratorType::DOUBLE_DOUBLE_AVX_FMA:
             return getPrecision<DoubleDouble>();
         case GeneratorType::TRIPLE_DOUBLE:
+        case GeneratorType::TRIPLE_DOUBLE_AVX:
             return getPrecision<TripleDouble>();
         case GeneratorType::QUAD_DOUBLE:
             return getPrecision<QuadDouble>();

+ 5 - 2
libmandel/src/Mandel.cpp

@@ -48,6 +48,7 @@ static const std::map<mnd::GeneratorType, std::string> typeNames =
     { mnd::GeneratorType::DOUBLE_DOUBLE_AVX_FMA, "double double AVX+FMA" },
     { mnd::GeneratorType::DOUBLE_DOUBLE_NEON, "double double NEON" },
     { mnd::GeneratorType::TRIPLE_DOUBLE, "triple double" },
+    { mnd::GeneratorType::TRIPLE_DOUBLE_AVX, "triple double AVX" },
     { mnd::GeneratorType::QUAD_DOUBLE, "quad double" },
     { mnd::GeneratorType::QUAD_DOUBLE_AVX_FMA, "quad double AVX+FMA" },
     { mnd::GeneratorType::FLOAT128, "float128" },
@@ -130,18 +131,20 @@ MandelContext::MandelContext(void)
 #   if defined(WITH_AVX512)
     if (cpuInfo.hasAvx512()) {
         auto fl = std::make_unique<CpuGenerator<float, mnd::X86_AVX_512, true>>();
-        auto db = std::make_unique<CpuGenerator<double, mnd::X86_AVX_512, true>>();
+        //auto db = std::make_unique<CpuGenerator<double, mnd::X86_AVX_512, true>>();
         cpuGenerators.insert({ GeneratorType::FLOAT_AVX512, std::move(fl) });
-        cpuGenerators.insert({ GeneratorType::DOUBLE_AVX512, std::move(db) });
+        //cpuGenerators.insert({ GeneratorType::DOUBLE_AVX512, std::move(db) });
     }
 #   endif
     if (cpuInfo.hasAvx()) {
         auto fl = std::make_unique<CpuGenerator<float, mnd::X86_AVX, true>>();
         auto db = std::make_unique<CpuGenerator<double, mnd::X86_AVX, true>>();
         auto ddb = std::make_unique<CpuGenerator<DoubleDouble, mnd::X86_AVX, true>>();
+        auto tdb = std::make_unique<CpuGenerator<TripleDouble, mnd::X86_AVX, true>>();
         cpuGenerators.insert({ GeneratorType::FLOAT_AVX, std::move(fl) });
         cpuGenerators.insert({ GeneratorType::DOUBLE_AVX, std::move(db) });
         cpuGenerators.insert({ GeneratorType::DOUBLE_DOUBLE_AVX, std::move(ddb) });
+        cpuGenerators.insert({ GeneratorType::TRIPLE_DOUBLE_AVX, std::move(tdb) });
     }
     if (cpuInfo.hasAvx2() && cpuInfo.hasFma()) {
         auto favxfma = std::make_unique<CpuGenerator<float, mnd::X86_AVX_FMA, true>>();