Browse Source

Merge branch 'master' of http://git.winfor.ch/nicolas/Almond

Nicolas Winkler 5 years ago
parent
commit
12dd7b2940

+ 2 - 2
CMakeLists.txt

@@ -16,7 +16,7 @@ find_package(Boost 1.65 REQUIRED)
 
 find_package(FFmpeg COMPONENTS AVCODEC AVDEVICE AVFORMAT AVUTIL SWSCALE REQUIRED)
 
-message( ${FFMPEG_INCLUDE_DIRS})
+#message(${FFMPEG_INCLUDE_DIRS})
 
 set(CMAKE_CXX_STANDARD 17)
 
@@ -34,7 +34,7 @@ add_subdirectory(libmandel)
 
 target_include_directories(Almond SYSTEM PUBLIC ${FFMPEG_INCLUDE_DIRS})
 
-target_link_libraries(Almond PUBLIC mandel asmjit qd)
+target_link_libraries(Almond PUBLIC mandel)
 target_link_libraries(Almond PUBLIC Qt5::Core Qt5::Widgets Qt5::OpenGL Qt5::Xml)
 target_link_libraries(Almond PUBLIC ${FFMPEG_LIBRARIES})
 target_link_libraries(Almond PUBLIC OpenGL::GL)

+ 2 - 1
MandelWidget.cpp

@@ -342,7 +342,8 @@ MandelView::MandelView(mnd::MandelGenerator* generator, MandelWidget& owner) :
 }
 
 
-int MandelView::getLevel(mnd::Real dpp) {
+int MandelView::getLevel(mnd::Real dpp)
+{
     return int(mnd::log2(dpp / chunkSize));
 }
 

+ 25 - 20
libmandel/CMakeLists.txt

@@ -1,6 +1,8 @@
 cmake_minimum_required(VERSION 3.12)
 
 set(ARCH "X86_64" CACHE STRING "Target Architecture")
+option(AVX512 "generate code that can make use of avx-512-instructions" ON)
+option(WITH_ASMJIT "use just-in-time-compilation capabilities of asmjit" ON)
 
 #message(CMAKE_SYSTEM_PROCESSOR)
 
@@ -36,13 +38,16 @@ SET(MandelSources
 FILE(GLOB MandelHeaders include/*.h)
 
 if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
-    list(APPEND MandelSources src/CpuGeneratorsAVX.cpp src/CpuGeneratorsAVXFMA.cpp src/CpuGeneratorsSSE2.cpp src/CpuGeneratorsAVX512.cpp)
+    list(APPEND MandelSources src/CpuGeneratorsAVX.cpp src/CpuGeneratorsAVXFMA.cpp src/CpuGeneratorsSSE2.cpp)
+    if (AVX512)
+        list(APPEND MandelSources src/CpuGeneratorsAVX512.cpp)
+    endif()
 elseif(ARCH STREQUAL "ARM")
     list(APPEND MandelSources src/CpuGeneratorsNeon.cpp)
 endif()
 
+
 #    message(${MandelSources})
-add_subdirectory(asmjit)
 
 add_library(mandel STATIC ${MandelSources})
 
@@ -62,7 +67,12 @@ target_include_directories(qd PUBLIC qd-2.3.22/include qd-2.3.22)
 #target_include_directories(mandel PUBLI#C qd-2.3.22/include)
 #target_include_directories(mandel PUBLI#C qd-2.3.22/include)
 target_link_libraries(mandel PUBLIC qd)
-target_link_libraries(mandel PUBLIC asmjit)
+
+if(WITH_ASMJIT)
+	add_subdirectory(asmjit)
+	target_compile_definitions(mandel PUBLIC WITH_ASMJIT)
+	target_link_libraries(mandel PUBLIC asmjit)
+endif(WITH_ASMJIT)
 
 include(CheckIPOSupported)
 check_ipo_supported(RESULT LTO_SUPPORTED)
@@ -95,34 +105,29 @@ if(Boost_FOUND)
 endif(Boost_FOUND)
 
 if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
-    if (MSVC)
-        set_source_files_properties(src/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS /arch:AVX512F)
-    else()
-        set_source_files_properties(src/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS -mavx512f)
-    endif(MSVC)
+    if (AVX512)
+        target_compile_definitions(mandel PUBLIC WITH_AVX512)
+        if (MSVC)
+            set_source_files_properties(src/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS /arch:AVX512F)
+        else()
+            set_source_files_properties(src/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS -mavx512f)
+        endif(MSVC)
+    endif()
 
     if (MSVC)
         set_source_files_properties(src/CpuGeneratorsAVX.cpp PROPERTIES COMPILE_FLAGS /arch:AVX)
         set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS /arch:AVX)
+        set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS /arch:FMA)
+        set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS /arch:SSE2)
     else()
         set_source_files_properties(src/CpuGeneratorsAVX.cpp PROPERTIES COMPILE_FLAGS -mavx)
-        set_source_files_properties(src/JuliaGenerators.cpp PROPERTIES COMPILE_FLAGS -mavx)
         set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS -mavx)
-    endif(MSVC)
-
-    if (MSVC)
-        set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS /arch:FMA)
-    else()
         set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS -mfma)
-    endif(MSVC)
-
-    if (MSVC)
-        set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS /arch:SSE2)
-    else()
         set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS -msse2)
     endif(MSVC)
+
 elseif(ARCH STREQUAL "ARM")
-    #set_source_files_properties(src/CpuGeneratorsNeon.cpp PROPERTIES COMPILE_FLAGS -mfpu=neon)
+    set_source_files_properties(src/CpuGeneratorsNeon.cpp PROPERTIES COMPILE_FLAGS -march=armv8-a+simd)
 endif()
 
 

+ 16 - 1
libmandel/include/CpuGenerators.h

@@ -90,12 +90,13 @@ public:
     virtual void generate(const MandelInfo& info, float* data);
 };
 
+
 template<bool parallel>
 class mnd::CpuGenerator<float, mnd::X86_AVX_FMA, parallel> : public MandelGenerator
 {
 public:
     inline CpuGenerator(void) :
-        MandelGenerator{ mnd::Precision::FLOAT, mnd::X86_AVX_FMA }
+        MandelGenerator{ mnd::Precision::DOUBLE, mnd::X86_AVX_FMA }
     {
     }
     virtual void generate(const MandelInfo& info, float* data);
@@ -174,6 +175,7 @@ public:
     virtual void generate(const MandelInfo& info, float* data);
 };
 
+
 template<bool parallel>
 class mnd::CpuGenerator<double, mnd::ARM_NEON, parallel> : public Generator
 {
@@ -184,6 +186,19 @@ public:
     }
     virtual void generate(const MandelInfo& info, float* data);
 };
+
+
+template<bool parallel>
+class mnd::CpuGenerator<DoubleDouble, mnd::ARM_NEON, parallel> : public Generator
+{
+public:
+    inline CpuGenerator(void) :
+        MandelGenerator{ mnd::Precision::DOUBLE_DOUBLE, mnd::ARM_NEON }
+    {
+    }
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
 #endif
 
 

+ 1 - 0
libmandel/include/Generators.h

@@ -99,6 +99,7 @@ enum class mnd::GeneratorType : int
     DOUBLE_DOUBLE,
     DOUBLE_DOUBLE_AVX,
     DOUBLE_DOUBLE_AVX_FMA,
+    DOUBLE_DOUBLE_NEON,
     QUAD_DOUBLE,
     QUAD_DOUBLE_AVX_FMA,
     FLOAT128,

+ 2 - 0
libmandel/include/IterationGenerator.h

@@ -53,6 +53,7 @@ private:
 };
 
 
+#ifdef WITH_ASMJIT
 #if defined(__x86_64__) || defined(_M_X64)
 class mnd::CompiledGenerator : public mnd::MandelGenerator
 {
@@ -81,6 +82,7 @@ public:
     virtual void generate(const MandelInfo& info, float* data) override;
 };
 #endif
+#endif // WITH_ASMJIT
 
 
 #ifdef WITH_OPENCL

+ 5 - 0
libmandel/include/Mandel.h

@@ -5,6 +5,11 @@
 //#include <asmjit/asmjit.h>
 namespace asmjit { class JitRuntime; }
 
+#ifndef WITH_ASMJIT
+// if no asmjit, use dummy implementation
+namespace asmjit { class JitRuntime{}; }
+#endif // WITH_ASMJITH
+
 #include <vector>
 #include <map>
 #include <string>

+ 259 - 27
libmandel/src/CpuGeneratorsNeon.cpp

@@ -1,4 +1,5 @@
 #include "CpuGenerators.h"
+#include "LightDoubleDouble.h"
 
 #include <omp.h>
 #include <arm_neon.h>
@@ -13,6 +14,9 @@ namespace mnd
 
     template class CpuGenerator<double, mnd::ARM_NEON, false>;
     template class CpuGenerator<double, mnd::ARM_NEON, true>;
+
+    template class CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, false>;
+    template class CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, true>;
 }
 
 
@@ -21,9 +25,14 @@ void CpuGenerator<float, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInf
 {
     using T = float;
     const MandelViewport& view = info.view;
+
+    float32x4_t juliaX = vmovq_n_f32(double(info.juliaX));
+    float32x4_t juliaY = vmovq_n_f32(double(info.juliaY));
+#if defined(_OPENMP)
     if constexpr(parallel)
         omp_set_num_threads(omp_get_num_procs());
-#pragma omp parallel for schedule(static, 1) if (parallel)
+#   pragma omp parallel for schedule(static, 1) if (parallel)
+#endif
     for (long j = 0; j < info.bHeight; j++) {
         T y = T(view.y) + T(j) * T(view.height / info.bHeight);
         long i = 0;
@@ -48,21 +57,24 @@ void CpuGenerator<float, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInf
             float32x4_t a = xs;
             float32x4_t b = ys;
 
+            float32x4_t cx = info.julia ? juliaX : xs;
+            float32x4_t cy = info.julia ? juliaY : ys;
+
             for (int k = 0; k < info.maxIter; k++) {
                 float32x4_t aa = vmulq_f32(a, a);
                 float32x4_t bb = vmulq_f32(b, b);
                 float32x4_t abab = vmulq_f32(a, b); abab = vaddq_f32(abab, abab);
                 uint32x4_t cmp = vcleq_f32(vaddq_f32(aa, bb), threshold);
-		if (info.smooth) {
-                    float32x4_t tempa = vaddq_f32(vsubq_f32(aa, bb), xs);
-                    float32x4_t tempb = vaddq_f32(abab, ys);
-		    a = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempa)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(a))));
-		    b = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempb)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(b))));
-		}
-		else {
-                    a = vaddq_f32(vsubq_f32(aa, bb), xs);
-                    b = vaddq_f32(abab, ys);
-		}
+                if (info.smooth) {
+                    float32x4_t tempa = vaddq_f32(vsubq_f32(aa, bb), cx);
+                    float32x4_t tempb = vaddq_f32(abab, cy);
+                    a = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempa)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(a))));
+                    b = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempb)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(b))));
+                }
+                else {
+                    a = vaddq_f32(vsubq_f32(aa, bb), cx);
+                    b = vaddq_f32(abab, cy);
+                }
                 adder = vandq_u32(adder, cmp);
                 counter = vaddq_u32(counter, adder);
                 // checking for break criterion is possibly expensive, only do it every 8 iterations
@@ -105,9 +117,15 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
 {
     using T = double;
     const MandelViewport& view = info.view;
+
+    float64x2_t juliaX = vmovq_n_f64(double(info.juliaX));
+    float64x2_t juliaY = vmovq_n_f64(double(info.juliaY));
+
+#if defined(_OPENMP)
     if constexpr(parallel)
         omp_set_num_threads(omp_get_num_procs());
-#pragma omp parallel for schedule(static, 1) if (parallel)
+#   pragma omp parallel for schedule(static, 1) if (parallel)
+#endif
     for (long j = 0; j < info.bHeight; j++) {
         T y = T(view.y) + T(j) * T(view.height / info.bHeight);
         long i = 0;
@@ -129,6 +147,8 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
             float64x2_t ys = vmovq_n_f64(y);
             float64x2_t a = xs;
             float64x2_t b = ys;
+            float64x2_t cx = info.julia ? juliaX : xs;
+            float64x2_t cy = info.julia ? juliaY : ys;
 
             for (int k = 0; k < info.maxIter; k++) {
                 float64x2_t aa = vmulq_f64(a, a);
@@ -137,16 +157,16 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
                 //a = vaddq_f64(vsubq_f64(aa, bb), xs);
                 //b = vaddq_f64(abab, ys);
                 uint64x2_t cmp = vcleq_f64(vaddq_f64(aa, bb), threshold);
-		if (info.smooth) {
-                    float64x2_t tempa = vaddq_f64(vsubq_f64(aa, bb), xs);
-                    float64x2_t tempb = vaddq_f64(abab, ys);
-		    a = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempa)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(a))));
-		    b = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempb)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(b))));
-		}
-		else {
-                    a = vaddq_f64(vsubq_f64(aa, bb), xs);
-                    b = vaddq_f64(abab, ys);
-		}
+                if (info.smooth) {
+                    float64x2_t tempa = vaddq_f64(vsubq_f64(aa, bb), cx);
+                    float64x2_t tempb = vaddq_f64(abab, cy);
+                    a = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempa)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(a))));
+                    b = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempb)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(b))));
+                }
+                else {
+                    a = vaddq_f64(vsubq_f64(aa, bb), cx);
+                    b = vaddq_f64(abab, cy);
+                }
                 adder = vandq_u64(adder, cmp);
                 counter = vaddq_u64(counter, adder);
                 // checking for break criterion is possibly expensive, only do it every 8 iterations
@@ -164,11 +184,6 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
                 }
             }
 
-            /*uint64_t resData[2];
-            vst1q_u64(resData, counter);
-            for (int k = 0; k < 2 && i + k < info.bWidth; k++)
-                data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
-*/
             uint64_t resData[2];
             double resa[2];
             double resb[2];
@@ -187,3 +202,220 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
         }
     }
 }
+
+
+
+struct VecPair
+{
+    float64x2_t a;
+    float64x2_t b;
+};
+
+
+static inline VecPair quickTwoSum(float64x2_t a, float64x2_t b)
+{
+    float64x2_t s = vaddq_f64(a, b);
+    float64x2_t e = vsubq_f64(b, vsubq_f64(s, a));
+    return { s, e };
+}
+
+static inline VecPair quickTwoDiff(float64x2_t a, float64x2_t b)
+{
+    float64x2_t s = vsubq_f64(a, b);
+    float64x2_t e = vsubq_f64(vsubq_f64(a, s), b);
+    return { s, e };
+}
+
+static inline VecPair twoSum(float64x2_t a, float64x2_t b)
+{
+    float64x2_t s = vaddq_f64(a, b);
+    float64x2_t bb = vsubq_f64(s, a);
+    float64x2_t e = vaddq_f64(vsubq_f64(a, vsubq_f64(s, bb)), vsubq_f64(b, bb));
+    return { s, e };
+}
+
+static inline VecPair twoDiff(float64x2_t a, float64x2_t b)
+{
+    float64x2_t s = vsubq_f64(a, b);
+    float64x2_t bb = vsubq_f64(s, a);
+    float64x2_t e = vsubq_f64(vsubq_f64(a, vsubq_f64(s, bb)), vaddq_f64(b, bb));
+    return { s, e };
+}
+
+
+static inline VecPair split(float64x2_t a)
+{
+    /*
+    // -- this should never happen when doing mandelbrot calculations,
+    //    so we omit this check.
+    if (a > _QD_SPLIT_THRESH || a < -_QD_SPLIT_THRESH) {
+        a *= 3.7252902984619140625e-09;  // 2^-28
+        temp = _QD_SPLITTER * a;
+        hi = temp - (temp - a);
+        lo = a - hi;
+        hi *= 268435456.0;          // 2^28
+        lo *= 268435456.0;          // 2^28
+    } else {
+        temp = _QD_SPLITTER * a;
+        hi = temp - (temp - a);
+        lo = a - hi;
+    }
+    */
+
+    static const float64x2_t SPLITTER = vmovq_n_f64(134217729.0);
+    float64x2_t temp = vmulq_f64(SPLITTER, a);
+    float64x2_t hi = vsubq_f64(temp, vsubq_f64(temp, a));
+    float64x2_t lo = vsubq_f64(a, hi);
+    return { hi, lo };
+}
+
+static inline VecPair twoProd(float64x2_t a, float64x2_t b)
+{
+    float64x2_t p = vmulq_f64(a, b);
+    auto[a_hi, a_lo] = split(a);
+    auto[b_hi, b_lo] = split(b);
+    float64x2_t err = vaddq_f64(vaddq_f64(vsubq_f64(vmulq_f64(a_hi, b_hi), p), vaddq_f64(vmulq_f64(a_hi, b_lo), vmulq_f64(a_lo, b_hi))), vmulq_f64(a_lo, b_lo));
+    return { p, err };
+}
+
+struct NeonDoubleDouble
+{
+    float64x2_t x[2];
+
+    inline NeonDoubleDouble(const float64x2_t& a, const float64x2_t& b) :
+        x{ a, b }
+    {}
+
+    inline NeonDoubleDouble(double a, double b) :
+        x{ vmovq_n_f64(a), vmovq_n_f64(b) }
+    {}
+
+
+    inline NeonDoubleDouble operator + (const NeonDoubleDouble& sm) const
+    {
+        auto[s, e] = twoSum(x[0], sm.x[0]);
+        e = vaddq_f64(e, vaddq_f64(x[1], sm.x[1]));
+        auto[r1, r2] = quickTwoSum(s, e);
+        return NeonDoubleDouble{ r1, r2 };
+    }
+
+    inline NeonDoubleDouble operator - (const NeonDoubleDouble& sm) const
+    {
+        auto[s, e] = twoDiff(x[0], sm.x[0]);
+        e = vaddq_f64(e, x[1]);
+        e = vsubq_f64(e, sm.x[1]);
+        auto[r1, r2] = quickTwoSum(s, e);
+        return NeonDoubleDouble{ r1, r2 };
+    }
+
+    inline NeonDoubleDouble operator * (const NeonDoubleDouble& sm) const
+    {
+        auto[p1, p2] = twoProd(this->x[0], sm.x[0]);
+        p2 = vaddq_f64(p2,
+            vaddq_f64(vmulq_f64(sm.x[1], x[0]), vmulq_f64(sm.x[0], x[1])) );
+        auto[r1, r2] = quickTwoSum(p1, p2);
+        return NeonDoubleDouble{ r1, r2 };
+    }
+};
+
+
+template<bool parallel>
+void CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInfo& info, float* data)
+{
+    const MandelViewport& view = info.view;
+
+    using T = LightDoubleDouble;
+
+    T viewx = mnd::convert<T>(view.x);
+    T viewy = mnd::convert<T>(view.y);
+    T wpp = mnd::convert<T>(view.width / info.bWidth);
+    T hpp = mnd::convert<T>(view.height / info.bHeight);
+
+    T jX = mnd::convert<T>(info.juliaX);
+    T jY = mnd::convert<T>(info.juliaY);
+    NeonDoubleDouble juliaX = { jX[0], jX[1] };
+    NeonDoubleDouble juliaY = { jY[0], jY[1] };
+
+#if defined(_OPENMP)
+    if constexpr(parallel)
+        omp_set_num_threads(omp_get_num_procs());
+#   pragma omp parallel for schedule(static, 1) if (parallel)
+#endif
+    for (long j = 0; j < info.bHeight; j++) {
+        T y = viewy + T(double(j)) * hpp;
+        NeonDoubleDouble ys{ y[0], y[1] };
+        for (long i = 0; i < info.bWidth; i += 2) {
+            T x1 = viewx + T(double(i)) * wpp;
+            T x2 = x1 + wpp;
+            double xarr1[] = { x1[0], x2[0] };
+            double xarr2[] = { x1[1], x2[1] };
+            float64x2_t x1s = vld1q_f64(xarr1);
+            float64x2_t x2s = vld1q_f64(xarr2);
+
+            NeonDoubleDouble xs{ x1s, x2s };
+
+            NeonDoubleDouble cx = info.julia ? juliaX : xs;
+            NeonDoubleDouble cy = info.julia ? juliaY : ys;
+
+            float64x2_t threshold = vmovq_n_f64(16.0);
+            uint64x2_t counter = vmovq_n_u64(0);
+            uint64x2_t adder = vmovq_n_u64(1);
+
+            NeonDoubleDouble a = xs;
+            NeonDoubleDouble b = ys;
+            float64x2_t resultA = a.x[0];
+            float64x2_t resultB = b.x[0];
+
+            float64x2_t resultsa = vmovq_n_f64(0);
+            float64x2_t resultsb = vmovq_n_f64(0);
+
+            for (int k = 0; k < info.maxIter; k++) {
+                NeonDoubleDouble aa = a * a;
+                NeonDoubleDouble bb = b * b;
+                NeonDoubleDouble abab = a * b; abab = abab + abab;
+                a = aa - bb + cx;
+                b = abab + cy;
+
+
+                uint64x2_t cmp = vcleq_f64(vaddq_f64(aa.x[0], bb.x[0]), threshold);
+                if (info.smooth) {
+                    resultA = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(a.x[0])), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(resultA))));
+                    resultB = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(b.x[0])), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(resultB))));
+                }
+                a = aa - bb + cx;
+                b = abab + cy;
+                adder = vandq_u64(adder, cmp);
+                counter = vaddq_u64(counter, adder);
+                // checking for break criterion is possibly expensive, only do it every 8 iterations
+                if ((k & 0x7) == 0) {
+                    /* // ARM-v7 method
+                    uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
+                    if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
+                        break;
+                    }
+                    */
+                    uint64_t allZero = vaddvq_u64(cmp);
+                    if (allZero == 0) {
+                        break;
+                    }
+                }
+            }
+
+            uint64_t resData[2];
+            double resa[2];
+            double resb[2];
+            vst1q_u64(resData, counter);
+            vst1q_f64(resa, resultA);
+            vst1q_f64(resb, resultB);
+
+            for (int k = 0; k < 2 && i + k < info.bWidth; k++) {
+                if (info.smooth)
+                    data[i + k + j * info.bWidth] = resData[k] <= 0 ? info.maxIter :
+                    resData[k] >= info.maxIter ? info.maxIter :
+                    ((float) resData[k]) + 1 - ::logf(::logf(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::logf(2.0f);
+                else
+                    data[i + k + j * info.bWidth] = resData[k] > 0 ? float(resData[k]) : info.maxIter;
+            }
+        }
+    }
+}

+ 10 - 1
libmandel/src/IterationCompiler.cpp

@@ -1,8 +1,11 @@
 #include "IterationCompiler.h"
 #include "NaiveIRGenerator.h"
 
-#include "ExecData.h"
 #include "Mandel.h"
+#ifdef WITH_ASMJIT
+#include "ExecData.h"
+#endif // WITH_ASMJIT
+
 #include "OpenClInternal.h"
 #include "OpenClCode.h"
 
@@ -14,6 +17,7 @@
 using namespace std::string_literals;
 namespace mnd
 {
+#ifdef WITH_ASMJIT
     struct CompileVisitor
     {
         using Reg = asmjit::x86::Xmm;
@@ -560,6 +564,7 @@ namespace mnd
         return CompiledGeneratorVec{ std::move(ed) };
     }
 
+#endif // WITH_ASMJIT
 
     struct OpenClVisitor
     {
@@ -775,6 +780,9 @@ namespace mnd
 
         ir::Formula irf = mnd::expand(z0o, zio);
         irf.optimize();
+
+
+#ifdef WITH_ASMJIT
         printf("ir: %s\n", irf.toString().c_str()); fflush(stdout);
         auto dg = std::make_unique<CompiledGenerator>(compile(irf));
         printf("asm: %s\n", dg->dump().c_str()); fflush(stdout);
@@ -784,6 +792,7 @@ namespace mnd
             printf("asm avxvec: %s\n", dgavx->dump().c_str()); fflush(stdout);
             vec.push_back(std::move(dgavx));
         }
+#endif // WITH_ASMJIT
 
         //vec.push_back(std::make_unique<NaiveIRGenerator<mnd::DoubleDouble>>(irf));
         //vec.push_back(std::make_unique<NaiveIRGenerator<mnd::QuadDouble>>(irf));

+ 7 - 3
libmandel/src/IterationGenerator.cpp

@@ -1,5 +1,4 @@
 #include "IterationGenerator.h"
-#include "ExecData.h"
 #include "Mandel.h"
 
 #include "OpenClInternal.h"
@@ -125,10 +124,12 @@ std::complex<double> NaiveGenerator::calc(mnd::Expression& expr, std::complex<do
     return result;
 }
 
+#ifdef WITH_ASMJIT
+
+#include "ExecData.h"
+
 using mnd::CompiledGenerator;
 using mnd::CompiledGeneratorVec;
-using mnd::CompiledClGenerator;
-using mnd::CompiledClGeneratorDouble;
 
 
 CompiledGenerator::CompiledGenerator(std::unique_ptr<mnd::ExecData> execData,
@@ -236,8 +237,11 @@ void CompiledGeneratorVec::generate(const mnd::MandelInfo& info, float* data)
     }
 }
 
+#endif // WITH_ASMJIT
 
 #ifdef WITH_OPENCL
+using mnd::CompiledClGenerator;
+using mnd::CompiledClGeneratorDouble;
 CompiledClGenerator::CompiledClGenerator(mnd::MandelDevice& device, const std::string& code) :
     ClGeneratorFloat{ device, code }
 {

+ 12 - 2
libmandel/src/Mandel.cpp

@@ -6,7 +6,9 @@
 #include "OpenClInternal.h"
 #include "OpenClCode.h"
 
+#ifdef WITH_ASMJIT
 #include <asmjit/asmjit.h>
+#endif // WITH_ASMJIT
 
 #include <map>
 
@@ -44,6 +46,7 @@ static const std::map<mnd::GeneratorType, std::string> typeNames =
     { mnd::GeneratorType::DOUBLE_DOUBLE, "double double" },
     { mnd::GeneratorType::DOUBLE_DOUBLE_AVX, "double double AVX" },
     { mnd::GeneratorType::DOUBLE_DOUBLE_AVX_FMA, "double double AVX+FMA" },
+    { mnd::GeneratorType::DOUBLE_DOUBLE_NEON, "double double NEON" },
     { mnd::GeneratorType::QUAD_DOUBLE, "quad double" },
     { mnd::GeneratorType::QUAD_DOUBLE_AVX_FMA, "quad double AVX+FMA" },
     { mnd::GeneratorType::FLOAT128, "float128" },
@@ -113,17 +116,21 @@ bool MandelDevice::supportsDouble(void) const
 }
 
 
-MandelContext::MandelContext(void) :
-    jitRuntime{ std::make_unique<asmjit::JitRuntime>() }
+MandelContext::MandelContext(void)
+#ifdef WITH_ASMJIT
+    : jitRuntime{ std::make_unique<asmjit::JitRuntime>() }
+#endif // WITH_ASMJIT
 {
 
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) 
+#   if defined(WITH_AVX512)
     if (cpuInfo.hasAvx512()) {
         auto fl = std::make_unique<CpuGenerator<float, mnd::X86_AVX_512, true>>();
         auto db = std::make_unique<CpuGenerator<double, mnd::X86_AVX_512, true>>();
         cpuGenerators.insert({ GeneratorType::FLOAT_AVX512, std::move(fl) });
         cpuGenerators.insert({ GeneratorType::DOUBLE_AVX512, std::move(db) });
     }
+#   endif
     if (cpuInfo.hasAvx()) {
         auto fl = std::make_unique<CpuGenerator<float, mnd::X86_AVX, true>>();
         auto db = std::make_unique<CpuGenerator<double, mnd::X86_AVX, true>>();
@@ -152,8 +159,10 @@ MandelContext::MandelContext(void) :
     if (cpuInfo.hasNeon()) {
         auto fl = std::make_unique<CpuGenerator<float, mnd::ARM_NEON, true>>();
         auto db = std::make_unique<CpuGenerator<double, mnd::ARM_NEON, true>>();
+        auto ddb = std::make_unique<CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, true>>();
         cpuGenerators.insert({ GeneratorType::FLOAT_NEON, std::move(fl) });
         cpuGenerators.insert({ GeneratorType::DOUBLE_NEON, std::move(db) });
+        cpuGenerators.insert({ GeneratorType::DOUBLE_DOUBLE_NEON, std::move(ddb) });
     }
 #endif
     {
@@ -219,6 +228,7 @@ std::unique_ptr<mnd::AdaptiveGenerator> MandelContext::createAdaptiveGenerator(v
     if (cpuInfo.hasNeon()) {
         floatGen = getCpuGenerator(GeneratorType::FLOAT_NEON);
         doubleGen = getCpuGenerator(GeneratorType::DOUBLE_NEON);
+        doubleDoubleGen = getCpuGenerator(GeneratorType::DOUBLE_DOUBLE_NEON);
     }
 
     if (!devices.empty()) {