5 年之前 · fba287531f
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,9 +4,9 @@ project(Almond VERSION 1.0.0 DESCRIPTION "Fractal Viewer")
 
				 
			
 
				 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules)
			
 
				 
			
 
				-set(CMAKE_AUTOMOC ON)
			
 
				-set(CMAKE_AUTORCC ON)
			
 
				-set(CMAKE_AUTOUIC ON)
			
 
				+#set(CMAKE_AUTOMOC ON)
			
 
				+#set(CMAKE_AUTORCC ON)
			
 
				+#set(CMAKE_AUTOUIC ON)
			
 
				 
			
 
				 find_package(Qt5 COMPONENTS Core Widgets OpenGL Xml REQUIRED)
			
 
				 find_package(OpenMP)
			
@@ -16,7 +16,7 @@ find_package(Boost 1.65 REQUIRED)
 
				 
			
 
				 find_package(FFmpeg COMPONENTS AVCODEC AVDEVICE AVFORMAT AVUTIL SWSCALE REQUIRED)
			
 
				 
			
 
				-message( ${FFMPEG_INCLUDE_DIRS})
			
 
				+#message(${FFMPEG_INCLUDE_DIRS})
			
 
				 
			
 
				 set(CMAKE_CXX_STANDARD 17)
			
 
				 
			
@@ -32,16 +32,22 @@ ENDIF()
 
				 
			
 
				 add_subdirectory(libmandel)
			
 
				 
			
 
				-target_include_directories(Almond PUBLIC ${FFMPEG_INCLUDE_DIRS})
			
 
				+target_include_directories(Almond SYSTEM PUBLIC ${FFMPEG_INCLUDE_DIRS})
			
 
				 
			
 
				-target_link_libraries(Almond PUBLIC mandel asmjit qd)
			
 
				+target_link_libraries(Almond PUBLIC mandel)
			
 
				 target_link_libraries(Almond PUBLIC Qt5::Core Qt5::Widgets Qt5::OpenGL Qt5::Xml)
			
 
				 target_link_libraries(Almond PUBLIC ${FFMPEG_LIBRARIES})
			
 
				 target_link_libraries(Almond PUBLIC OpenGL::GL)
			
 
				 
			
 
				+set_property(TARGET Almond PROPERTY AUTOMOC ON)
			
 
				+set_property(TARGET Almond PROPERTY AUTORCC ON)
			
 
				+set_property(TARGET Almond PROPERTY AUTOUIC ON)
			
 
				+
			
 
				+
			
 
				+
			
 
				 if(Boost_FOUND)
			
 
				     target_compile_definitions(Almond PUBLIC WITH_BOOST)
			
 
				-    target_include_directories(Almond PUBLIC ${Boost_INCLUDE_DIRS})
			
 
				+    target_include_directories(Almond SYSTEM PUBLIC ${Boost_INCLUDE_DIRS})
			
 
				     #target_link_libraries(Almond PRIVATE ${Boost_LIBRARIES})
			
 
				 endif(Boost_FOUND)
			
 
				 
			
--- a/libalmond/CMakeLists.txt
+++ b/libalmond/CMakeLists.txt
@@ -0,0 +1,30 @@
 
				+cmake_minimum_required(VERSION 3.13)
			
 
				+
			
 
				+project(libalmond VERSION 1.0.0 DESCRIPTION "almond functionality")
			
 
				+
			
 
				+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../CMakeModules)
			
 
				+
			
 
				+find_package(FFmpeg COMPONENTS AVCODEC AVDEVICE AVFORMAT AVUTIL SWSCALE REQUIRED)
			
 
				+
			
 
				+add_subdirectory(../libmandel ./libmandel)
			
 
				+
			
 
				+#set(Boost_DEBUG 1)
			
 
				+set(Boost_USE_STATIC_LIBS ON)
			
 
				+find_package(Boost 1.65 REQUIRED)
			
 
				+
			
 
				+
			
 
				+set(CMAKE_CXX_STANDARD 17)
			
 
				+
			
 
				+
			
 
				+FILE(GLOB libalmondsources src/*.cpp)
			
 
				+FILE(GLOB libalmondheaders include/*.h)
			
 
				+
			
 
				+add_library(libalmond STATIC ${libalmondsources})
			
 
				+target_include_directories(libalmond PUBLIC "include")
			
 
				+target_include_directories(libalmond SYSTEM PUBLIC ${FFMPEG_INCLUDE_DIRS})
			
 
				+target_link_libraries(libalmond PUBLIC mandel)
			
 
				+target_link_libraries(libalmond PUBLIC ${FFMPEG_LIBRARIES})
			
 
				+set_target_properties(libalmond PROPERTIES OUTPUT_NAME almond)
			
 
				+
			
 
				+
			
 
				+
			
--- a/libalmond/include/Bitmap.h
+++ b/libalmond/include/Bitmap.h
@@ -1,6 +1,6 @@
 
				 #pragma once
			
 
				-#ifndef BITMAP_H_
			
 
				-#define BITMAP_H_
			
 
				+#ifndef ALMOND_BITMAP_H_
			
 
				+#define ALMOND_BITMAP_H_
			
 
				 
			
 
				 #include "Color.h"
			
 
				 #include <memory>
			
@@ -59,4 +59,5 @@ public:
 
				     }
			
 
				 };
			
 
				 
			
 
				-#endif // BITMAP_H_
			
 
				+#endif // ALMOND_BITMAP_H_
			
 
				+
			
--- a/libalmond/include/Color.h
+++ b/libalmond/include/Color.h
--- a/libalmond/include/CubicSpline.h
+++ b/libalmond/include/CubicSpline.h
--- a/libalmond/include/Gradient.h
+++ b/libalmond/include/Gradient.h
--- a/libalmond/include/MandelVideoGenerator.h
+++ b/libalmond/include/MandelVideoGenerator.h
@@ -4,6 +4,7 @@
 
				 #include "MandelUtil.h"
			
 
				 #include "Gradient.h"
			
 
				 #include "Bitmap.h"
			
 
				+#include <functional>
			
 
				 
			
 
				 struct ExportVideoInfo
			
 
				 {
			
@@ -26,15 +27,27 @@ struct ExportVideoInfo
 
				 };
			
 
				 
			
 
				 
			
 
				+struct MandelVideoProgressInfo
			
 
				+{
			
 
				+    int framesExported;
			
 
				+};
			
 
				+
			
 
				+
			
 
				 class MandelVideoGenerator
			
 
				 {
			
 
				-    const ExportVideoInfo evi;
			
 
				 public:
			
 
				+    using ProgressCallback = std::function<void(const MandelVideoProgressInfo&)>;
			
 
				+private:
			
 
				+    const ExportVideoInfo evi;
			
 
				+    std::vector<ProgressCallback> progressCallbacks;
			
 
				+    public:
			
 
				     MandelVideoGenerator(const ExportVideoInfo& evi);
			
 
				 
			
 
				     void generate(void);
			
 
				+    void addProgressCallback(ProgressCallback pc);
			
 
				 
			
 
				 private:
			
 
				+    void callCallbacks(const MandelVideoProgressInfo& evi);
			
 
				     Bitmap<RGBColor> overlay(const Bitmap<RGBColor>& outer,
			
 
				                              const Bitmap<RGBColor>& inner,
			
 
				                              double scale);
			
--- a/libalmond/include/VideoStream.h
+++ b/libalmond/include/VideoStream.h
--- a/libalmond/src/Bitmap.cpp
+++ b/libalmond/src/Bitmap.cpp
--- a/libalmond/src/Color.cpp
+++ b/libalmond/src/Color.cpp
--- a/libalmond/src/CubicSpline.cpp
+++ b/libalmond/src/CubicSpline.cpp
--- a/libalmond/src/Gradient.cpp
+++ b/libalmond/src/Gradient.cpp
--- a/libalmond/src/MandelVideoGenerator.cpp
+++ b/libalmond/src/MandelVideoGenerator.cpp
@@ -10,10 +10,15 @@ MandelVideoGenerator::MandelVideoGenerator(const ExportVideoInfo& evi) :
 
				 }
			
 
				 
			
 
				 
			
 
				+void MandelVideoGenerator::addProgressCallback(ProgressCallback pc)
			
 
				+{
			
 
				+    progressCallbacks.push_back(std::move(pc));
			
 
				+}
			
 
				+
			
 
				 void MandelVideoGenerator::generate(void)
			
 
				 {
			
 
				     mnd::MandelContext ctxt = mnd::initializeContext();
			
 
				-    mnd::Generator& gen = ctxt.getDefaultGenerator();
			
 
				+    mnd::MandelGenerator& gen = ctxt.getDefaultGenerator();
			
 
				     mnd::MandelInfo mi;
			
 
				     mi.bWidth = evi.width * 2;
			
 
				     mi.bHeight = evi.height * 2;
			
@@ -31,6 +36,7 @@ void MandelVideoGenerator::generate(void)
 
				     Bitmap<RGBColor> big;
			
 
				     Bitmap<RGBColor> small;
			
 
				 
			
 
				+    int64_t frameCounter = 0;
			
 
				     while(w > evi.end.width || h > evi.end.height) {
			
 
				         mi.view = mnd::MandelViewport{ x - w/2, y - h/2, w, h };
			
 
				 
			
@@ -51,6 +57,9 @@ void MandelVideoGenerator::generate(void)
 
				         }
			
 
				 
			
 
				         vs.addFrame(overlay(big, small, bigFac));
			
 
				+        frameCounter++;
			
 
				+        MandelVideoProgressInfo mvpi{ frameCounter };
			
 
				+        callCallbacks(mvpi);
			
 
				 
			
 
				         w *= ::pow(0.99, evi.zoomSpeed);
			
 
				         h *= ::pow(0.99, evi.zoomSpeed);
			
@@ -59,6 +68,14 @@ void MandelVideoGenerator::generate(void)
 
				 }
			
 
				 
			
 
				 
			
 
				+void MandelVideoGenerator::callCallbacks(const MandelVideoProgressInfo& evi)
			
 
				+{
			
 
				+    for (auto& pc : progressCallbacks) {
			
 
				+        pc(evi);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				 inline RGBColor biliniear(const Bitmap<RGBColor>& img, double x, double y)
			
 
				 {
			
 
				     int xfloor = int(::floor(x));
			
--- a/libalmond/src/VideoStream.cpp
+++ b/libalmond/src/VideoStream.cpp
--- a/libalmond/src/main.cpp.txt
+++ b/libalmond/src/main.cpp.txt
@@ -0,0 +1,40 @@
 
				+#include "MandelVideoGenerator.h"
			
 
				+#include "Gradient.h"
			
 
				+#include "Mandel.h"
			
 
				+#include "Fixed.h"
			
 
				+
			
 
				+
			
 
				+int main() {
			
 
				+    //mnd::MandelContext mndCtxt = mnd::initializeContext();
			
 
				+
			
 
				+
			
 
				+    ExportVideoInfo evi;
			
 
				+    
			
 
				+    evi.start = mnd::MandelViewport::standardView();
			
 
				+    evi.end = mnd::MandelViewport {
			
 
				+        mnd::Real("-1.5016327722130767973008541252724123393337183519056236025189105693015282429244791506194548898968185999262221668435271537932672968559900159142085320685031"),
			
 
				+        mnd::Real("9.1949171527697821768939276268368163504538591789778359909730511642378316080598664365235178721745031546786105261407973733873085119833457073054327967448264e-06"),
			
 
				+        mnd::Real("1.6236294899543021550377844129369984149698872979955210084321757728274664401182171658849308001321609757279087031477100527629814577654596624031152718524352e-28"),
			
 
				+        mnd::Real("1.2246019034401093377903721086780361028058704962292211685926779200766324399350798858672587301860274703389823933260119617558370004128301410779021141722617e-28")
			
 
				+    };
			
 
				+    //evi.end.zoomCenter(1.0e+27);
			
 
				+    evi.gradient = Gradient::defaultGradient();
			
 
				+
			
 
				+    evi.width = 64;
			
 
				+    evi.height = 64;
			
 
				+    evi.maxIterations = 5000;
			
 
				+    evi.fps = 30;
			
 
				+    evi.zoomSpeed = 1.3;
			
 
				+    evi.path = "video.avi";
			
 
				+    evi.bitrate = 1500;
			
 
				+    evi.preset = "slow";
			
 
				+
			
 
				+    evi.start.adjustAspectRatio(evi.width, evi.height);
			
 
				+
			
 
				+    MandelVideoGenerator mvg(evi);
			
 
				+
			
 
				+    mvg.generate();
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+
			
--- a/libmandel/CMakeLists.txt
+++ b/libmandel/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.12)
 
				 
			
 
				 set(ARCH "X86_64" CACHE STRING "Target Architecture")
			
 
				 option(AVX512 "generate code that can make use of avx-512-instructions" ON)
			
 
				+option(WITH_ASMJIT "use just-in-time-compilation capabilities of asmjit" ON)
			
 
				 
			
 
				 #message(CMAKE_SYSTEM_PROCESSOR)
			
 
				 
			
@@ -41,10 +42,12 @@ if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
 
				     if (AVX512)
			
 
				         list(APPEND MandelSources src/CpuGeneratorsAVX512.cpp)
			
 
				     endif()
			
 
				+elseif(ARCH STREQUAL "ARM")
			
 
				+    list(APPEND MandelSources src/CpuGeneratorsNeon.cpp)
			
 
				 endif()
			
 
				 
			
 
				+
			
 
				 #    message(${MandelSources})
			
 
				-add_subdirectory(asmjit)
			
 
				 
			
 
				 add_library(mandel STATIC ${MandelSources})
			
 
				 
			
@@ -64,7 +67,12 @@ target_include_directories(qd PUBLIC qd-2.3.22/include qd-2.3.22)
 
				 #target_include_directories(mandel PUBLI#C qd-2.3.22/include)
			
 
				 #target_include_directories(mandel PUBLI#C qd-2.3.22/include)
			
 
				 target_link_libraries(mandel PUBLIC qd)
			
 
				-target_link_libraries(mandel PUBLIC asmjit)
			
 
				+
			
 
				+if(WITH_ASMJIT)
			
 
				+	add_subdirectory(asmjit)
			
 
				+	target_compile_definitions(mandel PUBLIC WITH_ASMJIT)
			
 
				+	target_link_libraries(mandel PUBLIC asmjit)
			
 
				+endif(WITH_ASMJIT)
			
 
				 
			
 
				 include(CheckIPOSupported)
			
 
				 check_ipo_supported(RESULT LTO_SUPPORTED)
			
@@ -77,7 +85,7 @@ endif()
 
				 
			
 
				 if(OPENCL_FOUND)
			
 
				     target_compile_definitions(mandel PUBLIC WITH_OPENCL)
			
 
				-    target_include_directories(mandel PUBLIC
			
 
				+    target_include_directories(mandel SYSTEM PUBLIC
			
 
				         "include"
			
 
				         ${OpenCL_INCLUDE_DIRS}
			
 
				     )
			
@@ -108,19 +116,16 @@ if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
 
				 
			
 
				     if (MSVC)
			
 
				         set_source_files_properties(src/CpuGeneratorsAVX.cpp PROPERTIES COMPILE_FLAGS /arch:AVX)
			
 
				-        set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS /arch:AVX)
			
 
				-        set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS /arch:FMA)
			
 
				+        set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS "/arch:FMA /arch:AVX2")
			
 
				         set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS /arch:SSE2)
			
 
				     else()
			
 
				         set_source_files_properties(src/CpuGeneratorsAVX.cpp PROPERTIES COMPILE_FLAGS -mavx)
			
 
				-        set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS -mavx)
			
 
				-        set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS -mfma)
			
 
				+        set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
			
 
				         set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS -msse2)
			
 
				     endif(MSVC)
			
 
				 
			
 
				 elseif(ARCH STREQUAL "ARM")
			
 
				-    list(APPEND MandelSources src/CpuGeneratorsNeon.cpp)
			
 
				-    #set_source_files_properties(src/CpuGeneratorsNeon.cpp PROPERTIES COMPILE_FLAGS -mfpu=neon)
			
 
				+    set_source_files_properties(src/CpuGeneratorsNeon.cpp PROPERTIES COMPILE_FLAGS -march=armv8-a+simd)
			
 
				 endif()
			
 
				 
			
 
				 
			
--- a/libmandel/include/CpuGenerators.h
+++ b/libmandel/include/CpuGenerators.h
@@ -90,38 +90,37 @@ public:
 
				     virtual void generate(const MandelInfo& info, float* data);
			
 
				 };
			
 
				 
			
 
				-#else //if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) 
			
 
				+
			
 
				 template<bool parallel>
			
 
				-class mnd::CpuGenerator<float, mnd::ARM_NEON, parallel> : public Generator
			
 
				+class mnd::CpuGenerator<float, mnd::X86_AVX_FMA, parallel> : public MandelGenerator
			
 
				 {
			
 
				 public:
			
 
				-    CpuGenerator(void);
			
 
				     inline CpuGenerator(void) :
			
 
				-        MandelGenerator{ mnd::Precision::FLOAT, mnd::ARM_NEON }
			
 
				+        MandelGenerator{ mnd::Precision::FLOAT, mnd::X86_AVX_FMA }
			
 
				     {
			
 
				     }
			
 
				     virtual void generate(const MandelInfo& info, float* data);
			
 
				 };
			
 
				 
			
 
				+
			
 
				 template<bool parallel>
			
 
				-class mnd::CpuGenerator<double, mnd::ARM_NEON, parallel> : public Generator
			
 
				+class mnd::CpuGenerator<double, mnd::X86_AVX_FMA, parallel> : public MandelGenerator
			
 
				 {
			
 
				 public:
			
 
				     inline CpuGenerator(void) :
			
 
				-        MandelGenerator{ mnd::Precision::DOUBLE, mnd::ARM_NEON }
			
 
				+        MandelGenerator{ mnd::Precision::DOUBLE, mnd::X86_AVX_FMA }
			
 
				     {
			
 
				     }
			
 
				     virtual void generate(const MandelInfo& info, float* data);
			
 
				 };
			
 
				-#endif
			
 
				 
			
 
				 
			
 
				 template<bool parallel>
			
 
				-class mnd::CpuGenerator<float, mnd::X86_AVX_FMA, parallel> : public MandelGenerator
			
 
				+class mnd::CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel> : public MandelGenerator
			
 
				 {
			
 
				 public:
			
 
				     inline CpuGenerator(void) :
			
 
				-        MandelGenerator{ mnd::Precision::FLOAT, mnd::X86_AVX_FMA }
			
 
				+        MandelGenerator{ mnd::Precision::DOUBLE_DOUBLE, mnd::X86_AVX_FMA }
			
 
				     {
			
 
				     }
			
 
				     virtual void generate(const MandelInfo& info, float* data);
			
@@ -129,11 +128,11 @@ public:
 
				 
			
 
				 
			
 
				 template<bool parallel>
			
 
				-class mnd::CpuGenerator<double, mnd::X86_AVX_FMA, parallel> : public MandelGenerator
			
 
				+class mnd::CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel> : public MandelGenerator
			
 
				 {
			
 
				 public:
			
 
				     inline CpuGenerator(void) :
			
 
				-        MandelGenerator{ mnd::Precision::DOUBLE, mnd::X86_AVX_FMA }
			
 
				+        MandelGenerator{ mnd::Precision::QUAD_DOUBLE, mnd::X86_AVX_FMA }
			
 
				     {
			
 
				     }
			
 
				     virtual void generate(const MandelInfo& info, float* data);
			
@@ -141,11 +140,11 @@ public:
 
				 
			
 
				 
			
 
				 template<bool parallel>
			
 
				-class mnd::CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel> : public MandelGenerator
			
 
				+class mnd::CpuGenerator<float, mnd::X86_AVX_512, parallel> : public MandelGenerator
			
 
				 {
			
 
				 public:
			
 
				     inline CpuGenerator(void) :
			
 
				-        MandelGenerator{ mnd::Precision::DOUBLE_DOUBLE, mnd::X86_AVX_FMA }
			
 
				+        MandelGenerator{ mnd::Precision::FLOAT, mnd::X86_AVX_512 }
			
 
				     {
			
 
				     }
			
 
				     virtual void generate(const MandelInfo& info, float* data);
			
@@ -153,11 +152,24 @@ public:
 
				 
			
 
				 
			
 
				 template<bool parallel>
			
 
				-class mnd::CpuGenerator<float, mnd::X86_AVX_512, parallel> : public MandelGenerator
			
 
				+class mnd::CpuGenerator<double, mnd::X86_AVX_512, parallel> : public MandelGenerator
			
 
				 {
			
 
				 public:
			
 
				     inline CpuGenerator(void) :
			
 
				-        MandelGenerator{ mnd::Precision::FLOAT, mnd::X86_AVX_512 }
			
 
				+        MandelGenerator{ mnd::Precision::DOUBLE, mnd::X86_AVX_512 }
			
 
				+    {
			
 
				+    }
			
 
				+    virtual void generate(const MandelInfo& info, float* data);
			
 
				+};
			
 
				+
			
 
				+#else //if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) 
			
 
				+template<bool parallel>
			
 
				+class mnd::CpuGenerator<float, mnd::ARM_NEON, parallel> : public Generator
			
 
				+{
			
 
				+public:
			
 
				+    CpuGenerator(void);
			
 
				+    inline CpuGenerator(void) :
			
 
				+        MandelGenerator{ mnd::Precision::FLOAT, mnd::ARM_NEON }
			
 
				     {
			
 
				     }
			
 
				     virtual void generate(const MandelInfo& info, float* data);
			
@@ -165,15 +177,30 @@ public:
 
				 
			
 
				 
			
 
				 template<bool parallel>
			
 
				-class mnd::CpuGenerator<double, mnd::X86_AVX_512, parallel> : public MandelGenerator
			
 
				+class mnd::CpuGenerator<double, mnd::ARM_NEON, parallel> : public Generator
			
 
				 {
			
 
				 public:
			
 
				     inline CpuGenerator(void) :
			
 
				-        MandelGenerator{ mnd::Precision::DOUBLE, mnd::X86_AVX_512 }
			
 
				+        MandelGenerator{ mnd::Precision::DOUBLE, mnd::ARM_NEON }
			
 
				+    {
			
 
				+    }
			
 
				+    virtual void generate(const MandelInfo& info, float* data);
			
 
				+};
			
 
				+
			
 
				+
			
 
				+template<bool parallel>
			
 
				+class mnd::CpuGenerator<DoubleDouble, mnd::ARM_NEON, parallel> : public Generator
			
 
				+{
			
 
				+public:
			
 
				+    inline CpuGenerator(void) :
			
 
				+        MandelGenerator{ mnd::Precision::DOUBLE_DOUBLE, mnd::ARM_NEON }
			
 
				     {
			
 
				     }
			
 
				     virtual void generate(const MandelInfo& info, float* data);
			
 
				 };
			
 
				 
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				 #endif // MANDEL_CPUGENERATORS_H
			
 
				 
			
--- a/libmandel/include/Generators.h
+++ b/libmandel/include/Generators.h
@@ -99,7 +99,9 @@ enum class mnd::GeneratorType : int
 
				     DOUBLE_DOUBLE,
			
 
				     DOUBLE_DOUBLE_AVX,
			
 
				     DOUBLE_DOUBLE_AVX_FMA,
			
 
				+    DOUBLE_DOUBLE_NEON,
			
 
				     QUAD_DOUBLE,
			
 
				+    QUAD_DOUBLE_AVX_FMA,
			
 
				     FLOAT128,
			
 
				     FLOAT256,
			
 
				     FIXED64,
			
--- a/libmandel/include/Hardware.h
+++ b/libmandel/include/Hardware.h
@@ -26,6 +26,7 @@ private:
 
				 
			
 
				     bool sse2;
			
 
				     bool avx;
			
 
				+    bool avx2;
			
 
				     bool fma;
			
 
				     bool avx512;
			
 
				     bool neon;
			
@@ -39,6 +40,7 @@ public:
 
				 //#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
			
 
				     inline bool hasSse2(void) const { return sse2; };
			
 
				     inline bool hasAvx(void) const { return avx; };
			
 
				+    inline bool hasAvx2(void) const { return avx2; };
			
 
				     inline bool hasFma(void) const { return fma; };
			
 
				     inline bool hasAvx512(void) const { return avx512; };
			
 
				 //#elif defined(__arm__) || defined(__aarch64__)
			
--- a/libmandel/include/IterationGenerator.h
+++ b/libmandel/include/IterationGenerator.h
@@ -53,6 +53,7 @@ private:
 
				 };
			
 
				 
			
 
				 
			
 
				+#ifdef WITH_ASMJIT
			
 
				 #if defined(__x86_64__) || defined(_M_X64)
			
 
				 class mnd::CompiledGenerator : public mnd::MandelGenerator
			
 
				 {
			
@@ -81,6 +82,7 @@ public:
 
				     virtual void generate(const MandelInfo& info, float* data) override;
			
 
				 };
			
 
				 #endif
			
 
				+#endif // WITH_ASMJIT
			
 
				 
			
 
				 
			
 
				 #ifdef WITH_OPENCL
			
--- a/libmandel/include/Mandel.h
+++ b/libmandel/include/Mandel.h
@@ -5,6 +5,11 @@
 
				 //#include <asmjit/asmjit.h>
			
 
				 namespace asmjit { class JitRuntime; }
			
 
				 
			
 
				+#ifndef WITH_ASMJIT
			
 
				+// if no asmjit, use dummy implementation
			
 
				+namespace asmjit { class JitRuntime{}; }
			
 
				+#endif // WITH_ASMJITH
			
 
				+
			
 
				 #include <vector>
			
 
				 #include <map>
			
 
				 #include <string>
			
--- a/libmandel/src/CpuGeneratorsAVXFMA.cpp
+++ b/libmandel/src/CpuGeneratorsAVXFMA.cpp
@@ -19,6 +19,9 @@ namespace mnd
 
				 
			
 
				     template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, false>;
			
 
				     template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>;
			
 
				+
			
 
				+    template class CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, false>;
			
 
				+    template class CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, true>;
			
 
				 }
			
 
				 
			
 
				 
			
@@ -324,6 +327,20 @@ struct VecPair
 
				     __m256d b;
			
 
				 };
			
 
				 
			
 
				+struct VecTriple
			
 
				+{
			
 
				+    __m256d a;
			
 
				+    __m256d b;
			
 
				+    __m256d c;
			
 
				+};
			
 
				+
			
 
				+struct VecQuadruple
			
 
				+{
			
 
				+    __m256d a;
			
 
				+    __m256d b;
			
 
				+    __m256d c;
			
 
				+    __m256d d;
			
 
				+};
			
 
				 
			
 
				 static inline VecPair quickTwoSum(__m256d a, __m256d b)
			
 
				 {
			
@@ -356,6 +373,98 @@ static inline VecPair twoDiff(__m256d a, __m256d b)
 
				 }
			
 
				 
			
 
				 
			
 
				+static inline VecTriple threeSum(__m256d a, __m256d b, __m256d c)
			
 
				+{
			
 
				+    auto [s, e] = twoSum(a, b);
			
 
				+    auto [r0, e2] = twoSum(s, c);
			
 
				+    auto [r1, r2] = twoSum(e, e2);
			
 
				+
			
 
				+    return { r0, r1, r2 };
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static inline VecPair threeTwoSum(__m256d a, __m256d b, __m256d c)
			
 
				+{
			
 
				+    auto[t, e1] = twoSum(a, b);
			
 
				+    auto[s, e2] = twoSum(t, c);
			
 
				+    return { s, _mm256_add_pd(e1, e2) };
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static inline __m256d threeOneSum(__m256d a, __m256d b, __m256d c)
			
 
				+{
			
 
				+    return _mm256_add_pd(a, _mm256_add_pd(b, c));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static inline VecTriple sixThreeSum(__m256d a, __m256d b, __m256d c,
			
 
				+                                    __m256d d, __m256d e, __m256d f)
			
 
				+{
			
 
				+    auto[x0, x1, x2] = threeSum(a, b, c);
			
 
				+    auto[y0, y1, y2] = threeSum(d, e, f);
			
 
				+
			
 
				+    auto[r0, t0] = twoSum(x0, y0);
			
 
				+    auto[t1, t2] = twoSum(x1, y1);
			
 
				+    auto[r1, t3] = twoSum(t0, t1);
			
 
				+    auto t4 = _mm256_add_pd(x2, y2);
			
 
				+    auto r2 = threeOneSum(t2, t3, t4);
			
 
				+
			
 
				+    return { r0, r1, r2 };
			
 
				+}
			
 
				+
			
 
				+static inline VecPair addDD(const VecPair& a, const VecPair& b)
			
 
				+{
			
 
				+    auto[s, e] = twoSum(a.a, b.a);
			
 
				+    e = _mm256_add_pd(e, _mm256_add_pd(a.b, b.b));
			
 
				+    auto[r1, r2] = quickTwoSum(s, e);
			
 
				+    return { r1, r2 };
			
 
				+}
			
 
				+
			
 
				+static inline VecPair nineTwoSum(__m256d a, __m256d b, __m256d c,
			
 
				+                                    __m256d d, __m256d e, __m256d f,
			
 
				+                                    __m256d g, __m256d h, __m256d i)
			
 
				+{
			
 
				+    auto[x1, x2] = twoSum(a, d);
			
 
				+    auto[y1, y2] = twoSum(b, c);
			
 
				+    auto[z1, z2] = twoSum(e, h);
			
 
				+    auto[u1, u2] = twoSum(f, g);
			
 
				+
			
 
				+    auto[t1, t2] = addDD({ x1, x2 }, { y1, y2 });
			
 
				+    auto[t3, t4] = addDD({ z1, z2 }, { u1, u2 });
			
 
				+
			
 
				+    auto[t5, t6] = addDD({ t1, t2 }, { t3, t4 });
			
 
				+
			
 
				+    return threeTwoSum(t5, t6, i);
			
 
				+}
			
 
				+
			
 
				+static inline VecQuadruple renormalize(__m256d x0, __m256d x1, __m256d x2, __m256d x3, __m256d x4)
			
 
				+{
			
 
				+    auto [st1, t4] = quickTwoSum(x3, x4);
			
 
				+    auto [st2, t3] = quickTwoSum(x2, st1);
			
 
				+    auto [st3, t2] = quickTwoSum(x1, st2);
			
 
				+    auto [t0, t1] = quickTwoSum(x0, st3);
			
 
				+
			
 
				+    __m256d s = t0;
			
 
				+    __m256d e;
			
 
				+
			
 
				+    __m256d t[] = { t1, t2, t3, t4 };
			
 
				+    __m256d b[4] = { 0, 0, 0, 0 };
			
 
				+
			
 
				+    int k = 0;
			
 
				+    for (int i = 0; i < 4; i++) {
			
 
				+        auto[st, et] = quickTwoSum(s, t[i]);
			
 
				+        s = st; e = et;
			
 
				+        b[k] = s;
			
 
				+        //if (e != 0) {
			
 
				+            b[k] = s;
			
 
				+            s = e;
			
 
				+            k = k + 1;
			
 
				+        //}
			
 
				+    }
			
 
				+
			
 
				+    return { b[0], b[1], b[2], b[3] };
			
 
				+}
			
 
				+
			
 
				 static inline VecPair twoProd(__m256d a, __m256d b)
			
 
				 {
			
 
				     __m256d p = _mm256_mul_pd(a, b);
			
@@ -507,5 +616,200 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const
 
				     }
			
 
				 }
			
 
				 
			
 
				+struct AvxQuadDouble
			
 
				+{
			
 
				+    __m256d x[4];
			
 
				+
			
 
				+    inline AvxQuadDouble(__m256d a, __m256d b, __m256d c, __m256d d) :
			
 
				+        x{ a, b, c, d}
			
 
				+    {}
			
 
				+
			
 
				+    inline AvxQuadDouble(double a, double b, double c, double d) :
			
 
				+        x{ _mm256_set1_pd(a), _mm256_set1_pd(b), _mm256_set1_pd(c), _mm256_set1_pd(d) }
			
 
				+    {}
			
 
				+
			
 
				+
			
 
				+    inline AvxQuadDouble operator + (const AvxQuadDouble& sm) const
			
 
				+    {
			
 
				+        auto[s0, e0] = twoSum(x[0], sm.x[0]);
			
 
				+        auto[s1, e1] = twoSum(x[1], sm.x[1]);
			
 
				+        auto[s2, e2] = twoSum(x[2], sm.x[2]);
			
 
				+        auto[s3, e3] = twoSum(x[3], sm.x[3]);
			
 
				+        __m256d r0 = s0;
			
 
				+
			
 
				+        auto [r1, t0] = twoSum(s1, e0);
			
 
				+        auto [r2, t1, t2] = threeSum(s2, e1, t0);
			
 
				+        auto [r3, t3, _t4] = threeSum(s3, e2, t1);
			
 
				+        auto [r4, _t5, _t6] = threeSum(e3, t3, t2);
			
 
				+
			
 
				+        auto [re0, re1, re2, re3] = renormalize(r0, r1, r2, r3, r4);
			
 
				+        return { re0, re1, re2, re3 };
			
 
				+    }
			
 
				+
			
 
				+    inline AvxQuadDouble operator - (const AvxQuadDouble& sm) const
			
 
				+    {
			
 
				+        auto[s0, e0] = twoDiff(x[0], sm.x[0]);
			
 
				+        auto[s1, e1] = twoDiff(x[1], sm.x[1]);
			
 
				+        auto[s2, e2] = twoDiff(x[2], sm.x[2]);
			
 
				+        auto[s3, e3] = twoDiff(x[3], sm.x[3]);
			
 
				+        __m256d r0 = s0;
			
 
				+
			
 
				+        auto [r1, t0] = twoSum(s1, e0);
			
 
				+        auto [r2, t1, t2] = threeSum(s2, e1, t0);
			
 
				+        auto [r3, t3, _t4] = threeSum(s3, e2, t1);
			
 
				+        auto [r4, _t5, _t6] = threeSum(e3, t3, t2);
			
 
				+
			
 
				+        auto [re0, re1, re2, re3] = renormalize(r0, r1, r2, r3, r4);
			
 
				+        return { re0, re1, re2, re3 };
			
 
				+    }
			
 
				+
			
 
				+    inline AvxQuadDouble operator * (const AvxQuadDouble& sm) const
			
 
				+    {
			
 
				+        auto[a0, b0] = twoProd(x[0], sm.x[0]);
			
 
				+        auto[b1, c0] = twoProd(x[0], sm.x[1]);
			
 
				+        auto[b2, c1] = twoProd(x[1], sm.x[0]);
			
 
				+        auto[c2, d0] = twoProd(x[0], sm.x[2]);
			
 
				+        auto[c3, d1] = twoProd(x[1], sm.x[1]);
			
 
				+        auto[c4, d2] = twoProd(x[2], sm.x[0]);
			
 
				+        auto d5 = _mm256_mul_pd(x[3], sm.x[0]);
			
 
				+        auto d6 = _mm256_mul_pd(x[2], sm.x[1]);
			
 
				+        auto d7 = _mm256_mul_pd(x[1], sm.x[2]);
			
 
				+        auto d8 = _mm256_mul_pd(x[0], sm.x[3]);
			
 
				+
			
 
				+        auto r0 = a0;
			
 
				+        auto[r1, c5, d3] = threeSum(b0, b1, b2);
			
 
				+        auto[r2, d4, e0] = sixThreeSum(c0, c1, c2, c3, c4, c5);
			
 
				+        auto[r3, e1] = nineTwoSum(d0, d1, d2, d3, d4, d5, d6, d7, d8);
			
 
				+        auto r4 = _mm256_add_pd(e0, e1);
			
 
				+
			
 
				+        auto [n0, n1, n2, n3] = renormalize(r0, r1, r2, r3, r4);
			
 
				+
			
 
				+        return { n0, n1, n2, n3 };
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+template<bool parallel>
			
 
				+void CpuGenerator<mnd::QuadDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
			
 
				+{
			
 
				+    const MandelViewport& view = info.view;
			
 
				+
			
 
				+    using T = mnd::Float256;
			
 
				+
			
 
				+    T viewx = mnd::convert<T>(view.x);
			
 
				+    T viewy = mnd::convert<T>(view.y);
			
 
				+    T wpp = mnd::convert<T>(view.width / info.bWidth);
			
 
				+    T hpp = mnd::convert<T>(view.height / info.bHeight);
			
 
				+
			
 
				+
			
 
				+    T jX = mnd::convert<T>(info.juliaX);
			
 
				+    T jY = mnd::convert<T>(info.juliaY);
			
 
				+
			
 
				 
			
 
				+    auto toQd = [] (const mnd::Float256& x) -> std::tuple<double, double, double, double> {
			
 
				+        double a = double(x);
			
 
				+        mnd::Float256 rem = x - a;
			
 
				+        double b = double(rem);
			
 
				+        rem = rem - b;
			
 
				+        double c = double(rem);
			
 
				+        rem = rem - c;
			
 
				+        double d = double(rem);
			
 
				+        return { a, b, c, d };
			
 
				+    };
			
 
				+
			
 
				+    auto toAvxQuadDouble = [&toQd] (const mnd::Float256& x) -> AvxQuadDouble {
			
 
				+        auto [a, b, c, d] = toQd(x);
			
 
				+        return AvxQuadDouble{ a, b, c, d };
			
 
				+    };
			
 
				+
			
 
				+    auto toAvxQuadDouble4 = [&toQd] (const mnd::Float256& a, const mnd::Float256& b,
			
 
				+                            const mnd::Float256& c, const mnd::Float256& d) -> AvxQuadDouble {
			
 
				+        auto [x0, y0, z0, u0] = toQd(a);
			
 
				+        auto [x1, y1, z1, u1] = toQd(b);
			
 
				+        auto [x2, y2, z2, u2] = toQd(c);
			
 
				+        auto [x3, y3, z3, u3] = toQd(d);
			
 
				+
			
 
				+        __m256d xs = { x0, x1, x2, x3 };
			
 
				+        __m256d ys = { y0, y1, y2, y3 };
			
 
				+        __m256d zs = { z0, z1, z2, z3 };
			
 
				+        __m256d us = { u0, u1, u2, u3 };
			
 
				+
			
 
				+        return AvxQuadDouble{ xs, ys, zs, us };
			
 
				+    };
			
 
				+
			
 
				+    AvxQuadDouble juliaX = toAvxQuadDouble(jX);
			
 
				+    AvxQuadDouble juliaY = toAvxQuadDouble(jY);
			
 
				+
			
 
				+#if defined(_OPENMP)
			
 
				+    if constexpr(parallel)
			
 
				+        omp_set_num_threads(omp_get_num_procs());
			
 
				+#   pragma omp parallel for schedule(static, 1) if (parallel)
			
 
				+#endif
			
 
				+    for (long j = 0; j < info.bHeight; j++) {
			
 
				+        T y = viewy + T(double(j)) * hpp;
			
 
				+        AvxQuadDouble ys = toAvxQuadDouble(y);
			
 
				+        for (long i = 0; i < info.bWidth; i += 4) {
			
 
				+            T x1 = viewx + T(double(i)) * wpp;
			
 
				+            T x2 = x1 + wpp;
			
 
				+            T x3 = x2 + wpp;
			
 
				+            T x4 = x3 + wpp;
			
 
				 
			
 
				+            AvxQuadDouble xs = toAvxQuadDouble4(x1, x2, x3, x4);
			
 
				+
			
 
				+            AvxQuadDouble cx = info.julia ? juliaX : xs;
			
 
				+            AvxQuadDouble cy = info.julia ? juliaY : ys;
			
 
				+
			
 
				+            int itRes[4] = { 0, 0, 0, 0 };
			
 
				+
			
 
				+            __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
			
 
				+            __m256d counter = { 0, 0, 0, 0 };
			
 
				+            __m256d adder = { 1, 1, 1, 1 };
			
 
				+
			
 
				+            AvxQuadDouble a = xs;
			
 
				+            AvxQuadDouble b = ys;
			
 
				+
			
 
				+            __m256d resultsa;
			
 
				+            __m256d resultsb;
			
 
				+
			
 
				+            for (int k = 0; k < info.maxIter; k++) {
			
 
				+                AvxQuadDouble aa = a * a;
			
 
				+                AvxQuadDouble bb = b * b;
			
 
				+                AvxQuadDouble abab = a * b; abab = abab + abab;
			
 
				+                a = aa - bb + cx;
			
 
				+                b = abab + cy;
			
 
				+                __m256d cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
			
 
				+                if (info.smooth) {
			
 
				+                    resultsa = _mm256_blendv_pd(resultsa, a.x[0], cmp);
			
 
				+                    resultsb = _mm256_blendv_pd(resultsb, b.x[0], cmp);
			
 
				+                }
			
 
				+                adder = _mm256_and_pd(adder, cmp);
			
 
				+                counter = _mm256_add_pd(counter, adder);
			
 
				+                if (_mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
			
 
				+                    break;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            auto alignVec = [](double* data) -> double* {
			
 
				+                void* aligned = data;
			
 
				+                ::size_t length = 64;
			
 
				+                std::align(32, 4 * sizeof(double), aligned, length);
			
 
				+                return static_cast<double*>(aligned);
			
 
				+            };
			
 
				+
			
 
				+            double resData[8];
			
 
				+            double* ftRes = alignVec(resData);
			
 
				+            double* resa = (double*) &resultsa;
			
 
				+            double* resb = (double*) &resultsb;
			
 
				+            _mm256_store_pd(ftRes, counter);
			
 
				+
			
 
				+            for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
			
 
				+                if (info.smooth)
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
			
 
				+                        ftRes[k] >= info.maxIter ? info.maxIter :
			
 
				+                        ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
			
 
				+                else
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/libmandel/src/CpuGeneratorsNeon.cpp
+++ b/libmandel/src/CpuGeneratorsNeon.cpp
@@ -1,4 +1,5 @@
 
				 #include "CpuGenerators.h"
			
 
				+#include "LightDoubleDouble.h"
			
 
				 
			
 
				 #include <omp.h>
			
 
				 #include <arm_neon.h>
			
@@ -13,6 +14,9 @@ namespace mnd
 
				 
			
 
				     template class CpuGenerator<double, mnd::ARM_NEON, false>;
			
 
				     template class CpuGenerator<double, mnd::ARM_NEON, true>;
			
 
				+
			
 
				+    template class CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, false>;
			
 
				+    template class CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, true>;
			
 
				 }
			
 
				 
			
 
				 
			
@@ -21,9 +25,14 @@ void CpuGenerator<float, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInf
 
				 {
			
 
				     using T = float;
			
 
				     const MandelViewport& view = info.view;
			
 
				+
			
 
				+    float32x4_t juliaX = vmovq_n_f32(double(info.juliaX));
			
 
				+    float32x4_t juliaY = vmovq_n_f32(double(info.juliaY));
			
 
				+#if defined(_OPENMP)
			
 
				     if constexpr(parallel)
			
 
				         omp_set_num_threads(omp_get_num_procs());
			
 
				-#pragma omp parallel for schedule(static, 1) if (parallel)
			
 
				+#   pragma omp parallel for schedule(static, 1) if (parallel)
			
 
				+#endif
			
 
				     for (long j = 0; j < info.bHeight; j++) {
			
 
				         T y = T(view.y) + T(j) * T(view.height / info.bHeight);
			
 
				         long i = 0;
			
@@ -48,21 +57,24 @@ void CpuGenerator<float, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInf
 
				             float32x4_t a = xs;
			
 
				             float32x4_t b = ys;
			
 
				 
			
 
				+            float32x4_t cx = info.julia ? juliaX : xs;
			
 
				+            float32x4_t cy = info.julia ? juliaY : ys;
			
 
				+
			
 
				             for (int k = 0; k < info.maxIter; k++) {
			
 
				                 float32x4_t aa = vmulq_f32(a, a);
			
 
				                 float32x4_t bb = vmulq_f32(b, b);
			
 
				                 float32x4_t abab = vmulq_f32(a, b); abab = vaddq_f32(abab, abab);
			
 
				                 uint32x4_t cmp = vcleq_f32(vaddq_f32(aa, bb), threshold);
			
 
				-		if (info.smooth) {
			
 
				-                    float32x4_t tempa = vaddq_f32(vsubq_f32(aa, bb), xs);
			
 
				-                    float32x4_t tempb = vaddq_f32(abab, ys);
			
 
				-		    a = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempa)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(a))));
			
 
				-		    b = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempb)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(b))));
			
 
				-		}
			
 
				-		else {
			
 
				-                    a = vaddq_f32(vsubq_f32(aa, bb), xs);
			
 
				-                    b = vaddq_f32(abab, ys);
			
 
				-		}
			
 
				+                if (info.smooth) {
			
 
				+                    float32x4_t tempa = vaddq_f32(vsubq_f32(aa, bb), cx);
			
 
				+                    float32x4_t tempb = vaddq_f32(abab, cy);
			
 
				+                    a = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempa)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(a))));
			
 
				+                    b = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempb)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(b))));
			
 
				+                }
			
 
				+                else {
			
 
				+                    a = vaddq_f32(vsubq_f32(aa, bb), cx);
			
 
				+                    b = vaddq_f32(abab, cy);
			
 
				+                }
			
 
				                 adder = vandq_u32(adder, cmp);
			
 
				                 counter = vaddq_u32(counter, adder);
			
 
				                 // checking for break criterion is possibly expensive, only do it every 8 iterations
			
@@ -105,9 +117,15 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
 
				 {
			
 
				     using T = double;
			
 
				     const MandelViewport& view = info.view;
			
 
				+
			
 
				+    float64x2_t juliaX = vmovq_n_f64(double(info.juliaX));
			
 
				+    float64x2_t juliaY = vmovq_n_f64(double(info.juliaY));
			
 
				+
			
 
				+#if defined(_OPENMP)
			
 
				     if constexpr(parallel)
			
 
				         omp_set_num_threads(omp_get_num_procs());
			
 
				-#pragma omp parallel for schedule(static, 1) if (parallel)
			
 
				+#   pragma omp parallel for schedule(static, 1) if (parallel)
			
 
				+#endif
			
 
				     for (long j = 0; j < info.bHeight; j++) {
			
 
				         T y = T(view.y) + T(j) * T(view.height / info.bHeight);
			
 
				         long i = 0;
			
@@ -129,6 +147,8 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
 
				             float64x2_t ys = vmovq_n_f64(y);
			
 
				             float64x2_t a = xs;
			
 
				             float64x2_t b = ys;
			
 
				+            float64x2_t cx = info.julia ? juliaX : xs;
			
 
				+            float64x2_t cy = info.julia ? juliaY : ys;
			
 
				 
			
 
				             for (int k = 0; k < info.maxIter; k++) {
			
 
				                 float64x2_t aa = vmulq_f64(a, a);
			
@@ -137,16 +157,16 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
 
				                 //a = vaddq_f64(vsubq_f64(aa, bb), xs);
			
 
				                 //b = vaddq_f64(abab, ys);
			
 
				                 uint64x2_t cmp = vcleq_f64(vaddq_f64(aa, bb), threshold);
			
 
				-		if (info.smooth) {
			
 
				-                    float64x2_t tempa = vaddq_f64(vsubq_f64(aa, bb), xs);
			
 
				-                    float64x2_t tempb = vaddq_f64(abab, ys);
			
 
				-		    a = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempa)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(a))));
			
 
				-		    b = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempb)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(b))));
			
 
				-		}
			
 
				-		else {
			
 
				-                    a = vaddq_f64(vsubq_f64(aa, bb), xs);
			
 
				-                    b = vaddq_f64(abab, ys);
			
 
				-		}
			
 
				+                if (info.smooth) {
			
 
				+                    float64x2_t tempa = vaddq_f64(vsubq_f64(aa, bb), cx);
			
 
				+                    float64x2_t tempb = vaddq_f64(abab, cy);
			
 
				+                    a = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempa)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(a))));
			
 
				+                    b = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempb)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(b))));
			
 
				+                }
			
 
				+                else {
			
 
				+                    a = vaddq_f64(vsubq_f64(aa, bb), cx);
			
 
				+                    b = vaddq_f64(abab, cy);
			
 
				+                }
			
 
				                 adder = vandq_u64(adder, cmp);
			
 
				                 counter = vaddq_u64(counter, adder);
			
 
				                 // checking for break criterion is possibly expensive, only do it every 8 iterations
			
@@ -164,11 +184,6 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
 
				                 }
			
 
				             }
			
 
				 
			
 
				-            /*uint64_t resData[2];
			
 
				-            vst1q_u64(resData, counter);
			
 
				-            for (int k = 0; k < 2 && i + k < info.bWidth; k++)
			
 
				-                data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
			
 
				-*/
			
 
				             uint64_t resData[2];
			
 
				             double resa[2];
			
 
				             double resb[2];
			
@@ -187,3 +202,220 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
 
				         }
			
 
				     }
			
 
				 }
			
 
				+
			
 
				+
			
 
				+
			
 
				+struct VecPair
			
 
				+{
			
 
				+    float64x2_t a;
			
 
				+    float64x2_t b;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static inline VecPair quickTwoSum(float64x2_t a, float64x2_t b)
			
 
				+{
			
 
				+    float64x2_t s = vaddq_f64(a, b);
			
 
				+    float64x2_t e = vsubq_f64(b, vsubq_f64(s, a));
			
 
				+    return { s, e };
			
 
				+}
			
 
				+
			
 
				+static inline VecPair quickTwoDiff(float64x2_t a, float64x2_t b)
			
 
				+{
			
 
				+    float64x2_t s = vsubq_f64(a, b);
			
 
				+    float64x2_t e = vsubq_f64(vsubq_f64(a, s), b);
			
 
				+    return { s, e };
			
 
				+}
			
 
				+
			
 
				+static inline VecPair twoSum(float64x2_t a, float64x2_t b)
			
 
				+{
			
 
				+    float64x2_t s = vaddq_f64(a, b);
			
 
				+    float64x2_t bb = vsubq_f64(s, a);
			
 
				+    float64x2_t e = vaddq_f64(vsubq_f64(a, vsubq_f64(s, bb)), vsubq_f64(b, bb));
			
 
				+    return { s, e };
			
 
				+}
			
 
				+
			
 
				+static inline VecPair twoDiff(float64x2_t a, float64x2_t b)
			
 
				+{
			
 
				+    float64x2_t s = vsubq_f64(a, b);
			
 
				+    float64x2_t bb = vsubq_f64(s, a);
			
 
				+    float64x2_t e = vsubq_f64(vsubq_f64(a, vsubq_f64(s, bb)), vaddq_f64(b, bb));
			
 
				+    return { s, e };
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static inline VecPair split(float64x2_t a)
			
 
				+{
			
 
				+    /*
			
 
				+    // -- this should never happen when doing mandelbrot calculations,
			
 
				+    //    so we omit this check.
			
 
				+    if (a > _QD_SPLIT_THRESH || a < -_QD_SPLIT_THRESH) {
			
 
				+        a *= 3.7252902984619140625e-09;  // 2^-28
			
 
				+        temp = _QD_SPLITTER * a;
			
 
				+        hi = temp - (temp - a);
			
 
				+        lo = a - hi;
			
 
				+        hi *= 268435456.0;          // 2^28
			
 
				+        lo *= 268435456.0;          // 2^28
			
 
				+    } else {
			
 
				+        temp = _QD_SPLITTER * a;
			
 
				+        hi = temp - (temp - a);
			
 
				+        lo = a - hi;
			
 
				+    }
			
 
				+    */
			
 
				+
			
 
				+    static const float64x2_t SPLITTER = vmovq_n_f64(134217729.0);
			
 
				+    float64x2_t temp = vmulq_f64(SPLITTER, a);
			
 
				+    float64x2_t hi = vsubq_f64(temp, vsubq_f64(temp, a));
			
 
				+    float64x2_t lo = vsubq_f64(a, hi);
			
 
				+    return { hi, lo };
			
 
				+}
			
 
				+
			
 
				+static inline VecPair twoProd(float64x2_t a, float64x2_t b)
			
 
				+{
			
 
				+    float64x2_t p = vmulq_f64(a, b);
			
 
				+    auto[a_hi, a_lo] = split(a);
			
 
				+    auto[b_hi, b_lo] = split(b);
			
 
				+    float64x2_t err = vaddq_f64(vaddq_f64(vsubq_f64(vmulq_f64(a_hi, b_hi), p), vaddq_f64(vmulq_f64(a_hi, b_lo), vmulq_f64(a_lo, b_hi))), vmulq_f64(a_lo, b_lo));
			
 
				+    return { p, err };
			
 
				+}
			
 
				+
			
 
				+struct NeonDoubleDouble
			
 
				+{
			
 
				+    float64x2_t x[2];
			
 
				+
			
 
				+    inline NeonDoubleDouble(const float64x2_t& a, const float64x2_t& b) :
			
 
				+        x{ a, b }
			
 
				+    {}
			
 
				+
			
 
				+    inline NeonDoubleDouble(double a, double b) :
			
 
				+        x{ vmovq_n_f64(a), vmovq_n_f64(b) }
			
 
				+    {}
			
 
				+
			
 
				+
			
 
				+    inline NeonDoubleDouble operator + (const NeonDoubleDouble& sm) const
			
 
				+    {
			
 
				+        auto[s, e] = twoSum(x[0], sm.x[0]);
			
 
				+        e = vaddq_f64(e, vaddq_f64(x[1], sm.x[1]));
			
 
				+        auto[r1, r2] = quickTwoSum(s, e);
			
 
				+        return NeonDoubleDouble{ r1, r2 };
			
 
				+    }
			
 
				+
			
 
				+    inline NeonDoubleDouble operator - (const NeonDoubleDouble& sm) const
			
 
				+    {
			
 
				+        auto[s, e] = twoDiff(x[0], sm.x[0]);
			
 
				+        e = vaddq_f64(e, x[1]);
			
 
				+        e = vsubq_f64(e, sm.x[1]);
			
 
				+        auto[r1, r2] = quickTwoSum(s, e);
			
 
				+        return NeonDoubleDouble{ r1, r2 };
			
 
				+    }
			
 
				+
			
 
				+    inline NeonDoubleDouble operator * (const NeonDoubleDouble& sm) const
			
 
				+    {
			
 
				+        auto[p1, p2] = twoProd(this->x[0], sm.x[0]);
			
 
				+        p2 = vaddq_f64(p2,
			
 
				+            vaddq_f64(vmulq_f64(sm.x[1], x[0]), vmulq_f64(sm.x[0], x[1])) );
			
 
				+        auto[r1, r2] = quickTwoSum(p1, p2);
			
 
				+        return NeonDoubleDouble{ r1, r2 };
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+template<bool parallel>
			
 
				+void CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInfo& info, float* data)
			
 
				+{
			
 
				+    const MandelViewport& view = info.view;
			
 
				+
			
 
				+    using T = LightDoubleDouble;
			
 
				+
			
 
				+    T viewx = mnd::convert<T>(view.x);
			
 
				+    T viewy = mnd::convert<T>(view.y);
			
 
				+    T wpp = mnd::convert<T>(view.width / info.bWidth);
			
 
				+    T hpp = mnd::convert<T>(view.height / info.bHeight);
			
 
				+
			
 
				+    T jX = mnd::convert<T>(info.juliaX);
			
 
				+    T jY = mnd::convert<T>(info.juliaY);
			
 
				+    NeonDoubleDouble juliaX = { jX[0], jX[1] };
			
 
				+    NeonDoubleDouble juliaY = { jY[0], jY[1] };
			
 
				+
			
 
				+#if defined(_OPENMP)
			
 
				+    if constexpr(parallel)
			
 
				+        omp_set_num_threads(omp_get_num_procs());
			
 
				+#   pragma omp parallel for schedule(static, 1) if (parallel)
			
 
				+#endif
			
 
				+    for (long j = 0; j < info.bHeight; j++) {
			
 
				+        T y = viewy + T(double(j)) * hpp;
			
 
				+        NeonDoubleDouble ys{ y[0], y[1] };
			
 
				+        for (long i = 0; i < info.bWidth; i += 2) {
			
 
				+            T x1 = viewx + T(double(i)) * wpp;
			
 
				+            T x2 = x1 + wpp;
			
 
				+            double xarr1[] = { x1[0], x2[0] };
			
 
				+            double xarr2[] = { x1[1], x2[1] };
			
 
				+            float64x2_t x1s = vld1q_f64(xarr1);
			
 
				+            float64x2_t x2s = vld1q_f64(xarr2);
			
 
				+
			
 
				+            NeonDoubleDouble xs{ x1s, x2s };
			
 
				+
			
 
				+            NeonDoubleDouble cx = info.julia ? juliaX : xs;
			
 
				+            NeonDoubleDouble cy = info.julia ? juliaY : ys;
			
 
				+
			
 
				+            float64x2_t threshold = vmovq_n_f64(16.0);
			
 
				+            uint64x2_t counter = vmovq_n_u64(0);
			
 
				+            uint64x2_t adder = vmovq_n_u64(1);
			
 
				+
			
 
				+            NeonDoubleDouble a = xs;
			
 
				+            NeonDoubleDouble b = ys;
			
 
				+            float64x2_t resultA = a.x[0];
			
 
				+            float64x2_t resultB = b.x[0];
			
 
				+
			
 
				+            float64x2_t resultsa = vmovq_n_f64(0);
			
 
				+            float64x2_t resultsb = vmovq_n_f64(0);
			
 
				+
			
 
				+            for (int k = 0; k < info.maxIter; k++) {
			
 
				+                NeonDoubleDouble aa = a * a;
			
 
				+                NeonDoubleDouble bb = b * b;
			
 
				+                NeonDoubleDouble abab = a * b; abab = abab + abab;
			
 
				+                a = aa - bb + cx;
			
 
				+                b = abab + cy;
			
 
				+
			
 
				+
			
 
				+                uint64x2_t cmp = vcleq_f64(vaddq_f64(aa.x[0], bb.x[0]), threshold);
			
 
				+                if (info.smooth) {
			
 
				+                    resultA = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(a.x[0])), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(resultA))));
			
 
				+                    resultB = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(b.x[0])), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(resultB))));
			
 
				+                }
			
 
				+                a = aa - bb + cx;
			
 
				+                b = abab + cy;
			
 
				+                adder = vandq_u64(adder, cmp);
			
 
				+                counter = vaddq_u64(counter, adder);
			
 
				+                // checking for break criterion is possibly expensive, only do it every 8 iterations
			
 
				+                if ((k & 0x7) == 0) {
			
 
				+                    /* // ARM-v7 method
			
 
				+                    uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
			
 
				+                    if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                    */
			
 
				+                    uint64_t allZero = vaddvq_u64(cmp);
			
 
				+                    if (allZero == 0) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            uint64_t resData[2];
			
 
				+            double resa[2];
			
 
				+            double resb[2];
			
 
				+            vst1q_u64(resData, counter);
			
 
				+            vst1q_f64(resa, resultA);
			
 
				+            vst1q_f64(resb, resultB);
			
 
				+
			
 
				+            for (int k = 0; k < 2 && i + k < info.bWidth; k++) {
			
 
				+                if (info.smooth)
			
 
				+                    data[i + k + j * info.bWidth] = resData[k] <= 0 ? info.maxIter :
			
 
				+                    resData[k] >= info.maxIter ? info.maxIter :
			
 
				+                    ((float) resData[k]) + 1 - ::logf(::logf(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::logf(2.0f);
			
 
				+                else
			
 
				+                    data[i + k + j * info.bWidth] = resData[k] > 0 ? float(resData[k]) : info.maxIter;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/libmandel/src/Hardware.cpp
+++ b/libmandel/src/Hardware.cpp
@@ -20,6 +20,7 @@ using mnd::CpuInfo;
 
				 CpuInfo::CpuInfo(void) :
			
 
				     sse2{ false },
			
 
				     avx{ false },
			
 
				+    avx2{ false },
			
 
				     fma{ false },
			
 
				     avx512{ false },
			
 
				     neon{ false }
			
@@ -107,6 +108,7 @@ CpuInfo::CpuInfo(void) :
 
				 
			
 
				     sse2 = edx1[26];
			
 
				     avx = ecx1[28];
			
 
				+    avx2 = ebx7[5];
			
 
				     fma = ecx1[12];
			
 
				     avx512 = ebx7[16];
			
 
				 }
			
--- a/libmandel/src/IterationCompiler.cpp
+++ b/libmandel/src/IterationCompiler.cpp
@@ -1,8 +1,11 @@
 
				 #include "IterationCompiler.h"
			
 
				 #include "NaiveIRGenerator.h"
			
 
				 
			
 
				-#include "ExecData.h"
			
 
				 #include "Mandel.h"
			
 
				+#ifdef WITH_ASMJIT
			
 
				+#include "ExecData.h"
			
 
				+#endif // WITH_ASMJIT
			
 
				+
			
 
				 #include "OpenClInternal.h"
			
 
				 #include "OpenClCode.h"
			
 
				 
			
@@ -14,6 +17,7 @@
 
				 using namespace std::string_literals;
			
 
				 namespace mnd
			
 
				 {
			
 
				+#ifdef WITH_ASMJIT
			
 
				     struct CompileVisitor
			
 
				     {
			
 
				         using Reg = asmjit::x86::Xmm;
			
@@ -560,6 +564,7 @@ namespace mnd
 
				         return CompiledGeneratorVec{ std::move(ed) };
			
 
				     }
			
 
				 
			
 
				+#endif // WITH_ASMJIT
			
 
				 
			
 
				     struct OpenClVisitor
			
 
				     {
			
@@ -775,6 +780,9 @@ namespace mnd
 
				 
			
 
				         ir::Formula irf = mnd::expand(z0o, zio);
			
 
				         irf.optimize();
			
 
				+
			
 
				+
			
 
				+#ifdef WITH_ASMJIT
			
 
				         printf("ir: %s\n", irf.toString().c_str()); fflush(stdout);
			
 
				         auto dg = std::make_unique<CompiledGenerator>(compile(irf));
			
 
				         printf("asm: %s\n", dg->dump().c_str()); fflush(stdout);
			
@@ -784,6 +792,7 @@ namespace mnd
 
				             printf("asm avxvec: %s\n", dgavx->dump().c_str()); fflush(stdout);
			
 
				             vec.push_back(std::move(dgavx));
			
 
				         }
			
 
				+#endif // WITH_ASMJIT
			
 
				 
			
 
				         //vec.push_back(std::make_unique<NaiveIRGenerator<mnd::DoubleDouble>>(irf));
			
 
				         //vec.push_back(std::make_unique<NaiveIRGenerator<mnd::QuadDouble>>(irf));
			
--- a/libmandel/src/IterationGenerator.cpp
+++ b/libmandel/src/IterationGenerator.cpp
@@ -1,5 +1,4 @@
 
				 #include "IterationGenerator.h"
			
 
				-#include "ExecData.h"
			
 
				 #include "Mandel.h"
			
 
				 
			
 
				 #include "OpenClInternal.h"
			
@@ -125,10 +124,12 @@ std::complex<double> NaiveGenerator::calc(mnd::Expression& expr, std::complex<do
 
				     return result;
			
 
				 }
			
 
				 
			
 
				+#ifdef WITH_ASMJIT
			
 
				+
			
 
				+#include "ExecData.h"
			
 
				+
			
 
				 using mnd::CompiledGenerator;
			
 
				 using mnd::CompiledGeneratorVec;
			
 
				-using mnd::CompiledClGenerator;
			
 
				-using mnd::CompiledClGeneratorDouble;
			
 
				 
			
 
				 
			
 
				 CompiledGenerator::CompiledGenerator(std::unique_ptr<mnd::ExecData> execData,
			
@@ -236,8 +237,11 @@ void CompiledGeneratorVec::generate(const mnd::MandelInfo& info, float* data)
 
				     }
			
 
				 }
			
 
				 
			
 
				+#endif // WITH_ASMJIT
			
 
				 
			
 
				 #ifdef WITH_OPENCL
			
 
				+using mnd::CompiledClGenerator;
			
 
				+using mnd::CompiledClGeneratorDouble;
			
 
				 CompiledClGenerator::CompiledClGenerator(mnd::MandelDevice& device, const std::string& code) :
			
 
				     ClGeneratorFloat{ device, code }
			
 
				 {
			
--- a/libmandel/src/Mandel.cpp
+++ b/libmandel/src/Mandel.cpp
@@ -6,7 +6,9 @@
 
				 #include "OpenClInternal.h"
			
 
				 #include "OpenClCode.h"
			
 
				 
			
 
				+#ifdef WITH_ASMJIT
			
 
				 #include <asmjit/asmjit.h>
			
 
				+#endif // WITH_ASMJIT
			
 
				 
			
 
				 #include <map>
			
 
				 
			
@@ -44,7 +46,9 @@ static const std::map<mnd::GeneratorType, std::string> typeNames =
 
				     { mnd::GeneratorType::DOUBLE_DOUBLE, "double double" },
			
 
				     { mnd::GeneratorType::DOUBLE_DOUBLE_AVX, "double double AVX" },
			
 
				     { mnd::GeneratorType::DOUBLE_DOUBLE_AVX_FMA, "double double AVX+FMA" },
			
 
				+    { mnd::GeneratorType::DOUBLE_DOUBLE_NEON, "double double NEON" },
			
 
				     { mnd::GeneratorType::QUAD_DOUBLE, "quad double" },
			
 
				+    { mnd::GeneratorType::QUAD_DOUBLE_AVX_FMA, "quad double AVX+FMA" },
			
 
				     { mnd::GeneratorType::FLOAT128, "float128" },
			
 
				     { mnd::GeneratorType::FLOAT256, "float256" },
			
 
				     { mnd::GeneratorType::FIXED64, "fixed64" },
			
@@ -112,8 +116,10 @@ bool MandelDevice::supportsDouble(void) const
 
				 }
			
 
				 
			
 
				 
			
 
				-MandelContext::MandelContext(void) :
			
 
				-    jitRuntime{ std::make_unique<asmjit::JitRuntime>() }
			
 
				+MandelContext::MandelContext(void)
			
 
				+#ifdef WITH_ASMJIT
			
 
				+    : jitRuntime{ std::make_unique<asmjit::JitRuntime>() }
			
 
				+#endif // WITH_ASMJIT
			
 
				 {
			
 
				 
			
 
				 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) 
			
@@ -132,14 +138,16 @@ MandelContext::MandelContext(void) :
 
				         cpuGenerators.insert({ GeneratorType::FLOAT_AVX, std::move(fl) });
			
 
				         cpuGenerators.insert({ GeneratorType::DOUBLE_AVX, std::move(db) });
			
 
				         cpuGenerators.insert({ GeneratorType::DOUBLE_DOUBLE_AVX, std::move(ddb) });
			
 
				-        if (cpuInfo.hasFma()) {
			
 
				-            auto favxfma = std::make_unique<CpuGenerator<float, mnd::X86_AVX_FMA, true>>();
			
 
				-            auto davxfma = std::make_unique<CpuGenerator<double, mnd::X86_AVX_FMA, true>>();
			
 
				-            auto ddavx = std::make_unique<CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>>();
			
 
				-            cpuGenerators.insert({ GeneratorType::FLOAT_AVX_FMA, std::move(favxfma) });
			
 
				-            cpuGenerators.insert({ GeneratorType::DOUBLE_AVX_FMA, std::move(davxfma) });
			
 
				-            cpuGenerators.insert({ GeneratorType::DOUBLE_DOUBLE_AVX_FMA, std::move(ddavx) });
			
 
				-        }
			
 
				+    }
			
 
				+    if (cpuInfo.hasAvx2() && cpuInfo.hasFma()) {
			
 
				+        auto favxfma = std::make_unique<CpuGenerator<float, mnd::X86_AVX_FMA, true>>();
			
 
				+        auto davxfma = std::make_unique<CpuGenerator<double, mnd::X86_AVX_FMA, true>>();
			
 
				+        auto ddavxfma = std::make_unique<CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>>();
			
 
				+        auto qdavxfma = std::make_unique<CpuGenerator<QuadDouble, mnd::X86_AVX_FMA, true>>();
			
 
				+        cpuGenerators.insert({ GeneratorType::FLOAT_AVX_FMA, std::move(favxfma) });
			
 
				+        cpuGenerators.insert({ GeneratorType::DOUBLE_AVX_FMA, std::move(davxfma) });
			
 
				+        cpuGenerators.insert({ GeneratorType::DOUBLE_DOUBLE_AVX_FMA, std::move(ddavxfma) });
			
 
				+        cpuGenerators.insert({ GeneratorType::QUAD_DOUBLE_AVX_FMA, std::move(qdavxfma) });
			
 
				     }
			
 
				     if (cpuInfo.hasSse2()) {
			
 
				         auto fl = std::make_unique<CpuGenerator<float, mnd::X86_SSE2, true>>();
			
@@ -151,8 +159,10 @@ MandelContext::MandelContext(void) :
 
				     if (cpuInfo.hasNeon()) {
			
 
				         auto fl = std::make_unique<CpuGenerator<float, mnd::ARM_NEON, true>>();
			
 
				         auto db = std::make_unique<CpuGenerator<double, mnd::ARM_NEON, true>>();
			
 
				+        auto ddb = std::make_unique<CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, true>>();
			
 
				         cpuGenerators.insert({ GeneratorType::FLOAT_NEON, std::move(fl) });
			
 
				         cpuGenerators.insert({ GeneratorType::DOUBLE_NEON, std::move(db) });
			
 
				+        cpuGenerators.insert({ GeneratorType::DOUBLE_DOUBLE_NEON, std::move(ddb) });
			
 
				     }
			
 
				 #endif
			
 
				     {
			
@@ -205,10 +215,11 @@ std::unique_ptr<mnd::AdaptiveGenerator> MandelContext::createAdaptiveGenerator(v
 
				         floatGen = getCpuGenerator(GeneratorType::FLOAT_SSE2);
			
 
				         doubleGen = getCpuGenerator(GeneratorType::DOUBLE_SSE2);
			
 
				     }
			
 
				-    if (cpuInfo.hasAvx() && cpuInfo.hasFma()) {
			
 
				+    if (cpuInfo.hasAvx2() && cpuInfo.hasFma()) {
			
 
				         floatGen = getCpuGenerator(GeneratorType::FLOAT_AVX_FMA);
			
 
				         doubleGen = getCpuGenerator(GeneratorType::DOUBLE_AVX_FMA);
			
 
				         doubleDoubleGen = getCpuGenerator(GeneratorType::DOUBLE_DOUBLE_AVX_FMA);
			
 
				+        quadDoubleGen = getCpuGenerator(GeneratorType::QUAD_DOUBLE_AVX_FMA);
			
 
				     }
			
 
				     if (cpuInfo.hasAvx512()) {
			
 
				         floatGen = getCpuGenerator(GeneratorType::FLOAT_AVX512);
			
@@ -218,6 +229,7 @@ std::unique_ptr<mnd::AdaptiveGenerator> MandelContext::createAdaptiveGenerator(v
 
				     if (cpuInfo.hasNeon()) {
			
 
				         floatGen = getCpuGenerator(GeneratorType::FLOAT_NEON);
			
 
				         doubleGen = getCpuGenerator(GeneratorType::DOUBLE_NEON);
			
 
				+        doubleDoubleGen = getCpuGenerator(GeneratorType::DOUBLE_DOUBLE_NEON);
			
 
				     }
			
 
				 
			
 
				     if (!devices.empty()) {
			
--- a/mandelvid/CMakeLists.txt
+++ b/mandelvid/CMakeLists.txt
@@ -1,25 +1,8 @@
 
				-cmake_minimum_required(VERSION 3.9)
			
 
				+cmake_minimum_required(VERSION 3.13)
			
 
				 
			
 
				 project(mandelvid VERSION 1.0.0 DESCRIPTION "mandel video generator")
			
 
				 
			
 
				-set(ARCH "X86_64" CACHE STRING "Target Architecture")
			
 
				-
			
 
				-
			
 
				-find_package(PkgConfig)
			
 
				-pkg_check_modules(LIBAVCODEC REQUIRED libavcodec)
			
 
				-pkg_check_modules(LIBAVUTIL REQUIRED libavutil)
			
 
				-pkg_check_modules(LIBAVFORMAT REQUIRED libavformat)
			
 
				-pkg_check_modules(LIBSWSCALE REQUIRED libswscale)
			
 
				-
			
 
				-
			
 
				-add_subdirectory(../libmandel ./libmandel)
			
 
				-
			
 
				-find_package(OpenCL)
			
 
				-find_package(OpenMP)
			
 
				-#set(Boost_DEBUG 1)
			
 
				-set(Boost_USE_STATIC_LIBS ON)
			
 
				-find_package(Boost 1.53)
			
 
				-
			
 
				+add_subdirectory(../libalmond ./libalmond)
			
 
				 
			
 
				 set(CMAKE_CXX_STANDARD 17)
			
 
				 
			
@@ -28,53 +11,6 @@ FILE(GLOB mvidsources src/*.cpp)
 
				 FILE(GLOB mvidheaders include/*.h)
			
 
				 
			
 
				 add_executable(mvg ${mvidsources})
			
 
				-target_include_directories(mvg PUBLIC "include")
			
 
				-target_include_directories(mvg PUBLIC ${LIBAVCODEC_INCLUDE_DIRS})
			
 
				-target_include_directories(mvg PUBLIC ${LIBAVUTIL_INCLUDE_DIRS})
			
 
				-target_include_directories(mvg PUBLIC ${LIBAVFORMAT_INCLUDE_DIRS})
			
 
				-target_include_directories(mvg PUBLIC ${LIBSWSCALE_INCLUDE_DIRS})
			
 
				-target_link_libraries(mvg PUBLIC mandel)
			
 
				-target_link_libraries(mvg PUBLIC qd)
			
 
				-target_link_libraries(mvg PUBLIC ${LIBAVCODEC_LIBRARIES})
			
 
				-target_link_libraries(mvg PUBLIC ${LIBAVUTIL_LIBRARIES})
			
 
				-target_link_libraries(mvg PUBLIC ${LIBAVFORMAT_LIBRARIES})
			
 
				-target_link_libraries(mvg PUBLIC ${LIBSWSCALE_LIBRARIES})
			
 
				-
			
 
				-include(CheckIPOSupported)
			
 
				-check_ipo_supported(RESULT LTO_SUPPORTED)
			
 
				-
			
 
				-if (LTO_SUPPORTED AND WITH_LTO)
			
 
				-    message("Enabling link-time optimization.")
			
 
				-    set_property(TARGET mvg PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
			
 
				-endif()
			
 
				-
			
 
				-
			
 
				-#if(OPENCL_FOUND)
			
 
				-#    target_compile_definitions(mvg PUBLIC WITH_OPENCL)
			
 
				-#    target_include_directories(mvg PUBLIC
			
 
				-#        "include"
			
 
				-#        ${OpenCL_INCLUDE_DIRS}
			
 
				-#    )
			
 
				-#    link_directories(${OpenCL_LIBRARY})
			
 
				-#else(OPENCL_FOUND)
			
 
				-#    include_directories("include")
			
 
				-#endif(OPENCL_FOUND)
			
 
				-#
			
 
				-#if (APPLE AND OpenCL_FOUND)
			
 
				-#    SET(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -framework OpenCL")
			
 
				-#endif()
			
 
				-#
			
 
				-#    target_compile_definitions(mandel PUBLIC WITH_BOOST)
			
 
				-#if(Boost_FOUND)
			
 
				-#    target_compile_definitions(mandel PUBLIC WITH_BOOST)
			
 
				-#    target_include_directories(mandel PRIVATE ${Boost_INCLUDE_DIRS})
			
 
				-#    target_link_libraries(mandel PRIVATE ${Boost_LIBRARIES})
			
 
				-#endif(Boost_FOUND)
			
 
				-#
			
 
				-#if(OpenMP_CXX_FOUND)
			
 
				-#    target_link_libraries(mandel PUBLIC OpenMP::OpenMP_CXX)
			
 
				-#endif()
			
 
				-#if(OpenCL_FOUND)
			
 
				-#    target_link_libraries(mandel PUBLIC OpenCL::OpenCL)
			
 
				-#endif()
			
 
				+#target_include_directories(mvg PUBLIC "include")
			
 
				+target_link_libraries(mvg PUBLIC libalmond)
			
 
				 
			
--- a/mandelvid/src/main.cpp
+++ b/mandelvid/src/main.cpp
@@ -20,13 +20,13 @@ int main() {
 
				     //evi.end.zoomCenter(1.0e+27);
			
 
				     evi.gradient = Gradient::defaultGradient();
			
 
				 
			
 
				-    evi.width = 2560;
			
 
				-    evi.height = 1440;
			
 
				-    evi.maxIterations = 20000;
			
 
				-    evi.fps = 60;
			
 
				-    evi.zoomSpeed = 0.9;
			
 
				+    evi.width = 64;
			
 
				+    evi.height = 64;
			
 
				+    evi.maxIterations = 5000;
			
 
				+    evi.fps = 30;
			
 
				+    evi.zoomSpeed = 1.3;
			
 
				     evi.path = "video.avi";
			
 
				-    evi.bitrate = 15000;
			
 
				+    evi.bitrate = 1500;
			
 
				     evi.preset = "slow";
			
 
				 
			
 
				     evi.start.adjustAspectRatio(evi.width, evi.height);