4 years ago · d2b3b8a525
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,8 @@ ENDIF()
 
				 
			
 
				 target_include_directories(Almond PUBLIC include)
			
 
				 
			
 
				+set(MANDEL_PLUGIN_DIR ${CMAKE_BINARY_DIR}/plugins)
			
 
				+set(CMAKE_ENABLE_EXPORTS ON)
			
 
				 add_subdirectory(libalmond)
			
 
				 
			
 
				 target_include_directories(Almond SYSTEM PUBLIC ${FFMPEG_INCLUDE_DIRS})
			
--- a/libmandel/CMakeLists.txt
+++ b/libmandel/CMakeLists.txt
@@ -9,7 +9,7 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
 
				 else()
			
 
				     set(MANDEL_TARGET_ARCHITECTURE "x86_64" CACHE STRING "Target Architecture")
			
 
				 endif()
			
 
				-option(MANDEL_AVX512 "generate code that can make use of avx-512-instructions" ON)
			
 
				+option(MANDEL_AVX512 "generate plugin that can make use of avx-512-instructions" ON)
			
 
				 option(MANDEL_ASMJIT "use just-in-time-compilation library asmjit" ON)
			
 
				 option(MANDEL_OPENCL "use opencl to offload calculations on GPU devices" ON)
			
 
				 option(MANDEL_BUILD_NATIVE
			
@@ -44,6 +44,7 @@ SET(MandelSources
 
				     src/NaiveIRGenerator.cpp
			
 
				     src/FloatLog.cpp
			
 
				     src/Benchmark.cpp
			
 
				+    src/CalcPlugin.cpp
			
 
				 )
			
 
				 FILE(GLOB MandelHeaders include/*.h)
			
 
				 
			
@@ -59,7 +60,7 @@ elseif(MANDEL_TARGET_ARCHITECTURE STREQUAL "aarch64")
 
				 endif()
			
 
				 
			
 
				 
			
 
				-# use both flags just to be sure
			
 
				+# use both flags (mtune & march) just to be sure
			
 
				 CHECK_CXX_COMPILER_FLAG("-march=native" MARCH_NATIVE_SUPPORTED)
			
 
				 CHECK_CXX_COMPILER_FLAG("-mtune=native" MTUNE_NATIVE_SUPPORTED)
			
 
				 if(MARCH_NATIVE_SUPPORTED AND MANDEL_BUILD_NATIVE)
			
@@ -72,22 +73,39 @@ endif()
 
				 
			
 
				 add_executable(resourcec resourcec/resourcec.cpp)
			
 
				 add_custom_command(
			
 
				-    OUTPUT OpenClCode.cpp
			
 
				+    OUTPUT  ${CMAKE_CURRENT_BINARY_DIR}/OpenClCode.cpp
			
 
				     BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/OpenClCode.h
			
 
				-    COMMAND resourcec ARGS -n mnd::cl_src -d ${CMAKE_CURRENT_BINARY_DIR}/OpenClCode.h -o OpenClCode.cpp
			
 
				+    COMMAND resourcec ARGS -n mnd::cl_src -d ${CMAKE_CURRENT_BINARY_DIR}/OpenClCode.h -o ${CMAKE_CURRENT_BINARY_DIR}/OpenClCode.cpp
			
 
				     SOURCES ${MandelClSources}
			
 
				     WORKING_DIRECTORY ${CMAKE_PROJECT_DIR}
			
 
				     COMMENT "Packaging Opencl Resources"
			
 
				+    VERBATIM
			
 
				 )
			
 
				 
			
 
				 
			
 
				 
			
 
				+set(CMAKE_ENABLE_EXPORTS ON)
			
 
				 if(OPENCL_FOUND AND MANDEL_OPENCL)
			
 
				     add_library(mandel STATIC ${MandelSources} OpenClCode.cpp)
			
 
				 else()
			
 
				     add_library(mandel STATIC ${MandelSources})
			
 
				 endif()
			
 
				 
			
 
				+target_link_libraries(mandel PUBLIC ${CMAKE_DL_LIBS})
			
 
				+
			
 
				+
			
 
				+# avx+fma plugin
			
 
				+add_library(avxfma MODULE src/plugins/CpuGeneratorsAVXFMA.cpp)
			
 
				+set_target_properties(avxfma PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${MANDEL_PLUGIN_DIR})
			
 
				+target_include_directories(avxfma PUBLIC "include")
			
 
				+add_dependencies(mandel avxfma)
			
 
				+
			
 
				+# avx512 plugin
			
 
				+add_library(avx512 MODULE src/plugins/CpuGeneratorsAVX512.cpp)
			
 
				+set_target_properties(avx512 PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${MANDEL_PLUGIN_DIR})
			
 
				+target_include_directories(avx512 PUBLIC "include")
			
 
				+add_dependencies(mandel avx512)
			
 
				+
			
 
				 target_include_directories(mandel PUBLIC "include")
			
 
				 target_include_directories(mandel PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
			
 
				 
			
@@ -104,9 +122,6 @@ if(OPENCL_FOUND AND MANDEL_OPENCL)
 
				     target_include_directories(mandel SYSTEM PUBLIC "include_cl")
			
 
				     link_directories(${OpenCL_LIBRARY})
			
 
				     target_link_libraries(mandel PUBLIC OpenCL::OpenCL)
			
 
				-
			
 
				-    #add_subdirectory(resourcec)
			
 
				-
			
 
				 else()
			
 
				 endif()
			
 
				 
			
@@ -117,6 +132,8 @@ endif()
 
				 
			
 
				 if(OpenMP_CXX_FOUND)
			
 
				     target_link_libraries(mandel PUBLIC OpenMP::OpenMP_CXX)
			
 
				+    target_link_libraries(avx512 PUBLIC OpenMP::OpenMP_CXX)
			
 
				+    target_link_libraries(avxfma PUBLIC OpenMP::OpenMP_CXX)
			
 
				 endif()
			
 
				 
			
 
				 if(Boost_FOUND)
			
@@ -130,18 +147,22 @@ if (MANDEL_TARGET_ARCHITECTURE STREQUAL "x86_64" OR MANDEL_TARGET_ARCHITECTURE S
 
				         target_compile_definitions(mandel PUBLIC WITH_AVX512)
			
 
				         if (MSVC)
			
 
				             set_source_files_properties(src/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS /arch:AVX512F)
			
 
				+            set_source_files_properties(src/plugins/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS /arch:AVX512F)
			
 
				         else()
			
 
				             set_source_files_properties(src/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS -mavx512f)
			
 
				+            set_source_files_properties(src/plugins/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS -mavx512f)
			
 
				         endif(MSVC)
			
 
				     endif()
			
 
				 
			
 
				     if (MSVC)
			
 
				         set_source_files_properties(src/CpuGeneratorsAVX.cpp PROPERTIES COMPILE_FLAGS /arch:AVX)
			
 
				         set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
			
 
				+        set_source_files_properties(src/plugins/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
			
 
				         set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS /arch:SSE2)
			
 
				     else()
			
 
				         set_source_files_properties(src/CpuGeneratorsAVX.cpp PROPERTIES COMPILE_FLAGS -mavx)
			
 
				         set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
			
 
				+        set_source_files_properties(src/plugins/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
			
 
				         set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS -msse2)
			
 
				     endif(MSVC)
			
 
				 
			
--- a/libmandel/include/CalcPlugin.h
+++ b/libmandel/include/CalcPlugin.h
@@ -0,0 +1,51 @@
 
				+#ifndef MANDEL_CALCPLUGIN_H
			
 
				+#define MANDEL_CALCPLUGIN_H
			
 
				+
			
 
				+#include <string>
			
 
				+
			
 
				+#if defined(__GNUC__)
			
 
				+    #define MANDEL_EXPORT __attribute__((visibility("default")))
			
 
				+    #define MANDEL_IMPORT
			
 
				+#elif defined(_MSC_VER)
			
 
				+    #define MANDEL_EXPORT __declspec(dllexport)
			
 
				+    #define MANDEL_IMPORT __declspec(dllimport)
			
 
				+#else
			
 
				+    #define MANDEL_EXPORT
			
 
				+    #define MANDEL_IMPORT
			
 
				+#endif
			
 
				+
			
 
				+namespace mnd
			
 
				+{
			
 
				+    class CalcPlugin;
			
 
				+
			
 
				+    class MandelGenerator;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+class MANDEL_EXPORT mnd::CalcPlugin
			
 
				+{
			
 
				+    void* handle;
			
 
				+public:
			
 
				+    CalcPlugin(const std::string& path);
			
 
				+    CalcPlugin(const CalcPlugin&) = delete;
			
 
				+    CalcPlugin(CalcPlugin&&) = default;
			
 
				+    ~CalcPlugin(void);
			
 
				+    CalcPlugin& operator=(const CalcPlugin&) = delete;
			
 
				+    CalcPlugin& operator=(CalcPlugin&&) = default;
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief gets the generators provided by this plugin
			
 
				+    /// \return a vector containing the generators provided by this
			
 
				+    ///         plugin. If no plugin was loaded, the vector is empty.
			
 
				+    ///
			
 
				+    /// \note The returned generators (vector incl.) have the same lifetime as
			
 
				+    ///       the plugin i.e. they are no longer valid should the plugin
			
 
				+    ///       struct be destroyed.
			
 
				+    ///
			
 
				+    const std::vector<mnd::MandelGenerator*>& getGenerators(void);
			
 
				+
			
 
				+    inline bool isValid(void) const { return handle != nullptr; }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+#endif // MANDEL_CALCPLUGIN
			
--- a/libmandel/include/Generators.h
+++ b/libmandel/include/Generators.h
@@ -88,7 +88,7 @@ public:
 
				     {
			
 
				     }
			
 
				 
			
 
				-    virtual ~MandelGenerator(void);
			
 
				+    virtual ~MandelGenerator(void) = default;
			
 
				 
			
 
				 
			
 
				     MandelGenerator(const MandelGenerator&) = default;
			
--- a/libmandel/include/Mandel.h
+++ b/libmandel/include/Mandel.h
@@ -21,6 +21,7 @@ namespace asmjit { class JitRuntime{}; }
 
				 #include "IterationGenerator.h"
			
 
				 #include "CpuGenerators.h"
			
 
				 #include "Hardware.h"
			
 
				+#include "CalcPlugin.h"
			
 
				 
			
 
				 namespace mnd
			
 
				 {
			
@@ -77,7 +78,20 @@ private:
 
				     CpuInfo cpuInfo;
			
 
				     std::unique_ptr<asmjit::JitRuntime> jitRuntime;
			
 
				 
			
 
				-    std::map<GeneratorType, std::unique_ptr<MandelGenerator>> cpuGenerators;
			
 
				+    ///
			
 
				+    /// \brief list of standard mandel generators implemented in c++
			
 
				+    ///
			
 
				+    /// This is an owning list of Generators that can be used regardless of
			
 
				+    /// Cpu type as they are implemented in standard c++ and are integrated
			
 
				+    /// into libmandel.
			
 
				+    ///
			
 
				+    std::vector<std::unique_ptr<MandelGenerator>> defaultGenerators;
			
 
				+    std::vector<std::unique_ptr<CalcPlugin>> loadedPlugins;
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief all cpu generators currently available
			
 
				+    ///
			
 
				+    std::map<GeneratorType, MandelGenerator*> cpuGenerators;
			
 
				 
			
 
				     std::unique_ptr<AdaptiveGenerator> adaptiveGenerator;
			
 
				 
			
@@ -94,6 +108,8 @@ public:
 
				     MandelContext& operator=(const MandelContext&) = delete;
			
 
				     MandelContext& operator=(MandelContext&&) = default;
			
 
				 
			
 
				+    void loadPlugin(std::unique_ptr<CalcPlugin> cp);
			
 
				+
			
 
				     AdaptiveGenerator& getDefaultGenerator(void);
			
 
				     std::vector<std::unique_ptr<mnd::MandelDevice>>& getDevices(void);
			
 
				 
			
--- a/libmandel/src/CalcPlugin.cpp
+++ b/libmandel/src/CalcPlugin.cpp
@@ -0,0 +1,40 @@
 
				+#include "CalcPlugin.h"
			
 
				+#include <dlfcn.h>
			
 
				+#include <stdexcept>
			
 
				+#include <vector>
			
 
				+
			
 
				+
			
 
				+using mnd::CalcPlugin;
			
 
				+
			
 
				+CalcPlugin::CalcPlugin(const std::string& path)
			
 
				+{
			
 
				+    handle = dlopen(path.c_str(), RTLD_LAZY);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+CalcPlugin::~CalcPlugin(void)
			
 
				+{
			
 
				+    if (handle != nullptr) {
			
 
				+         dlclose(handle);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+const std::vector<mnd::MandelGenerator*>& CalcPlugin::getGenerators(void)
			
 
				+{
			
 
				+    static std::vector<mnd::MandelGenerator*> empty = {};
			
 
				+    if (!isValid()) {
			
 
				+        return {};
			
 
				+    }
			
 
				+
			
 
				+    using GeneratorGetter =
			
 
				+        const typename std::vector<mnd::MandelGenerator*>& (*)(void);
			
 
				+    GeneratorGetter gg =
			
 
				+            GeneratorGetter(dlsym(handle, "mandel_get_generators"));
			
 
				+    if (gg != nullptr) {
			
 
				+        return gg();
			
 
				+    }
			
 
				+    else {
			
 
				+        return nullptr;
			
 
				+    }
			
 
				+}
			
--- a/libmandel/src/Generators.cpp
+++ b/libmandel/src/Generators.cpp
@@ -42,12 +42,6 @@ namespace mnd
 
				 }
			
 
				 
			
 
				 
			
 
				-
			
 
				-MandelGenerator::~MandelGenerator(void)
			
 
				-{
			
 
				-}
			
 
				-
			
 
				-
			
 
				 mnd::MandelDevice* MandelGenerator::getDevice(void)
			
 
				 {
			
 
				     return nullptr;
			
--- a/libmandel/src/Mandel.cpp
+++ b/libmandel/src/Mandel.cpp
@@ -74,7 +74,8 @@ MandelContext::MandelContext(void)
 
				     if (cpuInfo.hasAvx512()) {
			
 
				         auto fl = std::make_unique<CpuGenerator<float, mnd::X86_AVX_512, true>>();
			
 
				         //auto db = std::make_unique<CpuGenerator<double, mnd::X86_AVX_512, true>>();
			
 
				-        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::X86_AVX_512 }, std::move(fl) });
			
 
				+        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::X86_AVX_512 }, fl.get() });
			
 
				+        defaultGenerators.push_back(std::move(fl));
			
 
				         //cpuGenerators.insert({ { Precision::DOUBLE, CpuExtension::X86_AVX_512 }, std::move(db) });
			
 
				     }
			
 
				 #   endif
			
@@ -83,12 +84,17 @@ MandelContext::MandelContext(void)
 
				         auto db = std::make_unique<CpuGenerator<double, mnd::X86_AVX, true>>();
			
 
				         auto ddb = std::make_unique<CpuGenerator<DoubleDouble, mnd::X86_AVX, true>>();
			
 
				         auto tdb = std::make_unique<CpuGenerator<TripleDouble, mnd::X86_AVX, true>>();
			
 
				-        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::X86_AVX }, std::move(fl) });
			
 
				-        cpuGenerators.insert({ std::pair{ Precision::DOUBLE, HardwareFeature::X86_AVX }, std::move(db) });
			
 
				-        cpuGenerators.insert({ std::pair{ Precision::DOUBLE_DOUBLE, HardwareFeature::X86_AVX }, std::move(ddb) });
			
 
				-        cpuGenerators.insert({ std::pair{ Precision::TRIPLE_DOUBLE, HardwareFeature::X86_AVX }, std::move(tdb) });
			
 
				+        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::X86_AVX }, fl.get() });
			
 
				+        cpuGenerators.insert({ std::pair{ Precision::DOUBLE, HardwareFeature::X86_AVX }, db.get() });
			
 
				+        cpuGenerators.insert({ std::pair{ Precision::DOUBLE_DOUBLE, HardwareFeature::X86_AVX }, ddb.get() });
			
 
				+        cpuGenerators.insert({ std::pair{ Precision::TRIPLE_DOUBLE, HardwareFeature::X86_AVX }, tdb.get() });
			
 
				+
			
 
				+        defaultGenerators.push_back(std::move(fl));
			
 
				+        defaultGenerators.push_back(std::move(db));
			
 
				+        defaultGenerators.push_back(std::move(ddb));
			
 
				+        defaultGenerators.push_back(std::move(tdb));
			
 
				     }
			
 
				-    if (cpuInfo.hasAvx2() && cpuInfo.hasFma()) {
			
 
				+    /*if (cpuInfo.hasAvx2() && cpuInfo.hasFma()) {
			
 
				         auto favxfma = std::make_unique<CpuGenerator<float, mnd::X86_AVX_FMA, true>>();
			
 
				         auto davxfma = std::make_unique<CpuGenerator<double, mnd::X86_AVX_FMA, true>>();
			
 
				         auto ddavxfma = std::make_unique<CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>>();
			
@@ -115,19 +121,26 @@ MandelContext::MandelContext(void)
 
				         cpuGenerators.insert({ std::pair{ Precision::DOUBLE, CpuExtension::ARM_NEON }, std::move(db) });
			
 
				         cpuGenerators.insert({ std::pair{ Precision::DOUBLE_DOUBLE, CpuExtension::ARM_NEON }, std::move(ddb) });
			
 
				     }
			
 
				+    */
			
 
				 #endif
			
 
				     {
			
 
				         auto fl = std::make_unique<CpuGenerator<float, mnd::NONE, true>>();
			
 
				         auto db = std::make_unique<CpuGenerator<double, mnd::NONE, true>>();
			
 
				-        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::NONE }, std::move(fl) });
			
 
				-        cpuGenerators.insert({ std::pair{ Precision::DOUBLE, HardwareFeature::NONE }, std::move(db) });
			
 
				+        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::NONE }, fl.get() });
			
 
				+        cpuGenerators.insert({ std::pair{ Precision::DOUBLE, HardwareFeature::NONE }, db.get() });
			
 
				 
			
 
				         auto fx64 = std::make_unique<CpuGenerator<Fixed64, mnd::NONE, true>>();
			
 
				         auto fx128 = std::make_unique<CpuGenerator<Fixed128, mnd::NONE, true>>();
			
 
				-        cpuGenerators.insert({ std::pair{ Precision::FIXED64, HardwareFeature::NONE }, std::move(fx64) });
			
 
				-        cpuGenerators.insert({ std::pair{ Precision::FIXED128, HardwareFeature::NONE }, std::move(fx128) });
			
 
				+        cpuGenerators.insert({ std::pair{ Precision::FIXED64, HardwareFeature::NONE }, fx64.get() });
			
 
				+        cpuGenerators.insert({ std::pair{ Precision::FIXED128, HardwareFeature::NONE }, fx128.get() });
			
 
				+
			
 
				+        defaultGenerators.push_back(std::move(fl));
			
 
				+        defaultGenerators.push_back(std::move(db));
			
 
				+        defaultGenerators.push_back(std::move(fx64));
			
 
				+        defaultGenerators.push_back(std::move(fx128));
			
 
				     }
			
 
				 
			
 
				+    /*
			
 
				 #ifdef WITH_BOOST
			
 
				     auto quad = std::make_unique<CpuGenerator<Float128, mnd::NONE, true>>();
			
 
				     auto oct = std::make_unique<CpuGenerator<Float256, mnd::NONE, true>>();
			
@@ -151,7 +164,7 @@ MandelContext::MandelContext(void)
 
				 
			
 
				     auto fix512 = std::make_unique<CpuGenerator<Fixed512, mnd::NONE, true>>();
			
 
				     cpuGenerators.insert({ std::pair{ Precision::FIXED512, HardwareFeature::NONE }, std::move(fix512) });
			
 
				-
			
 
				+    */
			
 
				     devices = createDevices();
			
 
				 
			
 
				     adaptiveGenerator = createAdaptiveGenerator();
			
@@ -335,6 +348,22 @@ MandelContext::~MandelContext(void)
 
				 }
			
 
				 
			
 
				 
			
 
				+void MandelContext::loadPlugin(std::unique_ptr<CalcPlugin> cp)
			
 
				+{
			
 
				+    auto&& gens = cp->getGenerators();
			
 
				+    if (auto&& gen : gens) {
			
 
				+        cpuGenerators.insert({ GeneratorType{ gen->getType(), gen->getExtension() }, gen });
			
 
				+        if (adaptiveGenerator) {
			
 
				+            adaptiveGenerator->addGenerator(*gen);
			
 
				+        }
			
 
				+        loadedPlugins.push_back(std::move(cp));
			
 
				+    }
			
 
				+    else {
			
 
				+        printf("ouh nouh\n");
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				 AdaptiveGenerator& MandelContext::getDefaultGenerator(void)
			
 
				 {
			
 
				     return *adaptiveGenerator;
			
@@ -357,7 +386,7 @@ MandelGenerator* MandelContext::getCpuGenerator(mnd::Precision type, mnd::Hardwa
 
				 {
			
 
				     auto it = cpuGenerators.find({ type, ex });
			
 
				     if (it != cpuGenerators.end())
			
 
				-        return it->second.get();
			
 
				+        return it->second;
			
 
				     else
			
 
				         return nullptr;
			
 
				 }
			
@@ -378,7 +407,7 @@ std::vector<MandelGenerator*> MandelContext::getCpuGenerators(mnd::Precision pre
 
				     std::vector<MandelGenerator*> generators;
			
 
				     for (const auto& [type, gen] : cpuGenerators) {
			
 
				         if (type.first == prec)
			
 
				-            generators.push_back(gen.get());
			
 
				+            generators.push_back(gen);
			
 
				     }
			
 
				     return generators;
			
 
				 }
			
--- a/libmandel/src/plugins/CpuGeneratorsAVX512.cpp
+++ b/libmandel/src/plugins/CpuGeneratorsAVX512.cpp
@@ -0,0 +1,30 @@
 
				+#include <immintrin.h>
			
 
				+#include <omp.h>
			
 
				+
			
 
				+#include <vector>
			
 
				+#include <cmath>
			
 
				+#include "CpuGenerators.h"
			
 
				+
			
 
				+class CpuGeneratorFloatAVX512 : public mnd::MandelGenerator
			
 
				+{
			
 
				+public:
			
 
				+    CpuGeneratorFloatAVX512(void) :
			
 
				+        MandelGenerator{ mnd::Precision::FLOAT, mnd::X86_AVX_512 }
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    virtual void generate(const mnd::MandelInfo& info, float* data) override;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+extern "C" const std::vector<mnd::MandelGenerator*>& mandel_get_generators(void)
			
 
				+{
			
 
				+    static CpuGeneratorFloatAVX512 instance;
			
 
				+    static std::vector<mnd::MandelGenerator*> vec { &instance };
			
 
				+    return vec;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void CpuGeneratorFloatAVX512::generate(const mnd::MandelInfo& info, float* data)
			
 
				+{
			
 
				+}
			
--- a/libmandel/src/plugins/CpuGeneratorsAVXFMA.cpp
+++ b/libmandel/src/plugins/CpuGeneratorsAVXFMA.cpp
@@ -0,0 +1,204 @@
 
				+#include <immintrin.h>
			
 
				+#include <omp.h>
			
 
				+
			
 
				+#include <cmath>
			
 
				+#include <vector>
			
 
				+#include "CpuGenerators.h"
			
 
				+
			
 
				+#include "LightDoubleDouble.h"
			
 
				+#include "QuadDouble.h"
			
 
				+#include "HexDouble.h"
			
 
				+
			
 
				+class CpuGeneratorFloatAVXFMA : public mnd::MandelGenerator
			
 
				+{
			
 
				+public:
			
 
				+    CpuGeneratorFloatAVXFMA(void) :
			
 
				+        MandelGenerator{ mnd::Precision::FLOAT, mnd::X86_AVX_FMA }
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    virtual void generate(const mnd::MandelInfo& info, float* data) override;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+extern "C" const std::vector<mnd::MandelGenerator*>& mandel_get_generators(void)
			
 
				+{
			
 
				+    static CpuGeneratorFloatAVXFMA instance;
			
 
				+    static std::vector<mnd::MandelGenerator*> vec { &instance };
			
 
				+    return vec;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void CpuGeneratorFloatAVXFMA::generate(const mnd::MandelInfo& info, float* data)
			
 
				+{
			
 
				+    const bool parallel = true;
			
 
				+
			
 
				+    using T = float;
			
 
				+
			
 
				+    const auto& view = info.view;
			
 
				+    const T vx = mnd::convert<T>(view.x);
			
 
				+    const T vy = mnd::convert<T>(view.y);
			
 
				+    const T vw = mnd::convert<T>(view.width);
			
 
				+    const T vh = mnd::convert<T>(view.height);
			
 
				+
			
 
				+    const T jX = mnd::convert<T>(info.juliaX);
			
 
				+    const T jY = mnd::convert<T>(info.juliaY);
			
 
				+
			
 
				+    const float dppf = float(vw / info.bWidth);
			
 
				+    const float viewxf = vx; 
			
 
				+    __m256 viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
			
 
				+    __m256 dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
			
 
				+
			
 
				+    __m256 juliaX = { jX, jX, jX, jX, jX, jX, jX, jX };
			
 
				+    __m256 juliaY = { jY, jY, jY, jY, jY, jY, jY, jY };
			
 
				+
			
 
				+#if defined(_OPENMP)
			
 
				+    if (parallel)
			
 
				+        omp_set_num_threads(omp_get_num_procs());
			
 
				+#   pragma omp parallel for schedule(static, 1) if (parallel)
			
 
				+#endif
			
 
				+    for (long j = 0; j < info.bHeight; j++) {
			
 
				+        T y = vy + T(j) * vw / info.bHeight;
			
 
				+        __m256 ys = {y, y, y, y, y, y, y, y};
			
 
				+        for (long i = 0; i < info.bWidth; i += 24) {
			
 
				+            __m256 pixc = { float(i), float(i + 1), float(i + 2), float(i + 3), float(i + 4), float(i + 5), float(i + 6), float(i + 7) };
			
 
				+            __m256 pixc2 = { float(i + 8), float(i + 9), float(i + 10), float(i + 11), float(i + 12), float(i + 13), float(i + 14), float(i + 15) };
			
 
				+            __m256 pixc3 = { float(i + 16), float(i + 17), float(i + 18), float(i + 19), float(i + 20), float(i + 21), float(i + 22), float(i + 23) };
			
 
				+
			
 
				+            __m256 xs = _mm256_add_ps(_mm256_mul_ps(dpp, pixc), viewx);
			
 
				+            __m256 xs2 = _mm256_add_ps(_mm256_mul_ps(dpp, pixc2), viewx);
			
 
				+            __m256 xs3 = _mm256_add_ps(_mm256_mul_ps(dpp, pixc3), viewx);
			
 
				+
			
 
				+            __m256 counter = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+            __m256 adder = { 1, 1, 1, 1, 1, 1, 1, 1 };
			
 
				+            __m256 resultsa = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+            __m256 resultsb = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+
			
 
				+            __m256 counter2 = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+            __m256 adder2 = { 1, 1, 1, 1, 1, 1, 1, 1 };
			
 
				+            __m256 resultsa2 = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+            __m256 resultsb2 = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+
			
 
				+            __m256 counter3 = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+            __m256 adder3 = { 1, 1, 1, 1, 1, 1, 1, 1 };
			
 
				+            __m256 resultsa3 = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+            __m256 resultsb3 = { 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+
			
 
				+            __m256 threshold = { 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f };
			
 
				+            __m256 two = { 2, 2, 2, 2, 2, 2, 2, 2 };
			
 
				+
			
 
				+            __m256 a = xs;
			
 
				+            __m256 a2 = xs2;
			
 
				+            __m256 a3 = xs3;
			
 
				+            __m256 b = ys;
			
 
				+            __m256 b2 = ys;
			
 
				+            __m256 b3 = ys;
			
 
				+
			
 
				+            __m256 cx = info.julia ? juliaX : xs;
			
 
				+            __m256 cx2 = info.julia ? juliaX : xs2;
			
 
				+            __m256 cx3 = info.julia ? juliaX : xs3;
			
 
				+            __m256 cy = info.julia ? juliaY : ys;
			
 
				+
			
 
				+            if (info.smooth) {
			
 
				+                __m256 cmp = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
			
 
				+                __m256 cmp2 = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
			
 
				+                __m256 cmp3 = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
			
 
				+                for (int k = 0; k < info.maxIter; k++) {
			
 
				+                    __m256 bb = _mm256_mul_ps(b, b);
			
 
				+                    __m256 bb2 = _mm256_mul_ps(b2, b2);
			
 
				+                    __m256 bb3 = _mm256_mul_ps(b3, b3);
			
 
				+                    __m256 ab = _mm256_mul_ps(a, b);
			
 
				+                    __m256 ab2 = _mm256_mul_ps(a2, b2);
			
 
				+                    __m256 ab3 = _mm256_mul_ps(a3, b3);
			
 
				+                    __m256 olda = a;
			
 
				+                    __m256 olda2 = a2;
			
 
				+                    __m256 olda3 = a3;
			
 
				+                    a = _mm256_add_ps(_mm256_fmsub_ps(a, a, bb), cx);
			
 
				+                    a2 = _mm256_add_ps(_mm256_fmsub_ps(a2, a2, bb2), cx2);
			
 
				+                    a3 = _mm256_add_ps(_mm256_fmsub_ps(a3, a3, bb3), cx3);
			
 
				+                    b = _mm256_fmadd_ps(two, ab, cy);
			
 
				+                    b2 = _mm256_fmadd_ps(two, ab2, cy);
			
 
				+                    b3 = _mm256_fmadd_ps(two, ab3, cy);
			
 
				+                    /*resultsa = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
			
 
				+                    resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
			
 
				+                    resultsa2 = _mm256_or_ps(_mm256_andnot_ps(cmp2, resultsa2), _mm256_and_ps(cmp2, a2));
			
 
				+                    resultsb2 = _mm256_or_ps(_mm256_andnot_ps(cmp2, resultsb2), _mm256_and_ps(cmp2, b2));
			
 
				+                    resultsa3 = _mm256_or_ps(_mm256_andnot_ps(cmp3, resultsa3), _mm256_and_ps(cmp3, a3));
			
 
				+                    resultsb3 = _mm256_or_ps(_mm256_andnot_ps(cmp3, resultsb3), _mm256_and_ps(cmp3, b3));*/
			
 
				+                    resultsa = _mm256_blendv_ps(resultsa, a, cmp);
			
 
				+                    resultsb = _mm256_blendv_ps(resultsb, b, cmp);
			
 
				+                    resultsa2 = _mm256_blendv_ps(resultsa2, a2, cmp2);
			
 
				+                    resultsb2 = _mm256_blendv_ps(resultsb2, b2, cmp2);
			
 
				+                    resultsa3 = _mm256_blendv_ps(resultsa3, a3, cmp3);
			
 
				+                    resultsb3 = _mm256_blendv_ps(resultsb3, b3, cmp3);
			
 
				+                    cmp = _mm256_cmp_ps(_mm256_fmadd_ps(olda, olda, bb), threshold, _CMP_LE_OQ);
			
 
				+                    cmp2 = _mm256_cmp_ps(_mm256_fmadd_ps(olda2, olda2, bb2), threshold, _CMP_LE_OQ);
			
 
				+                    cmp3 = _mm256_cmp_ps(_mm256_fmadd_ps(olda3, olda3, bb3), threshold, _CMP_LE_OQ);
			
 
				+                    adder = _mm256_and_ps(adder, cmp);
			
 
				+                    counter = _mm256_add_ps(counter, adder);
			
 
				+                    adder2 = _mm256_and_ps(adder2, cmp2);
			
 
				+                    counter2 = _mm256_add_ps(counter2, adder2);
			
 
				+                    adder3 = _mm256_and_ps(adder3, cmp3);
			
 
				+                    counter3 = _mm256_add_ps(counter3, adder3);
			
 
				+                    if ((k & 0x7) == 0 && _mm256_testz_ps(cmp, cmp) != 0 && _mm256_testz_ps(cmp2, cmp2) != 0 && _mm256_testz_ps(cmp3, cmp3) != 0) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+            else {
			
 
				+                for (int k = 0; k < info.maxIter; k++) {
			
 
				+                    __m256 bb = _mm256_mul_ps(b, b);
			
 
				+                    __m256 bb2 = _mm256_mul_ps(b2, b2);
			
 
				+                    __m256 bb3 = _mm256_mul_ps(b3, b3);
			
 
				+                    __m256 ab = _mm256_mul_ps(a, b);
			
 
				+                    __m256 ab2 = _mm256_mul_ps(a2, b2);
			
 
				+                    __m256 ab3 = _mm256_mul_ps(a3, b3);
			
 
				+                    __m256 cmp = _mm256_cmp_ps(_mm256_fmadd_ps(a, a, bb), threshold, _CMP_LE_OQ);
			
 
				+                    __m256 cmp2 = _mm256_cmp_ps(_mm256_fmadd_ps(a2, a2, bb2), threshold, _CMP_LE_OQ);
			
 
				+                    __m256 cmp3 = _mm256_cmp_ps(_mm256_fmadd_ps(a3, a3, bb3), threshold, _CMP_LE_OQ);
			
 
				+                    a = _mm256_add_ps(_mm256_fmsub_ps(a, a, bb), cx);
			
 
				+                    a2 = _mm256_add_ps(_mm256_fmsub_ps(a2, a2, bb2), cx2);
			
 
				+                    a3 = _mm256_add_ps(_mm256_fmsub_ps(a3, a3, bb3), cx3);
			
 
				+                    b = _mm256_fmadd_ps(two, ab, cy);
			
 
				+                    b2 = _mm256_fmadd_ps(two, ab2, cy);
			
 
				+                    b3 = _mm256_fmadd_ps(two, ab3, cy);
			
 
				+                    adder = _mm256_and_ps(adder, cmp);
			
 
				+                    counter = _mm256_add_ps(counter, adder);
			
 
				+                    adder2 = _mm256_and_ps(adder2, cmp2);
			
 
				+                    counter2 = _mm256_add_ps(counter2, adder2);
			
 
				+                    adder3 = _mm256_and_ps(adder3, cmp3);
			
 
				+                    counter3 = _mm256_add_ps(counter3, adder3);
			
 
				+                    if ((k & 0x7) == 0 && _mm256_testz_ps(cmp, cmp) != 0 && _mm256_testz_ps(cmp2, cmp2) != 0 && _mm256_testz_ps(cmp3, cmp3) != 0) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            float resData[96];
			
 
				+            float* ftRes = resData;
			
 
				+            float* resa = ftRes + 24;
			
 
				+            float* resb = resa + 24;
			
 
				+
			
 
				+            _mm256_storeu_ps(ftRes, counter);
			
 
				+            _mm256_storeu_ps(ftRes + 8, counter2);
			
 
				+            _mm256_storeu_ps(ftRes + 16, counter3);
			
 
				+            _mm256_storeu_ps(resa, resultsa);
			
 
				+            _mm256_storeu_ps(resa + 8, resultsa2);
			
 
				+            _mm256_storeu_ps(resa + 16, resultsa3);
			
 
				+            _mm256_storeu_ps(resb, resultsb);
			
 
				+            _mm256_storeu_ps(resb + 8, resultsb2);
			
 
				+            _mm256_storeu_ps(resb + 16, resultsb3);
			
 
				+            for (int k = 0; k < 24 && i + k < info.bWidth; k++) {
			
 
				+                if (info.smooth) {
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
			
 
				+                        ftRes[k] >= info.maxIter ? info.maxIter :
			
 
				+                        ((float)ftRes[k]) + 1 - ::log2(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
			
 
				+                }
			
 
				+                else {
			
 
				+                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter : ftRes[k];
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
--- a/libmandel/src/plugins/avx512.so
+++ b/libmandel/src/plugins/avx512.so
--- a/src/Almond.cpp
+++ b/src/Almond.cpp
@@ -16,6 +16,15 @@ Almond::Almond(QWidget* parent) :
 
				     QMainWindow{ parent, Qt::WindowFlags() },
			
 
				     mandelContext{ mnd::initializeContext() }
			
 
				 {
			
 
				+    std::unique_ptr<mnd::CalcPlugin> cp =
			
 
				+            std::make_unique<mnd::CalcPlugin>("./plugins/libavxfma.so");
			
 
				+    if (!cp->isValid()) {
			
 
				+        //exit(1);
			
 
				+    }
			
 
				+    else {
			
 
				+        mandelContext.loadPlugin(std::move(cp));
			
 
				+    }
			
 
				+
			
 
				     ui.setupUi(this);
			
 
				     fractalWidget = new FractalWidget(this);
			
 
				     fractalWidget->setGenerator(&mandelContext.getDefaultGenerator());