Browse Source

adding plugins for cpu extensions

Nicolas Winkler 4 years ago
parent
commit
d2b3b8a525

+ 2 - 0
CMakeLists.txt

@@ -49,6 +49,8 @@ ENDIF()
 
 target_include_directories(Almond PUBLIC include)
 
+set(MANDEL_PLUGIN_DIR ${CMAKE_BINARY_DIR}/plugins)
+set(CMAKE_ENABLE_EXPORTS ON)
 add_subdirectory(libalmond)
 
 target_include_directories(Almond SYSTEM PUBLIC ${FFMPEG_INCLUDE_DIRS})

+ 28 - 7
libmandel/CMakeLists.txt

@@ -9,7 +9,7 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
 else()
     set(MANDEL_TARGET_ARCHITECTURE "x86_64" CACHE STRING "Target Architecture")
 endif()
-option(MANDEL_AVX512 "generate code that can make use of avx-512-instructions" ON)
+option(MANDEL_AVX512 "generate plugin that can make use of avx-512-instructions" ON)
 option(MANDEL_ASMJIT "use just-in-time-compilation library asmjit" ON)
 option(MANDEL_OPENCL "use opencl to offload calculations on GPU devices" ON)
 option(MANDEL_BUILD_NATIVE
@@ -44,6 +44,7 @@ SET(MandelSources
     src/NaiveIRGenerator.cpp
     src/FloatLog.cpp
     src/Benchmark.cpp
+    src/CalcPlugin.cpp
 )
 FILE(GLOB MandelHeaders include/*.h)
 
@@ -59,7 +60,7 @@ elseif(MANDEL_TARGET_ARCHITECTURE STREQUAL "aarch64")
 endif()
 
 
-# use both flags just to be sure
+# use both flags (mtune & march) just to be sure
 CHECK_CXX_COMPILER_FLAG("-march=native" MARCH_NATIVE_SUPPORTED)
 CHECK_CXX_COMPILER_FLAG("-mtune=native" MTUNE_NATIVE_SUPPORTED)
 if(MARCH_NATIVE_SUPPORTED AND MANDEL_BUILD_NATIVE)
@@ -72,22 +73,39 @@ endif()
 
 add_executable(resourcec resourcec/resourcec.cpp)
 add_custom_command(
-    OUTPUT OpenClCode.cpp
+    OUTPUT  ${CMAKE_CURRENT_BINARY_DIR}/OpenClCode.cpp
     BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/OpenClCode.h
-    COMMAND resourcec ARGS -n mnd::cl_src -d ${CMAKE_CURRENT_BINARY_DIR}/OpenClCode.h -o OpenClCode.cpp
+    COMMAND resourcec ARGS -n mnd::cl_src -d ${CMAKE_CURRENT_BINARY_DIR}/OpenClCode.h -o ${CMAKE_CURRENT_BINARY_DIR}/OpenClCode.cpp
     SOURCES ${MandelClSources}
     WORKING_DIRECTORY ${CMAKE_PROJECT_DIR}
     COMMENT "Packaging Opencl Resources"
+    VERBATIM
 )
 
 
 
+set(CMAKE_ENABLE_EXPORTS ON)
 if(OPENCL_FOUND AND MANDEL_OPENCL)
     add_library(mandel STATIC ${MandelSources} OpenClCode.cpp)
 else()
     add_library(mandel STATIC ${MandelSources})
 endif()
 
+target_link_libraries(mandel PUBLIC ${CMAKE_DL_LIBS})
+
+
+# avx+fma plugin
+add_library(avxfma MODULE src/plugins/CpuGeneratorsAVXFMA.cpp)
+set_target_properties(avxfma PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${MANDEL_PLUGIN_DIR})
+target_include_directories(avxfma PUBLIC "include")
+add_dependencies(mandel avxfma)
+
+# avx512 plugin
+add_library(avx512 MODULE src/plugins/CpuGeneratorsAVX512.cpp)
+set_target_properties(avx512 PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${MANDEL_PLUGIN_DIR})
+target_include_directories(avx512 PUBLIC "include")
+add_dependencies(mandel avx512)
+
 target_include_directories(mandel PUBLIC "include")
 target_include_directories(mandel PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 
@@ -104,9 +122,6 @@ if(OPENCL_FOUND AND MANDEL_OPENCL)
     target_include_directories(mandel SYSTEM PUBLIC "include_cl")
     link_directories(${OpenCL_LIBRARY})
     target_link_libraries(mandel PUBLIC OpenCL::OpenCL)
-
-    #add_subdirectory(resourcec)
-
 else()
 endif()
 
@@ -117,6 +132,8 @@ endif()
 
 if(OpenMP_CXX_FOUND)
     target_link_libraries(mandel PUBLIC OpenMP::OpenMP_CXX)
+    target_link_libraries(avx512 PUBLIC OpenMP::OpenMP_CXX)
+    target_link_libraries(avxfma PUBLIC OpenMP::OpenMP_CXX)
 endif()
 
 if(Boost_FOUND)
@@ -130,18 +147,22 @@ if (MANDEL_TARGET_ARCHITECTURE STREQUAL "x86_64" OR MANDEL_TARGET_ARCHITECTURE S
         target_compile_definitions(mandel PUBLIC WITH_AVX512)
         if (MSVC)
             set_source_files_properties(src/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS /arch:AVX512F)
+            set_source_files_properties(src/plugins/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS /arch:AVX512F)
         else()
             set_source_files_properties(src/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS -mavx512f)
+            set_source_files_properties(src/plugins/CpuGeneratorsAVX512.cpp PROPERTIES COMPILE_FLAGS -mavx512f)
         endif(MSVC)
     endif()
 
     if (MSVC)
         set_source_files_properties(src/CpuGeneratorsAVX.cpp PROPERTIES COMPILE_FLAGS /arch:AVX)
         set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
+        set_source_files_properties(src/plugins/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
         set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS /arch:SSE2)
     else()
         set_source_files_properties(src/CpuGeneratorsAVX.cpp PROPERTIES COMPILE_FLAGS -mavx)
         set_source_files_properties(src/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
+        set_source_files_properties(src/plugins/CpuGeneratorsAVXFMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
         set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS -msse2)
     endif(MSVC)
 

+ 51 - 0
libmandel/include/CalcPlugin.h

@@ -0,0 +1,51 @@
+#ifndef MANDEL_CALCPLUGIN_H
+#define MANDEL_CALCPLUGIN_H
+
+#include <string>
+
+#if defined(__GNUC__)
+    #define MANDEL_EXPORT __attribute__((visibility("default")))
+    #define MANDEL_IMPORT
+#elif defined(_MSC_VER)
+    #define MANDEL_EXPORT __declspec(dllexport)
+    #define MANDEL_IMPORT __declspec(dllimport)
+#else
+    #define MANDEL_EXPORT
+    #define MANDEL_IMPORT
+#endif
+
+namespace mnd
+{
+    class CalcPlugin;
+
+    class MandelGenerator;
+}
+
+
+class MANDEL_EXPORT mnd::CalcPlugin
+{
+    void* handle;
+public:
+    CalcPlugin(const std::string& path);
+    CalcPlugin(const CalcPlugin&) = delete;
+    CalcPlugin(CalcPlugin&&) = default;
+    ~CalcPlugin(void);
+    CalcPlugin& operator=(const CalcPlugin&) = delete;
+    CalcPlugin& operator=(CalcPlugin&&) = default;
+
+    ///
+    /// \brief gets the generators provided by this plugin
+    /// \return a vector containing the generators provided by this
+    ///         plugin. If no plugin was loaded, the vector is empty.
+    ///
+    /// \note The returned generators (vector incl.) have the same lifetime as
+    ///       the plugin i.e. they are no longer valid should the plugin
+    ///       struct be destroyed.
+    ///
+    const std::vector<mnd::MandelGenerator*>& getGenerators(void);
+
+    inline bool isValid(void) const { return handle != nullptr; }
+};
+
+
+#endif // MANDEL_CALCPLUGIN

+ 1 - 1
libmandel/include/Generators.h

@@ -88,7 +88,7 @@ public:
     {
     }
 
-    virtual ~MandelGenerator(void);
+    virtual ~MandelGenerator(void) = default;
 
 
     MandelGenerator(const MandelGenerator&) = default;

+ 17 - 1
libmandel/include/Mandel.h

@@ -21,6 +21,7 @@ namespace asmjit { class JitRuntime{}; }
 #include "IterationGenerator.h"
 #include "CpuGenerators.h"
 #include "Hardware.h"
+#include "CalcPlugin.h"
 
 namespace mnd
 {
@@ -77,7 +78,20 @@ private:
     CpuInfo cpuInfo;
     std::unique_ptr<asmjit::JitRuntime> jitRuntime;
 
-    std::map<GeneratorType, std::unique_ptr<MandelGenerator>> cpuGenerators;
+    ///
+    /// \brief list of standard mandel generators implemented in c++
+    ///
+    /// This is an owning list of Generators that can be used regardless of
+    /// Cpu type as they are implemented in standard c++ and are integrated
+    /// into libmandel.
+    ///
+    std::vector<std::unique_ptr<MandelGenerator>> defaultGenerators;
+    std::vector<std::unique_ptr<CalcPlugin>> loadedPlugins;
+
+    ///
+    /// \brief all cpu generators currently available
+    ///
+    std::map<GeneratorType, MandelGenerator*> cpuGenerators;
 
     std::unique_ptr<AdaptiveGenerator> adaptiveGenerator;
 
@@ -94,6 +108,8 @@ public:
     MandelContext& operator=(const MandelContext&) = delete;
     MandelContext& operator=(MandelContext&&) = default;
 
+    void loadPlugin(std::unique_ptr<CalcPlugin> cp);
+
     AdaptiveGenerator& getDefaultGenerator(void);
     std::vector<std::unique_ptr<mnd::MandelDevice>>& getDevices(void);
 

+ 40 - 0
libmandel/src/CalcPlugin.cpp

@@ -0,0 +1,40 @@
+#include "CalcPlugin.h"
+#include <dlfcn.h>
+#include <stdexcept>
+#include <vector>
+
+
+using mnd::CalcPlugin;
+
+CalcPlugin::CalcPlugin(const std::string& path)
+{
+    handle = dlopen(path.c_str(), RTLD_LAZY);
+}
+
+
+CalcPlugin::~CalcPlugin(void)
+{
+    if (handle != nullptr) {
+         dlclose(handle);
+    }
+}
+
+
+const std::vector<mnd::MandelGenerator*>& CalcPlugin::getGenerators(void)
+{
+    static std::vector<mnd::MandelGenerator*> empty = {};
+    if (!isValid()) {
+        return {};
+    }
+
+    using GeneratorGetter =
+        const typename std::vector<mnd::MandelGenerator*>& (*)(void);
+    GeneratorGetter gg =
+            GeneratorGetter(dlsym(handle, "mandel_get_generators"));
+    if (gg != nullptr) {
+        return gg();
+    }
+    else {
+        return nullptr;
+    }
+}

+ 0 - 6
libmandel/src/Generators.cpp

@@ -42,12 +42,6 @@ namespace mnd
 }
 
 
-
-MandelGenerator::~MandelGenerator(void)
-{
-}
-
-
 mnd::MandelDevice* MandelGenerator::getDevice(void)
 {
     return nullptr;

+ 42 - 13
libmandel/src/Mandel.cpp

@@ -74,7 +74,8 @@ MandelContext::MandelContext(void)
     if (cpuInfo.hasAvx512()) {
         auto fl = std::make_unique<CpuGenerator<float, mnd::X86_AVX_512, true>>();
         //auto db = std::make_unique<CpuGenerator<double, mnd::X86_AVX_512, true>>();
-        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::X86_AVX_512 }, std::move(fl) });
+        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::X86_AVX_512 }, fl.get() });
+        defaultGenerators.push_back(std::move(fl));
         //cpuGenerators.insert({ { Precision::DOUBLE, CpuExtension::X86_AVX_512 }, std::move(db) });
     }
 #   endif
@@ -83,12 +84,17 @@ MandelContext::MandelContext(void)
         auto db = std::make_unique<CpuGenerator<double, mnd::X86_AVX, true>>();
         auto ddb = std::make_unique<CpuGenerator<DoubleDouble, mnd::X86_AVX, true>>();
         auto tdb = std::make_unique<CpuGenerator<TripleDouble, mnd::X86_AVX, true>>();
-        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::X86_AVX }, std::move(fl) });
-        cpuGenerators.insert({ std::pair{ Precision::DOUBLE, HardwareFeature::X86_AVX }, std::move(db) });
-        cpuGenerators.insert({ std::pair{ Precision::DOUBLE_DOUBLE, HardwareFeature::X86_AVX }, std::move(ddb) });
-        cpuGenerators.insert({ std::pair{ Precision::TRIPLE_DOUBLE, HardwareFeature::X86_AVX }, std::move(tdb) });
+        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::X86_AVX }, fl.get() });
+        cpuGenerators.insert({ std::pair{ Precision::DOUBLE, HardwareFeature::X86_AVX }, db.get() });
+        cpuGenerators.insert({ std::pair{ Precision::DOUBLE_DOUBLE, HardwareFeature::X86_AVX }, ddb.get() });
+        cpuGenerators.insert({ std::pair{ Precision::TRIPLE_DOUBLE, HardwareFeature::X86_AVX }, tdb.get() });
+
+        defaultGenerators.push_back(std::move(fl));
+        defaultGenerators.push_back(std::move(db));
+        defaultGenerators.push_back(std::move(ddb));
+        defaultGenerators.push_back(std::move(tdb));
     }
-    if (cpuInfo.hasAvx2() && cpuInfo.hasFma()) {
+    /*if (cpuInfo.hasAvx2() && cpuInfo.hasFma()) {
         auto favxfma = std::make_unique<CpuGenerator<float, mnd::X86_AVX_FMA, true>>();
         auto davxfma = std::make_unique<CpuGenerator<double, mnd::X86_AVX_FMA, true>>();
         auto ddavxfma = std::make_unique<CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>>();
@@ -115,19 +121,26 @@ MandelContext::MandelContext(void)
         cpuGenerators.insert({ std::pair{ Precision::DOUBLE, CpuExtension::ARM_NEON }, std::move(db) });
         cpuGenerators.insert({ std::pair{ Precision::DOUBLE_DOUBLE, CpuExtension::ARM_NEON }, std::move(ddb) });
     }
+    */
 #endif
     {
         auto fl = std::make_unique<CpuGenerator<float, mnd::NONE, true>>();
         auto db = std::make_unique<CpuGenerator<double, mnd::NONE, true>>();
-        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::NONE }, std::move(fl) });
-        cpuGenerators.insert({ std::pair{ Precision::DOUBLE, HardwareFeature::NONE }, std::move(db) });
+        cpuGenerators.insert({ std::pair{ Precision::FLOAT, HardwareFeature::NONE }, fl.get() });
+        cpuGenerators.insert({ std::pair{ Precision::DOUBLE, HardwareFeature::NONE }, db.get() });
 
         auto fx64 = std::make_unique<CpuGenerator<Fixed64, mnd::NONE, true>>();
         auto fx128 = std::make_unique<CpuGenerator<Fixed128, mnd::NONE, true>>();
-        cpuGenerators.insert({ std::pair{ Precision::FIXED64, HardwareFeature::NONE }, std::move(fx64) });
-        cpuGenerators.insert({ std::pair{ Precision::FIXED128, HardwareFeature::NONE }, std::move(fx128) });
+        cpuGenerators.insert({ std::pair{ Precision::FIXED64, HardwareFeature::NONE }, fx64.get() });
+        cpuGenerators.insert({ std::pair{ Precision::FIXED128, HardwareFeature::NONE }, fx128.get() });
+
+        defaultGenerators.push_back(std::move(fl));
+        defaultGenerators.push_back(std::move(db));
+        defaultGenerators.push_back(std::move(fx64));
+        defaultGenerators.push_back(std::move(fx128));
     }
 
+    /*
 #ifdef WITH_BOOST
     auto quad = std::make_unique<CpuGenerator<Float128, mnd::NONE, true>>();
     auto oct = std::make_unique<CpuGenerator<Float256, mnd::NONE, true>>();
@@ -151,7 +164,7 @@ MandelContext::MandelContext(void)
 
     auto fix512 = std::make_unique<CpuGenerator<Fixed512, mnd::NONE, true>>();
     cpuGenerators.insert({ std::pair{ Precision::FIXED512, HardwareFeature::NONE }, std::move(fix512) });
-
+    */
     devices = createDevices();
 
     adaptiveGenerator = createAdaptiveGenerator();
@@ -335,6 +348,22 @@ MandelContext::~MandelContext(void)
 }
 
 
+void MandelContext::loadPlugin(std::unique_ptr<CalcPlugin> cp)
+{
+    auto&& gens = cp->getGenerators();
+    if (auto&& gen : gens) {
+        cpuGenerators.insert({ GeneratorType{ gen->getType(), gen->getExtension() }, gen });
+        if (adaptiveGenerator) {
+            adaptiveGenerator->addGenerator(*gen);
+        }
+        loadedPlugins.push_back(std::move(cp));
+    }
+    else {
+        printf("ouh nouh\n");
+    }
+}
+
+
 AdaptiveGenerator& MandelContext::getDefaultGenerator(void)
 {
     return *adaptiveGenerator;
@@ -357,7 +386,7 @@ MandelGenerator* MandelContext::getCpuGenerator(mnd::Precision type, mnd::Hardwa
 {
     auto it = cpuGenerators.find({ type, ex });
     if (it != cpuGenerators.end())
-        return it->second.get();
+        return it->second;
     else
         return nullptr;
 }
@@ -378,7 +407,7 @@ std::vector<MandelGenerator*> MandelContext::getCpuGenerators(mnd::Precision pre
     std::vector<MandelGenerator*> generators;
     for (const auto& [type, gen] : cpuGenerators) {
         if (type.first == prec)
-            generators.push_back(gen.get());
+            generators.push_back(gen);
     }
     return generators;
 }

+ 30 - 0
libmandel/src/plugins/CpuGeneratorsAVX512.cpp

@@ -0,0 +1,30 @@
+#include <immintrin.h>
+#include <omp.h>
+
+#include <vector>
+#include <cmath>
+#include "CpuGenerators.h"
+
+class CpuGeneratorFloatAVX512 : public mnd::MandelGenerator
+{
+public:
+    CpuGeneratorFloatAVX512(void) :
+        MandelGenerator{ mnd::Precision::FLOAT, mnd::X86_AVX_512 }
+    {
+    }
+
+    virtual void generate(const mnd::MandelInfo& info, float* data) override;
+};
+
+
+extern "C" const std::vector<mnd::MandelGenerator*>& mandel_get_generators(void)
+{
+    static CpuGeneratorFloatAVX512 instance;
+    static std::vector<mnd::MandelGenerator*> vec { &instance };
+    return vec;
+}
+
+
+void CpuGeneratorFloatAVX512::generate(const mnd::MandelInfo& info, float* data)
+{
+}

+ 204 - 0
libmandel/src/plugins/CpuGeneratorsAVXFMA.cpp

@@ -0,0 +1,204 @@
+#include <immintrin.h>
+#include <omp.h>
+
+#include <cmath>
+#include <vector>
+#include "CpuGenerators.h"
+
+#include "LightDoubleDouble.h"
+#include "QuadDouble.h"
+#include "HexDouble.h"
+
+class CpuGeneratorFloatAVXFMA : public mnd::MandelGenerator
+{
+public:
+    CpuGeneratorFloatAVXFMA(void) :
+        MandelGenerator{ mnd::Precision::FLOAT, mnd::X86_AVX_FMA }
+    {
+    }
+
+    virtual void generate(const mnd::MandelInfo& info, float* data) override;
+};
+
+
+extern "C" const std::vector<mnd::MandelGenerator*>& mandel_get_generators(void)
+{
+    static CpuGeneratorFloatAVXFMA instance;
+    static std::vector<mnd::MandelGenerator*> vec { &instance };
+    return vec;
+}
+
+
+void CpuGeneratorFloatAVXFMA::generate(const mnd::MandelInfo& info, float* data)
+{
+    const bool parallel = true;
+
+    using T = float;
+
+    const auto& view = info.view;
+    const T vx = mnd::convert<T>(view.x);
+    const T vy = mnd::convert<T>(view.y);
+    const T vw = mnd::convert<T>(view.width);
+    const T vh = mnd::convert<T>(view.height);
+
+    const T jX = mnd::convert<T>(info.juliaX);
+    const T jY = mnd::convert<T>(info.juliaY);
+
+    const float dppf = float(vw / info.bWidth);
+    const float viewxf = vx; 
+    __m256 viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
+    __m256 dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
+
+    __m256 juliaX = { jX, jX, jX, jX, jX, jX, jX, jX };
+    __m256 juliaY = { jY, jY, jY, jY, jY, jY, jY, jY };
+
+#if defined(_OPENMP)
+    if (parallel)
+        omp_set_num_threads(omp_get_num_procs());
+#   pragma omp parallel for schedule(static, 1) if (parallel)
+#endif
+    for (long j = 0; j < info.bHeight; j++) {
+        T y = vy + T(j) * vw / info.bHeight;
+        __m256 ys = {y, y, y, y, y, y, y, y};
+        for (long i = 0; i < info.bWidth; i += 24) {
+            __m256 pixc = { float(i), float(i + 1), float(i + 2), float(i + 3), float(i + 4), float(i + 5), float(i + 6), float(i + 7) };
+            __m256 pixc2 = { float(i + 8), float(i + 9), float(i + 10), float(i + 11), float(i + 12), float(i + 13), float(i + 14), float(i + 15) };
+            __m256 pixc3 = { float(i + 16), float(i + 17), float(i + 18), float(i + 19), float(i + 20), float(i + 21), float(i + 22), float(i + 23) };
+
+            __m256 xs = _mm256_add_ps(_mm256_mul_ps(dpp, pixc), viewx);
+            __m256 xs2 = _mm256_add_ps(_mm256_mul_ps(dpp, pixc2), viewx);
+            __m256 xs3 = _mm256_add_ps(_mm256_mul_ps(dpp, pixc3), viewx);
+
+            __m256 counter = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m256 adder = { 1, 1, 1, 1, 1, 1, 1, 1 };
+            __m256 resultsa = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m256 resultsb = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+            __m256 counter2 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m256 adder2 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+            __m256 resultsa2 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m256 resultsb2 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+            __m256 counter3 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m256 adder3 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+            __m256 resultsa3 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+            __m256 resultsb3 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+            __m256 threshold = { 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f };
+            __m256 two = { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+            __m256 a = xs;
+            __m256 a2 = xs2;
+            __m256 a3 = xs3;
+            __m256 b = ys;
+            __m256 b2 = ys;
+            __m256 b3 = ys;
+
+            __m256 cx = info.julia ? juliaX : xs;
+            __m256 cx2 = info.julia ? juliaX : xs2;
+            __m256 cx3 = info.julia ? juliaX : xs3;
+            __m256 cy = info.julia ? juliaY : ys;
+
+            if (info.smooth) {
+                __m256 cmp = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
+                __m256 cmp2 = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
+                __m256 cmp3 = _mm256_cmp_ps(threshold, threshold, _CMP_LE_OQ);
+                for (int k = 0; k < info.maxIter; k++) {
+                    __m256 bb = _mm256_mul_ps(b, b);
+                    __m256 bb2 = _mm256_mul_ps(b2, b2);
+                    __m256 bb3 = _mm256_mul_ps(b3, b3);
+                    __m256 ab = _mm256_mul_ps(a, b);
+                    __m256 ab2 = _mm256_mul_ps(a2, b2);
+                    __m256 ab3 = _mm256_mul_ps(a3, b3);
+                    __m256 olda = a;
+                    __m256 olda2 = a2;
+                    __m256 olda3 = a3;
+                    a = _mm256_add_ps(_mm256_fmsub_ps(a, a, bb), cx);
+                    a2 = _mm256_add_ps(_mm256_fmsub_ps(a2, a2, bb2), cx2);
+                    a3 = _mm256_add_ps(_mm256_fmsub_ps(a3, a3, bb3), cx3);
+                    b = _mm256_fmadd_ps(two, ab, cy);
+                    b2 = _mm256_fmadd_ps(two, ab2, cy);
+                    b3 = _mm256_fmadd_ps(two, ab3, cy);
+                    /*resultsa = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
+                    resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
+                    resultsa2 = _mm256_or_ps(_mm256_andnot_ps(cmp2, resultsa2), _mm256_and_ps(cmp2, a2));
+                    resultsb2 = _mm256_or_ps(_mm256_andnot_ps(cmp2, resultsb2), _mm256_and_ps(cmp2, b2));
+                    resultsa3 = _mm256_or_ps(_mm256_andnot_ps(cmp3, resultsa3), _mm256_and_ps(cmp3, a3));
+                    resultsb3 = _mm256_or_ps(_mm256_andnot_ps(cmp3, resultsb3), _mm256_and_ps(cmp3, b3));*/
+                    resultsa = _mm256_blendv_ps(resultsa, a, cmp);
+                    resultsb = _mm256_blendv_ps(resultsb, b, cmp);
+                    resultsa2 = _mm256_blendv_ps(resultsa2, a2, cmp2);
+                    resultsb2 = _mm256_blendv_ps(resultsb2, b2, cmp2);
+                    resultsa3 = _mm256_blendv_ps(resultsa3, a3, cmp3);
+                    resultsb3 = _mm256_blendv_ps(resultsb3, b3, cmp3);
+                    cmp = _mm256_cmp_ps(_mm256_fmadd_ps(olda, olda, bb), threshold, _CMP_LE_OQ);
+                    cmp2 = _mm256_cmp_ps(_mm256_fmadd_ps(olda2, olda2, bb2), threshold, _CMP_LE_OQ);
+                    cmp3 = _mm256_cmp_ps(_mm256_fmadd_ps(olda3, olda3, bb3), threshold, _CMP_LE_OQ);
+                    adder = _mm256_and_ps(adder, cmp);
+                    counter = _mm256_add_ps(counter, adder);
+                    adder2 = _mm256_and_ps(adder2, cmp2);
+                    counter2 = _mm256_add_ps(counter2, adder2);
+                    adder3 = _mm256_and_ps(adder3, cmp3);
+                    counter3 = _mm256_add_ps(counter3, adder3);
+                    if ((k & 0x7) == 0 && _mm256_testz_ps(cmp, cmp) != 0 && _mm256_testz_ps(cmp2, cmp2) != 0 && _mm256_testz_ps(cmp3, cmp3) != 0) {
+                        break;
+                    }
+                }
+            }
+            else {
+                for (int k = 0; k < info.maxIter; k++) {
+                    __m256 bb = _mm256_mul_ps(b, b);
+                    __m256 bb2 = _mm256_mul_ps(b2, b2);
+                    __m256 bb3 = _mm256_mul_ps(b3, b3);
+                    __m256 ab = _mm256_mul_ps(a, b);
+                    __m256 ab2 = _mm256_mul_ps(a2, b2);
+                    __m256 ab3 = _mm256_mul_ps(a3, b3);
+                    __m256 cmp = _mm256_cmp_ps(_mm256_fmadd_ps(a, a, bb), threshold, _CMP_LE_OQ);
+                    __m256 cmp2 = _mm256_cmp_ps(_mm256_fmadd_ps(a2, a2, bb2), threshold, _CMP_LE_OQ);
+                    __m256 cmp3 = _mm256_cmp_ps(_mm256_fmadd_ps(a3, a3, bb3), threshold, _CMP_LE_OQ);
+                    a = _mm256_add_ps(_mm256_fmsub_ps(a, a, bb), cx);
+                    a2 = _mm256_add_ps(_mm256_fmsub_ps(a2, a2, bb2), cx2);
+                    a3 = _mm256_add_ps(_mm256_fmsub_ps(a3, a3, bb3), cx3);
+                    b = _mm256_fmadd_ps(two, ab, cy);
+                    b2 = _mm256_fmadd_ps(two, ab2, cy);
+                    b3 = _mm256_fmadd_ps(two, ab3, cy);
+                    adder = _mm256_and_ps(adder, cmp);
+                    counter = _mm256_add_ps(counter, adder);
+                    adder2 = _mm256_and_ps(adder2, cmp2);
+                    counter2 = _mm256_add_ps(counter2, adder2);
+                    adder3 = _mm256_and_ps(adder3, cmp3);
+                    counter3 = _mm256_add_ps(counter3, adder3);
+                    if ((k & 0x7) == 0 && _mm256_testz_ps(cmp, cmp) != 0 && _mm256_testz_ps(cmp2, cmp2) != 0 && _mm256_testz_ps(cmp3, cmp3) != 0) {
+                        break;
+                    }
+                }
+            }
+
+            float resData[96];
+            float* ftRes = resData;
+            float* resa = ftRes + 24;
+            float* resb = resa + 24;
+
+            _mm256_storeu_ps(ftRes, counter);
+            _mm256_storeu_ps(ftRes + 8, counter2);
+            _mm256_storeu_ps(ftRes + 16, counter3);
+            _mm256_storeu_ps(resa, resultsa);
+            _mm256_storeu_ps(resa + 8, resultsa2);
+            _mm256_storeu_ps(resa + 16, resultsa3);
+            _mm256_storeu_ps(resb, resultsb);
+            _mm256_storeu_ps(resb + 8, resultsb2);
+            _mm256_storeu_ps(resb + 16, resultsb3);
+            for (int k = 0; k < 24 && i + k < info.bWidth; k++) {
+                if (info.smooth) {
+                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter :
+                        ftRes[k] >= info.maxIter ? info.maxIter :
+                        ((float)ftRes[k]) + 1 - ::log2(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2);
+                }
+                else {
+                    data[i + k + j * info.bWidth] = ftRes[k] < 0 ? info.maxIter : ftRes[k];
+                }
+            }
+        }
+    }
+}
+

BIN
libmandel/src/plugins/avx512.so


+ 9 - 0
src/Almond.cpp

@@ -16,6 +16,15 @@ Almond::Almond(QWidget* parent) :
     QMainWindow{ parent, Qt::WindowFlags() },
     mandelContext{ mnd::initializeContext() }
 {
+    std::unique_ptr<mnd::CalcPlugin> cp =
+            std::make_unique<mnd::CalcPlugin>("./plugins/libavxfma.so");
+    if (!cp->isValid()) {
+        //exit(1);
+    }
+    else {
+        mandelContext.loadPlugin(std::move(cp));
+    }
+
     ui.setupUi(this);
     fractalWidget = new FractalWidget(this);
     fractalWidget->setGenerator(&mandelContext.getDefaultGenerator());