6 years ago · fa400aa147
--- a/libmandel/CMakeLists.txt
+++ b/libmandel/CMakeLists.txt
@@ -4,6 +4,7 @@ cmake_minimum_required(VERSION 3.9)
 
															 set(ARCH "X86_64" CACHE STRING "Target Architecture")
														
 
															+message(CMAKE_SYSTEM_PROCESSOR)
														
 
															 project(mandel VERSION 1.0.0 DESCRIPTION "library for mandelbrot calculations")
														
@@ -25,15 +26,22 @@ FILE(GLOB MandelHeaders include/*.h)
 
															 if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
														
 
															     list(APPEND MandelSources src/CpuGeneratorsAVX.cpp src/CpuGeneratorsSSE2.cpp)
														
 
															+elseif(ARCH STREQUAL "ARM")
														
 
															+    list(APPEND MandelSources src/CpuGeneratorsNeon.cpp)
														
 
															 endif()
														
 
															 #    message(${MandelSources})
														
 
															-include_directories(
														
 
															-    "include"
														
 
															-    ${OpenCL_INCLUDE_DIRS}
														
 
															-)
														
 
															-link_directories(${OpenCL_LIBRARY})
														
 
															+if(OPENCL_FOUND)
														
 
															+    target_compile_definitions(mandel WITH_OPENCL)
														
 
															+    include_directories(
														
 
															+        "include"
														
 
															+        ${OpenCL_INCLUDE_DIRS}
														
 
															+    )
														
 
															+    link_directories(${OpenCL_LIBRARY})
														
 
															+else(OPENCL_FOUND)
														
 
															+    include_directories("include")
														
 
															+endif(OPENCL_FOUND)
														
 
															 if (APPLE AND OpenCL_FOUND)
														
 
															     SET(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -framework OpenCL")
														
@@ -52,6 +60,8 @@ if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
 
															     else()
														
 
															         set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS -msse2)
														
 
															     endif(MSVC)
														
 
															+elseif(ARCH STREQUAL "ARM")
														
 
															+    #set_source_files_properties(src/CpuGeneratorsNeon.cpp PROPERTIES COMPILE_FLAGS -mfpu=neon)
														
 
															 endif()
														
 
															 add_library(mandel STATIC ${MandelSources})
														
--- a/libmandel/include/ClGenerators.h
+++ b/libmandel/include/ClGenerators.h
@@ -1,6 +1,8 @@
 
															 #ifndef MANDEL_CLGENERATORS_H
														
 
															 #define MANDEL_CLGENERATORS_H
														
 
															+#ifdef WITH_OPENCL
														
 
															+
														
 
															 #include "Generators.h"
														
 
															 #ifdef __APPLE__
														
@@ -70,4 +72,6 @@ protected:
 
															     virtual std::string getKernelCode(void) const;
														
 
															 };
														
 
															+#endif // WITH_OPENCL
														
 
															+
														
 
															 #endif // MANDEL_CLGENERATORS_H
														
--- a/libmandel/include/CpuGenerators.h
+++ b/libmandel/include/CpuGenerators.h
@@ -18,6 +18,9 @@ namespace mnd
 
															     class CpuGeneratorAvx512Float;
														
 
															     class CpuGeneratorAvx512Double;
														
 
															+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
														
 
															+    class CpuGeneratorNeonFloat;
														
 
															+    class CpuGeneratorNeonDouble;
														
 
															 #endif
														
 
															 }
														
@@ -85,6 +88,21 @@ class mnd::CpuGeneratorAvx512Double : public Generator
 
															 public:
														
 
															     virtual void generate(const MandelInfo& info, float* data);
														
 
															 };
														
 
															-#endif 
														
 
															+
														
 
															+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
														
 
															+
														
 
															+class mnd::CpuGeneratorNeonFloat : public Generator
														
 
															+{
														
 
															+public:
														
 
															+    virtual void generate(const MandelInfo& info, float* data);
														
 
															+};
														
 
															+
														
 
															+
														
 
															+class mnd::CpuGeneratorNeonDouble : public Generator
														
 
															+{
														
 
															+public:
														
 
															+    virtual void generate(const MandelInfo& info, float* data);
														
 
															+};
														
 
															+#endif
														
 
															 #endif // MANDEL_CPUGENERATORS_H
														
--- a/libmandel/src/ClGenerators.cpp
+++ b/libmandel/src/ClGenerators.cpp
@@ -1,5 +1,7 @@
 
															 #include "ClGenerators.h"
														
 
															+#ifdef WITH_OPENCL
														
 
															+
														
 
															 #include <iostream>
														
 
															 #include <iterator>
														
@@ -330,4 +332,8 @@ std::string ClGenerator128::getKernelCode(void) const
 
															         std::istreambuf_iterator<char>());
														
 
															     //fprintf(stderr, "%s\n", str);
														
 
															     return str;
														
 
															-}
														
 
															+}
														
 
															+
														
 
															+
														
 
															+#endif // WITH_OPENCL
														
 
															+
														
--- a/libmandel/src/CpuGeneratorsNeon.cpp
+++ b/libmandel/src/CpuGeneratorsNeon.cpp
@@ -0,0 +1,132 @@
 
															+#include "CpuGenerators.h"
														
 
															+
														
 
															+#include <omp.h>
														
 
															+#include <arm_neon.h>
														
 
															+#include <memory>
														
 
															+
														
 
															+using mnd::CpuGeneratorNeonFloat;
														
 
															+using mnd::CpuGeneratorNeonDouble;
														
 
															+
														
 
															+
														
 
															+void CpuGeneratorNeonFloat::generate(const mnd::MandelInfo& info, float* data)
														
 
															+{
														
 
															+    using T = float;
														
 
															+    const MandelViewport& view = info.view;
														
 
															+    omp_set_num_threads(2 * omp_get_num_procs());
														
 
															+#pragma omp parallel for
														
 
															+    for (long j = 0; j < info.bHeight; j++) {
														
 
															+        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
														
 
															+        long i = 0;
														
 
															+        for (i; i < info.bWidth; i += 4) {
														
 
															+            float xsvals[] = {
														
 
															+                float(view.x + double(i) * view.width / info.bWidth),
														
 
															+                float(view.x + double(i + 1) * view.width / info.bWidth),
														
 
															+                float(view.x + double(i + 2) * view.width / info.bWidth),
														
 
															+                float(view.x + double(i + 3) * view.width / info.bWidth)
														
 
															+            };
														
 
															+
														
 
															+            float32x4_t xs = vld1q_f32(xsvals);
														
 
															+
														
 
															+
														
 
															+            uint32x4_t counter = vmovq_n_u32(0);
														
 
															+            uint32x4_t adder = vmovq_n_u32(1);
														
 
															+            //uint32x4_t ones = vmovq_n_u32(1);
														
 
															+
														
 
															+            float32x4_t threshold = vmovq_n_f32(16);
														
 
															+
														
 
															+            float32x4_t ys = vmovq_n_f32(y);
														
 
															+            float32x4_t a = xs;
														
 
															+            float32x4_t b = ys;
														
 
															+
														
 
															+            for (int k = 0; k < info.maxIter; k++) {
														
 
															+                float32x4_t aa = vmulq_f32(a, a);
														
 
															+                float32x4_t bb = vmulq_f32(b, b);
														
 
															+                float32x4_t abab = vmulq_f32(a, b); abab = vaddq_f32(abab, abab);
														
 
															+                a = vaddq_f32(vsubq_f32(aa, bb), xs);
														
 
															+                b = vaddq_f32(abab, ys);
														
 
															+                uint32x4_t cmp = vcleq_f32(vaddq_f32(aa, bb), threshold);
														
 
															+                adder = vandq_u32(adder, cmp);
														
 
															+                counter = vaddq_u32(counter, adder);
														
 
															+                // checking for break criterion is possibly expensive, only do it every 8 iterations
														
 
															+                if ((k & 7) == 0) {
														
 
															+                    /* // ARM-v7 method
														
 
															+                    uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
														
 
															+                    if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
														
 
															+                        break;
														
 
															+                    }
														
 
															+                    */
														
 
															+                    uint32_t allZero = vaddvq_u32(cmp);
														
 
															+                    if (allZero == 0) {
														
 
															+                        break;
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+
														
 
															+            uint32_t resData[4];
														
 
															+            vst1q_u32(resData, counter);
														
 
															+            for (int k = 0; k < 4 && i + k < info.bWidth; k++)
														
 
															+                data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
														
 
															+        }
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+
														
 
															+void CpuGeneratorNeonDouble::generate(const mnd::MandelInfo& info, float* data)
														
 
															+{
														
 
															+    using T = double;
														
 
															+    const MandelViewport& view = info.view;
														
 
															+    omp_set_num_threads(2 * omp_get_num_procs());
														
 
															+#pragma omp parallel for
														
 
															+    for (long j = 0; j < info.bHeight; j++) {
														
 
															+        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
														
 
															+        long i = 0;
														
 
															+        for (i; i < info.bWidth; i += 2) {
														
 
															+            double xsvals[] = {
														
 
															+                (view.x + double(i) * view.width / info.bWidth),
														
 
															+                (view.x + double(i + 1) * view.width / info.bWidth),
														
 
															+            };
														
 
															+
														
 
															+            float64x2_t xs = vld1q_f64(xsvals);
														
 
															+
														
 
															+
														
 
															+            uint64x2_t counter = vmovq_n_u64(0);
														
 
															+            uint64x2_t adder = vmovq_n_u64(1);
														
 
															+            //uint32x4_t ones = vmovq_n_u32(1);
														
 
															+
														
 
															+            float64x2_t threshold = vmovq_n_f64(16);
														
 
															+
														
 
															+            float64x2_t ys = vmovq_n_f64(y);
														
 
															+            float64x2_t a = xs;
														
 
															+            float64x2_t b = ys;
														
 
															+
														
 
															+            for (int k = 0; k < info.maxIter; k++) {
														
 
															+                float64x2_t aa = vmulq_f64(a, a);
														
 
															+                float64x2_t bb = vmulq_f64(b, b);
														
 
															+                float64x2_t abab = vmulq_f64(a, b); abab = vaddq_f64(abab, abab);
														
 
															+                a = vaddq_f64(vsubq_f64(aa, bb), xs);
														
 
															+                b = vaddq_f64(abab, ys);
														
 
															+                uint64x2_t cmp = vcleq_f64(vaddq_f64(aa, bb), threshold);
														
 
															+                adder = vandq_u64(adder, cmp);
														
 
															+                counter = vaddq_u64(counter, adder);
														
 
															+                // checking for break criterion is possibly expensive, only do it every 8 iterations
														
 
															+                if ((k & 7) == 0) {
														
 
															+                    /* // ARM-v7 method
														
 
															+                    uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
														
 
															+                    if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
														
 
															+                        break;
														
 
															+                    }
														
 
															+                    */
														
 
															+                    uint64_t allZero = vaddvq_u64(cmp);
														
 
															+                    if (allZero == 0) {
														
 
															+                        break;
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+
														
 
															+            uint64_t resData[2];
														
 
															+            vst1q_u64(resData, counter);
														
 
															+            for (int k = 0; k < 2 && i + k < info.bWidth; k++)
														
 
															+                data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
														
 
															+        }
														
 
															+    }
														
 
															+}
														
--- a/libmandel/src/mandel.cpp
+++ b/libmandel/src/mandel.cpp
@@ -63,6 +63,12 @@ MandelContext::MandelContext(void)
 
															         cpuGeneratorDouble = std::make_unique<CpuGeneratorSse2Double>();
														
 
															     }
														
 
															     else
														
 
															+#elif defined(__aarch64__)
														
 
															+    if (true) {
														
 
															+        cpuGeneratorFloat = std::make_unique<CpuGeneratorNeonFloat>();
														
 
															+        cpuGeneratorDouble = std::make_unique<CpuGeneratorNeonDouble>();
														
 
															+    }
														
 
															+    else
														
 
															 #endif
														
 
															     {
														
 
															         cpuGeneratorFloat = std::make_unique<CpuGeneratorFloat>();
														
@@ -78,7 +84,7 @@ MandelContext::MandelContext(void)
 
															 std::vector<MandelDevice> MandelContext::createDevices(void)
														
 
															 {
														
 
															     std::vector<MandelDevice> mandelDevices;
														
 
															-
														
 
															+#ifdef WITH_OPENCL
														
 
															     std::vector<cl::Platform> platforms;
														
 
															     cl::Platform::get(&platforms);
														
 
															     platforms.erase(platforms.begin() + 1);
														
@@ -133,6 +139,7 @@ std::vector<MandelDevice> MandelContext::createDevices(void)
 
															             mandelDevices.push_back(std::move(md));
														
 
															         }
														
 
															     }
														
 
															+#endif // WITH_OPENCL
														
 
															     return mandelDevices;
														
 
															 }