há 6 anos atrás · fa400aa147
--- a/libmandel/CMakeLists.txt
+++ b/libmandel/CMakeLists.txt
@@ -4,6 +4,7 @@ cmake_minimum_required(VERSION 3.9)
 
				 
			
 
				 set(ARCH "X86_64" CACHE STRING "Target Architecture")
			
 
				 
			
 
				+message(CMAKE_SYSTEM_PROCESSOR)
			
 
				 
			
 
				 project(mandel VERSION 1.0.0 DESCRIPTION "library for mandelbrot calculations")
			
 
				 
			
@@ -25,15 +26,22 @@ FILE(GLOB MandelHeaders include/*.h)
 
				 
			
 
				 if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
			
 
				     list(APPEND MandelSources src/CpuGeneratorsAVX.cpp src/CpuGeneratorsSSE2.cpp)
			
 
				+elseif(ARCH STREQUAL "ARM")
			
 
				+    list(APPEND MandelSources src/CpuGeneratorsNeon.cpp)
			
 
				 endif()
			
 
				 
			
 
				 #    message(${MandelSources})
			
 
				 
			
 
				-include_directories(
			
 
				-    "include"
			
 
				-    ${OpenCL_INCLUDE_DIRS}
			
 
				-)
			
 
				-link_directories(${OpenCL_LIBRARY})
			
 
				+if(OPENCL_FOUND)
			
 
				+    target_compile_definitions(mandel WITH_OPENCL)
			
 
				+    include_directories(
			
 
				+        "include"
			
 
				+        ${OpenCL_INCLUDE_DIRS}
			
 
				+    )
			
 
				+    link_directories(${OpenCL_LIBRARY})
			
 
				+else(OPENCL_FOUND)
			
 
				+    include_directories("include")
			
 
				+endif(OPENCL_FOUND)
			
 
				 
			
 
				 if (APPLE AND OpenCL_FOUND)
			
 
				     SET(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -framework OpenCL")
			
@@ -52,6 +60,8 @@ if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
 
				     else()
			
 
				         set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS -msse2)
			
 
				     endif(MSVC)
			
 
				+elseif(ARCH STREQUAL "ARM")
			
 
				+    #set_source_files_properties(src/CpuGeneratorsNeon.cpp PROPERTIES COMPILE_FLAGS -mfpu=neon)
			
 
				 endif()
			
 
				 
			
 
				 add_library(mandel STATIC ${MandelSources})
			
--- a/libmandel/include/ClGenerators.h
+++ b/libmandel/include/ClGenerators.h
@@ -1,6 +1,8 @@
 
				 #ifndef MANDEL_CLGENERATORS_H
			
 
				 #define MANDEL_CLGENERATORS_H
			
 
				 
			
 
				+#ifdef WITH_OPENCL
			
 
				+
			
 
				 #include "Generators.h"
			
 
				 
			
 
				 #ifdef __APPLE__
			
@@ -70,4 +72,6 @@ protected:
 
				     virtual std::string getKernelCode(void) const;
			
 
				 };
			
 
				 
			
 
				+#endif // WITH_OPENCL
			
 
				+
			
 
				 #endif // MANDEL_CLGENERATORS_H
			
--- a/libmandel/include/CpuGenerators.h
+++ b/libmandel/include/CpuGenerators.h
@@ -18,6 +18,9 @@ namespace mnd
 
				 
			
 
				     class CpuGeneratorAvx512Float;
			
 
				     class CpuGeneratorAvx512Double;
			
 
				+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
			
 
				+    class CpuGeneratorNeonFloat;
			
 
				+    class CpuGeneratorNeonDouble;
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -85,6 +88,21 @@ class mnd::CpuGeneratorAvx512Double : public Generator
 
				 public:
			
 
				     virtual void generate(const MandelInfo& info, float* data);
			
 
				 };
			
 
				-#endif 
			
 
				+
			
 
				+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
			
 
				+
			
 
				+class mnd::CpuGeneratorNeonFloat : public Generator
			
 
				+{
			
 
				+public:
			
 
				+    virtual void generate(const MandelInfo& info, float* data);
			
 
				+};
			
 
				+
			
 
				+
			
 
				+class mnd::CpuGeneratorNeonDouble : public Generator
			
 
				+{
			
 
				+public:
			
 
				+    virtual void generate(const MandelInfo& info, float* data);
			
 
				+};
			
 
				+#endif
			
 
				 
			
 
				 #endif // MANDEL_CPUGENERATORS_H
			
--- a/libmandel/src/ClGenerators.cpp
+++ b/libmandel/src/ClGenerators.cpp
@@ -1,5 +1,7 @@
 
				 #include "ClGenerators.h"
			
 
				 
			
 
				+#ifdef WITH_OPENCL
			
 
				+
			
 
				 #include <iostream>
			
 
				 #include <iterator>
			
 
				 
			
@@ -330,4 +332,8 @@ std::string ClGenerator128::getKernelCode(void) const
 
				         std::istreambuf_iterator<char>());
			
 
				     //fprintf(stderr, "%s\n", str);
			
 
				     return str;
			
 
				-}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#endif // WITH_OPENCL
			
 
				+
			
--- a/libmandel/src/CpuGeneratorsNeon.cpp
+++ b/libmandel/src/CpuGeneratorsNeon.cpp
@@ -0,0 +1,132 @@
 
				+#include "CpuGenerators.h"
			
 
				+
			
 
				+#include <omp.h>
			
 
				+#include <arm_neon.h>
			
 
				+#include <memory>
			
 
				+
			
 
				+using mnd::CpuGeneratorNeonFloat;
			
 
				+using mnd::CpuGeneratorNeonDouble;
			
 
				+
			
 
				+
			
 
				+void CpuGeneratorNeonFloat::generate(const mnd::MandelInfo& info, float* data)
			
 
				+{
			
 
				+    using T = float;
			
 
				+    const MandelViewport& view = info.view;
			
 
				+    omp_set_num_threads(2 * omp_get_num_procs());
			
 
				+#pragma omp parallel for
			
 
				+    for (long j = 0; j < info.bHeight; j++) {
			
 
				+        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
			
 
				+        long i = 0;
			
 
				+        for (i; i < info.bWidth; i += 4) {
			
 
				+            float xsvals[] = {
			
 
				+                float(view.x + double(i) * view.width / info.bWidth),
			
 
				+                float(view.x + double(i + 1) * view.width / info.bWidth),
			
 
				+                float(view.x + double(i + 2) * view.width / info.bWidth),
			
 
				+                float(view.x + double(i + 3) * view.width / info.bWidth)
			
 
				+            };
			
 
				+
			
 
				+            float32x4_t xs = vld1q_f32(xsvals);
			
 
				+
			
 
				+
			
 
				+            uint32x4_t counter = vmovq_n_u32(0);
			
 
				+            uint32x4_t adder = vmovq_n_u32(1);
			
 
				+            //uint32x4_t ones = vmovq_n_u32(1);
			
 
				+
			
 
				+            float32x4_t threshold = vmovq_n_f32(16);
			
 
				+
			
 
				+            float32x4_t ys = vmovq_n_f32(y);
			
 
				+            float32x4_t a = xs;
			
 
				+            float32x4_t b = ys;
			
 
				+
			
 
				+            for (int k = 0; k < info.maxIter; k++) {
			
 
				+                float32x4_t aa = vmulq_f32(a, a);
			
 
				+                float32x4_t bb = vmulq_f32(b, b);
			
 
				+                float32x4_t abab = vmulq_f32(a, b); abab = vaddq_f32(abab, abab);
			
 
				+                a = vaddq_f32(vsubq_f32(aa, bb), xs);
			
 
				+                b = vaddq_f32(abab, ys);
			
 
				+                uint32x4_t cmp = vcleq_f32(vaddq_f32(aa, bb), threshold);
			
 
				+                adder = vandq_u32(adder, cmp);
			
 
				+                counter = vaddq_u32(counter, adder);
			
 
				+                // checking for break criterion is possibly expensive, only do it every 8 iterations
			
 
				+                if ((k & 7) == 0) {
			
 
				+                    /* // ARM-v7 method
			
 
				+                    uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
			
 
				+                    if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                    */
			
 
				+                    uint32_t allZero = vaddvq_u32(cmp);
			
 
				+                    if (allZero == 0) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            uint32_t resData[4];
			
 
				+            vst1q_u32(resData, counter);
			
 
				+            for (int k = 0; k < 4 && i + k < info.bWidth; k++)
			
 
				+                data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void CpuGeneratorNeonDouble::generate(const mnd::MandelInfo& info, float* data)
			
 
				+{
			
 
				+    using T = double;
			
 
				+    const MandelViewport& view = info.view;
			
 
				+    omp_set_num_threads(2 * omp_get_num_procs());
			
 
				+#pragma omp parallel for
			
 
				+    for (long j = 0; j < info.bHeight; j++) {
			
 
				+        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
			
 
				+        long i = 0;
			
 
				+        for (i; i < info.bWidth; i += 2) {
			
 
				+            double xsvals[] = {
			
 
				+                (view.x + double(i) * view.width / info.bWidth),
			
 
				+                (view.x + double(i + 1) * view.width / info.bWidth),
			
 
				+            };
			
 
				+
			
 
				+            float64x2_t xs = vld1q_f64(xsvals);
			
 
				+
			
 
				+
			
 
				+            uint64x2_t counter = vmovq_n_u64(0);
			
 
				+            uint64x2_t adder = vmovq_n_u64(1);
			
 
				+            //uint32x4_t ones = vmovq_n_u32(1);
			
 
				+
			
 
				+            float64x2_t threshold = vmovq_n_f64(16);
			
 
				+
			
 
				+            float64x2_t ys = vmovq_n_f64(y);
			
 
				+            float64x2_t a = xs;
			
 
				+            float64x2_t b = ys;
			
 
				+
			
 
				+            for (int k = 0; k < info.maxIter; k++) {
			
 
				+                float64x2_t aa = vmulq_f64(a, a);
			
 
				+                float64x2_t bb = vmulq_f64(b, b);
			
 
				+                float64x2_t abab = vmulq_f64(a, b); abab = vaddq_f64(abab, abab);
			
 
				+                a = vaddq_f64(vsubq_f64(aa, bb), xs);
			
 
				+                b = vaddq_f64(abab, ys);
			
 
				+                uint64x2_t cmp = vcleq_f64(vaddq_f64(aa, bb), threshold);
			
 
				+                adder = vandq_u64(adder, cmp);
			
 
				+                counter = vaddq_u64(counter, adder);
			
 
				+                // checking for break criterion is possibly expensive, only do it every 8 iterations
			
 
				+                if ((k & 7) == 0) {
			
 
				+                    /* // ARM-v7 method
			
 
				+                    uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
			
 
				+                    if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                    */
			
 
				+                    uint64_t allZero = vaddvq_u64(cmp);
			
 
				+                    if (allZero == 0) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            uint64_t resData[2];
			
 
				+            vst1q_u64(resData, counter);
			
 
				+            for (int k = 0; k < 2 && i + k < info.bWidth; k++)
			
 
				+                data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/libmandel/src/mandel.cpp
+++ b/libmandel/src/mandel.cpp
@@ -63,6 +63,12 @@ MandelContext::MandelContext(void)
 
				         cpuGeneratorDouble = std::make_unique<CpuGeneratorSse2Double>();
			
 
				     }
			
 
				     else
			
 
				+#elif defined(__aarch64__)
			
 
				+    if (true) {
			
 
				+        cpuGeneratorFloat = std::make_unique<CpuGeneratorNeonFloat>();
			
 
				+        cpuGeneratorDouble = std::make_unique<CpuGeneratorNeonDouble>();
			
 
				+    }
			
 
				+    else
			
 
				 #endif
			
 
				     {
			
 
				         cpuGeneratorFloat = std::make_unique<CpuGeneratorFloat>();
			
@@ -78,7 +84,7 @@ MandelContext::MandelContext(void)
 
				 std::vector<MandelDevice> MandelContext::createDevices(void)
			
 
				 {
			
 
				     std::vector<MandelDevice> mandelDevices;
			
 
				-
			
 
				+#ifdef WITH_OPENCL
			
 
				     std::vector<cl::Platform> platforms;
			
 
				     cl::Platform::get(&platforms);
			
 
				     platforms.erase(platforms.begin() + 1);
			
@@ -133,6 +139,7 @@ std::vector<MandelDevice> MandelContext::createDevices(void)
 
				             mandelDevices.push_back(std::move(md));
			
 
				         }
			
 
				     }
			
 
				+#endif // WITH_OPENCL
			
 
				     
			
 
				     return mandelDevices;
			
 
				 }