Ver Fonte

added neon support

Nicolas Winkler há 6 anos atrás
pai
commit
fa400aa147

+ 15 - 5
libmandel/CMakeLists.txt

@@ -4,6 +4,7 @@ cmake_minimum_required(VERSION 3.9)
 
 set(ARCH "X86_64" CACHE STRING "Target Architecture")
 
+message(CMAKE_SYSTEM_PROCESSOR)
 
 project(mandel VERSION 1.0.0 DESCRIPTION "library for mandelbrot calculations")
 
@@ -25,15 +26,22 @@ FILE(GLOB MandelHeaders include/*.h)
 
 if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
     list(APPEND MandelSources src/CpuGeneratorsAVX.cpp src/CpuGeneratorsSSE2.cpp)
+elseif(ARCH STREQUAL "ARM")
+    list(APPEND MandelSources src/CpuGeneratorsNeon.cpp)
 endif()
 
 #    message(${MandelSources})
 
-include_directories(
-    "include"
-    ${OpenCL_INCLUDE_DIRS}
-)
-link_directories(${OpenCL_LIBRARY})
+if(OPENCL_FOUND)
+    target_compile_definitions(mandel WITH_OPENCL)
+    include_directories(
+        "include"
+        ${OpenCL_INCLUDE_DIRS}
+    )
+    link_directories(${OpenCL_LIBRARY})
+else(OPENCL_FOUND)
+    include_directories("include")
+endif(OPENCL_FOUND)
 
 if (APPLE AND OpenCL_FOUND)
     SET(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -framework OpenCL")
@@ -52,6 +60,8 @@ if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
     else()
         set_source_files_properties(src/CpuGeneratorsSSE2.cpp PROPERTIES COMPILE_FLAGS -msse2)
     endif(MSVC)
+elseif(ARCH STREQUAL "ARM")
+    #set_source_files_properties(src/CpuGeneratorsNeon.cpp PROPERTIES COMPILE_FLAGS -mfpu=neon)
 endif()
 
 add_library(mandel STATIC ${MandelSources})

+ 4 - 0
libmandel/include/ClGenerators.h

@@ -1,6 +1,8 @@
 #ifndef MANDEL_CLGENERATORS_H
 #define MANDEL_CLGENERATORS_H
 
+#ifdef WITH_OPENCL
+
 #include "Generators.h"
 
 #ifdef __APPLE__
@@ -70,4 +72,6 @@ protected:
     virtual std::string getKernelCode(void) const;
 };
 
+#endif // WITH_OPENCL
+
 #endif // MANDEL_CLGENERATORS_H

+ 19 - 1
libmandel/include/CpuGenerators.h

@@ -18,6 +18,9 @@ namespace mnd
 
     class CpuGeneratorAvx512Float;
     class CpuGeneratorAvx512Double;
+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
+    class CpuGeneratorNeonFloat;
+    class CpuGeneratorNeonDouble;
 #endif
 }
 
@@ -85,6 +88,21 @@ class mnd::CpuGeneratorAvx512Double : public Generator
 public:
     virtual void generate(const MandelInfo& info, float* data);
 };
-#endif 
+
+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
+
+class mnd::CpuGeneratorNeonFloat : public Generator
+{
+public:
+    virtual void generate(const MandelInfo& info, float* data);
+};
+
+
+class mnd::CpuGeneratorNeonDouble : public Generator
+{
+public:
+    virtual void generate(const MandelInfo& info, float* data);
+};
+#endif
 
 #endif // MANDEL_CPUGENERATORS_H

+ 7 - 1
libmandel/src/ClGenerators.cpp

@@ -1,5 +1,7 @@
 #include "ClGenerators.h"
 
+#ifdef WITH_OPENCL
+
 #include <iostream>
 #include <iterator>
 
@@ -330,4 +332,8 @@ std::string ClGenerator128::getKernelCode(void) const
         std::istreambuf_iterator<char>());
     //fprintf(stderr, "%s\n", str);
     return str;
-}
+}
+
+
+#endif // WITH_OPENCL
+

+ 132 - 0
libmandel/src/CpuGeneratorsNeon.cpp

@@ -0,0 +1,132 @@
+#include "CpuGenerators.h"
+
+#include <omp.h>
+#include <arm_neon.h>
+#include <memory>
+
+using mnd::CpuGeneratorNeonFloat;
+using mnd::CpuGeneratorNeonDouble;
+
+
+void CpuGeneratorNeonFloat::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = float;
+    const MandelViewport& view = info.view;
+    omp_set_num_threads(2 * omp_get_num_procs());
+#pragma omp parallel for
+    for (long j = 0; j < info.bHeight; j++) {
+        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
+        long i = 0;
+        for (i; i < info.bWidth; i += 4) {
+            float xsvals[] = {
+                float(view.x + double(i) * view.width / info.bWidth),
+                float(view.x + double(i + 1) * view.width / info.bWidth),
+                float(view.x + double(i + 2) * view.width / info.bWidth),
+                float(view.x + double(i + 3) * view.width / info.bWidth)
+            };
+
+            float32x4_t xs = vld1q_f32(xsvals);
+
+
+            uint32x4_t counter = vmovq_n_u32(0);
+            uint32x4_t adder = vmovq_n_u32(1);
+            //uint32x4_t ones = vmovq_n_u32(1);
+
+            float32x4_t threshold = vmovq_n_f32(16);
+
+            float32x4_t ys = vmovq_n_f32(y);
+            float32x4_t a = xs;
+            float32x4_t b = ys;
+
+            for (int k = 0; k < info.maxIter; k++) {
+                float32x4_t aa = vmulq_f32(a, a);
+                float32x4_t bb = vmulq_f32(b, b);
+                float32x4_t abab = vmulq_f32(a, b); abab = vaddq_f32(abab, abab);
+                a = vaddq_f32(vsubq_f32(aa, bb), xs);
+                b = vaddq_f32(abab, ys);
+                uint32x4_t cmp = vcleq_f32(vaddq_f32(aa, bb), threshold);
+                adder = vandq_u32(adder, cmp);
+                counter = vaddq_u32(counter, adder);
+                // checking for break criterion is possibly expensive, only do it every 8 iterations
+                if ((k & 7) == 0) {
+                    /* // ARM-v7 method
+                    uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
+                    if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
+                        break;
+                    }
+                    */
+                    uint32_t allZero = vaddvq_u32(cmp);
+                    if (allZero == 0) {
+                        break;
+                    }
+                }
+            }
+
+            uint32_t resData[4];
+            vst1q_u32(resData, counter);
+            for (int k = 0; k < 4 && i + k < info.bWidth; k++)
+                data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
+        }
+    }
+}
+
+
+void CpuGeneratorNeonDouble::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = double;
+    const MandelViewport& view = info.view;
+    omp_set_num_threads(2 * omp_get_num_procs());
+#pragma omp parallel for
+    for (long j = 0; j < info.bHeight; j++) {
+        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
+        long i = 0;
+        for (i; i < info.bWidth; i += 2) {
+            double xsvals[] = {
+                (view.x + double(i) * view.width / info.bWidth),
+                (view.x + double(i + 1) * view.width / info.bWidth),
+            };
+
+            float64x2_t xs = vld1q_f64(xsvals);
+
+
+            uint64x2_t counter = vmovq_n_u64(0);
+            uint64x2_t adder = vmovq_n_u64(1);
+            //uint32x4_t ones = vmovq_n_u32(1);
+
+            float64x2_t threshold = vmovq_n_f64(16);
+
+            float64x2_t ys = vmovq_n_f64(y);
+            float64x2_t a = xs;
+            float64x2_t b = ys;
+
+            for (int k = 0; k < info.maxIter; k++) {
+                float64x2_t aa = vmulq_f64(a, a);
+                float64x2_t bb = vmulq_f64(b, b);
+                float64x2_t abab = vmulq_f64(a, b); abab = vaddq_f64(abab, abab);
+                a = vaddq_f64(vsubq_f64(aa, bb), xs);
+                b = vaddq_f64(abab, ys);
+                uint64x2_t cmp = vcleq_f64(vaddq_f64(aa, bb), threshold);
+                adder = vandq_u64(adder, cmp);
+                counter = vaddq_u64(counter, adder);
+                // checking for break criterion is possibly expensive, only do it every 8 iterations
+                if ((k & 7) == 0) {
+                    /* // ARM-v7 method
+                    uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
+                    if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
+                        break;
+                    }
+                    */
+                    uint64_t allZero = vaddvq_u64(cmp);
+                    if (allZero == 0) {
+                        break;
+                    }
+                }
+            }
+
+            uint64_t resData[2];
+            vst1q_u64(resData, counter);
+            for (int k = 0; k < 2 && i + k < info.bWidth; k++)
+                data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
+        }
+    }
+}

+ 8 - 1
libmandel/src/mandel.cpp

@@ -63,6 +63,12 @@ MandelContext::MandelContext(void)
         cpuGeneratorDouble = std::make_unique<CpuGeneratorSse2Double>();
     }
     else
+#elif defined(__aarch64__)
+    if (true) {
+        cpuGeneratorFloat = std::make_unique<CpuGeneratorNeonFloat>();
+        cpuGeneratorDouble = std::make_unique<CpuGeneratorNeonDouble>();
+    }
+    else
 #endif
     {
         cpuGeneratorFloat = std::make_unique<CpuGeneratorFloat>();
@@ -78,7 +84,7 @@ MandelContext::MandelContext(void)
 std::vector<MandelDevice> MandelContext::createDevices(void)
 {
     std::vector<MandelDevice> mandelDevices;
-
+#ifdef WITH_OPENCL
     std::vector<cl::Platform> platforms;
     cl::Platform::get(&platforms);
     platforms.erase(platforms.begin() + 1);
@@ -133,6 +139,7 @@ std::vector<MandelDevice> MandelContext::createDevices(void)
             mandelDevices.push_back(std::move(md));
         }
     }
+#endif // WITH_OPENCL
     
     return mandelDevices;
 }