1
0
فهرست منبع

added benchmark tool

Nicolas Winkler 6 سال پیش
والد
کامیت
21bca5cd66
5فایلهای تغییر یافته به همراه248 افزوده شده و 16 حذف شده
  1. 10 8
      libmandel/CMakeLists.txt
  2. 82 0
      libmandel/src/CpuGeneratorsAVX512.cpp
  3. 7 8
      libmandel/src/mandel.cpp
  4. 11 0
      mandelbench/CMakeLists.txt
  5. 138 0
      mandelbench/mandelbench.cpp

+ 10 - 8
libmandel/CMakeLists.txt

@@ -1,12 +1,16 @@
 cmake_minimum_required(VERSION 3.9)
 
-set(CMAKE_CXX_STANDARD 17)
-OPTION(ARCH "Target Architecture" X86_64)
+
+
+set(ARCH "X86_64" CACHE STRING "Target Architecture")
+
+
 project(mandel VERSION 1.0.0 DESCRIPTION "library for mandelbrot calculations")
 
 find_package(OpenCL)
 find_package(OpenMP)
 
+set(CMAKE_CXX_STANDARD 17)
 
 #FILE(GLOB MandelSources src/*.cpp)
 SET(MandelSources
@@ -14,18 +18,16 @@ SET(MandelSources
     src/CpuGenerators.cpp
     src/Generators.cpp
     src/mandel.cpp
-    src/CpuGeneratorsAVX.cpp
-    src/CpuGeneratorsSSE2.cpp
     src/Hardware.cpp
     src/MandelUtil.cpp
 )
 FILE(GLOB MandelHeaders include/*.h)
 
-if (NOT (ARCH EQUAL X86_64 OR ARCH EQUAL X86))
-    list(REMOVE_ITEM MandelSources src/CpuGeneratorsAVX.cpp src/CpuGeneratorsSSE2.cpp)
+if (ARCH STREQUAL "X86_64" OR ARCH STREQUAL "X86")
+    list(APPEND MandelSources src/CpuGeneratorsAVX.cpp src/CpuGeneratorsSSE2.cpp)
 endif()
 
-    message(${MandelSources})
+#    message(${MandelSources})
 
 include_directories(
     "include"
@@ -59,5 +61,5 @@ if(OpenMP_CXX_FOUND)
     target_link_libraries(mandel PUBLIC OpenMP::OpenMP_CXX)
 endif()
 if(OpenCL_FOUND)
-    target_link_libraries(mandel OpenCL::OpenCL)
+    target_link_libraries(mandel PUBLIC OpenCL::OpenCL)
 endif()

+ 82 - 0
libmandel/src/CpuGeneratorsAVX512.cpp

@@ -0,0 +1,82 @@
+#include "CpuGenerators.h"
+
+#include <immintrin.h>
+#include <omp.h>
+
+#include <memory>
+
+using mnd::CpuGeneratorAvx512Float;
+using mnd::CpuGeneratorAvx512Double;
+
+void CpuGeneratorAvx512Float::generate(const mnd::MandelInfo& info, float* data)
+{
+    using T = float;
+    const MandelViewport& view = info.view;
+    omp_set_num_threads(2 * omp_get_num_procs());
+#pragma omp parallel for
+    for (long j = 0; j < info.bHeight; j++) {
+        T y = T(view.y) + T(j) * T(view.height / info.bHeight);
+        long i = 0;
+        for (i; i < info.bWidth; i += 16) {
+            __m512 xs = {
+                float(view.x + double(i) * view.width / info.bWidth),
+                float(view.x + double(i + 1) * view.width / info.bWidth),
+                float(view.x + double(i + 2) * view.width / info.bWidth),
+                float(view.x + double(i + 3) * view.width / info.bWidth),
+                float(view.x + double(i + 4) * view.width / info.bWidth),
+                float(view.x + double(i + 5) * view.width / info.bWidth),
+                float(view.x + double(i + 6) * view.width / info.bWidth),
+                float(view.x + double(i + 7) * view.width / info.bWidth),
+                float(view.x + double(i + 8) * view.width / info.bWidth),
+                float(view.x + double(i + 9) * view.width / info.bWidth),
+                float(view.x + double(i + 10) * view.width / info.bWidth),
+                float(view.x + double(i + 11) * view.width / info.bWidth),
+                float(view.x + double(i + 12) * view.width / info.bWidth),
+                float(view.x + double(i + 13) * view.width / info.bWidth),
+                float(view.x + double(i + 14) * view.width / info.bWidth),
+                float(view.x + double(i + 15) * view.width / info.bWidth)
+            };
+
+            __m512 counter = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+            __m512 adder = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+            __m512 threshold = {16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f};
+
+            __m512 ys = {y, y, y, y, y, y, y, y, y, y, y, y, y, y, y};
+            __m512 a = xs;
+            __m512 b = ys;
+
+            for (int k = 0; k < info.maxIter; k++) {
+                __m512 aa = _mm512_mul_ps(a, a);
+                __m512 bb = _mm512_mul_ps(b, b);
+                __m512 abab = _mm512_mul_ps(a, b); abab = _mm512_add_ps(abab, abab);
+                a = _mm512_add_ps(_mm512_sub_ps(aa, bb), xs);
+                b = _mm512_add_ps(abab, ys);
+                __mmask16 cmp = _mm512_cmp_ps_mask(_mm512_add_ps(aa, bb), threshold, _CMP_LE_OQ);
+                counter = _mm512_mask_add_ps(counter, cmp, counter, adder);
+                if (cmp == 0) {
+                    break;
+                }
+            }
+
+            auto alignVec = [](float* data) -> float* {
+                void* aligned = data;
+                ::size_t length = 64;
+                std::align(32, 16 * sizeof(float), aligned, length);
+                return static_cast<float*>(aligned);
+            };
+
+            float resData[32];
+            float* ftRes = alignVec(resData);
+
+            _mm512_store_ps(ftRes, counter);
+            for (int k = 0; k < 8 && i + k < info.bWidth; k++)
+                data[i + k + j * info.bWidth] = ftRes[k] > 0 ? ftRes[k] : info.maxIter;
+        }
+    }
+}
+
+
+void CpuGeneratorAvx512Double::generate(const mnd::MandelInfo& info, float* data)
+{
+}

+ 7 - 8
libmandel/src/mandel.cpp

@@ -79,7 +79,6 @@ std::vector<MandelDevice> MandelContext::createDevices(void)
 {
     std::vector<MandelDevice> mandelDevices;
 
-    /*
     std::vector<cl::Platform> platforms;
     cl::Platform::get(&platforms);
     platforms.erase(platforms.begin() + 1);
@@ -88,9 +87,9 @@ std::vector<MandelDevice> MandelContext::createDevices(void)
         std::string name = platform.getInfo<CL_PLATFORM_NAME>();
         std::string profile = platform.getInfo<CL_PLATFORM_PROFILE>();
 
-       std::string ext = platform.getInfo<CL_PLATFORM_EXTENSIONS>();
-        printf("Platform extensions: %s\n", ext.c_str());
-        printf("Platform: %s, %s\n", name.c_str(), profile.c_str());
+        //std::string ext = platform.getInfo<CL_PLATFORM_EXTENSIONS>();
+        //printf("Platform extensions: %s\n", ext.c_str());
+        //printf("Platform: %s, %s\n", name.c_str(), profile.c_str());
 
         std::vector<cl::Device> devices;
         platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
@@ -102,10 +101,10 @@ std::vector<MandelDevice> MandelContext::createDevices(void)
             std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
             auto supportsDouble = extensions.find("cl_khr_fp64") != std::string::npos;
 
-            printf("Device extensions: %s\n", ext.c_str());
+            //printf("Device extensions: %s\n", ext.c_str());
             MandelDevice md;
 
-            printf("clock: %d", device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>());
+            //printf("clock: %d", device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>());
 
             md.name = device.getInfo<CL_DEVICE_NAME>();
             md.vendor = device.getInfo<CL_DEVICE_VENDOR>();
@@ -128,13 +127,13 @@ std::vector<MandelDevice> MandelContext::createDevices(void)
                 md.generator128 = std::make_unique<ClGenerator128>(device);
             }
             catch (const std::string& err) {
-                fprintf(stderr, "error creating 128bit cl generator: %s\n", err.c_str());
+                //fprintf(stderr, "error creating 128bit cl generator: %s\n", err.c_str());
             }
 
             mandelDevices.push_back(std::move(md));
         }
     }
-    */
+    
     return mandelDevices;
 }
 

+ 11 - 0
mandelbench/CMakeLists.txt

@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.9)
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../libmandel ${CMAKE_CURRENT_BINARY_DIR}/libmandel)
+set(CMAKE_CXX_STANDARD 17)
+
+add_executable(mandelbench mandelbench.cpp)
+
+target_include_directories(mandelbench PUBLIC ../libmandel/include)
+
+target_link_libraries(mandelbench mandel)
+

+ 138 - 0
mandelbench/mandelbench.cpp

@@ -0,0 +1,138 @@
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <functional>
+#include <Mandel.h>
+
+constexpr mnd::MandelViewport benchViewport(void)
+{
+    return mnd::MandelViewport{ -1.250000598933854152929, 0.0001879894057291665530, 0.0000003839916666666565, 0.0000003839916666666565 };
+}
+
+const std::vector<mnd::MandelInfo> benches {
+    mnd::MandelInfo{ benchViewport(), 100, 100, 1000 },
+    mnd::MandelInfo{ benchViewport(), 100, 200, 1000 },
+    mnd::MandelInfo{ benchViewport(), 200, 200, 1000 },
+    mnd::MandelInfo{ benchViewport(), 200, 200, 2000 },
+    mnd::MandelInfo{ benchViewport(), 200, 400, 2000 },
+    mnd::MandelInfo{ benchViewport(), 400, 400, 2000 },
+    mnd::MandelInfo{ benchViewport(), 400, 400, 4000 },
+    mnd::MandelInfo{ benchViewport(), 400, 800, 4000 },
+    mnd::MandelInfo{ benchViewport(), 800, 800, 4000 },
+    mnd::MandelInfo{ benchViewport(), 800, 800, 8000 },
+    mnd::MandelInfo{ benchViewport(), 800, 800, 16000 },
+    mnd::MandelInfo{ benchViewport(), 800, 1600, 16000 },
+    mnd::MandelInfo{ benchViewport(), 1600, 1600, 16000 },
+    mnd::MandelInfo{ benchViewport(), 1600, 1600, 32000 },
+    mnd::MandelInfo{ benchViewport(), 1600, 1600, 64000 },
+    mnd::MandelInfo{ benchViewport(), 1600, 3200, 64000 },
+    mnd::MandelInfo{ benchViewport(), 3200, 3200, 64000 },
+    mnd::MandelInfo{ benchViewport(), 3200, 3200, 128000 },
+    mnd::MandelInfo{ benchViewport(), 3200, 3200, 256000 },
+    mnd::MandelInfo{ benchViewport(), 3200, 3200, 512000 },
+    mnd::MandelInfo{ benchViewport(), 3200, 3200, 1024000 },
+    mnd::MandelInfo{ benchViewport(), 3200, 3200, 2048000 },
+    mnd::MandelInfo{ benchViewport(), 3200, 6400, 2048000 },
+    mnd::MandelInfo{ benchViewport(), 6400, 6400, 2048000 },
+    mnd::MandelInfo{ benchViewport(), 6400, 6400, 4096000 },
+    mnd::MandelInfo{ benchViewport(), 6400, 6400, 8192000 },
+    mnd::MandelInfo{ benchViewport(), 6400, 6400, 16384000 },
+    mnd::MandelInfo{ benchViewport(), 6400, 6400, 32768000 },
+    mnd::MandelInfo{ benchViewport(), 6400, 6400, 65536000 },
+    mnd::MandelInfo{ benchViewport(), 6400, 6400, 131072000 },
+    mnd::MandelInfo{ benchViewport(), 6400, 6400, 262144000 },
+    mnd::MandelInfo{ benchViewport(), 6400, 6400, 524288000 },
+    mnd::MandelInfo{ benchViewport(), 6400, 6400, 1048576000 },
+    mnd::MandelInfo{ benchViewport(), 6400, 6400, 2097152000 },
+};
+
+std::pair<long long, std::chrono::nanoseconds> measureMips(const std::function<std::pair<float*, long>()>& bench)
+{
+    using namespace std::chrono;
+    auto before = high_resolution_clock::now();
+    auto [bitmap, length] = bench();
+    auto after = high_resolution_clock::now();
+
+    long long sum = 0;
+    for (int i = 0; i < length; i++) {
+        sum += std::floor(bitmap[size_t(i)]);
+    }
+
+    return std::make_pair(sum, duration_cast<nanoseconds>(after - before));
+
+    /*
+    double iterPerNanos = double(sum) / duration_cast<nanoseconds>(after - before).count();
+    //printf("test took %lld millis\n", duration_cast<milliseconds>(after - before).count());
+    //printf("test did %lld iters\n", sum);
+    double megaItersPerSecond = iterPerNanos * 1000.0;
+    return megaItersPerSecond;*/
+}
+
+
+double benchmark(mnd::Generator& generator)
+{
+    /*mnd::MandelInfo mi;
+    mi.bWidth = 250;
+    mi.bHeight = 250;
+    mi.maxIter = 4000;
+    mi.view = benchViewport();*/
+
+    int testIndex = 0;
+
+    for (int i = 0; i < benches.size(); i++) {
+        const mnd::MandelInfo& mi = benches[i];
+        auto data = std::make_unique<float[]>(mi.bWidth * mi.bHeight);
+        auto [iters, time] = measureMips([&generator, &mi, &data]() { generator.generate(mi, data.get()); return std::make_pair(data.get(), mi.bWidth * mi.bHeight);  });
+        //printf("benchmark lvl %d, time %d ms\n", i, time.count() / 1000 / 1000);
+        //fflush(stdout);
+        if (time > std::chrono::milliseconds(1000)) {
+            testIndex = i + 2;
+            break;
+        }
+    }
+
+
+    const mnd::MandelInfo& mi = benches[(testIndex >= benches.size()) ? (benches.size() - 1) : testIndex];
+    auto data = std::make_unique<float[]>(mi.bWidth * mi.bHeight);
+    auto [iters, time] = measureMips([&generator, &mi, &data]() { generator.generate(mi, data.get()); return std::make_pair(data.get(), mi.bWidth * mi.bHeight);  });
+    //printf("bench time %d ms\n", time.count() / 1000 / 1000);
+    //fflush(stdout);
+
+    return double(iters) / time.count() * 1000;
+}
+
+#define REPORT_PERFORMANCE(name, performance) \
+do { std::cout << std::setw(30) << name << std::setw(10) << std::right << std::showbase << std::fixed << std::setprecision(2) << performance << std::endl; } while(0)
+
+int main()
+{
+    mnd::MandelContext mc = mnd::initializeContext();
+
+    std::cout << "Benchmarking CPU [" << mc.getCpuInfo().getBrand() << "]" << std::endl;
+
+    REPORT_PERFORMANCE("float [MIps]: ", benchmark(mc.getCpuGeneratorFloat()));
+    REPORT_PERFORMANCE("double [MIps]: ", benchmark(mc.getCpuGeneratorDouble()));
+    REPORT_PERFORMANCE("fixed-point 128 bit [MIps]: ", benchmark(mc.getCpuGenerator128()));
+    
+
+    for (auto& device : mc.getDevices()) {
+        std::cout << "Benchmarking Device [" << device.getName() << "]" << std::endl;
+        if (mnd::Generator* gpuf; gpuf = device.getGeneratorFloat()) {
+            REPORT_PERFORMANCE("float [MIps]: ", benchmark(*gpuf));
+        }
+        if (mnd::Generator* gpud; gpud = device.getGeneratorDouble()) {
+            REPORT_PERFORMANCE("double [MIps]: ", benchmark(*gpud));
+        }
+        if (mnd::Generator* gpu128; gpu128 = device.getGenerator128()) {
+            REPORT_PERFORMANCE("fixed-point 128 bit [MIps]: ", benchmark(*gpu128));
+        }
+    }
+
+
+    /*
+    std::cout << std::setw(30) << "float [MIps]: " << std::setw(10) << std::right << std::showbase << std::fixed << std::setprecision(2) << benchmark(mc.getCpuGeneratorFloat()) << std::endl;
+    std::cout << std::setw(30) << "double [MIps]: " << std::setw(10) << std::right << std::showbase << std::fixed << std::setprecision(2) << benchmark(mc.getCpuGeneratorDouble()) << std::endl;
+    std::cout << std::setw(30) << "fixed-point 128 bit [MIps]: " << std::setw(10) << std::right << std::showbase << std::fixed << std::setprecision(2) << benchmark(mc.getCpuGenerator128()) << std::endl;
+    */
+}
+