소스 검색

improving benchmarks

Nicolas Winkler 5 년 전
부모
커밋
11732cff05
5개의 변경된 파일56개의 추가작업 그리고 58개의 파일을 삭제
  1. 37 29
      choosegenerators.cpp
  2. 0 1
      libmandel/CMakeLists.txt
  3. 7 3
      libmandel/src/ClGenerators.cpp
  4. 3 5
      libmandel/src/CpuGeneratorsAVX.cpp
  5. 9 20
      libmandel/src/CpuGeneratorsSSE2.cpp

+ 37 - 29
choosegenerators.cpp

@@ -11,12 +11,14 @@
 #include <QRegExpValidator>
 #include <QMessageBox>
 
+#include <cstring>
 
 
 mnd::MandelViewport Benchmarker::benchViewport(void)
 {
     //return mnd::MandelViewport{ -1.250000598933854152929, 0.0001879894057291665530, 0.0000003839916666666565, 0.0000003839916666666565 };
-    return mnd::MandelViewport::centerView();
+    //return mnd::MandelViewport::centerView();
+    return mnd::MandelViewport{ 0, 0, 0.0000003839916666666565, 0.0000003839916666666565 };
 }
 
 
@@ -25,22 +27,19 @@ static std::vector<mnd::MandelInfo> createBenches()
     std::vector<mnd::MandelInfo> vec;
     for (int i = 0; i < 50; i++) {
         int expo = i + 14;
-        int whe = 5;
-
-        if (expo > 18)
-            whe = 6;
-        if (expo > 19)
-            whe = 7;
-        if (expo > 21)
-            whe = 8;
-        if (expo > 24)
-            whe = 9;
-        if (expo > 25)
-            whe = 10;
-
-        long wh = 1L << whe;
-        long iter = 1L << (expo - 2 * whe);
-        vec.push_back(mnd::MandelInfo{ mnd::MandelViewport::centerView(), wh, wh, iter, false, false, 0.0, 0.0 });
+        int w = 5;
+        int h = 5;
+
+        while (int(expo * 1) - w - h > 15 && w <= 10 && h <= 10) {
+            w++;
+            if (int(expo * 1) - w - h > 15)
+                h++;
+        }
+
+        long wi = 1L << w;
+        long he = 1L << h;
+        long iter = 1L << (expo - w - h);
+        vec.push_back(mnd::MandelInfo{ mnd::MandelViewport::centerView(), wi, he, iter, false, false, 0.0, 0.0 });
     }
     return vec;
 }
@@ -120,10 +119,11 @@ double Benchmarker::benchmarkResult(mnd::MandelGenerator& mg) const
             mg.generate(mi, bmp.pixels.get());
             return &bmp;
         });
-        if (time > std::chrono::milliseconds(500)) {
-            testIndex = i + 2;
-            //printf("testing index %d\n", testIndex);
-            fflush(stdout);
+        if (time > std::chrono::milliseconds(200)) {
+            testIndex = i + 4;
+            printf("testing index for generator %s: %d\n", (mnd::toString(mg.getType()) + ", " + mnd::toString(mg.getExtension())).c_str(), testIndex);
+            printf("    w: %d, h: %d, iter: %d\n", benches[testIndex].bWidth, benches[testIndex].bHeight, benches[testIndex].maxIter);
+            fflush(stdout);fflush(stdout);fflush(stdout);fflush(stdout);fflush(stdout);fflush(stdout);fflush(stdout);
             break;
         }
         else if (time < std::chrono::milliseconds(10)) {
@@ -131,15 +131,23 @@ double Benchmarker::benchmarkResult(mnd::MandelGenerator& mg) const
         }
     }
 
+    try {
+        const mnd::MandelInfo& mi = benches[(testIndex >= benches.size()) ? (benches.size() - 1) : testIndex];
+        Bitmap<float> bmp(mi.bWidth, mi.bHeight);
+        auto [iters, time] = measureMips([&mg, &mi, &bmp]() {
+            mg.generate(mi, bmp.pixels.get());
+            return &bmp;
+        });
 
-    const mnd::MandelInfo& mi = benches[(testIndex >= benches.size()) ? (benches.size() - 1) : testIndex];
-    Bitmap<float> bmp(mi.bWidth, mi.bHeight);
-    auto [iters, time] = measureMips([&mg, &mi, &bmp]() {
-        mg.generate(mi, bmp.pixels.get());
-        return &bmp;
-    });
-
-    return double(iters) / time.count() * 1000;
+        return double(iters) / time.count() * 1000;
+    }
+    catch(const std::string& c) {
+        printf("error benchmarking: %s\n", c.c_str());
+    }
+    catch(...) {
+        printf("error benchmarking\n");
+    }
+    return 0;
 }
 
 

+ 0 - 1
libmandel/CMakeLists.txt

@@ -42,7 +42,6 @@ endif()
 #    message(${MandelSources})
 
 add_library(mandel STATIC ${MandelSources})
-set_source_files_properties(${MandelSources} PROPERTIES COMPILE_FLAGS -march=native)
 
 FILE(GLOB QdSources qd-2.3.22/src/*.cpp)
 

+ 7 - 3
libmandel/src/ClGenerators.cpp

@@ -145,9 +145,10 @@ void ClGeneratorFloat::generate(const mnd::MandelInfo& info, float* data)
     } else {
         queue.enqueueNDRangeKernel(kernel, 0, NDRange(info.bWidth * info.bHeight));
     }
+    cl::Event event;
+    queue.enqueueReadBuffer(buffer_A, CL_FALSE, 0, bufferSize, data, nullptr, &event);
     queue.flush();
-    queue.finish();
-    queue.enqueueReadBuffer(buffer_A, CL_TRUE, 0, bufferSize, data);
+    event.wait();
 }
 
 
@@ -336,7 +337,10 @@ void ClGeneratorDouble::generate(const mnd::MandelInfo& info, float* data)
     kernel.setArg(10, double(info.juliaY));
 
     cl_int result = queue.enqueueNDRangeKernel(kernel, 0, NDRange(info.bWidth * info.bHeight));
-    queue.enqueueReadBuffer(buffer_A, CL_TRUE, 0, bufferSize, data);
+    cl::Event event;
+    queue.enqueueReadBuffer(buffer_A, CL_FALSE, 0, bufferSize, data, nullptr, &event);
+    queue.flush();
+    event.wait();
 }
 
 

+ 3 - 5
libmandel/src/CpuGeneratorsAVX.cpp

@@ -435,9 +435,7 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd
 #pragma omp parallel for schedule(static, 1) if (parallel)
     for (long j = 0; j < info.bHeight; j++) {
         T y = viewy + T(double(j)) * hpp;
-        __m256d y0s = { y.x[0], y.x[0], y.x[0], y.x[0] };
-        __m256d y1s = { y.x[1], y.x[1], y.x[1], y.x[1] };
-        AvxDoubleDouble ys{ y0s, y1s };
+        AvxDoubleDouble ys{ y[0], y[1] };
         for (long i = 0; i < info.bWidth; i += 4) {
             T x1 = viewx + T(double(i)) * wpp;
             T x2 = x1 + wpp;
@@ -466,8 +464,8 @@ void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd
             AvxDoubleDouble a = xs;
             AvxDoubleDouble b = ys;
 
-            __m256d resultsa;
-            __m256d resultsb;
+            __m256d resultsa = _mm256_set1_pd(0);
+            __m256d resultsb = _mm256_set1_pd(0);
 
             for (int k = 0; k < info.maxIter; k++) {
                 AvxDoubleDouble aa = a * a;

+ 9 - 20
libmandel/src/CpuGeneratorsSSE2.cpp

@@ -188,33 +188,22 @@ void CpuGenerator<double, mnd::X86_SSE2, parallel>::generate(const mnd::MandelIn
                 counter = _mm_add_pd(counter, adder);
                 adder2 = _mm_and_pd(adder2, cmp2);
                 counter2 = _mm_add_pd(counter2, adder2);
-                if ((k & 0x7 == 0) && _mm_movemask_epi8(_mm_castpd_si128(cmp)) == 0 &&
-                    _mm_movemask_epi8(_mm_castpd_si128(cmp)) == 0) {
+                if (((k & 0x7) == 0) && _mm_movemask_epi8(_mm_castpd_si128(cmp)) == 0 &&
+                    _mm_movemask_epi8(_mm_castpd_si128(cmp2)) == 0) {
                     break;
                 }
             }
 
-            auto alignVec = [](double* data) -> double* {
-                void* aligned = data;
-                ::size_t length = 64;
-                std::align(32, 4 * sizeof(double), aligned, length);
-                return static_cast<double*>(aligned);
-            };
-
-            double resData[24];
-            double* ftRes = alignVec(resData);
+            double ftRes[24];
             double* resa = ftRes + 4;
             double* resb = ftRes + 8;
 
-            _mm_store_pd(ftRes, counter);
-            _mm_store_pd(ftRes + 2, counter2);
-            _mm_store_pd(resa, resulta);
-            _mm_store_pd(resa + 2, resulta2);
-            _mm_store_pd(resb, resultb);
-            _mm_store_pd(resb + 2, resultb2);
-            //for (int k = 0; k < 2 && i + k < info.bWidth; k++)
-            //    data[i + k + j * info.bWidth] = ftRes[k] > 0 ? ftRes[k] : info.maxIter;
-
+            _mm_storeu_pd(ftRes, counter);
+            _mm_storeu_pd(ftRes + 2, counter2);
+            _mm_storeu_pd(resa, resulta);
+            _mm_storeu_pd(resa + 2, resulta2);
+            _mm_storeu_pd(resb, resultb);
+            _mm_storeu_pd(resb + 2, resultb2);
             for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
                 if (info.smooth)
                     data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :