|
@@ -1,4 +1,5 @@
|
|
|
#include "CpuGenerators.h"
|
|
|
+#include "LightDoubleDouble.h"
|
|
|
|
|
|
#include <omp.h>
|
|
|
#include <arm_neon.h>
|
|
@@ -13,6 +14,9 @@ namespace mnd
|
|
|
|
|
|
template class CpuGenerator<double, mnd::ARM_NEON, false>;
|
|
|
template class CpuGenerator<double, mnd::ARM_NEON, true>;
|
|
|
+
|
|
|
+ template class CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, false>;
|
|
|
+ template class CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, true>;
|
|
|
}
|
|
|
|
|
|
|
|
@@ -21,9 +25,14 @@ void CpuGenerator<float, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInf
|
|
|
{
|
|
|
using T = float;
|
|
|
const MandelViewport& view = info.view;
|
|
|
+
|
|
|
+ float32x4_t juliaX = vmovq_n_f32(double(info.juliaX));
|
|
|
+ float32x4_t juliaY = vmovq_n_f32(double(info.juliaY));
|
|
|
+#if defined(_OPENMP)
|
|
|
if constexpr(parallel)
|
|
|
omp_set_num_threads(omp_get_num_procs());
|
|
|
-#pragma omp parallel for schedule(static, 1) if (parallel)
|
|
|
+# pragma omp parallel for schedule(static, 1) if (parallel)
|
|
|
+#endif
|
|
|
for (long j = 0; j < info.bHeight; j++) {
|
|
|
T y = T(view.y) + T(j) * T(view.height / info.bHeight);
|
|
|
long i = 0;
|
|
@@ -48,21 +57,24 @@ void CpuGenerator<float, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInf
|
|
|
float32x4_t a = xs;
|
|
|
float32x4_t b = ys;
|
|
|
|
|
|
+ float32x4_t cx = info.julia ? juliaX : xs;
|
|
|
+ float32x4_t cy = info.julia ? juliaY : ys;
|
|
|
+
|
|
|
for (int k = 0; k < info.maxIter; k++) {
|
|
|
float32x4_t aa = vmulq_f32(a, a);
|
|
|
float32x4_t bb = vmulq_f32(b, b);
|
|
|
float32x4_t abab = vmulq_f32(a, b); abab = vaddq_f32(abab, abab);
|
|
|
uint32x4_t cmp = vcleq_f32(vaddq_f32(aa, bb), threshold);
|
|
|
- if (info.smooth) {
|
|
|
- float32x4_t tempa = vaddq_f32(vsubq_f32(aa, bb), xs);
|
|
|
- float32x4_t tempb = vaddq_f32(abab, ys);
|
|
|
- a = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempa)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(a))));
|
|
|
- b = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempb)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(b))));
|
|
|
- }
|
|
|
- else {
|
|
|
- a = vaddq_f32(vsubq_f32(aa, bb), xs);
|
|
|
- b = vaddq_f32(abab, ys);
|
|
|
- }
|
|
|
+ if (info.smooth) {
|
|
|
+ float32x4_t tempa = vaddq_f32(vsubq_f32(aa, bb), cx);
|
|
|
+ float32x4_t tempb = vaddq_f32(abab, cy);
|
|
|
+ a = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempa)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(a))));
|
|
|
+ b = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempb)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(b))));
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ a = vaddq_f32(vsubq_f32(aa, bb), cx);
|
|
|
+ b = vaddq_f32(abab, cy);
|
|
|
+ }
|
|
|
adder = vandq_u32(adder, cmp);
|
|
|
counter = vaddq_u32(counter, adder);
|
|
|
// checking for break criterion is possibly expensive, only do it every 8 iterations
|
|
@@ -105,9 +117,15 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
|
|
|
{
|
|
|
using T = double;
|
|
|
const MandelViewport& view = info.view;
|
|
|
+
|
|
|
+ float64x2_t juliaX = vmovq_n_f64(double(info.juliaX));
|
|
|
+ float64x2_t juliaY = vmovq_n_f64(double(info.juliaY));
|
|
|
+
|
|
|
+#if defined(_OPENMP)
|
|
|
if constexpr(parallel)
|
|
|
omp_set_num_threads(omp_get_num_procs());
|
|
|
-#pragma omp parallel for schedule(static, 1) if (parallel)
|
|
|
+# pragma omp parallel for schedule(static, 1) if (parallel)
|
|
|
+#endif
|
|
|
for (long j = 0; j < info.bHeight; j++) {
|
|
|
T y = T(view.y) + T(j) * T(view.height / info.bHeight);
|
|
|
long i = 0;
|
|
@@ -129,6 +147,8 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
|
|
|
float64x2_t ys = vmovq_n_f64(y);
|
|
|
float64x2_t a = xs;
|
|
|
float64x2_t b = ys;
|
|
|
+ float64x2_t cx = info.julia ? juliaX : xs;
|
|
|
+ float64x2_t cy = info.julia ? juliaY : ys;
|
|
|
|
|
|
for (int k = 0; k < info.maxIter; k++) {
|
|
|
float64x2_t aa = vmulq_f64(a, a);
|
|
@@ -137,16 +157,16 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
|
|
|
//a = vaddq_f64(vsubq_f64(aa, bb), xs);
|
|
|
//b = vaddq_f64(abab, ys);
|
|
|
uint64x2_t cmp = vcleq_f64(vaddq_f64(aa, bb), threshold);
|
|
|
- if (info.smooth) {
|
|
|
- float64x2_t tempa = vaddq_f64(vsubq_f64(aa, bb), xs);
|
|
|
- float64x2_t tempb = vaddq_f64(abab, ys);
|
|
|
- a = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempa)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(a))));
|
|
|
- b = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempb)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(b))));
|
|
|
- }
|
|
|
- else {
|
|
|
- a = vaddq_f64(vsubq_f64(aa, bb), xs);
|
|
|
- b = vaddq_f64(abab, ys);
|
|
|
- }
|
|
|
+ if (info.smooth) {
|
|
|
+ float64x2_t tempa = vaddq_f64(vsubq_f64(aa, bb), cx);
|
|
|
+ float64x2_t tempb = vaddq_f64(abab, cy);
|
|
|
+ a = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempa)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(a))));
|
|
|
+ b = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempb)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(b))));
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ a = vaddq_f64(vsubq_f64(aa, bb), cx);
|
|
|
+ b = vaddq_f64(abab, cy);
|
|
|
+ }
|
|
|
adder = vandq_u64(adder, cmp);
|
|
|
counter = vaddq_u64(counter, adder);
|
|
|
// checking for break criterion is possibly expensive, only do it every 8 iterations
|
|
@@ -164,11 +184,6 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- /*uint64_t resData[2];
|
|
|
- vst1q_u64(resData, counter);
|
|
|
- for (int k = 0; k < 2 && i + k < info.bWidth; k++)
|
|
|
- data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
|
|
|
-*/
|
|
|
uint64_t resData[2];
|
|
|
double resa[2];
|
|
|
double resb[2];
|
|
@@ -187,3 +202,220 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+struct VecPair
|
|
|
+{
|
|
|
+ float64x2_t a;
|
|
|
+ float64x2_t b;
|
|
|
+};
|
|
|
+
|
|
|
+
|
|
|
+static inline VecPair quickTwoSum(float64x2_t a, float64x2_t b)
|
|
|
+{
|
|
|
+ float64x2_t s = vaddq_f64(a, b);
|
|
|
+ float64x2_t e = vsubq_f64(b, vsubq_f64(s, a));
|
|
|
+ return { s, e };
|
|
|
+}
|
|
|
+
|
|
|
+static inline VecPair quickTwoDiff(float64x2_t a, float64x2_t b)
|
|
|
+{
|
|
|
+ float64x2_t s = vsubq_f64(a, b);
|
|
|
+ float64x2_t e = vsubq_f64(vsubq_f64(a, s), b);
|
|
|
+ return { s, e };
|
|
|
+}
|
|
|
+
|
|
|
+static inline VecPair twoSum(float64x2_t a, float64x2_t b)
|
|
|
+{
|
|
|
+ float64x2_t s = vaddq_f64(a, b);
|
|
|
+ float64x2_t bb = vsubq_f64(s, a);
|
|
|
+ float64x2_t e = vaddq_f64(vsubq_f64(a, vsubq_f64(s, bb)), vsubq_f64(b, bb));
|
|
|
+ return { s, e };
|
|
|
+}
|
|
|
+
|
|
|
+static inline VecPair twoDiff(float64x2_t a, float64x2_t b)
|
|
|
+{
|
|
|
+ float64x2_t s = vsubq_f64(a, b);
|
|
|
+ float64x2_t bb = vsubq_f64(s, a);
|
|
|
+ float64x2_t e = vsubq_f64(vsubq_f64(a, vsubq_f64(s, bb)), vaddq_f64(b, bb));
|
|
|
+ return { s, e };
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+static inline VecPair split(float64x2_t a)
|
|
|
+{
|
|
|
+ /*
|
|
|
+ // -- this should never happen when doing mandelbrot calculations,
|
|
|
+ // so we omit this check.
|
|
|
+ if (a > _QD_SPLIT_THRESH || a < -_QD_SPLIT_THRESH) {
|
|
|
+ a *= 3.7252902984619140625e-09; // 2^-28
|
|
|
+ temp = _QD_SPLITTER * a;
|
|
|
+ hi = temp - (temp - a);
|
|
|
+ lo = a - hi;
|
|
|
+ hi *= 268435456.0; // 2^28
|
|
|
+ lo *= 268435456.0; // 2^28
|
|
|
+ } else {
|
|
|
+ temp = _QD_SPLITTER * a;
|
|
|
+ hi = temp - (temp - a);
|
|
|
+ lo = a - hi;
|
|
|
+ }
|
|
|
+ */
|
|
|
+
|
|
|
+ static const float64x2_t SPLITTER = vmovq_n_f64(134217729.0);
|
|
|
+ float64x2_t temp = vmulq_f64(SPLITTER, a);
|
|
|
+ float64x2_t hi = vsubq_f64(temp, vsubq_f64(temp, a));
|
|
|
+ float64x2_t lo = vsubq_f64(a, hi);
|
|
|
+ return { hi, lo };
|
|
|
+}
|
|
|
+
|
|
|
+static inline VecPair twoProd(float64x2_t a, float64x2_t b)
|
|
|
+{
|
|
|
+ float64x2_t p = vmulq_f64(a, b);
|
|
|
+ auto[a_hi, a_lo] = split(a);
|
|
|
+ auto[b_hi, b_lo] = split(b);
|
|
|
+ float64x2_t err = vaddq_f64(vaddq_f64(vsubq_f64(vmulq_f64(a_hi, b_hi), p), vaddq_f64(vmulq_f64(a_hi, b_lo), vmulq_f64(a_lo, b_hi))), vmulq_f64(a_lo, b_lo));
|
|
|
+ return { p, err };
|
|
|
+}
|
|
|
+
|
|
|
+struct NeonDoubleDouble
|
|
|
+{
|
|
|
+ float64x2_t x[2];
|
|
|
+
|
|
|
+ inline NeonDoubleDouble(const float64x2_t& a, const float64x2_t& b) :
|
|
|
+ x{ a, b }
|
|
|
+ {}
|
|
|
+
|
|
|
+ inline NeonDoubleDouble(double a, double b) :
|
|
|
+ x{ vmovq_n_f64(a), vmovq_n_f64(b) }
|
|
|
+ {}
|
|
|
+
|
|
|
+
|
|
|
+ inline NeonDoubleDouble operator + (const NeonDoubleDouble& sm) const
|
|
|
+ {
|
|
|
+ auto[s, e] = twoSum(x[0], sm.x[0]);
|
|
|
+ e = vaddq_f64(e, vaddq_f64(x[1], sm.x[1]));
|
|
|
+ auto[r1, r2] = quickTwoSum(s, e);
|
|
|
+ return NeonDoubleDouble{ r1, r2 };
|
|
|
+ }
|
|
|
+
|
|
|
+ inline NeonDoubleDouble operator - (const NeonDoubleDouble& sm) const
|
|
|
+ {
|
|
|
+ auto[s, e] = twoDiff(x[0], sm.x[0]);
|
|
|
+ e = vaddq_f64(e, x[1]);
|
|
|
+ e = vsubq_f64(e, sm.x[1]);
|
|
|
+ auto[r1, r2] = quickTwoSum(s, e);
|
|
|
+ return NeonDoubleDouble{ r1, r2 };
|
|
|
+ }
|
|
|
+
|
|
|
+ inline NeonDoubleDouble operator * (const NeonDoubleDouble& sm) const
|
|
|
+ {
|
|
|
+ auto[p1, p2] = twoProd(this->x[0], sm.x[0]);
|
|
|
+ p2 = vaddq_f64(p2,
|
|
|
+ vaddq_f64(vmulq_f64(sm.x[1], x[0]), vmulq_f64(sm.x[0], x[1])) );
|
|
|
+ auto[r1, r2] = quickTwoSum(p1, p2);
|
|
|
+ return NeonDoubleDouble{ r1, r2 };
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+
|
|
|
+template<bool parallel>
|
|
|
+void CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInfo& info, float* data)
|
|
|
+{
|
|
|
+ const MandelViewport& view = info.view;
|
|
|
+
|
|
|
+ using T = LightDoubleDouble;
|
|
|
+
|
|
|
+ T viewx = mnd::convert<T>(view.x);
|
|
|
+ T viewy = mnd::convert<T>(view.y);
|
|
|
+ T wpp = mnd::convert<T>(view.width / info.bWidth);
|
|
|
+ T hpp = mnd::convert<T>(view.height / info.bHeight);
|
|
|
+
|
|
|
+ T jX = mnd::convert<T>(info.juliaX);
|
|
|
+ T jY = mnd::convert<T>(info.juliaY);
|
|
|
+ NeonDoubleDouble juliaX = { jX[0], jX[1] };
|
|
|
+ NeonDoubleDouble juliaY = { jY[0], jY[1] };
|
|
|
+
|
|
|
+#if defined(_OPENMP)
|
|
|
+ if constexpr(parallel)
|
|
|
+ omp_set_num_threads(omp_get_num_procs());
|
|
|
+# pragma omp parallel for schedule(static, 1) if (parallel)
|
|
|
+#endif
|
|
|
+ for (long j = 0; j < info.bHeight; j++) {
|
|
|
+ T y = viewy + T(double(j)) * hpp;
|
|
|
+ NeonDoubleDouble ys{ y[0], y[1] };
|
|
|
+ for (long i = 0; i < info.bWidth; i += 2) {
|
|
|
+ T x1 = viewx + T(double(i)) * wpp;
|
|
|
+ T x2 = x1 + wpp;
|
|
|
+ double xarr1[] = { x1[0], x2[0] };
|
|
|
+ double xarr2[] = { x1[1], x2[1] };
|
|
|
+ float64x2_t x1s = vld1q_f64(xarr1);
|
|
|
+ float64x2_t x2s = vld1q_f64(xarr2);
|
|
|
+
|
|
|
+ NeonDoubleDouble xs{ x1s, x2s };
|
|
|
+
|
|
|
+ NeonDoubleDouble cx = info.julia ? juliaX : xs;
|
|
|
+ NeonDoubleDouble cy = info.julia ? juliaY : ys;
|
|
|
+
|
|
|
+ float64x2_t threshold = vmovq_n_f64(16.0);
|
|
|
+ uint64x2_t counter = vmovq_n_u64(0);
|
|
|
+ uint64x2_t adder = vmovq_n_u64(1);
|
|
|
+
|
|
|
+ NeonDoubleDouble a = xs;
|
|
|
+ NeonDoubleDouble b = ys;
|
|
|
+ float64x2_t resultA = a.x[0];
|
|
|
+ float64x2_t resultB = b.x[0];
|
|
|
+
|
|
|
+ float64x2_t resultsa = vmovq_n_f64(0);
|
|
|
+ float64x2_t resultsb = vmovq_n_f64(0);
|
|
|
+
|
|
|
+ for (int k = 0; k < info.maxIter; k++) {
|
|
|
+ NeonDoubleDouble aa = a * a;
|
|
|
+ NeonDoubleDouble bb = b * b;
|
|
|
+ NeonDoubleDouble abab = a * b; abab = abab + abab;
|
|
|
+ a = aa - bb + cx;
|
|
|
+ b = abab + cy;
|
|
|
+
|
|
|
+
|
|
|
+ uint64x2_t cmp = vcleq_f64(vaddq_f64(aa.x[0], bb.x[0]), threshold);
|
|
|
+ if (info.smooth) {
|
|
|
+ resultA = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(a.x[0])), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(resultA))));
|
|
|
+ resultB = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(b.x[0])), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(resultB))));
|
|
|
+ }
|
|
|
+ a = aa - bb + cx;
|
|
|
+ b = abab + cy;
|
|
|
+ adder = vandq_u64(adder, cmp);
|
|
|
+ counter = vaddq_u64(counter, adder);
|
|
|
+ // checking for break criterion is possibly expensive, only do it every 8 iterations
|
|
|
+ if ((k & 0x7) == 0) {
|
|
|
+ /* // ARM-v7 method
|
|
|
+ uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
|
|
|
+ if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ */
|
|
|
+ uint64_t allZero = vaddvq_u64(cmp);
|
|
|
+ if (allZero == 0) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ uint64_t resData[2];
|
|
|
+ double resa[2];
|
|
|
+ double resb[2];
|
|
|
+ vst1q_u64(resData, counter);
|
|
|
+ vst1q_f64(resa, resultA);
|
|
|
+ vst1q_f64(resb, resultB);
|
|
|
+
|
|
|
+ for (int k = 0; k < 2 && i + k < info.bWidth; k++) {
|
|
|
+ if (info.smooth)
|
|
|
+ data[i + k + j * info.bWidth] = resData[k] <= 0 ? info.maxIter :
|
|
|
+ resData[k] >= info.maxIter ? info.maxIter :
|
|
|
+ ((float) resData[k]) + 1 - ::logf(::logf(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::logf(2.0f);
|
|
|
+ else
|
|
|
+ data[i + k + j * info.bWidth] = resData[k] > 0 ? float(resData[k]) : info.maxIter;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|