|
@@ -21,17 +21,18 @@ void CpuGenerator<float, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInf
|
|
|
{
|
|
|
using T = float;
|
|
|
const MandelViewport& view = info.view;
|
|
|
- omp_set_num_threads(2 * omp_get_num_procs());
|
|
|
-#pragma omp parallel for
|
|
|
+ if constexpr(parallel)
|
|
|
+ omp_set_num_threads(omp_get_num_procs());
|
|
|
+#pragma omp parallel for schedule(static, 1) if (parallel)
|
|
|
for (long j = 0; j < info.bHeight; j++) {
|
|
|
T y = T(view.y) + T(j) * T(view.height / info.bHeight);
|
|
|
long i = 0;
|
|
|
for (i; i < info.bWidth; i += 4) {
|
|
|
float xsvals[] = {
|
|
|
- float(view.x + double(i) * view.width / info.bWidth),
|
|
|
- float(view.x + double(i + 1) * view.width / info.bWidth),
|
|
|
- float(view.x + double(i + 2) * view.width / info.bWidth),
|
|
|
- float(view.x + double(i + 3) * view.width / info.bWidth)
|
|
|
+ float(view.x + float(i) * view.width / info.bWidth),
|
|
|
+ float(view.x + float(i + 1) * view.width / info.bWidth),
|
|
|
+ float(view.x + float(i + 2) * view.width / info.bWidth),
|
|
|
+ float(view.x + float(i + 3) * view.width / info.bWidth)
|
|
|
};
|
|
|
|
|
|
float32x4_t xs = vld1q_f32(xsvals);
|
|
@@ -51,13 +52,21 @@ void CpuGenerator<float, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInf
|
|
|
float32x4_t aa = vmulq_f32(a, a);
|
|
|
float32x4_t bb = vmulq_f32(b, b);
|
|
|
float32x4_t abab = vmulq_f32(a, b); abab = vaddq_f32(abab, abab);
|
|
|
- a = vaddq_f32(vsubq_f32(aa, bb), xs);
|
|
|
- b = vaddq_f32(abab, ys);
|
|
|
uint32x4_t cmp = vcleq_f32(vaddq_f32(aa, bb), threshold);
|
|
|
+ if (info.smooth) {
|
|
|
+ float32x4_t tempa = vaddq_f32(vsubq_f32(aa, bb), xs);
|
|
|
+ float32x4_t tempb = vaddq_f32(abab, ys);
|
|
|
+ a = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempa)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(a))));
|
|
|
+ b = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempb)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(b))));
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ a = vaddq_f32(vsubq_f32(aa, bb), xs);
|
|
|
+ b = vaddq_f32(abab, ys);
|
|
|
+ }
|
|
|
adder = vandq_u32(adder, cmp);
|
|
|
counter = vaddq_u32(counter, adder);
|
|
|
// checking for break criterion is possibly expensive, only do it every 8 iterations
|
|
|
- if ((k & 0xF) == 0) {
|
|
|
+ if ((k & 0x7) == 0) {
|
|
|
/* // ARM-v7 method
|
|
|
uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
|
|
|
if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
|
|
@@ -72,9 +81,20 @@ void CpuGenerator<float, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInf
|
|
|
}
|
|
|
|
|
|
uint32_t resData[4];
|
|
|
+ float resa[4];
|
|
|
+ float resb[4];
|
|
|
vst1q_u32(resData, counter);
|
|
|
- for (int k = 0; k < 4 && i + k < info.bWidth; k++)
|
|
|
- data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
|
|
|
+ vst1q_f32(resa, a);
|
|
|
+ vst1q_f32(resb, b);
|
|
|
+
|
|
|
+ for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
|
|
|
+ if (info.smooth)
|
|
|
+ data[i + k + j * info.bWidth] = resData[k] <= 0 ? info.maxIter :
|
|
|
+ resData[k] >= info.maxIter ? info.maxIter :
|
|
|
+ ((float) resData[k]) + 1 - ::logf(::logf(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::logf(2.0f);
|
|
|
+ else
|
|
|
+ data[i + k + j * info.bWidth] = resData[k] > 0 ? float(resData[k]) : info.maxIter;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -85,8 +105,9 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
|
|
|
{
|
|
|
using T = double;
|
|
|
const MandelViewport& view = info.view;
|
|
|
- omp_set_num_threads(2 * omp_get_num_procs());
|
|
|
-#pragma omp parallel for
|
|
|
+ if constexpr(parallel)
|
|
|
+ omp_set_num_threads(omp_get_num_procs());
|
|
|
+#pragma omp parallel for schedule(static, 1) if (parallel)
|
|
|
for (long j = 0; j < info.bHeight; j++) {
|
|
|
T y = T(view.y) + T(j) * T(view.height / info.bHeight);
|
|
|
long i = 0;
|
|
@@ -113,13 +134,23 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
|
|
|
float64x2_t aa = vmulq_f64(a, a);
|
|
|
float64x2_t bb = vmulq_f64(b, b);
|
|
|
float64x2_t abab = vmulq_f64(a, b); abab = vaddq_f64(abab, abab);
|
|
|
- a = vaddq_f64(vsubq_f64(aa, bb), xs);
|
|
|
- b = vaddq_f64(abab, ys);
|
|
|
+ //a = vaddq_f64(vsubq_f64(aa, bb), xs);
|
|
|
+ //b = vaddq_f64(abab, ys);
|
|
|
uint64x2_t cmp = vcleq_f64(vaddq_f64(aa, bb), threshold);
|
|
|
+ if (info.smooth) {
|
|
|
+ float64x2_t tempa = vaddq_f64(vsubq_f64(aa, bb), xs);
|
|
|
+ float64x2_t tempb = vaddq_f64(abab, ys);
|
|
|
+ a = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempa)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(a))));
|
|
|
+ b = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempb)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(b))));
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ a = vaddq_f64(vsubq_f64(aa, bb), xs);
|
|
|
+ b = vaddq_f64(abab, ys);
|
|
|
+ }
|
|
|
adder = vandq_u64(adder, cmp);
|
|
|
counter = vaddq_u64(counter, adder);
|
|
|
// checking for break criterion is possibly expensive, only do it every 8 iterations
|
|
|
- if ((k & 0xF) == 0) {
|
|
|
+ if ((k & 0x7) == 0) {
|
|
|
/* // ARM-v7 method
|
|
|
uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
|
|
|
if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
|
|
@@ -133,10 +164,26 @@ void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelIn
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- uint64_t resData[2];
|
|
|
+ /*uint64_t resData[2];
|
|
|
vst1q_u64(resData, counter);
|
|
|
for (int k = 0; k < 2 && i + k < info.bWidth; k++)
|
|
|
data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
|
|
|
+*/
|
|
|
+ uint64_t resData[2];
|
|
|
+ double resa[2];
|
|
|
+ double resb[2];
|
|
|
+ vst1q_u64(resData, counter);
|
|
|
+ vst1q_f64(resa, a);
|
|
|
+ vst1q_f64(resb, b);
|
|
|
+
|
|
|
+ for (int k = 0; k < 2 && i + k < info.bWidth; k++) {
|
|
|
+ if (info.smooth)
|
|
|
+ data[i + k + j * info.bWidth] = resData[k] <= 0 ? info.maxIter :
|
|
|
+ resData[k] >= info.maxIter ? info.maxIter :
|
|
|
+ ((float) resData[k]) + 1 - ::logf(::logf(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::logf(2.0f);
|
|
|
+ else
|
|
|
+ data[i + k + j * info.bWidth] = resData[k] > 0 ? float(resData[k]) : info.maxIter;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|