|
@@ -36,15 +36,18 @@ void CpuGenerator<float, mnd::X86_SSE2, parallel>::generate(const mnd::MandelInf
|
|
|
float(view.x + double(i + 3) * view.width / info.bWidth)
|
|
|
};
|
|
|
|
|
|
- __m128 counter = {0, 0, 0, 0};
|
|
|
- __m128 adder = {1, 1, 1, 1};
|
|
|
+ __m128 counter = { 0, 0, 0, 0 };
|
|
|
+ __m128 adder = { 1, 1, 1, 1 };
|
|
|
|
|
|
- __m128 threshold = {16.0f, 16.0f, 16.0f, 16.0f};
|
|
|
+ __m128 threshold = { 16.0f, 16.0f, 16.0f, 16.0f };
|
|
|
|
|
|
__m128 ys = {y, y, y, y};
|
|
|
__m128 a = xs;
|
|
|
__m128 b = ys;
|
|
|
|
|
|
+ __m128 resulta = { 0, 0, 0, 0 };
|
|
|
+ __m128 resultb = { 0, 0, 0, 0 };
|
|
|
+
|
|
|
for (int k = 0; k < info.maxIter; k++) {
|
|
|
__m128 aa = _mm_mul_ps(a, a);
|
|
|
__m128 bb = _mm_mul_ps(b, b);
|
|
@@ -52,6 +55,10 @@ void CpuGenerator<float, mnd::X86_SSE2, parallel>::generate(const mnd::MandelInf
|
|
|
a = _mm_add_ps(_mm_sub_ps(aa, bb), xs);
|
|
|
b = _mm_add_ps(abab, ys);
|
|
|
__m128 cmp = _mm_cmple_ps(_mm_add_ps(aa, bb), threshold);
|
|
|
+ if (info.smooth) {
|
|
|
+ resulta = _mm_or_ps(_mm_andnot_ps(cmp, resulta), _mm_and_ps(cmp, a));
|
|
|
+ resultb = _mm_or_ps(_mm_andnot_ps(cmp, resultb), _mm_and_ps(cmp, b));
|
|
|
+ }
|
|
|
adder = _mm_and_ps(adder, cmp);
|
|
|
counter = _mm_add_ps(counter, adder);
|
|
|
if (_mm_movemask_epi8(_mm_castps_si128(cmp)) == 0) {
|
|
@@ -66,12 +73,22 @@ void CpuGenerator<float, mnd::X86_SSE2, parallel>::generate(const mnd::MandelInf
|
|
|
return static_cast<float*>(aligned);
|
|
|
};
|
|
|
|
|
|
- float resData[16];
|
|
|
+ float resData[32];
|
|
|
float* ftRes = alignVec(resData);
|
|
|
+ float* resa = ftRes + 4;
|
|
|
+ float* resb = ftRes + 8;
|
|
|
|
|
|
_mm_store_ps(ftRes, counter);
|
|
|
- for (int k = 0; k < 4 && i + k < info.bWidth; k++)
|
|
|
- data[i + k + j * info.bWidth] = ftRes[k] > 0 ? ftRes[k] : info.maxIter;
|
|
|
+ _mm_store_ps(resa, resulta);
|
|
|
+ _mm_store_ps(resb, resultb);
|
|
|
+ for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
|
|
|
+ if (info.smooth)
|
|
|
+ data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
|
|
|
+ ftRes[k] >= info.maxIter ? info.maxIter :
|
|
|
+ ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
|
|
|
+ else
|
|
|
+ data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -94,14 +111,16 @@ void CpuGenerator<double, mnd::X86_SSE2, parallel>::generate(const mnd::MandelIn
|
|
|
double(view.x + double(i + 1) * view.width / info.bWidth)
|
|
|
};
|
|
|
|
|
|
- __m128d counter = {0, 0};
|
|
|
- __m128d adder = {1, 1};
|
|
|
+ __m128d counter = { 0, 0 };
|
|
|
+ __m128d adder = { 1, 1 };
|
|
|
|
|
|
- __m128d threshold = {16.0f, 16.0f};
|
|
|
+ __m128d threshold = { 16.0f, 16.0f };
|
|
|
|
|
|
- __m128d ys = {y, y};
|
|
|
+ __m128d ys = { y, y };
|
|
|
__m128d a = xs;
|
|
|
__m128d b = ys;
|
|
|
+ __m128d resulta = { 0, 0 };
|
|
|
+ __m128d resultb = { 0, 0 };
|
|
|
|
|
|
for (int k = 0; k < info.maxIter; k++) {
|
|
|
__m128d aa = _mm_mul_pd(a, a);
|
|
@@ -110,6 +129,10 @@ void CpuGenerator<double, mnd::X86_SSE2, parallel>::generate(const mnd::MandelIn
|
|
|
a = _mm_add_pd(_mm_sub_pd(aa, bb), xs);
|
|
|
b = _mm_add_pd(abab, ys);
|
|
|
__m128d cmp = _mm_cmple_pd(_mm_add_pd(aa, bb), threshold);
|
|
|
+ if (info.smooth) {
|
|
|
+ resulta = _mm_or_pd(_mm_andnot_pd(cmp, resulta), _mm_and_pd(cmp, a));
|
|
|
+ resultb = _mm_or_pd(_mm_andnot_pd(cmp, resultb), _mm_and_pd(cmp, b));
|
|
|
+ }
|
|
|
adder = _mm_and_pd(adder, cmp);
|
|
|
counter = _mm_add_pd(counter, adder);
|
|
|
if (_mm_movemask_epi8(_mm_castpd_si128(cmp)) == 0) {
|
|
@@ -124,12 +147,25 @@ void CpuGenerator<double, mnd::X86_SSE2, parallel>::generate(const mnd::MandelIn
|
|
|
return static_cast<double*>(aligned);
|
|
|
};
|
|
|
|
|
|
- double resData[8];
|
|
|
+ double resData[16];
|
|
|
double* ftRes = alignVec(resData);
|
|
|
+ double* resa = ftRes + 2;
|
|
|
+ double* resb = ftRes + 4;
|
|
|
|
|
|
_mm_store_pd(ftRes, counter);
|
|
|
+ _mm_store_pd(resa, resulta);
|
|
|
+ _mm_store_pd(resb, resultb);
|
|
|
for (int k = 0; k < 2 && i + k < info.bWidth; k++)
|
|
|
data[i + k + j * info.bWidth] = ftRes[k] > 0 ? ftRes[k] : info.maxIter;
|
|
|
+
|
|
|
+ for (int k = 0; k < 2 && i + k < info.bWidth; k++) {
|
|
|
+ if (info.smooth)
|
|
|
+ data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
|
|
|
+ ftRes[k] >= info.maxIter ? info.maxIter :
|
|
|
+ ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
|
|
|
+ else
|
|
|
+ data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|