CpuGeneratorsAVXFMA.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. #include "CpuGenerators.h"
  2. #include <immintrin.h>
  3. #include <omp.h>
  4. #include <cmath>
  5. #include <utility>
  6. #include <memory>
  7. using mnd::CpuGenerator;
  8. namespace mnd
  9. {
  10. template class CpuGenerator<float, mnd::X86_AVX_FMA, false>;
  11. template class CpuGenerator<float, mnd::X86_AVX_FMA, true>;
  12. template class CpuGenerator<double, mnd::X86_AVX_FMA, false>;
  13. template class CpuGenerator<double, mnd::X86_AVX_FMA, true>;
  14. template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, false>;
  15. template class CpuGenerator<DoubleDouble, mnd::X86_AVX_FMA, true>;
  16. }
  17. template<bool parallel>
  18. void CpuGenerator<float, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
  19. {
  20. using T = float;
  21. const MandelViewport& view = info.view;
  22. const float dppf = float(view.width / info.bWidth);
  23. const float viewxf = float(view.x);
  24. __m256 viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
  25. __m256 dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
  26. T jX = mnd::convert<T>(info.juliaX);
  27. T jY = mnd::convert<T>(info.juliaY);
  28. __m256 juliaX = { jX, jX, jX, jX, jX, jX, jX, jX };
  29. __m256 juliaY = { jY, jY, jY, jY, jY, jY, jY, jY };
  30. if constexpr(parallel)
  31. omp_set_num_threads(omp_get_num_procs());
  32. #pragma omp parallel for schedule(static, 1) if (parallel)
  33. for (long j = 0; j < info.bHeight; j++) {
  34. T y = T(view.y) + T(j) * T(view.height / info.bHeight);
  35. __m256 ys = {y, y, y, y, y, y, y, y};
  36. long i = 0;
  37. for (i; i < info.bWidth; i += 24) {
  38. __m256 pixc = { float(i), float(i + 1), float(i + 2), float(i + 3), float(i + 4), float(i + 5), float(i + 6), float(i + 7) };
  39. __m256 pixc2 = { float(i + 8), float(i + 9), float(i + 10), float(i + 11), float(i + 12), float(i + 13), float(i + 14), float(i + 15) };
  40. __m256 pixc3 = { float(i + 16), float(i + 17), float(i + 18), float(i + 19), float(i + 20), float(i + 21), float(i + 22), float(i + 23) };
  41. __m256 xs = _mm256_add_ps(_mm256_mul_ps(dpp, pixc), viewx);
  42. __m256 xs2 = _mm256_add_ps(_mm256_mul_ps(dpp, pixc2), viewx);
  43. __m256 xs3 = _mm256_add_ps(_mm256_mul_ps(dpp, pixc3), viewx);
  44. __m256 counter = { 0, 0, 0, 0, 0, 0, 0, 0 };
  45. __m256 adder = { 1, 1, 1, 1, 1, 1, 1, 1 };
  46. __m256 resultsa = { 0, 0, 0, 0, 0, 0, 0, 0 };
  47. __m256 resultsb = { 0, 0, 0, 0, 0, 0, 0, 0 };
  48. __m256 counter2 = { 0, 0, 0, 0, 0, 0, 0, 0 };
  49. __m256 adder2 = { 1, 1, 1, 1, 1, 1, 1, 1 };
  50. __m256 resultsa2 = { 0, 0, 0, 0, 0, 0, 0, 0 };
  51. __m256 resultsb2 = { 0, 0, 0, 0, 0, 0, 0, 0 };
  52. __m256 counter3 = { 0, 0, 0, 0, 0, 0, 0, 0 };
  53. __m256 adder3 = { 1, 1, 1, 1, 1, 1, 1, 1 };
  54. __m256 resultsa3 = { 0, 0, 0, 0, 0, 0, 0, 0 };
  55. __m256 resultsb3 = { 0, 0, 0, 0, 0, 0, 0, 0 };
  56. __m256 threshold = { 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f };
  57. __m256 two = { 2, 2, 2, 2, 2, 2, 2, 2 };
  58. __m256 a = xs;
  59. __m256 a2 = xs2;
  60. __m256 a3 = xs3;
  61. __m256 b = ys;
  62. __m256 b2 = ys;
  63. __m256 b3 = ys;
  64. __m256 cx = info.julia ? juliaX : xs;
  65. __m256 cx2 = info.julia ? juliaX : xs2;
  66. __m256 cx3 = info.julia ? juliaX : xs3;
  67. __m256 cy = info.julia ? juliaY : ys;
  68. if (info.smooth) {
  69. for (int k = 0; k < info.maxIter; k++) {
  70. __m256 bb = _mm256_mul_ps(b, b);
  71. __m256 bb2 = _mm256_mul_ps(b2, b2);
  72. __m256 bb3 = _mm256_mul_ps(b3, b3);
  73. __m256 ab = _mm256_mul_ps(a, b);
  74. __m256 ab2 = _mm256_mul_ps(a2, b2);
  75. __m256 ab3 = _mm256_mul_ps(a3, b3);
  76. b = _mm256_fmadd_ps(two, ab, cy);
  77. b2 = _mm256_fmadd_ps(two, ab2, cy);
  78. b3 = _mm256_fmadd_ps(two, ab3, cy);
  79. __m256 cmp = _mm256_cmp_ps(_mm256_fmadd_ps(a, a, bb), threshold, _CMP_LE_OQ);
  80. __m256 cmp2 = _mm256_cmp_ps(_mm256_fmadd_ps(a2, a2, bb2), threshold, _CMP_LE_OQ);
  81. __m256 cmp3 = _mm256_cmp_ps(_mm256_fmadd_ps(a3, a3, bb3), threshold, _CMP_LE_OQ);
  82. a = _mm256_add_ps(_mm256_fmsub_ps(a, a, bb), cx);
  83. a2 = _mm256_add_ps(_mm256_fmsub_ps(a2, a2, bb2), cx2);
  84. a3 = _mm256_add_ps(_mm256_fmsub_ps(a3, a3, bb3), cx3);
  85. /*resultsa = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
  86. resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
  87. resultsa2 = _mm256_or_ps(_mm256_andnot_ps(cmp2, resultsa2), _mm256_and_ps(cmp2, a2));
  88. resultsb2 = _mm256_or_ps(_mm256_andnot_ps(cmp2, resultsb2), _mm256_and_ps(cmp2, b2));
  89. resultsa3 = _mm256_or_ps(_mm256_andnot_ps(cmp3, resultsa3), _mm256_and_ps(cmp3, a3));
  90. resultsb3 = _mm256_or_ps(_mm256_andnot_ps(cmp3, resultsb3), _mm256_and_ps(cmp3, b3));*/
  91. resultsa = _mm256_blendv_ps(resultsa, a, cmp);
  92. resultsb = _mm256_blendv_ps(resultsb, b, cmp);
  93. resultsa2 = _mm256_blendv_ps(resultsa2, a2, cmp2);
  94. resultsb2 = _mm256_blendv_ps(resultsb2, b2, cmp2);
  95. resultsa3 = _mm256_blendv_ps(resultsa3, a3, cmp3);
  96. resultsb3 = _mm256_blendv_ps(resultsb3, b3, cmp3);
  97. adder = _mm256_and_ps(adder, cmp);
  98. counter = _mm256_add_ps(counter, adder);
  99. adder2 = _mm256_and_ps(adder2, cmp2);
  100. counter2 = _mm256_add_ps(counter2, adder2);
  101. adder3 = _mm256_and_ps(adder3, cmp3);
  102. counter3 = _mm256_add_ps(counter3, adder3);
  103. if ((k & 0x7) == 0 && _mm256_testz_ps(cmp, cmp) != 0 && _mm256_testz_ps(cmp2, cmp2) != 0 && _mm256_testz_ps(cmp3, cmp3) != 0) {
  104. break;
  105. }
  106. }
  107. }
  108. else {
  109. for (int k = 0; k < info.maxIter; k++) {
  110. __m256 bb = _mm256_mul_ps(b, b);
  111. __m256 bb2 = _mm256_mul_ps(b2, b2);
  112. __m256 bb3 = _mm256_mul_ps(b3, b3);
  113. __m256 ab = _mm256_mul_ps(a, b);
  114. __m256 ab2 = _mm256_mul_ps(a2, b2);
  115. __m256 ab3 = _mm256_mul_ps(a3, b3);
  116. __m256 cmp = _mm256_cmp_ps(_mm256_fmadd_ps(a, a, bb), threshold, _CMP_LE_OQ);
  117. __m256 cmp2 = _mm256_cmp_ps(_mm256_fmadd_ps(a2, a2, bb2), threshold, _CMP_LE_OQ);
  118. __m256 cmp3 = _mm256_cmp_ps(_mm256_fmadd_ps(a3, a3, bb3), threshold, _CMP_LE_OQ);
  119. a = _mm256_add_ps(_mm256_fmsub_ps(a, a, bb), cx);
  120. a2 = _mm256_add_ps(_mm256_fmsub_ps(a2, a2, bb2), cx2);
  121. a3 = _mm256_add_ps(_mm256_fmsub_ps(a3, a3, bb3), cx3);
  122. b = _mm256_fmadd_ps(two, ab, cy);
  123. b2 = _mm256_fmadd_ps(two, ab2, cy);
  124. b3 = _mm256_fmadd_ps(two, ab3, cy);
  125. adder = _mm256_and_ps(adder, cmp);
  126. counter = _mm256_add_ps(counter, adder);
  127. adder2 = _mm256_and_ps(adder2, cmp2);
  128. counter2 = _mm256_add_ps(counter2, adder2);
  129. adder3 = _mm256_and_ps(adder3, cmp3);
  130. counter3 = _mm256_add_ps(counter3, adder3);
  131. if ((k & 0x7) == 0 && _mm256_testz_ps(cmp, cmp) != 0 && _mm256_testz_ps(cmp2, cmp2) != 0 && _mm256_testz_ps(cmp3, cmp3) != 0) {
  132. break;
  133. }
  134. }
  135. }
  136. auto alignVec = [](float* data) -> float* {
  137. void* aligned = data;
  138. ::size_t length = 64;
  139. std::align(32, 8 * sizeof(float), aligned, length);
  140. return static_cast<float*>(aligned);
  141. };
  142. float resData[96];
  143. float* ftRes = alignVec(resData);
  144. float* resa = ftRes + 24;
  145. float* resb = resa + 24;
  146. _mm256_store_ps(ftRes, counter);
  147. _mm256_store_ps(ftRes + 8, counter2);
  148. _mm256_store_ps(ftRes + 16, counter3);
  149. _mm256_store_ps(resa, resultsa);
  150. _mm256_store_ps(resa + 8, resultsa2);
  151. _mm256_store_ps(resa + 16, resultsa3);
  152. _mm256_store_ps(resb, resultsb);
  153. _mm256_store_ps(resb + 8, resultsb2);
  154. _mm256_store_ps(resb + 16, resultsb3);
  155. for (int k = 0; k < 24 && i + k < info.bWidth; k++) {
  156. if (info.smooth) {
  157. data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
  158. ftRes[k] >= info.maxIter ? info.maxIter :
  159. ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
  160. }
  161. else {
  162. data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter : ftRes[k];
  163. }
  164. }
  165. }
  166. }
  167. }
  168. template<bool parallel>
  169. void CpuGenerator<double, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
  170. {
  171. using T = double;
  172. const MandelViewport& view = info.view;
  173. const double dppf = double(view.width / info.bWidth);
  174. const double viewxf = double(view.x);
  175. __m256d viewx = { viewxf, viewxf, viewxf, viewxf };
  176. __m256d dpp = { dppf, dppf, dppf, dppf };
  177. T jX = mnd::convert<T>(info.juliaX);
  178. T jY = mnd::convert<T>(info.juliaY);
  179. __m256d juliaX = { jX, jX, jX, jX };
  180. __m256d juliaY = { jY, jY, jY, jY };
  181. if constexpr(parallel)
  182. omp_set_num_threads(omp_get_num_procs());
  183. #pragma omp parallel for schedule(static, 1) if (parallel)
  184. for (long j = 0; j < info.bHeight; j++) {
  185. T y = T(view.y + T(j) * view.height / info.bHeight);
  186. __m256d ys = { y, y, y, y };
  187. long i = 0;
  188. for (i; i < info.bWidth; i += 8) {
  189. __m256d pixc = { double(i), double(i + 1), double(i + 2), double(i + 3) };
  190. __m256d pixc2 = { double(i + 4), double(i + 5), double(i + 6), double(i + 7) };
  191. __m256d xs = _mm256_fmadd_pd(dpp, pixc, viewx);
  192. __m256d xs2 = _mm256_fmadd_pd(dpp, pixc2, viewx);
  193. int itRes[4] = { 0, 0, 0, 0 };
  194. __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
  195. __m256d counter = { 0, 0, 0, 0 };
  196. __m256d adder = { 1, 1, 1, 1 };
  197. __m256d counter2 = { 0, 0, 0, 0 };
  198. __m256d adder2 = { 1, 1, 1, 1 };
  199. __m256d two = { 2, 2, 2, 2 };
  200. __m256d resultsa = { 0, 0, 0, 0 };
  201. __m256d resultsb = { 0, 0, 0, 0 };
  202. __m256d resultsa2 = { 0, 0, 0, 0 };
  203. __m256d resultsb2 = { 0, 0, 0, 0 };
  204. __m256d a = xs;
  205. __m256d b = ys;
  206. __m256d a2 = xs2;
  207. __m256d b2 = ys;
  208. __m256d cx = info.julia ? juliaX : xs;
  209. __m256d cy = info.julia ? juliaY : ys;
  210. __m256d cx2 = info.julia ? juliaX : xs2;
  211. //__m256d cy2 = info.julia ? juliaY : ys;
  212. for (int k = 0; k < info.maxIter; k++) {
  213. __m256d ab = _mm256_mul_pd(a, b);
  214. __m256d bb = _mm256_mul_pd(b, b);
  215. __m256d ab2 = _mm256_mul_pd(a2, b2);
  216. __m256d bb2 = _mm256_mul_pd(b2, b2);
  217. __m256d cmp = _mm256_cmp_pd(_mm256_fmadd_pd(a, a, bb), threshold, _CMP_LE_OQ);
  218. __m256d cmp2 = _mm256_cmp_pd(_mm256_fmadd_pd(a2, a2, bb2), threshold, _CMP_LE_OQ);
  219. a = _mm256_fmsub_pd(a, a, bb);
  220. a = _mm256_add_pd(a, cx);
  221. a2 = _mm256_fmsub_pd(a2, a2, bb2);
  222. a2 = _mm256_add_pd(a2, cx2);
  223. b = _mm256_fmadd_pd(two, ab, cy);
  224. b2 = _mm256_fmadd_pd(two, ab2, cy);
  225. if (info.smooth) {
  226. /*resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a));
  227. resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b));
  228. resultsa2 = _mm256_or_pd(_mm256_andnot_pd(cmp2, resultsa2), _mm256_and_pd(cmp2, a2));
  229. resultsb2 = _mm256_or_pd(_mm256_andnot_pd(cmp2, resultsb2), _mm256_and_pd(cmp2, b2));*/
  230. resultsa = _mm256_blendv_pd(resultsa, a, cmp);
  231. resultsb = _mm256_blendv_pd(resultsb, b, cmp);
  232. resultsa2 = _mm256_blendv_pd(resultsa2, a2, cmp2);
  233. resultsb2 = _mm256_blendv_pd(resultsb2, b2, cmp2);
  234. }
  235. adder = _mm256_and_pd(adder, cmp);
  236. adder2 = _mm256_and_pd(adder2, cmp2);
  237. counter = _mm256_add_pd(counter, adder);
  238. counter2 = _mm256_add_pd(counter2, adder2);
  239. if ((k & 0x7) == 0 && _mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0 &&
  240. _mm256_testz_si256(_mm256_castpd_si256(cmp2), _mm256_castpd_si256(cmp2)) != 0) {
  241. break;
  242. }
  243. }
  244. auto alignVec = [](double* data) -> double* {
  245. void* aligned = data;
  246. ::size_t length = 64;
  247. std::align(32, 4 * sizeof(double), aligned, length);
  248. return static_cast<double*>(aligned);
  249. };
  250. double resData[8];
  251. double* ftRes = alignVec(resData);
  252. double* resa = (double*) &resultsa;
  253. double* resb = (double*) &resultsb;
  254. _mm256_store_pd(ftRes, counter);
  255. for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
  256. if (info.smooth)
  257. data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
  258. ftRes[k] >= info.maxIter ? info.maxIter :
  259. ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
  260. else
  261. data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
  262. }
  263. resa = (double*) &resultsa2;
  264. resb = (double*) &resultsb2;
  265. _mm256_store_pd(ftRes, counter2);
  266. i += 4;
  267. for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
  268. if (info.smooth)
  269. data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
  270. ftRes[k] >= info.maxIter ? info.maxIter :
  271. ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
  272. else
  273. data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
  274. }
  275. i -= 4;
  276. }
  277. }
  278. }
  279. struct VecPair
  280. {
  281. __m256d a;
  282. __m256d b;
  283. };
  284. static inline VecPair quickTwoSum(__m256d a, __m256d b)
  285. {
  286. __m256d s = _mm256_add_pd(a, b);
  287. __m256d e = _mm256_sub_pd(b, _mm256_sub_pd(s, a));
  288. return { s, e };
  289. }
  290. static inline VecPair quickTwoDiff(__m256d a, __m256d b)
  291. {
  292. __m256d s = _mm256_sub_pd(a, b);
  293. __m256d e = _mm256_sub_pd(_mm256_sub_pd(a, s), b);
  294. return { s, e };
  295. }
  296. static inline VecPair twoSum(__m256d a, __m256d b)
  297. {
  298. __m256d s = _mm256_add_pd(a, b);
  299. __m256d bb = _mm256_sub_pd(s, a);
  300. __m256d e = _mm256_add_pd(_mm256_sub_pd(a, _mm256_sub_pd(s, bb)), _mm256_sub_pd(b, bb));
  301. return { s, e };
  302. }
  303. static inline VecPair twoDiff(__m256d a, __m256d b)
  304. {
  305. __m256d s = _mm256_sub_pd(a, b);
  306. __m256d bb = _mm256_sub_pd(s, a);
  307. __m256d e = _mm256_sub_pd(_mm256_sub_pd(a, _mm256_sub_pd(s, bb)), _mm256_add_pd(b, bb));
  308. return { s, e };
  309. }
  310. static inline VecPair twoProd(__m256d a, __m256d b)
  311. {
  312. __m256d p = _mm256_mul_pd(a, b);
  313. __m256d e = _mm256_fmsub_pd(a, b, p);
  314. return { p, e };
  315. }
  316. struct AvxDoubleDouble
  317. {
  318. __m256d x[2];
  319. inline AvxDoubleDouble(__m256d a, __m256d b) :
  320. x{ a, b }
  321. {}
  322. inline AvxDoubleDouble operator + (const AvxDoubleDouble& sm) const
  323. {
  324. auto[s, e] = twoSum(x[0], sm.x[0]);
  325. e = _mm256_add_pd(e, _mm256_add_pd(x[1], sm.x[1]));
  326. auto[r1, r2] = quickTwoSum(s, e);
  327. return AvxDoubleDouble{ r1, r2 };
  328. }
  329. inline AvxDoubleDouble operator - (const AvxDoubleDouble& sm) const
  330. {
  331. auto[s, e] = twoDiff(x[0], sm.x[0]);
  332. e = _mm256_add_pd(e, x[1]);
  333. e = _mm256_sub_pd(e, sm.x[1]);
  334. auto[r1, r2] = quickTwoSum(s, e);
  335. return AvxDoubleDouble{ r1, r2 };
  336. }
  337. inline AvxDoubleDouble operator * (const AvxDoubleDouble& sm) const
  338. {
  339. auto[p1, p2] = twoProd(this->x[0], sm.x[0]);
  340. p2 = _mm256_add_pd(p2,
  341. _mm256_add_pd(_mm256_mul_pd(sm.x[1], x[0]), _mm256_mul_pd(sm.x[0], x[1])) );
  342. auto[r1, r2] = quickTwoSum(p1, p2);
  343. return AvxDoubleDouble{ r1, r2 };
  344. }
  345. };
  346. template<bool parallel>
  347. void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX_FMA, parallel>::generate(const mnd::MandelInfo& info, float* data)
  348. {
  349. const MandelViewport& view = info.view;
  350. using T = DoubleDouble;
  351. T viewx = mnd::convert<T>(view.x);
  352. T viewy = mnd::convert<T>(view.y);
  353. T wpp = mnd::convert<T>(view.width / info.bWidth);
  354. T hpp = mnd::convert<T>(view.height / info.bHeight);
  355. T jX = mnd::convert<T>(info.juliaX);
  356. T jY = mnd::convert<T>(info.juliaY);
  357. AvxDoubleDouble juliaX = { __m256d{ jX.x[0], jX.x[0], jX.x[0], jX.x[0] }, __m256d{ jX.x[1], jX.x[1], jX.x[1], jX.x[1] } };
  358. AvxDoubleDouble juliaY = { __m256d{ jY.x[0], jY.x[0], jY.x[0], jY.x[0] }, __m256d{ jY.x[1], jY.x[1], jY.x[1], jY.x[1] } };
  359. if constexpr(parallel)
  360. omp_set_num_threads(omp_get_num_procs());
  361. #pragma omp parallel for schedule(static, 1) if (parallel)
  362. for (long j = 0; j < info.bHeight; j++) {
  363. T y = viewy + T(double(j)) * hpp;
  364. __m256d y0s = { y.x[0], y.x[0], y.x[0], y.x[0] };
  365. __m256d y1s = { y.x[1], y.x[1], y.x[1], y.x[1] };
  366. AvxDoubleDouble ys{ y0s, y1s };
  367. long i = 0;
  368. for (i; i < info.bWidth; i += 4) {
  369. T x1 = viewx + T(double(i)) * wpp;
  370. T x2 = x1 + wpp;
  371. T x3 = x2 + wpp;
  372. T x4 = x3 + wpp;
  373. __m256d x0s = {
  374. x1.x[0], x2.x[0], x3.x[0], x4.x[0],
  375. };
  376. __m256d x1s = {
  377. x1.x[1], x2.x[1], x3.x[1], x4.x[1],
  378. };
  379. AvxDoubleDouble xs{ x0s, x1s };
  380. AvxDoubleDouble cx = info.julia ? juliaX : xs;
  381. AvxDoubleDouble cy = info.julia ? juliaY : ys;
  382. int itRes[4] = { 0, 0, 0, 0 };
  383. __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
  384. __m256d counter = { 0, 0, 0, 0 };
  385. __m256d adder = { 1, 1, 1, 1 };
  386. AvxDoubleDouble a = xs;
  387. AvxDoubleDouble b = ys;
  388. __m256d resultsa;
  389. __m256d resultsb;
  390. for (int k = 0; k < info.maxIter; k++) {
  391. AvxDoubleDouble aa = a * a;
  392. AvxDoubleDouble bb = b * b;
  393. AvxDoubleDouble abab = a * b; abab = abab + abab;
  394. a = aa - bb + cx;
  395. b = abab + cy;
  396. __m256d cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
  397. if (info.smooth) {
  398. resultsa = _mm256_blendv_pd(resultsa, a.x[0], cmp);
  399. resultsb = _mm256_blendv_pd(resultsb, b.x[0], cmp);
  400. }
  401. adder = _mm256_and_pd(adder, cmp);
  402. counter = _mm256_add_pd(counter, adder);
  403. if (_mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
  404. break;
  405. }
  406. }
  407. auto alignVec = [](double* data) -> double* {
  408. void* aligned = data;
  409. ::size_t length = 64;
  410. std::align(32, 4 * sizeof(double), aligned, length);
  411. return static_cast<double*>(aligned);
  412. };
  413. double resData[8];
  414. double* ftRes = alignVec(resData);
  415. double* resa = (double*) &resultsa;
  416. double* resb = (double*) &resultsb;
  417. _mm256_store_pd(ftRes, counter);
  418. for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
  419. if (info.smooth)
  420. data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
  421. ftRes[k] >= info.maxIter ? info.maxIter :
  422. ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
  423. else
  424. data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
  425. }
  426. }
  427. }
  428. }