CpuGeneratorsAVX.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. #include "CpuGenerators.h"
  2. #include <immintrin.h>
  3. #include <omp.h>
  4. #include <cmath>
  5. #include <utility>
  6. #include <memory>
  7. using mnd::CpuGenerator;
  8. namespace mnd
  9. {
  10. template class CpuGenerator<float, mnd::X86_AVX, false>;
  11. template class CpuGenerator<float, mnd::X86_AVX, true>;
  12. template class CpuGenerator<double, mnd::X86_AVX, false>;
  13. template class CpuGenerator<double, mnd::X86_AVX, true>;
  14. template class CpuGenerator<DoubleDouble, mnd::X86_AVX, false>;
  15. template class CpuGenerator<DoubleDouble, mnd::X86_AVX, true>;
  16. }
  17. template<bool parallel>
  18. void CpuGenerator<float, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo& info, float* data)
  19. {
  20. using T = float;
  21. const MandelViewport& view = info.view;
  22. const float dppf = float(view.width / info.bWidth);
  23. const float viewxf = float(view.x);
  24. __m256 viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
  25. __m256 dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
  26. T jX = mnd::convert<T>(info.juliaX);
  27. T jY = mnd::convert<T>(info.juliaY);
  28. __m256 juliaX = { jX, jX, jX, jX, jX, jX, jX, jX };
  29. __m256 juliaY = { jY, jY, jY, jY, jY, jY, jY, jY };
  30. if constexpr(parallel)
  31. omp_set_num_threads(omp_get_num_procs());
  32. #pragma omp parallel for schedule(static, 1) if (parallel)
  33. for (long j = 0; j < info.bHeight; j++) {
  34. T y = T(view.y) + T(j) * T(view.height / info.bHeight);
  35. __m256 ys = {y, y, y, y, y, y, y, y};
  36. long i = 0;
  37. for (i; i < info.bWidth; i += 8) {
  38. __m256 pixc = { float(i), float(i + 1), float(i + 2), float(i + 3), float(i + 4), float(i + 5), float(i + 6), float(i + 7) };
  39. __m256 xs = _mm256_add_ps(_mm256_mul_ps(dpp, pixc), viewx);
  40. __m256 counter = { 0, 0, 0, 0, 0, 0, 0, 0 };
  41. __m256 adder = { 1, 1, 1, 1, 1, 1, 1, 1 };
  42. __m256 resultsa = { 0, 0, 0, 0, 0, 0, 0, 0 };
  43. __m256 resultsb = { 0, 0, 0, 0, 0, 0, 0, 0 };
  44. __m256 threshold = { 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f };
  45. __m256 a = xs;
  46. __m256 b = ys;
  47. __m256 cx = info.julia ? juliaX : xs;
  48. __m256 cy = info.julia ? juliaY : ys;
  49. if (info.smooth) {
  50. for (int k = 0; k < info.maxIter; k++) {
  51. __m256 aa = _mm256_mul_ps(a, a);
  52. __m256 bb = _mm256_mul_ps(b, b);
  53. __m256 abab = _mm256_mul_ps(a, b); abab = _mm256_add_ps(abab, abab);
  54. a = _mm256_add_ps(_mm256_sub_ps(aa, bb), cx);
  55. b = _mm256_add_ps(abab, cy);
  56. __m256 cmp = _mm256_cmp_ps(_mm256_add_ps(aa, bb), threshold, _CMP_LE_OQ);
  57. resultsa = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsa), _mm256_and_ps(cmp, a));
  58. resultsb = _mm256_or_ps(_mm256_andnot_ps(cmp, resultsb), _mm256_and_ps(cmp, b));
  59. adder = _mm256_and_ps(adder, cmp);
  60. counter = _mm256_add_ps(counter, adder);
  61. if ((k & 0x7) == 0 && _mm256_testz_ps(cmp, cmp) != 0) {
  62. break;
  63. }
  64. }
  65. }
  66. else {
  67. for (int k = 0; k < info.maxIter; k++) {
  68. __m256 aa = _mm256_mul_ps(a, a);
  69. __m256 bb = _mm256_mul_ps(b, b);
  70. __m256 abab = _mm256_mul_ps(a, b); abab = _mm256_add_ps(abab, abab);
  71. a = _mm256_add_ps(_mm256_sub_ps(aa, bb), cx);
  72. b = _mm256_add_ps(abab, cy);
  73. __m256 cmp = _mm256_cmp_ps(_mm256_add_ps(aa, bb), threshold, _CMP_LE_OQ);
  74. adder = _mm256_and_ps(adder, cmp);
  75. counter = _mm256_add_ps(counter, adder);
  76. if ((k & 0x7) == 0 && _mm256_testz_ps(cmp, cmp) != 0) {
  77. break;
  78. }
  79. }
  80. }
  81. auto alignVec = [](float* data) -> float* {
  82. void* aligned = data;
  83. ::size_t length = 64;
  84. std::align(32, 8 * sizeof(float), aligned, length);
  85. return static_cast<float*>(aligned);
  86. };
  87. float resData[16];
  88. float* ftRes = alignVec(resData);
  89. float* resa = (float*) &resultsa;
  90. float* resb = (float*) &resultsb;
  91. _mm256_store_ps(ftRes, counter);
  92. for (int k = 0; k < 8 && i + k < info.bWidth; k++) {
  93. if (info.smooth) {
  94. data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
  95. ftRes[k] >= info.maxIter ? info.maxIter :
  96. ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
  97. }
  98. else {
  99. data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter : ftRes[k];
  100. }
  101. }
  102. }
  103. }
  104. }
  105. template<bool parallel>
  106. void CpuGenerator<double, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo& info, float* data)
  107. {
  108. using T = double;
  109. const MandelViewport& view = info.view;
  110. const double dppf = double(view.width / info.bWidth);
  111. const double viewxf = double(view.x);
  112. __m256d viewx = { viewxf, viewxf, viewxf, viewxf };
  113. __m256d dpp = { dppf, dppf, dppf, dppf };
  114. T jX = mnd::convert<T>(info.juliaX);
  115. T jY = mnd::convert<T>(info.juliaY);
  116. __m256d juliaX = { jX, jX, jX, jX };
  117. __m256d juliaY = { jY, jY, jY, jY };
  118. if constexpr(parallel)
  119. omp_set_num_threads(omp_get_num_procs());
  120. #pragma omp parallel for schedule(static, 1) if (parallel)
  121. for (long j = 0; j < info.bHeight; j++) {
  122. T y = T(view.y + T(j) * view.height / info.bHeight);
  123. __m256d ys = { y, y, y, y };
  124. long i = 0;
  125. for (i; i < info.bWidth; i += 4) {
  126. __m256d pixc = { double(i), double(i + 1), double(i + 2), double(i + 3) };
  127. __m256d xs = _mm256_add_pd(_mm256_mul_pd(dpp, pixc), viewx);
  128. int itRes[4] = { 0, 0, 0, 0 };
  129. __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
  130. __m256d counter = { 0, 0, 0, 0 };
  131. __m256d adder = { 1, 1, 1, 1 };
  132. __m256d resultsa = { 0, 0, 0, 0 };
  133. __m256d resultsb = { 0, 0, 0, 0 };
  134. __m256d a = xs;
  135. __m256d b = ys;
  136. __m256d cx = info.julia ? juliaX : xs;
  137. __m256d cy = info.julia ? juliaY : ys;
  138. if (info.smooth) {
  139. for (int k = 0; k < info.maxIter; k++) {
  140. __m256d aa = _mm256_mul_pd(a, a);
  141. __m256d bb = _mm256_mul_pd(b, b);
  142. __m256d abab = _mm256_mul_pd(a, b); abab = _mm256_add_pd(abab, abab);
  143. a = _mm256_add_pd(_mm256_sub_pd(aa, bb), cx);
  144. b = _mm256_add_pd(abab, cy);
  145. __m256d cmp = _mm256_cmp_pd(_mm256_add_pd(aa, bb), threshold, _CMP_LE_OQ);
  146. resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a));
  147. resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b));
  148. adder = _mm256_and_pd(adder, cmp);
  149. counter = _mm256_add_pd(counter, adder);
  150. if ((k & 0x3) == 0 && _mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
  151. break;
  152. }
  153. }
  154. }
  155. else {
  156. for (int k = 0; k < info.maxIter; k++) {
  157. __m256d aa = _mm256_mul_pd(a, a);
  158. __m256d bb = _mm256_mul_pd(b, b);
  159. __m256d abab = _mm256_mul_pd(a, b); abab = _mm256_add_pd(abab, abab);
  160. a = _mm256_add_pd(_mm256_sub_pd(aa, bb), cx);
  161. b = _mm256_add_pd(abab, cy);
  162. __m256d cmp = _mm256_cmp_pd(_mm256_add_pd(aa, bb), threshold, _CMP_LE_OQ);
  163. adder = _mm256_and_pd(adder, cmp);
  164. counter = _mm256_add_pd(counter, adder);
  165. if ((k & 0x3) == 0 && _mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
  166. break;
  167. }
  168. }
  169. }
  170. auto alignVec = [](double* data) -> double* {
  171. void* aligned = data;
  172. ::size_t length = 64;
  173. std::align(32, 4 * sizeof(double), aligned, length);
  174. return static_cast<double*>(aligned);
  175. };
  176. double resData[8];
  177. double* ftRes = alignVec(resData);
  178. double* resa = (double*) &resultsa;
  179. double* resb = (double*) &resultsb;
  180. _mm256_store_pd(ftRes, counter);
  181. for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
  182. if (info.smooth)
  183. data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
  184. ftRes[k] >= info.maxIter ? info.maxIter :
  185. ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
  186. else
  187. data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
  188. }
  189. }
  190. }
  191. }
  192. struct VecPair
  193. {
  194. __m256d a;
  195. __m256d b;
  196. };
  197. static inline VecPair quickTwoSum(__m256d a, __m256d b)
  198. {
  199. __m256d s = _mm256_add_pd(a, b);
  200. __m256d e = _mm256_sub_pd(b, _mm256_sub_pd(s, a));
  201. return { s, e };
  202. }
  203. static inline VecPair quickTwoDiff(__m256d a, __m256d b)
  204. {
  205. __m256d s = _mm256_sub_pd(a, b);
  206. __m256d e = _mm256_sub_pd(_mm256_sub_pd(a, s), b);
  207. return { s, e };
  208. }
  209. static inline VecPair twoSum(__m256d a, __m256d b)
  210. {
  211. __m256d s = _mm256_add_pd(a, b);
  212. __m256d bb = _mm256_sub_pd(s, a);
  213. __m256d e = _mm256_add_pd(_mm256_sub_pd(a, _mm256_sub_pd(s, bb)), _mm256_sub_pd(b, bb));
  214. return { s, e };
  215. }
  216. static inline VecPair twoDiff(__m256d a, __m256d b)
  217. {
  218. __m256d s = _mm256_sub_pd(a, b);
  219. __m256d bb = _mm256_sub_pd(s, a);
  220. __m256d e = _mm256_sub_pd(_mm256_sub_pd(a, _mm256_sub_pd(s, bb)), _mm256_add_pd(b, bb));
  221. return { s, e };
  222. }
  223. static inline VecPair split(__m256d a)
  224. {
  225. /*
  226. // -- this should never happen when doing mandelbrot calculations,
  227. // so we omit this check.
  228. if (a > _QD_SPLIT_THRESH || a < -_QD_SPLIT_THRESH) {
  229. a *= 3.7252902984619140625e-09; // 2^-28
  230. temp = _QD_SPLITTER * a;
  231. hi = temp - (temp - a);
  232. lo = a - hi;
  233. hi *= 268435456.0; // 2^28
  234. lo *= 268435456.0; // 2^28
  235. } else {
  236. temp = _QD_SPLITTER * a;
  237. hi = temp - (temp - a);
  238. lo = a - hi;
  239. }
  240. */
  241. static const __m256d SPLITTER = { 134217729.0, 134217729.0, 134217729.0, 134217729.0 };
  242. __m256d temp = _mm256_mul_pd(SPLITTER, a);
  243. __m256d hi = _mm256_sub_pd(temp, _mm256_sub_pd(temp, a));
  244. __m256d lo = _mm256_sub_pd(a, hi);
  245. return { hi, lo };
  246. }
  247. static inline VecPair twoProd(__m256d a, __m256d b)
  248. {
  249. __m256d p = _mm256_mul_pd(a, b);
  250. auto[a_hi, a_lo] = split(a);
  251. auto[b_hi, b_lo] = split(b);
  252. __m256d err = _mm256_add_pd(_mm256_add_pd(_mm256_sub_pd(_mm256_mul_pd(a_hi, b_hi), p), _mm256_add_pd(_mm256_mul_pd(a_hi, b_lo), _mm256_mul_pd(a_lo, b_hi))), _mm256_mul_pd(a_lo, b_lo));
  253. return { p, err };
  254. }
  255. struct AvxDoubleDouble
  256. {
  257. __m256d x[2];
  258. inline AvxDoubleDouble(__m256d a, __m256d b) :
  259. x{ a, b }
  260. {}
  261. inline AvxDoubleDouble operator + (const AvxDoubleDouble& sm) const
  262. {
  263. auto[s, e] = twoSum(x[0], sm.x[0]);
  264. e = _mm256_add_pd(e, _mm256_add_pd(x[1], sm.x[1]));
  265. auto[r1, r2] = quickTwoSum(s, e);
  266. return AvxDoubleDouble{ r1, r2 };
  267. }
  268. inline AvxDoubleDouble operator - (const AvxDoubleDouble& sm) const
  269. {
  270. auto[s, e] = twoDiff(x[0], sm.x[0]);
  271. e = _mm256_add_pd(e, x[1]);
  272. e = _mm256_sub_pd(e, sm.x[1]);
  273. auto[r1, r2] = quickTwoSum(s, e);
  274. return AvxDoubleDouble{ r1, r2 };
  275. }
  276. inline AvxDoubleDouble operator * (const AvxDoubleDouble& sm) const
  277. {
  278. auto[p1, p2] = twoProd(this->x[0], sm.x[0]);
  279. p2 = _mm256_add_pd(p2,
  280. _mm256_add_pd(_mm256_mul_pd(sm.x[1], x[0]), _mm256_mul_pd(sm.x[0], x[1])) );
  281. auto[r1, r2] = quickTwoSum(p1, p2);
  282. return AvxDoubleDouble{ r1, r2 };
  283. }
  284. };
  285. template<bool parallel>
  286. void CpuGenerator<mnd::DoubleDouble, mnd::X86_AVX, parallel>::generate(const mnd::MandelInfo& info, float* data)
  287. {
  288. const MandelViewport& view = info.view;
  289. using T = DoubleDouble;
  290. T viewx = mnd::convert<T>(view.x);
  291. T viewy = mnd::convert<T>(view.y);
  292. T wpp = mnd::convert<T>(view.width / info.bWidth);
  293. T hpp = mnd::convert<T>(view.height / info.bHeight);
  294. T jX = mnd::convert<T>(info.juliaX);
  295. T jY = mnd::convert<T>(info.juliaY);
  296. AvxDoubleDouble juliaX = { __m256d{ jX.x[0], jX.x[0], jX.x[0], jX.x[0] }, __m256d{ jX.x[1], jX.x[1], jX.x[1], jX.x[1] } };
  297. AvxDoubleDouble juliaY = { __m256d{ jY.x[0], jY.x[0], jY.x[0], jY.x[0] }, __m256d{ jY.x[1], jY.x[1], jY.x[1], jY.x[1] } };
  298. if constexpr(parallel)
  299. omp_set_num_threads(omp_get_num_procs());
  300. #pragma omp parallel for schedule(static, 1) if (parallel)
  301. for (long j = 0; j < info.bHeight; j++) {
  302. T y = viewy + T(double(j)) * hpp;
  303. __m256d y0s = { y.x[0], y.x[0], y.x[0], y.x[0] };
  304. __m256d y1s = { y.x[1], y.x[1], y.x[1], y.x[1] };
  305. AvxDoubleDouble ys{ y0s, y1s };
  306. long i = 0;
  307. for (i; i < info.bWidth; i += 4) {
  308. T x1 = viewx + T(double(i)) * wpp;
  309. T x2 = x1 + wpp;
  310. T x3 = x2 + wpp;
  311. T x4 = x3 + wpp;
  312. __m256d x0s = {
  313. x1.x[0], x2.x[0], x3.x[0], x4.x[0],
  314. };
  315. __m256d x1s = {
  316. x1.x[1], x2.x[1], x3.x[1], x4.x[1],
  317. };
  318. AvxDoubleDouble xs{ x0s, x1s };
  319. AvxDoubleDouble cx = info.julia ? juliaX : xs;
  320. AvxDoubleDouble cy = info.julia ? juliaY : ys;
  321. int itRes[4] = { 0, 0, 0, 0 };
  322. __m256d threshold = { 16.0, 16.0, 16.0, 16.0 };
  323. __m256d counter = { 0, 0, 0, 0 };
  324. __m256d adder = { 1, 1, 1, 1 };
  325. AvxDoubleDouble a = xs;
  326. AvxDoubleDouble b = ys;
  327. __m256d resultsa;
  328. __m256d resultsb;
  329. for (int k = 0; k < info.maxIter; k++) {
  330. AvxDoubleDouble aa = a * a;
  331. AvxDoubleDouble bb = b * b;
  332. AvxDoubleDouble abab = a * b; abab = abab + abab;
  333. a = aa - bb + cx;
  334. b = abab + cy;
  335. __m256d cmp = _mm256_cmp_pd(_mm256_add_pd(aa.x[0], bb.x[0]), threshold, _CMP_LE_OQ);
  336. if (info.smooth) {
  337. resultsa = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsa), _mm256_and_pd(cmp, a.x[0]));
  338. resultsb = _mm256_or_pd(_mm256_andnot_pd(cmp, resultsb), _mm256_and_pd(cmp, b.x[0]));
  339. }
  340. adder = _mm256_and_pd(adder, cmp);
  341. counter = _mm256_add_pd(counter, adder);
  342. if (_mm256_testz_si256(_mm256_castpd_si256(cmp), _mm256_castpd_si256(cmp)) != 0) {
  343. break;
  344. }
  345. }
  346. auto alignVec = [](double* data) -> double* {
  347. void* aligned = data;
  348. ::size_t length = 64;
  349. std::align(32, 4 * sizeof(double), aligned, length);
  350. return static_cast<double*>(aligned);
  351. };
  352. double resData[8];
  353. double* ftRes = alignVec(resData);
  354. double* resa = (double*) &resultsa;
  355. double* resb = (double*) &resultsb;
  356. _mm256_store_pd(ftRes, counter);
  357. for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
  358. if (info.smooth)
  359. data[i + k + j * info.bWidth] = ftRes[k] <= 0 ? info.maxIter :
  360. ftRes[k] >= info.maxIter ? info.maxIter :
  361. ((float)ftRes[k]) + 1 - ::log(::log(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::log(2.0f);
  362. else
  363. data[i + k + j * info.bWidth] = ftRes[k] > 0 ? float(ftRes[k]) : info.maxIter;
  364. }
  365. }
  366. }
  367. }