CpuGeneratorsNeon.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. #include "CpuGenerators.h"
  2. #include "LightDoubleDouble.h"
  3. #include <omp.h>
  4. #include <arm_neon.h>
  5. #include <memory>
  6. using mnd::CpuGenerator;
  7. namespace mnd
  8. {
  9. template class CpuGenerator<float, mnd::ARM_NEON, false>;
  10. template class CpuGenerator<float, mnd::ARM_NEON, true>;
  11. template class CpuGenerator<double, mnd::ARM_NEON, false>;
  12. template class CpuGenerator<double, mnd::ARM_NEON, true>;
  13. template class CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, false>;
  14. template class CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, true>;
  15. }
  16. template<bool parallel>
  17. void CpuGenerator<float, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInfo& info, float* data)
  18. {
  19. using T = float;
  20. const MandelViewport& view = info.view;
  21. float32x4_t juliaX = vmovq_n_f32(double(info.juliaX));
  22. float32x4_t juliaY = vmovq_n_f32(double(info.juliaY));
  23. #if defined(_OPENMP)
  24. if constexpr(parallel)
  25. omp_set_num_threads(omp_get_num_procs());
  26. # pragma omp parallel for schedule(static, 1) if (parallel)
  27. #endif
  28. for (long j = 0; j < info.bHeight; j++) {
  29. T y = T(view.y) + T(j) * T(view.height / info.bHeight);
  30. for (long i = 0; i < info.bWidth; i += 4) {
  31. float xsvals[] = {
  32. float(view.x + float(i) * view.width / info.bWidth),
  33. float(view.x + float(i + 1) * view.width / info.bWidth),
  34. float(view.x + float(i + 2) * view.width / info.bWidth),
  35. float(view.x + float(i + 3) * view.width / info.bWidth)
  36. };
  37. float32x4_t xs = vld1q_f32(xsvals);
  38. uint32x4_t counter = vmovq_n_u32(0);
  39. uint32x4_t adder = vmovq_n_u32(1);
  40. //uint32x4_t ones = vmovq_n_u32(1);
  41. float32x4_t threshold = vmovq_n_f32(16);
  42. float32x4_t ys = vmovq_n_f32(y);
  43. float32x4_t a = xs;
  44. float32x4_t b = ys;
  45. float32x4_t cx = info.julia ? juliaX : xs;
  46. float32x4_t cy = info.julia ? juliaY : ys;
  47. for (int k = 0; k < info.maxIter; k++) {
  48. float32x4_t aa = vmulq_f32(a, a);
  49. float32x4_t bb = vmulq_f32(b, b);
  50. float32x4_t abab = vmulq_f32(a, b); abab = vaddq_f32(abab, abab);
  51. uint32x4_t cmp = vcleq_f32(vaddq_f32(aa, bb), threshold);
  52. if (info.smooth) {
  53. float32x4_t tempa = vaddq_f32(vsubq_f32(aa, bb), cx);
  54. float32x4_t tempb = vaddq_f32(abab, cy);
  55. a = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempa)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(a))));
  56. b = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempb)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(b))));
  57. }
  58. else {
  59. a = vaddq_f32(vsubq_f32(aa, bb), cx);
  60. b = vaddq_f32(abab, cy);
  61. }
  62. adder = vandq_u32(adder, cmp);
  63. counter = vaddq_u32(counter, adder);
  64. // checking for break criterion is possibly expensive, only do it every 8 iterations
  65. if ((k & 0x7) == 0) {
  66. /* // ARM-v7 method
  67. uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
  68. if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
  69. break;
  70. }
  71. */
  72. uint32_t allZero = vaddvq_u32(cmp);
  73. if (allZero == 0) {
  74. break;
  75. }
  76. }
  77. }
  78. uint32_t resData[4];
  79. float resa[4];
  80. float resb[4];
  81. vst1q_u32(resData, counter);
  82. vst1q_f32(resa, a);
  83. vst1q_f32(resb, b);
  84. for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
  85. if (info.smooth)
  86. data[i + k + j * info.bWidth] = resData[k] <= 0 ? info.maxIter :
  87. resData[k] >= info.maxIter ? info.maxIter :
  88. ((float) resData[k]) + 1 - ::logf(::logf(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::logf(2.0f);
  89. else
  90. data[i + k + j * info.bWidth] = resData[k] > 0 ? float(resData[k]) : info.maxIter;
  91. }
  92. }
  93. }
  94. }
  95. template<bool parallel>
  96. void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInfo& info, float* data)
  97. {
  98. using T = double;
  99. const MandelViewport& view = info.view;
  100. float64x2_t juliaX = vmovq_n_f64(double(info.juliaX));
  101. float64x2_t juliaY = vmovq_n_f64(double(info.juliaY));
  102. #if defined(_OPENMP)
  103. if constexpr(parallel)
  104. omp_set_num_threads(omp_get_num_procs());
  105. # pragma omp parallel for schedule(static, 1) if (parallel)
  106. #endif
  107. for (long j = 0; j < info.bHeight; j++) {
  108. T y = T(view.y) + T(j) * T(view.height / info.bHeight);
  109. for (long i = 0; i < info.bWidth; i += 2) {
  110. double xsvals[] = {
  111. double(view.x + double(i) * view.width / info.bWidth),
  112. double(view.x + double(i + 1) * view.width / info.bWidth),
  113. };
  114. float64x2_t xs = vld1q_f64(xsvals);
  115. uint64x2_t counter = vmovq_n_u64(0);
  116. uint64x2_t adder = vmovq_n_u64(1);
  117. //uint32x4_t ones = vmovq_n_u32(1);
  118. float64x2_t threshold = vmovq_n_f64(16);
  119. float64x2_t ys = vmovq_n_f64(y);
  120. float64x2_t a = xs;
  121. float64x2_t b = ys;
  122. float64x2_t cx = info.julia ? juliaX : xs;
  123. float64x2_t cy = info.julia ? juliaY : ys;
  124. for (int k = 0; k < info.maxIter; k++) {
  125. float64x2_t aa = vmulq_f64(a, a);
  126. float64x2_t bb = vmulq_f64(b, b);
  127. float64x2_t abab = vmulq_f64(a, b); abab = vaddq_f64(abab, abab);
  128. //a = vaddq_f64(vsubq_f64(aa, bb), xs);
  129. //b = vaddq_f64(abab, ys);
  130. uint64x2_t cmp = vcleq_f64(vaddq_f64(aa, bb), threshold);
  131. if (info.smooth) {
  132. float64x2_t tempa = vaddq_f64(vsubq_f64(aa, bb), cx);
  133. float64x2_t tempb = vaddq_f64(abab, cy);
  134. a = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempa)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(a))));
  135. b = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempb)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(b))));
  136. }
  137. else {
  138. a = vaddq_f64(vsubq_f64(aa, bb), cx);
  139. b = vaddq_f64(abab, cy);
  140. }
  141. adder = vandq_u64(adder, cmp);
  142. counter = vaddq_u64(counter, adder);
  143. // checking for break criterion is possibly expensive, only do it every 8 iterations
  144. if ((k & 0x7) == 0) {
  145. /* // ARM-v7 method
  146. uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
  147. if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
  148. break;
  149. }
  150. */
  151. uint64_t allZero = vaddvq_u64(cmp);
  152. if (allZero == 0) {
  153. break;
  154. }
  155. }
  156. }
  157. uint64_t resData[2];
  158. double resa[2];
  159. double resb[2];
  160. vst1q_u64(resData, counter);
  161. vst1q_f64(resa, a);
  162. vst1q_f64(resb, b);
  163. for (int k = 0; k < 2 && i + k < info.bWidth; k++) {
  164. if (info.smooth)
  165. data[i + k + j * info.bWidth] = resData[k] <= 0 ? info.maxIter :
  166. resData[k] >= info.maxIter ? info.maxIter :
  167. ((float) resData[k]) + 1 - ::logf(::logf(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::logf(2.0f);
  168. else
  169. data[i + k + j * info.bWidth] = resData[k] > 0 ? float(resData[k]) : info.maxIter;
  170. }
  171. }
  172. }
  173. }
  174. struct VecPair
  175. {
  176. float64x2_t a;
  177. float64x2_t b;
  178. };
  179. static inline VecPair quickTwoSum(float64x2_t a, float64x2_t b)
  180. {
  181. float64x2_t s = vaddq_f64(a, b);
  182. float64x2_t e = vsubq_f64(b, vsubq_f64(s, a));
  183. return { s, e };
  184. }
  185. static inline VecPair quickTwoDiff(float64x2_t a, float64x2_t b)
  186. {
  187. float64x2_t s = vsubq_f64(a, b);
  188. float64x2_t e = vsubq_f64(vsubq_f64(a, s), b);
  189. return { s, e };
  190. }
  191. static inline VecPair twoSum(float64x2_t a, float64x2_t b)
  192. {
  193. float64x2_t s = vaddq_f64(a, b);
  194. float64x2_t bb = vsubq_f64(s, a);
  195. float64x2_t e = vaddq_f64(vsubq_f64(a, vsubq_f64(s, bb)), vsubq_f64(b, bb));
  196. return { s, e };
  197. }
  198. static inline VecPair twoDiff(float64x2_t a, float64x2_t b)
  199. {
  200. float64x2_t s = vsubq_f64(a, b);
  201. float64x2_t bb = vsubq_f64(s, a);
  202. float64x2_t e = vsubq_f64(vsubq_f64(a, vsubq_f64(s, bb)), vaddq_f64(b, bb));
  203. return { s, e };
  204. }
  205. static inline VecPair split(float64x2_t a)
  206. {
  207. /*
  208. // -- this should never happen when doing mandelbrot calculations,
  209. // so we omit this check.
  210. if (a > _QD_SPLIT_THRESH || a < -_QD_SPLIT_THRESH) {
  211. a *= 3.7252902984619140625e-09; // 2^-28
  212. temp = _QD_SPLITTER * a;
  213. hi = temp - (temp - a);
  214. lo = a - hi;
  215. hi *= 268435456.0; // 2^28
  216. lo *= 268435456.0; // 2^28
  217. } else {
  218. temp = _QD_SPLITTER * a;
  219. hi = temp - (temp - a);
  220. lo = a - hi;
  221. }
  222. */
  223. static const float64x2_t SPLITTER = vmovq_n_f64(134217729.0);
  224. float64x2_t temp = vmulq_f64(SPLITTER, a);
  225. float64x2_t hi = vsubq_f64(temp, vsubq_f64(temp, a));
  226. float64x2_t lo = vsubq_f64(a, hi);
  227. return { hi, lo };
  228. }
  229. static inline VecPair twoProd(float64x2_t a, float64x2_t b)
  230. {
  231. float64x2_t p = vmulq_f64(a, b);
  232. auto[a_hi, a_lo] = split(a);
  233. auto[b_hi, b_lo] = split(b);
  234. float64x2_t err = vaddq_f64(vaddq_f64(vsubq_f64(vmulq_f64(a_hi, b_hi), p), vaddq_f64(vmulq_f64(a_hi, b_lo), vmulq_f64(a_lo, b_hi))), vmulq_f64(a_lo, b_lo));
  235. return { p, err };
  236. }
  237. struct NeonDoubleDouble
  238. {
  239. float64x2_t x[2];
  240. inline NeonDoubleDouble(const float64x2_t& a, const float64x2_t& b) :
  241. x{ a, b }
  242. {}
  243. inline NeonDoubleDouble(double a, double b) :
  244. x{ vmovq_n_f64(a), vmovq_n_f64(b) }
  245. {}
  246. inline NeonDoubleDouble operator + (const NeonDoubleDouble& sm) const
  247. {
  248. auto[s, e] = twoSum(x[0], sm.x[0]);
  249. e = vaddq_f64(e, vaddq_f64(x[1], sm.x[1]));
  250. auto[r1, r2] = quickTwoSum(s, e);
  251. return NeonDoubleDouble{ r1, r2 };
  252. }
  253. inline NeonDoubleDouble operator - (const NeonDoubleDouble& sm) const
  254. {
  255. auto[s, e] = twoDiff(x[0], sm.x[0]);
  256. e = vaddq_f64(e, x[1]);
  257. e = vsubq_f64(e, sm.x[1]);
  258. auto[r1, r2] = quickTwoSum(s, e);
  259. return NeonDoubleDouble{ r1, r2 };
  260. }
  261. inline NeonDoubleDouble operator * (const NeonDoubleDouble& sm) const
  262. {
  263. auto[p1, p2] = twoProd(this->x[0], sm.x[0]);
  264. p2 = vaddq_f64(p2,
  265. vaddq_f64(vmulq_f64(sm.x[1], x[0]), vmulq_f64(sm.x[0], x[1])) );
  266. auto[r1, r2] = quickTwoSum(p1, p2);
  267. return NeonDoubleDouble{ r1, r2 };
  268. }
  269. };
  270. template<bool parallel>
  271. void CpuGenerator<mnd::DoubleDouble, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInfo& info, float* data)
  272. {
  273. const MandelViewport& view = info.view;
  274. using T = LightDoubleDouble;
  275. T viewx = mnd::convert<T>(view.x);
  276. T viewy = mnd::convert<T>(view.y);
  277. T wpp = mnd::convert<T>(view.width / info.bWidth);
  278. T hpp = mnd::convert<T>(view.height / info.bHeight);
  279. T jX = mnd::convert<T>(info.juliaX);
  280. T jY = mnd::convert<T>(info.juliaY);
  281. NeonDoubleDouble juliaX = { jX[0], jX[1] };
  282. NeonDoubleDouble juliaY = { jY[0], jY[1] };
  283. #if defined(_OPENMP)
  284. if constexpr(parallel)
  285. omp_set_num_threads(omp_get_num_procs());
  286. # pragma omp parallel for schedule(static, 1) if (parallel)
  287. #endif
  288. for (long j = 0; j < info.bHeight; j++) {
  289. T y = viewy + T(double(j)) * hpp;
  290. NeonDoubleDouble ys{ y[0], y[1] };
  291. for (long i = 0; i < info.bWidth; i += 2) {
  292. T x1 = viewx + T(double(i)) * wpp;
  293. T x2 = x1 + wpp;
  294. double xarr1[] = { x1[0], x2[0] };
  295. double xarr2[] = { x1[1], x2[1] };
  296. float64x2_t x1s = vld1q_f64(xarr1);
  297. float64x2_t x2s = vld1q_f64(xarr2);
  298. NeonDoubleDouble xs{ x1s, x2s };
  299. NeonDoubleDouble cx = info.julia ? juliaX : xs;
  300. NeonDoubleDouble cy = info.julia ? juliaY : ys;
  301. float64x2_t threshold = vmovq_n_f64(16.0);
  302. uint64x2_t counter = vmovq_n_u64(0);
  303. uint64x2_t adder = vmovq_n_u64(1);
  304. NeonDoubleDouble a = xs;
  305. NeonDoubleDouble b = ys;
  306. float64x2_t resultA = a.x[0];
  307. float64x2_t resultB = b.x[0];
  308. float64x2_t resultsa = vmovq_n_f64(0);
  309. float64x2_t resultsb = vmovq_n_f64(0);
  310. for (int k = 0; k < info.maxIter; k++) {
  311. NeonDoubleDouble aa = a * a;
  312. NeonDoubleDouble bb = b * b;
  313. NeonDoubleDouble abab = a * b; abab = abab + abab;
  314. a = aa - bb + cx;
  315. b = abab + cy;
  316. uint64x2_t cmp = vcleq_f64(vaddq_f64(aa.x[0], bb.x[0]), threshold);
  317. if (info.smooth) {
  318. resultA = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(a.x[0])), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(resultA))));
  319. resultB = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(b.x[0])), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(resultB))));
  320. }
  321. a = aa - bb + cx;
  322. b = abab + cy;
  323. adder = vandq_u64(adder, cmp);
  324. counter = vaddq_u64(counter, adder);
  325. // checking for break criterion is possibly expensive, only do it every 8 iterations
  326. if ((k & 0x7) == 0) {
  327. /* // ARM-v7 method
  328. uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
  329. if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
  330. break;
  331. }
  332. */
  333. uint64_t allZero = vaddvq_u64(cmp);
  334. if (allZero == 0) {
  335. break;
  336. }
  337. }
  338. }
  339. uint64_t resData[2];
  340. double resa[2];
  341. double resb[2];
  342. vst1q_u64(resData, counter);
  343. vst1q_f64(resa, resultA);
  344. vst1q_f64(resb, resultB);
  345. for (int k = 0; k < 2 && i + k < info.bWidth; k++) {
  346. if (info.smooth)
  347. data[i + k + j * info.bWidth] = resData[k] <= 0 ? info.maxIter :
  348. resData[k] >= info.maxIter ? info.maxIter :
  349. ((float) resData[k]) + 1 - ::logf(::logf(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::logf(2.0f);
  350. else
  351. data[i + k + j * info.bWidth] = resData[k] > 0 ? float(resData[k]) : info.maxIter;
  352. }
  353. }
  354. }
  355. }