CpuGeneratorsNeon.cpp 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. #include "CpuGenerators.h"
  2. #include <omp.h>
  3. #include <arm_neon.h>
  4. #include <memory>
  5. using mnd::CpuGenerator;
  6. namespace mnd
  7. {
  8. template class CpuGenerator<float, mnd::ARM_NEON, false>;
  9. template class CpuGenerator<float, mnd::ARM_NEON, true>;
  10. template class CpuGenerator<double, mnd::ARM_NEON, false>;
  11. template class CpuGenerator<double, mnd::ARM_NEON, true>;
  12. }
  13. template<bool parallel>
  14. void CpuGenerator<float, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInfo& info, float* data)
  15. {
  16. using T = float;
  17. const MandelViewport& view = info.view;
  18. if constexpr(parallel)
  19. omp_set_num_threads(omp_get_num_procs());
  20. #pragma omp parallel for schedule(static, 1) if (parallel)
  21. for (long j = 0; j < info.bHeight; j++) {
  22. T y = T(view.y) + T(j) * T(view.height / info.bHeight);
  23. long i = 0;
  24. for (i; i < info.bWidth; i += 4) {
  25. float xsvals[] = {
  26. float(view.x + float(i) * view.width / info.bWidth),
  27. float(view.x + float(i + 1) * view.width / info.bWidth),
  28. float(view.x + float(i + 2) * view.width / info.bWidth),
  29. float(view.x + float(i + 3) * view.width / info.bWidth)
  30. };
  31. float32x4_t xs = vld1q_f32(xsvals);
  32. uint32x4_t counter = vmovq_n_u32(0);
  33. uint32x4_t adder = vmovq_n_u32(1);
  34. //uint32x4_t ones = vmovq_n_u32(1);
  35. float32x4_t threshold = vmovq_n_f32(16);
  36. float32x4_t ys = vmovq_n_f32(y);
  37. float32x4_t a = xs;
  38. float32x4_t b = ys;
  39. for (int k = 0; k < info.maxIter; k++) {
  40. float32x4_t aa = vmulq_f32(a, a);
  41. float32x4_t bb = vmulq_f32(b, b);
  42. float32x4_t abab = vmulq_f32(a, b); abab = vaddq_f32(abab, abab);
  43. uint32x4_t cmp = vcleq_f32(vaddq_f32(aa, bb), threshold);
  44. if (info.smooth) {
  45. float32x4_t tempa = vaddq_f32(vsubq_f32(aa, bb), xs);
  46. float32x4_t tempb = vaddq_f32(abab, ys);
  47. a = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempa)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(a))));
  48. b = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(cmp, vreinterpretq_u32_f32(tempb)), vandq_u32(vmvnq_u32(cmp), vreinterpretq_u32_f32(b))));
  49. }
  50. else {
  51. a = vaddq_f32(vsubq_f32(aa, bb), xs);
  52. b = vaddq_f32(abab, ys);
  53. }
  54. adder = vandq_u32(adder, cmp);
  55. counter = vaddq_u32(counter, adder);
  56. // checking for break criterion is possibly expensive, only do it every 8 iterations
  57. if ((k & 0x7) == 0) {
  58. /* // ARM-v7 method
  59. uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
  60. if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
  61. break;
  62. }
  63. */
  64. uint32_t allZero = vaddvq_u32(cmp);
  65. if (allZero == 0) {
  66. break;
  67. }
  68. }
  69. }
  70. uint32_t resData[4];
  71. float resa[4];
  72. float resb[4];
  73. vst1q_u32(resData, counter);
  74. vst1q_f32(resa, a);
  75. vst1q_f32(resb, b);
  76. for (int k = 0; k < 4 && i + k < info.bWidth; k++) {
  77. if (info.smooth)
  78. data[i + k + j * info.bWidth] = resData[k] <= 0 ? info.maxIter :
  79. resData[k] >= info.maxIter ? info.maxIter :
  80. ((float) resData[k]) + 1 - ::logf(::logf(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::logf(2.0f);
  81. else
  82. data[i + k + j * info.bWidth] = resData[k] > 0 ? float(resData[k]) : info.maxIter;
  83. }
  84. }
  85. }
  86. }
  87. template<bool parallel>
  88. void CpuGenerator<double, mnd::ARM_NEON, parallel>::generate(const mnd::MandelInfo& info, float* data)
  89. {
  90. using T = double;
  91. const MandelViewport& view = info.view;
  92. if constexpr(parallel)
  93. omp_set_num_threads(omp_get_num_procs());
  94. #pragma omp parallel for schedule(static, 1) if (parallel)
  95. for (long j = 0; j < info.bHeight; j++) {
  96. T y = T(view.y) + T(j) * T(view.height / info.bHeight);
  97. long i = 0;
  98. for (i; i < info.bWidth; i += 2) {
  99. double xsvals[] = {
  100. double(view.x + double(i) * view.width / info.bWidth),
  101. double(view.x + double(i + 1) * view.width / info.bWidth),
  102. };
  103. float64x2_t xs = vld1q_f64(xsvals);
  104. uint64x2_t counter = vmovq_n_u64(0);
  105. uint64x2_t adder = vmovq_n_u64(1);
  106. //uint32x4_t ones = vmovq_n_u32(1);
  107. float64x2_t threshold = vmovq_n_f64(16);
  108. float64x2_t ys = vmovq_n_f64(y);
  109. float64x2_t a = xs;
  110. float64x2_t b = ys;
  111. for (int k = 0; k < info.maxIter; k++) {
  112. float64x2_t aa = vmulq_f64(a, a);
  113. float64x2_t bb = vmulq_f64(b, b);
  114. float64x2_t abab = vmulq_f64(a, b); abab = vaddq_f64(abab, abab);
  115. //a = vaddq_f64(vsubq_f64(aa, bb), xs);
  116. //b = vaddq_f64(abab, ys);
  117. uint64x2_t cmp = vcleq_f64(vaddq_f64(aa, bb), threshold);
  118. if (info.smooth) {
  119. float64x2_t tempa = vaddq_f64(vsubq_f64(aa, bb), xs);
  120. float64x2_t tempb = vaddq_f64(abab, ys);
  121. a = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempa)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(a))));
  122. b = vreinterpretq_f64_u64(vorrq_u64(vandq_u64(cmp, vreinterpretq_u64_f64(tempb)), vandq_u64(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmp))), vreinterpretq_u64_f64(b))));
  123. }
  124. else {
  125. a = vaddq_f64(vsubq_f64(aa, bb), xs);
  126. b = vaddq_f64(abab, ys);
  127. }
  128. adder = vandq_u64(adder, cmp);
  129. counter = vaddq_u64(counter, adder);
  130. // checking for break criterion is possibly expensive, only do it every 8 iterations
  131. if ((k & 0x7) == 0) {
  132. /* // ARM-v7 method
  133. uint32x2_t allZero = vorr_u32(vget_low_u32(cmp), vget_high_u32(cmp));
  134. if (vget_lane_u32(vpmax_u32(allZero, allZero), 0) == 0) {
  135. break;
  136. }
  137. */
  138. uint64_t allZero = vaddvq_u64(cmp);
  139. if (allZero == 0) {
  140. break;
  141. }
  142. }
  143. }
  144. /*uint64_t resData[2];
  145. vst1q_u64(resData, counter);
  146. for (int k = 0; k < 2 && i + k < info.bWidth; k++)
  147. data[i + k + j * info.bWidth] = resData[k] > 0 ? resData[k] : info.maxIter;
  148. */
  149. uint64_t resData[2];
  150. double resa[2];
  151. double resb[2];
  152. vst1q_u64(resData, counter);
  153. vst1q_f64(resa, a);
  154. vst1q_f64(resb, b);
  155. for (int k = 0; k < 2 && i + k < info.bWidth; k++) {
  156. if (info.smooth)
  157. data[i + k + j * info.bWidth] = resData[k] <= 0 ? info.maxIter :
  158. resData[k] >= info.maxIter ? info.maxIter :
  159. ((float) resData[k]) + 1 - ::logf(::logf(resa[k] * resa[k] + resb[k] * resb[k]) / 2) / ::logf(2.0f);
  160. else
  161. data[i + k + j * info.bWidth] = resData[k] > 0 ? float(resData[k]) : info.maxIter;
  162. }
  163. }
  164. }
  165. }