quaddouble.cl 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. #pragma OPENCL EXTENSION cl_khr_fp64 : enable
  2. inline double2 twoSum(double a, double b) {
  3. double s = a + b;
  4. double bb = s - a;
  5. double e = (a - (s - bb)) + (b - bb);
  6. return (double2)(s, e);
  7. }
  8. inline double2 quickTwoSum(double a, double b) {
  9. double s = a + b;
  10. double e = b - (s - a);
  11. return (double2)(s, e);
  12. }
  13. inline double2 twoProd(double a, double b) {
  14. double p = a * b;
  15. double e = fma(a, b, -p);
  16. return (double2)(p, e);
  17. }
  18. inline void threeSum(double* a, double* b, double* c) {
  19. double2 t = twoSum(*a, *b);
  20. double2 at3 = twoSum(*c, t.s0);
  21. double2 bc = twoSum(t.s1, at3.s1);
  22. *a = at3.s0;
  23. *b = bc.s0;
  24. *c = bc.s1;
  25. }
  26. inline void threeSum2(double* a, double* b, double* c) {
  27. double2 t = twoSum(*a, *b);
  28. double2 at3 = twoSum(*c, t.s0);
  29. *a = at3.s0;
  30. *b = t.s1 + at3.s1;
  31. }
  32. inline double4 add(double4 a, double4 b) {
  33. double2 su0 = twoSum(a.s0, b.s0);
  34. double2 su1 = twoSum(a.s1, b.s1);
  35. double2 su2 = twoSum(a.s2, b.s2);
  36. double2 su3 = twoSum(a.s3, b.s3);
  37. double2 s1t0 = twoSum(su1.s0, su0.s1);
  38. threeSum(&su2.s0, &su0.s1, &su1.s1);
  39. threeSum2(&su3.s0, &su0.s1, &su2.s1);
  40. su0.s1 = su0.s1 + su1.s1 + su3.s1;
  41. renorm(su0.s0, su1.s0, su2.s0, su3.s0, su0.s1);
  42. return (double4)(su0.s0, su1.s0, su2.s0, su3.s0);
  43. }
  44. inline double2 mul(double2 a, double2 b) {
  45. double2 p = twoProd(a.s0, b.s0);
  46. p.s1 += (a.s0 * b.s1 + a.s1 * b.s0);
  47. return quickTwoSum(p.s0, p.s1);
  48. }
  49. inline double2 add(double2 a, double2 b) {
  50. double2 se = twoSum(a.s0, b.s0);
  51. se.s1 += a.s1 + b.s1;
  52. return quickTwoSum(se.s0, se.s1);
  53. }
  54. inline double2 mulDouble(double2 a, double b) {
  55. double2 p = twoProd(a.s0, b);
  56. p.s1 += a.s1 * b;
  57. return quickTwoSum(p.s0, p.s1);
  58. }
  59. __kernel void iterate(__global float* A, const int width,
  60. double x1, double x2, double y1, double y2,
  61. double pw1, double pw2, double ph1, double ph2, int max, int smooth) {
  62. int index = get_global_id(0);
  63. int px = index % width;
  64. int py = index / width;
  65. double2 xl = (double2)(x1, x2);
  66. double2 yt = (double2)(y1, y2);
  67. double2 pixelScaleX = (double2)(pw1, pw2);
  68. double2 pixelScaleY = (double2)(ph1, ph2);
  69. double2 a = add(mulDouble(pixelScaleX, (double) px), xl); // pixelScaleX * px + xl
  70. double2 b = add(mulDouble(pixelScaleY, (double) py), yt); // pixelScaleY * py + yt
  71. double2 ca = a;
  72. double2 cb = b;
  73. int n = 0;
  74. while (n < max - 1) {
  75. double2 aa = mul(a, a);
  76. double2 bb = mul(b, b);
  77. double2 ab = mul(a, b);
  78. if (aa.s0 + aa.s1 + bb.s0 + bb.s1 > 16) break;
  79. double2 minusbb = (double2)(-bb.s0, -bb.s1);
  80. a = add(add(aa, minusbb), ca);
  81. b = add(add(ab, ab), cb);
  82. n++;
  83. }
  84. // N + 1 - log (log |Z(N)|) / log 2
  85. if (n >= max - 1)
  86. A[index] = max;
  87. else {
  88. if (smooth != 0)
  89. A[index] = ((float) n) + 1 - log(log(a.s0 * a.s0 + b.s0 * b.s0) / 2) / log(2.0f);
  90. else
  91. A[index] = ((float)n);
  92. }
  93. // A[index] = ((float)n) + 1 - (a * a + b * b - 16) / (256 - 16);
  94. // A[get_global_id(0)] = 5;
  95. }