5 rokov pred · 3f68d029cc
--- a/libmandel/include/NaiveIRGenerator.h
+++ b/libmandel/include/NaiveIRGenerator.h
@@ -3,6 +3,7 @@
 
				 
			
 
				 #include "IterationIR.h"
			
 
				 #include "Generators.h"
			
 
				+#include <memory>
			
 
				 #include <variant>
			
 
				 
			
 
				 namespace mnd
			
@@ -42,6 +43,42 @@ namespace mnd
 
				             Ln
			
 
				         >;
			
 
				 
			
 
				+
			
 
				+        struct Load { size_t index; };
			
 
				+        struct Store
			
 
				+        {
			
 
				+            size_t index;
			
 
				+            std::unique_ptr<EvalNode> v;
			
 
				+        };
			
 
				+    
			
 
				+        struct BinaryOperation
			
 
				+        {
			
 
				+            std::unique_ptr<EvalNode> a;
			
 
				+            std::unique_ptr<EvalNode> b;
			
 
				+        };
			
 
				+    
			
 
				+        struct UnaryOperation
			
 
				+        {
			
 
				+            std::unique_ptr<EvalNode> a;
			
 
				+        };
			
 
				+    
			
 
				+        struct Add : BinaryOperation {};
			
 
				+        struct Sub : BinaryOperation {};
			
 
				+        struct Mul : BinaryOperation {};
			
 
				+        struct Div : BinaryOperation {};
			
 
				+    
			
 
				+        struct Neg : UnaryOperation {};
			
 
				+    
			
 
				+    
			
 
				+        struct Atan2 : BinaryOperation {};
			
 
				+        struct Pow : BinaryOperation {};
			
 
				+        struct Cos : UnaryOperation {};
			
 
				+        struct Sin : UnaryOperation {};
			
 
				+        struct Exp : UnaryOperation {};
			
 
				+        struct Ln : UnaryOperation {};
			
 
				+
			
 
				+
			
 
				+
			
 
				         template<typename T>
			
 
				         struct EvalStruct
			
 
				         {
			
--- a/libmandel/src/CpuGeneratorsAVX512.cpp
+++ b/libmandel/src/CpuGeneratorsAVX512.cpp
@@ -29,6 +29,11 @@ void CpuGenerator<float, mnd::X86_AVX_512, parallel>::generate(const mnd::Mandel
 
				     __m512 enumerate = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
			
 
				     __m512 two = _mm512_set1_ps(2);
			
 
				 
			
 
				+    T jX = mnd::convert<T>(info.juliaX);
			
 
				+    T jY = mnd::convert<T>(info.juliaY);
			
 
				+    __m512 juliaX = _mm512_set1_ps(jX);
			
 
				+    __m512 juliaY = _mm512_set1_ps(jY);
			
 
				+
			
 
				 #if defined(_OPENMP)
			
 
				     if constexpr(parallel)
			
 
				         omp_set_num_threads(omp_get_num_procs());
			
@@ -60,6 +65,15 @@ void CpuGenerator<float, mnd::X86_AVX_512, parallel>::generate(const mnd::Mandel
 
				 
			
 
				             __m512 threshold = _mm512_set1_ps(16);
			
 
				 
			
 
				+            __m512 cx0 = xs0;
			
 
				+            __m512 cx1 = xs1;
			
 
				+            __m512 cy = ys;
			
 
				+	    if (info.julia) {
			
 
				+		cx0 = juliaX;
			
 
				+		cx1 = juliaX;
			
 
				+		cy = juliaY;
			
 
				+	    }
			
 
				+
			
 
				             __m512 a0 = xs0;
			
 
				             __m512 a1 = xs1;
			
 
				             //__m512 a2 = xs2;
			
@@ -78,11 +92,11 @@ void CpuGenerator<float, mnd::X86_AVX_512, parallel>::generate(const mnd::Mandel
 
				                     __mmask16 cmp0 = _mm512_cmp_ps_mask(_mm512_fmadd_ps(b0, b0, aa0), threshold, _CMP_LE_OQ);
			
 
				                     __mmask16 cmp1 = _mm512_cmp_ps_mask(_mm512_fmadd_ps(b1, b1, aa1), threshold, _CMP_LE_OQ);
			
 
				                     //__mmask16 cmp2 = _mm512_cmp_ps_mask(_mm512_fmadd_ps(b2, b2, aa2), threshold, _CMP_LE_OQ);
			
 
				-                    a0 = _mm512_sub_ps(aa0, _mm512_fmsub_ps(b0, b0, xs0));
			
 
				-                    a1 = _mm512_sub_ps(aa1, _mm512_fmsub_ps(b1, b1, xs1));
			
 
				+                    a0 = _mm512_sub_ps(aa0, _mm512_fmsub_ps(b0, b0, cx0));
			
 
				+                    a1 = _mm512_sub_ps(aa1, _mm512_fmsub_ps(b1, b1, cx1));
			
 
				                     //a2 = _mm512_sub_ps(aa2, _mm512_fmsub_ps(b2, b2, xs2));
			
 
				-                    b0 = _mm512_fmadd_ps(two, abab0, ys);
			
 
				-                    b1 = _mm512_fmadd_ps(two, abab1, ys);
			
 
				+                    b0 = _mm512_fmadd_ps(two, abab0, cy);
			
 
				+                    b1 = _mm512_fmadd_ps(two, abab1, cy);
			
 
				                     //b2 = _mm512_fmadd_ps(two, abab2, ys);
			
 
				                     counter0 = _mm512_mask_add_ps(counter0, cmp0, counter0, adder0);
			
 
				                     counter1 = _mm512_mask_add_ps(counter1, cmp1, counter1, adder1);
			
@@ -109,11 +123,11 @@ void CpuGenerator<float, mnd::X86_AVX_512, parallel>::generate(const mnd::Mandel
 
				                     __mmask16 cmp0 = _mm512_cmp_ps_mask(_mm512_fmadd_ps(b0, b0, aa0), threshold, _CMP_LE_OQ);
			
 
				                     __mmask16 cmp1 = _mm512_cmp_ps_mask(_mm512_fmadd_ps(b1, b1, aa1), threshold, _CMP_LE_OQ);
			
 
				                     //__mmask16 cmp2 = _mm512_cmp_ps_mask(_mm512_fmadd_ps(b2, b2, aa2), threshold, _CMP_LE_OQ);
			
 
				-                    a0 = _mm512_sub_ps(aa0, _mm512_fmsub_ps(b0, b0, xs0));
			
 
				-                    a1 = _mm512_sub_ps(aa1, _mm512_fmsub_ps(b1, b1, xs1));
			
 
				+                    a0 = _mm512_sub_ps(aa0, _mm512_fmsub_ps(b0, b0, cx0));
			
 
				+                    a1 = _mm512_sub_ps(aa1, _mm512_fmsub_ps(b1, b1, cx1));
			
 
				                     //a2 = _mm512_sub_ps(aa2, _mm512_fmsub_ps(b2, b2, xs2));
			
 
				-                    b0 = _mm512_fmadd_ps(two, abab0, ys);
			
 
				-                    b1 = _mm512_fmadd_ps(two, abab1, ys);
			
 
				+                    b0 = _mm512_fmadd_ps(two, abab0, cy);
			
 
				+                    b1 = _mm512_fmadd_ps(two, abab1, cy);
			
 
				                     //b2 = _mm512_fmadd_ps(two, abab2, ys);
			
 
				                     counter0 = _mm512_mask_add_ps(counter0, cmp0, counter0, adder0);
			
 
				                     counter1 = _mm512_mask_add_ps(counter1, cmp1, counter1, adder1);
			
@@ -173,6 +187,11 @@ void CpuGenerator<double, mnd::X86_AVX_512, parallel>::generate(const mnd::Mande
 
				     __m512d viewx = { viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf, viewxf };
			
 
				     __m512d dpp = { dppf, dppf, dppf, dppf, dppf, dppf, dppf, dppf };
			
 
				 
			
 
				+    T jX = mnd::convert<T>(info.juliaX);
			
 
				+    T jY = mnd::convert<T>(info.juliaY);
			
 
				+    __m512d juliaX = _mm512_set1_pd(jX);
			
 
				+    __m512d juliaY = _mm512_set1_pd(jY);
			
 
				+
			
 
				 #if defined(_OPENMP)
			
 
				     if constexpr(parallel)
			
 
				         omp_set_num_threads(omp_get_num_procs());
			
@@ -193,6 +212,12 @@ void CpuGenerator<double, mnd::X86_AVX_512, parallel>::generate(const mnd::Mande
 
				 
			
 
				             __m512d threshold = { 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f, 16.0f };
			
 
				 
			
 
				+            __m512d cx = xs;
			
 
				+            __m512d cy = ys;
			
 
				+	    if (info.julia) {
			
 
				+		cx = juliaX;
			
 
				+		cy = juliaY;
			
 
				+	    }
			
 
				             __m512d a = xs;
			
 
				             __m512d b = ys;
			
 
				 
			
@@ -201,8 +226,8 @@ void CpuGenerator<double, mnd::X86_AVX_512, parallel>::generate(const mnd::Mande
 
				                     __m512d aa = _mm512_mul_pd(a, a);
			
 
				                     __m512d ab = _mm512_mul_pd(a, b);
			
 
				                     __mmask8 cmp = _mm512_cmp_pd_mask(_mm512_fmadd_pd(b, b, aa), threshold, _CMP_LE_OQ);
			
 
				-                    a = _mm512_sub_pd(aa, _mm512_fmsub_pd(b, b, xs));
			
 
				-                    b = _mm512_fmadd_pd(two, ab, ys);
			
 
				+                    a = _mm512_sub_pd(aa, _mm512_fmsub_pd(b, b, cx));
			
 
				+                    b = _mm512_fmadd_pd(two, ab, cy);
			
 
				                     resultsa = _mm512_mask_blend_pd(cmp, resultsa, a);
			
 
				                     resultsb = _mm512_mask_blend_pd(cmp, resultsb, b);
			
 
				                     counter = _mm512_mask_add_pd(counter, cmp, counter, adder);
			
@@ -216,8 +241,8 @@ void CpuGenerator<double, mnd::X86_AVX_512, parallel>::generate(const mnd::Mande
 
				                     __m512d aa = _mm512_mul_pd(a, a);
			
 
				                     __m512d ab = _mm512_mul_pd(a, b);
			
 
				                     __mmask8 cmp = _mm512_cmp_pd_mask(_mm512_fmadd_pd(b, b, aa), threshold, _CMP_LE_OQ);
			
 
				-                    a = _mm512_sub_pd(aa, _mm512_fmsub_pd(b, b, xs));
			
 
				-                    b = _mm512_fmadd_pd(two, ab, ys);
			
 
				+                    a = _mm512_sub_pd(aa, _mm512_fmsub_pd(b, b, cx));
			
 
				+                    b = _mm512_fmadd_pd(two, ab, cy);
			
 
				                     counter = _mm512_mask_add_pd(counter, cmp, counter, adder);
			
 
				                     if (cmp == 0) {
			
 
				                         break;
			
--- a/libmandel/src/NaiveIRGenerator.cpp
+++ b/libmandel/src/NaiveIRGenerator.cpp
@@ -16,41 +16,6 @@ namespace mnd
 
				 namespace mnd::eval
			
 
				 {
			
 
				 
			
 
				-
			
 
				-    struct Load { size_t index; };
			
 
				-    struct Store
			
 
				-    {
			
 
				-        size_t index;
			
 
				-        std::unique_ptr<EvalNode> v;
			
 
				-    };
			
 
				-
			
 
				-    struct BinaryOperation
			
 
				-    {
			
 
				-        std::unique_ptr<EvalNode> a;
			
 
				-        std::unique_ptr<EvalNode> b;
			
 
				-    };
			
 
				-
			
 
				-    struct UnaryOperation
			
 
				-    {
			
 
				-        std::unique_ptr<EvalNode> a;
			
 
				-    };
			
 
				-
			
 
				-    struct Add : BinaryOperation {};
			
 
				-    struct Sub : BinaryOperation {};
			
 
				-    struct Mul : BinaryOperation {};
			
 
				-    struct Div : BinaryOperation {};
			
 
				-
			
 
				-    struct Neg : UnaryOperation {};
			
 
				-
			
 
				-
			
 
				-    struct Atan2 : BinaryOperation {};
			
 
				-    struct Pow : BinaryOperation {};
			
 
				-    struct Cos : UnaryOperation {};
			
 
				-    struct Sin : UnaryOperation {};
			
 
				-    struct Exp : UnaryOperation {};
			
 
				-    struct Ln : UnaryOperation {};
			
 
				-
			
 
				-
			
 
				     using namespace mnd;
			
 
				     using namespace mnd::ir;
			
 
				     template<typename T>