C#Vector.CopyTo几乎比非SIMD版本快?
更新:之前提到的跨度问题已在.net核心2.1版本(目前正在预览中)中修复.这些实际上使得Span Vector *比数组Vector更快* …
注意:在“Intel Xeon E5-1660 v4”上进行测试,CPU-Z告诉我有“MMX,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,EM64T,VT-x,AES,AVX,AVX2,FMA3,RSX“所以它应该没问题…… 在回答Vector based question之后,我想我会尝试实现一些BLAS功能.我发现那些正在阅读/求和的产品如dot产品非常好,但是我回写一个阵列是坏的 – 比非SIMD更好,但几乎没有. 我做错了什么,或者是否需要在JIT中做更多的工作? 示例(假设x.Length = y.Length,not null等等等等等等): public static void daxpy(double alpha,double[] x,double[] y) { for (var i = 0; i < x.Length; ++i) y[i] = y[i] + x[i] * alpha; } 在矢量形式变为: public static void daxpy(double alpha,double[] y) { var i = 0; if (Vector.IsHardwareAccelerated) { var length = x.Length + 1 - Vector<double>.Count; for (; i < length; i += Vector<double>.Count) { var valpha = new Vector<double>(alpha); var vx = new Vector<double>(x,i); var vy = new Vector<double>(y,i); (vy + vx * valpha).CopyTo(y,i); } } for (; i < x.Length; ++i) y[i] = y[i] + x[i] * alpha; } 而且,在.NET Core 2.0中玩游戏,虽然我会尝试Span,无论是天真还是矢量形式: public static void daxpy(double alpha,Span<double> x,Span<double> y) { for (var i = 0; i < x.Length; ++i) y[i] += x[i] * alpha; } 和矢量 public static void daxpy(double alpha,Span<double> y) { if (Vector.IsHardwareAccelerated) { var vx = x.NonPortableCast<double,Vector<double>>(); var vy = y.NonPortableCast<double,Vector<double>>(); var valpha = new Vector<double>(alpha); for (var i = 0; i < vx.Length; ++i) vy[i] += vx[i] * valpha; x = x.Slice(Vector<double>.Count * vx.Length); y = y.Slice(Vector<double>.Count * vy.Length); } for (var i = 0; i < x.Length; ++i) y[i] += x[i] * alpha; } 所以这些的相对时间是: Naive 1.0 Vector 0.8 Span Naive 2.5 ==> Update: Span Naive 1.1 Span Vector 0.9 ==> Update: Span Vector 0.6 我做错了什么?我几乎无法想到一个更简单的例子,所以我不这么认为? 解决方法
你可能想用2.1以上的测试;
在我的笔记本电脑上(SIMD与我的桌面相比较差),我得到: daxpy_naive x10000: 144ms daxpy_arr_vector x10000: 77ms daxpy_span x10000: 173ms daxpy_vector x10000: 67ms daxpy_vector_no_slice x10000: 67ms 使用代码: using System; using System.Diagnostics; using System.Numerics; class Program { static void Main(string[] args) { double alpha = 0.5; double[] x = new double[16 * 1024],y = new double[x.Length]; var rand = new Random(12345); for (int i = 0; i < x.Length; i++) x[i] = rand.NextDouble(); RunAll(alpha,x,y,1,false); RunAll(alpha,10000,true); } private static void RunAll(double alpha,double[] y,int loop,bool log) { GC.Collect(GC.MaxGeneration); GC.WaitForPendingFinalizers(); var watch = Stopwatch.StartNew(); for(int i = 0; i < loop; i++) { daxpy_naive(alpha,y); } watch.Stop(); if (log) Console.WriteLine($"{nameof(daxpy_naive)} x{loop}: {watch.ElapsedMilliseconds}ms"); watch = Stopwatch.StartNew(); for (int i = 0; i < loop; i++) { daxpy_arr_vector(alpha,y); } watch.Stop(); if (log) Console.WriteLine($"{nameof(daxpy_arr_vector)} x{loop}: {watch.ElapsedMilliseconds}ms"); watch = Stopwatch.StartNew(); for (int i = 0; i < loop; i++) { daxpy_span(alpha,y); } watch.Stop(); if (log) Console.WriteLine($"{nameof(daxpy_span)} x{loop}: {watch.ElapsedMilliseconds}ms"); watch = Stopwatch.StartNew(); for (int i = 0; i < loop; i++) { daxpy_vector(alpha,y); } watch.Stop(); if (log) Console.WriteLine($"{nameof(daxpy_vector)} x{loop}: {watch.ElapsedMilliseconds}ms"); watch = Stopwatch.StartNew(); for (int i = 0; i < loop; i++) { daxpy_vector_no_slice(alpha,y); } watch.Stop(); if (log) Console.WriteLine($"{nameof(daxpy_vector_no_slice)} x{loop}: {watch.ElapsedMilliseconds}ms"); } public static void daxpy_naive(double alpha,double[] y) { for (var i = 0; i < x.Length; ++i) y[i] = y[i] + x[i] * alpha; } public static void daxpy_arr_vector(double alpha,double[] y) { var i = 0; if (Vector.IsHardwareAccelerated) { var length = x.Length + 1 - Vector<double>.Count; for (; i < length; i += Vector<double>.Count) { var valpha = new Vector<double>(alpha); var vx = new Vector<double>(x,i); var vy = new Vector<double>(y,i); (vy + vx * valpha).CopyTo(y,i); } } for (; i < x.Length; ++i) y[i] = y[i] + x[i] * alpha; } public static void daxpy_span(double alpha,Span<double> y) { for (var i = 0; i < x.Length; ++i) y[i] += x[i] * alpha; } public static void daxpy_vector(double alpha,Span<double> y) { if (Vector.IsHardwareAccelerated) { var vx = x.NonPortableCast<double,Vector<double>>(); var vy = y.NonPortableCast<double,Vector<double>>(); var valpha = new Vector<double>(alpha); for (var i = 0; i < vx.Length; ++i) vy[i] += vx[i] * valpha; x = x.Slice(Vector<double>.Count * vx.Length); y = y.Slice(Vector<double>.Count * vy.Length); } for (var i = 0; i < x.Length; ++i) y[i] += x[i] * alpha; } public static void daxpy_vector_no_slice(double alpha,Span<double> y) { int i = 0; if (Vector.IsHardwareAccelerated) { var vx = x.NonPortableCast<double,Vector<double>>(); var valpha = new Vector<double>(alpha); for (i = 0; i < vx.Length; ++i) vy[i] += vx[i] * valpha; i = Vector<double>.Count * vx.Length; } for (; i < x.Length; ++i) y[i] += x[i] * alpha; } } 这是使用dotnet build -c Release和dotnet run -c Release,dotnet –version报告“2.2.0-preview1-008000”(不久之前的“每日”). 在我的桌面上,我希望差异会更好. (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |