注意:在“Intel Xeon E5-1660 v4”上进行测试,cpu-Z告诉我有“MMX,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,EM64T,VT-x,AES,AVX,AVX2,FMA3,RSX“所以它应该没问题……
在回答Vector based question之后,我想我会尝试实现一些BLAS功能.我发现那些正在阅读/求和的产品如dot产品非常好,但是我回写一个阵列是坏的 – 比非SIMD更好,但几乎没有.
我做错了什么,或者是否需要在JIT中做更多的工作?
示例(假设x.Length = y.Length,not null等等等等等等):
public static voID daxpy(double Alpha,double[] x,double[] y){ for (var i = 0; i < x.Length; ++i) y[i] = y[i] + x[i] * Alpha;}
在矢量形式变为:
public static voID daxpy(double Alpha,double[] y){ var i = 0; if (Vector.IsHarDWareAccelerated) { var length = x.Length + 1 - Vector<double>.Count; for (; i < length; i += Vector<double>.Count) { var vAlpha = new Vector<double>(Alpha); var vx = new Vector<double>(x,i); var vy = new Vector<double>(y,i); (vy + vx * vAlpha).copyTo(y,i); } } for (; i < x.Length; ++i) y[i] = y[i] + x[i] * Alpha;}
而且,在.NET Core 2.0中玩游戏,虽然我会尝试Span,无论是天真还是矢量形式:
public static voID daxpy(double Alpha,Span<double> x,Span<double> y){ for (var i = 0; i < x.Length; ++i) y[i] += x[i] * Alpha;}
和矢量
public static voID daxpy(double Alpha,Span<double> y){ if (Vector.IsHarDWareAccelerated) { var vx = x.NonPortableCast<double,Vector<double>>(); var vy = y.NonPortableCast<double,Vector<double>>(); var vAlpha = new Vector<double>(Alpha); for (var i = 0; i < vx.Length; ++i) vy[i] += vx[i] * vAlpha; x = x.Slice(Vector<double>.Count * vx.Length); y = y.Slice(Vector<double>.Count * vy.Length); } for (var i = 0; i < x.Length; ++i) y[i] += x[i] * Alpha;}
所以这些的相对时间是:
Naive 1.0Vector 0.8Span Naive 2.5 ==> Update: Span Naive 1.1Span Vector 0.9 ==> Update: Span Vector 0.6
我做错了什么?我几乎无法想到一个更简单的例子,所以我不这么认为?
解决方法 你可能想用2.1以上的测试;在我的笔记本电脑上(SIMD与我的桌面相比较差),我得到:
daxpy_naive x10000: 144msdaxpy_arr_vector x10000: 77msdaxpy_span x10000: 173msdaxpy_vector x10000: 67msdaxpy_vector_no_slice x10000: 67ms
使用代码:
using System;using System.Diagnostics;using system.numerics;class Program{ static voID Main(string[] args) { double Alpha = 0.5; double[] x = new double[16 * 1024],y = new double[x.Length]; var rand = new Random(12345); for (int i = 0; i < x.Length; i++) x[i] = rand.NextDouble(); RunAll(Alpha,x,y,1,false); RunAll(Alpha,10000,true); } private static voID RunAll(double Alpha,double[] y,int loop,bool log) { GC.Collect(GC.MaxGeneration); GC.WaitForPendingFinalizers(); var watch = Stopwatch.StartNew(); for(int i = 0; i < loop; i++) { daxpy_naive(Alpha,y); } watch.Stop(); if (log) Console.Writeline($"{nameof(daxpy_naive)} x{loop}: {watch.ElapsedMilliseconds}ms"); watch = Stopwatch.StartNew(); for (int i = 0; i < loop; i++) { daxpy_arr_vector(Alpha,y); } watch.Stop(); if (log) Console.Writeline($"{nameof(daxpy_arr_vector)} x{loop}: {watch.ElapsedMilliseconds}ms"); watch = Stopwatch.StartNew(); for (int i = 0; i < loop; i++) { daxpy_span(Alpha,y); } watch.Stop(); if (log) Console.Writeline($"{nameof(daxpy_span)} x{loop}: {watch.ElapsedMilliseconds}ms"); watch = Stopwatch.StartNew(); for (int i = 0; i < loop; i++) { daxpy_vector(Alpha,y); } watch.Stop(); if (log) Console.Writeline($"{nameof(daxpy_vector)} x{loop}: {watch.ElapsedMilliseconds}ms"); watch = Stopwatch.StartNew(); for (int i = 0; i < loop; i++) { daxpy_vector_no_slice(Alpha,y); } watch.Stop(); if (log) Console.Writeline($"{nameof(daxpy_vector_no_slice)} x{loop}: {watch.ElapsedMilliseconds}ms"); } public static voID daxpy_naive(double Alpha,double[] y) { for (var i = 0; i < x.Length; ++i) y[i] = y[i] + x[i] * Alpha; } public static voID daxpy_arr_vector(double Alpha,double[] y) { var i = 0; if (Vector.IsHarDWareAccelerated) { var length = x.Length + 1 - Vector<double>.Count; for (; i < length; i += Vector<double>.Count) { var vAlpha = new Vector<double>(Alpha); var vx = new Vector<double>(x,i); var vy = new Vector<double>(y,i); (vy + vx * vAlpha).copyTo(y,i); } } for (; i < x.Length; ++i) y[i] = y[i] + x[i] * Alpha; } public static voID daxpy_span(double Alpha,Span<double> y) { for (var i = 0; i < x.Length; ++i) y[i] += x[i] * Alpha; } public static voID daxpy_vector(double Alpha,Span<double> y) { if (Vector.IsHarDWareAccelerated) { var vx = x.NonPortableCast<double,Vector<double>>(); var vy = y.NonPortableCast<double,Vector<double>>(); var vAlpha = new Vector<double>(Alpha); for (var i = 0; i < vx.Length; ++i) vy[i] += vx[i] * vAlpha; x = x.Slice(Vector<double>.Count * vx.Length); y = y.Slice(Vector<double>.Count * vy.Length); } for (var i = 0; i < x.Length; ++i) y[i] += x[i] * Alpha; } public static voID daxpy_vector_no_slice(double Alpha,Span<double> y) { int i = 0; if (Vector.IsHarDWareAccelerated) { var vx = x.NonPortableCast<double,Vector<double>>(); var vAlpha = new Vector<double>(Alpha); for (i = 0; i < vx.Length; ++i) vy[i] += vx[i] * vAlpha; i = Vector<double>.Count * vx.Length; } for (; i < x.Length; ++i) y[i] += x[i] * Alpha; }}
这是使用dotnet build -c Release和dotnet run -c Release,dotnet –version报告“2.2.0-prevIEw1-008000”(不久之前的“每日”).
在我的桌面上,我希望差异会更好.
总结以上是内存溢出为你收集整理的C#Vector.CopyTo几乎比非SIMD版本快?全部内容,希望文章能够帮你解决C#Vector.CopyTo几乎比非SIMD版本快?所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)