c# – 为什么添加DoubleStructs比添加双倍的速度要慢得多?
假设
对于任何简单的操作,包含单个基元的只读结构应该与基元本身一样快. 测试 以下所有测试都是在Windows 7 x64上运行.NET Core 2.2,代码优化.在.NET 4.7.2上测试时,我也得到了类似的结果. 测试:渴望 用long类型测试这个前提,似乎这有: // =============== SETUP =================== public readonly struct LongStruct { public readonly long Primitive; public LongStruct(long value) => Primitive = value; [MethodImpl(MethodImplOptions.AggressiveInlining)] public static LongStruct Add(in LongStruct lhs,in LongStruct rhs) => new LongStruct(lhs.Primitive + rhs.Primitive); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static long LongAdd(long lhs,long rhs) => lhs + rhs; // =============== TESTS =================== public static void TestLong(long a,long b,out long result) { var sw = Stopwatch.StartNew(); for (var i = 1000000000; i > 0; --i) { a = LongAdd(a,b); } sw.Stop(); result = a; return sw.ElapsedMilliseconds; } public static void TestLongStruct(LongStruct a,LongStruct b,out LongStruct result) { var sw = Stopwatch.StartNew(); for (var i = 1000000000; i > 0; --i) { a = LongStruct.Add(a,b); } sw.Stop(); result = a; return sw.ElapsedMilliseconds; } // ============= TEST LOOP ================= public static void RunTests() { var longStruct = new LongStruct(1); var count = 0; var longTime = 0L; var longStructTime = 0L; while (true) { count++; Console.WriteLine("Test #" + count); longTime += TestLong(1,1,out var longResult); var longMean = longTime / count; Console.WriteLine($"Long: value={longResult},Mean Time elapsed: {longMean} ms"); longStructTime += TestLongStruct(longStruct,longStruct,out var longStructResult); var longStructMean = longStructTime / count; Console.WriteLine($"LongStruct: value={longStructResult.Primitive},Mean Time elapsed: {longStructMean} ms"); Console.WriteLine(); } } 使用LongAdd使测试循环匹配 – 每个循环调用一个方法来进行一些添加,而不是内联案例的内联 在我的机器上,这两次已经稳定在彼此的2%之内,足够近以至于我确信它们已经针对几乎相同的代码进行了优化. IL的差异相当小: >测试循环代码是相同的,除了调用哪个方法(LongAdd vs LongStruct.Add). >一对ldfld指令从结构中加载Primitive 所以要么抖动正在优化这些指令,要么它们基本上是免费的. 测试:双打 如果我采用上面的代码并用double替换每个long,我会期望相同的结果(绝对值较慢,因为add指令稍微慢一点,但两者都是相同的余量). 我实际看到的是DoubleStruct版本比双版本慢大约4.8倍(即480%). IL与long case相同(除了交换用于float64和DoubleStruct的int64和LongStruct),但不知何故,运行时正在为LongStruct案例或双案例中不存在的DoubleStruct案例进行额外的工作. 测试:其他类型 测试一些其他原始类型,我看到float(465%)的行为与double的行为相同,而short和int的行为方式与long相同,所以看起来它是关于浮点导致一些优化不被采取. 题 为什么DoubleStruct和FloatStruct比double和float慢得多,其中long,int和short等价物没有这样的减速? 解决方法
这不是一个单独的答案,但它在x86和x64上都是一个更严格的基准测试,所以希望它能为其他可以解释这个问题的人提供更多信息.
我试图用BenchmarkDotNet复制它.我也想知道删除in会有什么区别.我把它作为x86和x64单独运行. x86(LegacyJIT) | Method | Mean | Error | StdDev | |----------------------- |---------:|---------:|---------:| | TestLong | 257.9 ms | 2.099 ms | 1.964 ms | | TestLongStruct | 529.3 ms | 4.977 ms | 4.412 ms | | TestLongStructWithIn | 526.2 ms | 6.722 ms | 6.288 ms | | TestDouble | 256.7 ms | 1.466 ms | 1.300 ms | | TestDoubleStruct | 342.5 ms | 5.189 ms | 4.600 ms | | TestDoubleStructWithIn | 338.7 ms | 3.808 ms | 3.376 ms | x64(RyuJIT) | Method | Mean | Error | StdDev | |----------------------- |-----------:|----------:|----------:| | TestLong | 269.8 ms | 5.359 ms | 9.099 ms | | TestLongStruct | 266.2 ms | 6.706 ms | 8.236 ms | | TestLongStructWithIn | 270.4 ms | 4.150 ms | 3.465 ms | | TestDouble | 270.4 ms | 5.336 ms | 6.748 ms | | TestDoubleStruct | 1,250.9 ms | 24.702 ms | 25.367 ms | | TestDoubleStructWithIn | 577.1 ms | 12.159 ms | 16.644 ms | 我可以使用RyuJIT在x64上复制此内容,但不能在使用LegacyJIT的x86上复制此内容.这似乎是RyuJIT管理优化长期案例但不是双重案例的工件 – LegacyJIT也没有做出优化. 我不知道为什么TestDoubleStruct在RyuJIT上是如此异常. 码: public readonly struct LongStruct { public readonly long Primitive; public LongStruct(long value) => Primitive = value; public static LongStruct Add(LongStruct lhs,LongStruct rhs) => new LongStruct(lhs.Primitive + rhs.Primitive); public static LongStruct AddWithIn(in LongStruct lhs,in LongStruct rhs) => new LongStruct(lhs.Primitive + rhs.Primitive); } public readonly struct DoubleStruct { public readonly double Primitive; public DoubleStruct(double value) => Primitive = value; public static DoubleStruct Add(DoubleStruct lhs,DoubleStruct rhs) => new DoubleStruct(lhs.Primitive + rhs.Primitive); public static DoubleStruct AddWithIn(in DoubleStruct lhs,in DoubleStruct rhs) => new DoubleStruct(lhs.Primitive + rhs.Primitive); } public class Benchmark { [Benchmark] public void TestLong() { for (var i = 1000000000; i > 0; --i) { LongAdd(1,2); } } [Benchmark] public void TestLongStruct() { var a = new LongStruct(1); var b = new LongStruct(2); for (var i = 1000000000; i > 0; --i) { LongStruct.Add(a,b); } } [Benchmark] public void TestLongStructWithIn() { var a = new LongStruct(1); var b = new LongStruct(2); for (var i = 1000000000; i > 0; --i) { LongStruct.AddWithIn(a,b); } } [Benchmark] public void TestDouble() { for (var i = 1000000000; i > 0; --i) { DoubleAdd(1,2); } } [Benchmark] public void TestDoubleStruct() { var a = new DoubleStruct(1); var b = new DoubleStruct(2); for (var i = 1000000000; i > 0; --i) { DoubleStruct.Add(a,b); } } [Benchmark] public void TestDoubleStructWithIn() { var a = new DoubleStruct(1); var b = new DoubleStruct(2); for (var i = 1000000000; i > 0; --i) { DoubleStruct.AddWithIn(a,b); } } public static long LongAdd(long lhs,long rhs) => lhs + rhs; public static double DoubleAdd(double lhs,double rhs) => lhs + rhs; } class Program { static void Main(string[] args) { var summary = BenchmarkRunner.Run<Benchmark>(); Console.ReadLine(); } } 为了好玩,以下是两种情况下的x64程序集: 码 using System; public class C { public long AddLongs(long a,long b) { return a + b; } public LongStruct AddLongStructs(LongStruct a,LongStruct b) { return LongStruct.Add(a,b); } public LongStruct AddLongStructsWithIn(LongStruct a,LongStruct b) { return LongStruct.AddWithIn(a,b); } public double AddDoubles(double a,double b) { return a + b; } public DoubleStruct AddDoubleStructs(DoubleStruct a,DoubleStruct b) { return DoubleStruct.Add(a,b); } public DoubleStruct AddDoubleStructsWithIn(DoubleStruct a,DoubleStruct b) { return DoubleStruct.AddWithIn(a,b); } } public readonly struct LongStruct { public readonly long Primitive; public LongStruct(long value) => Primitive = value; public static LongStruct Add(LongStruct lhs,in LongStruct rhs) => new LongStruct(lhs.Primitive + rhs.Primitive); } public readonly struct DoubleStruct { public readonly double Primitive; public DoubleStruct(double value) => Primitive = value; public static DoubleStruct Add(DoubleStruct lhs,in DoubleStruct rhs) => new DoubleStruct(lhs.Primitive + rhs.Primitive); } x86汇编 C.AddLongs(Int64,Int64) L0000: mov eax,[esp+0xc] L0004: mov edx,[esp+0x10] L0008: add eax,[esp+0x4] L000c: adc edx,[esp+0x8] L0010: ret 0x10 C.AddLongStructs(LongStruct,LongStruct) L0000: push esi L0001: mov eax,[esp+0x10] L0005: mov esi,[esp+0x14] L0009: add eax,[esp+0x8] L000d: adc esi,[esp+0xc] L0011: mov [edx],eax L0013: mov [edx+0x4],esi L0016: pop esi L0017: ret 0x10 C.AddLongStructsWithIn(LongStruct,esi L0016: pop esi L0017: ret 0x10 C.AddDoubles(Double,Double) L0000: fld qword [esp+0xc] L0004: fadd qword [esp+0x4] L0008: ret 0x10 C.AddDoubleStructs(DoubleStruct,DoubleStruct) L0000: fld qword [esp+0xc] L0004: fld qword [esp+0x4] L0008: faddp st1,st0 L000a: fstp qword [edx] L000c: ret 0x10 C.AddDoubleStructsWithIn(DoubleStruct,DoubleStruct) L0000: fld qword [esp+0xc] L0004: fadd qword [esp+0x4] L0008: fstp qword [edx] L000a: ret 0x10 x64汇编 C..ctor() L0000: ret C.AddLongs(Int64,Int64) L0000: lea rax,[rdx+r8] L0004: ret C.AddLongStructs(LongStruct,LongStruct) L0000: lea rax,[rdx+r8] L0004: ret C.AddLongStructsWithIn(LongStruct,[rdx+r8] L0004: ret C.AddDoubles(Double,Double) L0000: vzeroupper L0003: vmovaps xmm0,xmm1 L0008: vaddsd xmm0,xmm0,xmm2 L000d: ret C.AddDoubleStructs(DoubleStruct,DoubleStruct) L0000: sub rsp,0x18 L0004: vzeroupper L0007: mov [rsp+0x28],rdx L000c: mov [rsp+0x30],r8 L0011: mov rax,[rsp+0x28] L0016: mov [rsp+0x10],rax L001b: mov rax,[rsp+0x30] L0020: mov [rsp+0x8],rax L0025: vmovsd xmm0,qword [rsp+0x10] L002c: vaddsd xmm0,[rsp+0x8] L0033: vmovsd [rsp],xmm0 L0039: mov rax,[rsp] L003d: add rsp,0x18 L0041: ret C.AddDoubleStructsWithIn(DoubleStruct,DoubleStruct) L0000: push rax L0001: vzeroupper L0004: mov [rsp+0x18],rdx L0009: mov [rsp+0x20],r8 L000e: vmovsd xmm0,qword [rsp+0x18] L0015: vaddsd xmm0,[rsp+0x20] L001c: vmovsd [rsp],xmm0 L0022: mov rax,[rsp] L0026: add rsp,0x8 L002a: ret SharpLab 如果你添加循环: 码 public class C { public void AddLongs(long a,long b) { for (var i = 1000000000; i > 0; --i) { long c = a + b; } } public void AddLongStructs(LongStruct a,LongStruct b) { for (var i = 1000000000; i > 0; --i) { a = LongStruct.Add(a,b); } } public void AddLongStructsWithIn(LongStruct a,LongStruct b) { for (var i = 1000000000; i > 0; --i) { a = LongStruct.AddWithIn(a,b); } } public void AddDoubles(double a,double b) { for (var i = 1000000000; i > 0; --i) { a = a + b; } } public void AddDoubleStructs(DoubleStruct a,DoubleStruct b) { for (var i = 1000000000; i > 0; --i) { a = DoubleStruct.Add(a,b); } } public void AddDoubleStructsWithIn(DoubleStruct a,DoubleStruct b) { for (var i = 1000000000; i > 0; --i) { a = DoubleStruct.AddWithIn(a,b); } } } public readonly struct LongStruct { public readonly long Primitive; public LongStruct(long value) => Primitive = value; public static LongStruct Add(LongStruct lhs,in DoubleStruct rhs) => new DoubleStruct(lhs.Primitive + rhs.Primitive); } 86 C.AddLongs(Int64,Int64) L0000: push ebp L0001: mov ebp,esp L0003: mov eax,0x3b9aca00 L0008: dec eax L0009: test eax,eax L000b: jg L0008 L000d: pop ebp L000e: ret 0x10 C.AddLongStructs(LongStruct,LongStruct) L0000: push ebp L0001: mov ebp,esp L0003: push esi L0004: mov esi,0x3b9aca00 L0009: mov eax,[ebp+0x10] L000c: mov edx,[ebp+0x14] L000f: add eax,[ebp+0x8] L0012: adc edx,[ebp+0xc] L0015: mov [ebp+0x10],eax L0018: mov [ebp+0x14],edx L001b: dec esi L001c: test esi,esi L001e: jg L0009 L0020: pop esi L0021: pop ebp L0022: ret 0x10 C.AddLongStructsWithIn(LongStruct,esi L001e: jg L0009 L0020: pop esi L0021: pop ebp L0022: ret 0x10 C.AddDoubles(Double,Double) L0000: push ebp L0001: mov ebp,eax L000b: jg L0008 L000d: pop ebp L000e: ret 0x10 C.AddDoubleStructs(DoubleStruct,DoubleStruct) L0000: push ebp L0001: mov ebp,0x3b9aca00 L0008: fld qword [ebp+0x10] L000b: fld qword [ebp+0x8] L000e: faddp st1,st0 L0010: fstp qword [ebp+0x10] L0013: dec eax L0014: test eax,eax L0016: jg L0008 L0018: pop ebp L0019: ret 0x10 C.AddDoubleStructsWithIn(DoubleStruct,0x3b9aca00 L0008: fld qword [ebp+0x10] L000b: fadd qword [ebp+0x8] L000e: fstp qword [ebp+0x10] L0011: dec eax L0012: test eax,eax L0014: jg L0008 L0016: pop ebp L0017: ret 0x10 64位 C.AddLongs(Int64,0x3b9aca00 L0005: dec eax L0007: test eax,eax L0009: jg L0005 L000b: ret C.AddLongStructs(LongStruct,LongStruct) L0000: mov eax,0x3b9aca00 L0005: add rdx,r8 L0008: dec eax L000a: test eax,eax L000c: jg L0005 L000e: ret C.AddLongStructsWithIn(LongStruct,eax L000c: jg L0005 L000e: ret C.AddDoubles(Double,Double) L0000: vzeroupper L0003: mov eax,0x3b9aca00 L0008: vaddsd xmm1,xmm1,xmm2 L000d: dec eax L000f: test eax,eax L0011: jg L0008 L0013: ret C.AddDoubleStructs(DoubleStruct,r8 L0011: mov eax,0x3b9aca00 L0016: mov rdx,[rsp+0x28] L001b: mov [rsp+0x10],rdx L0020: mov rdx,[rsp+0x30] L0025: mov [rsp+0x8],rdx L002a: vmovsd xmm0,qword [rsp+0x10] L0031: vaddsd xmm0,[rsp+0x8] L0038: vmovsd [rsp],xmm0 L003e: mov rdx,[rsp] L0042: mov [rsp+0x28],rdx L0047: dec eax L0049: test eax,eax L004b: jg L0016 L004d: add rsp,0x18 L0051: ret C.AddDoubleStructsWithIn(DoubleStruct,r8 L000e: mov eax,0x3b9aca00 L0013: vmovsd xmm0,qword [rsp+0x20] L001a: vmovaps xmm1,xmm0 L001f: vaddsd xmm1,[rsp+0x18] L0026: vmovsd [rsp],xmm1 L002c: mov rdx,[rsp] L0030: mov [rsp+0x18],rdx L0035: dec eax L0037: test eax,eax L0039: jg L001a L003b: add rsp,0x8 L003f: ret SharpLab 我对汇编不够熟悉,无法解释它究竟在做什么,但很明显AddDoubleStructs中的工作比AddLongStructs更多. (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |