Skip to content

Instantly share code, notes, and snippets.

@20chan
Last active May 20, 2019 07:34
Show Gist options
  • Select an option

  • Save 20chan/d8b3d0bde93ebfdeefd94164a721059c to your computer and use it in GitHub Desktop.

Select an option

Save 20chan/d8b3d0bde93ebfdeefd94164a721059c to your computer and use it in GitHub Desktop.
c# parallel sum 퍼포먼스 비교

https://michaelscodingspot.com/array-iteration-vs-parallelism-in-c-net/?utm_source=csharpdigest&utm_medium=email&utm_campaign=featured

이 글을 보고 따라해봤다

총 10000번을 반복하고, 글처럼 아이템 개수를 바꿔가면서 테스트해봤다

length: 1000

Sum: 000005 ms
SumParallelForInterlocked: 000055 ms
SumParallelForWithLocalFinally: 000034 ms
SumSIMD: 000005 ms

length: 100000

Sum: 000618 ms
SumParallelForInterlocked: 000037 ms
SumParallelForWithLocalFinally: 000029 ms
SumSIMD: 000484 ms

length: 1000000

Sum: 005403 ms
SumParallelForInterlocked: 000046 ms
SumParallelForWithLocalFinally: 000031 ms
SumSIMD: 005104 ms

와 정말 쩐다

추가로 BenchmarkDotNet 을 사용해봤다

|                         Method | Length |         Mean |        Error |       StdDev |  Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|------------------------------- |------- |-------------:|-------------:|-------------:|-------:|--------:|------:|------:|------:|----------:|
|                            Sum |   1000 |     583.5 ns |     49.69 ns |     32.87 ns |   1.00 |    0.00 |     - |     - |     - |         - |
|      SumParallelForInterlocked |   1000 |  52,022.5 ns | 39,320.37 ns | 26,007.99 ns |  90.47 |   48.20 |     - |     - |     - |    1944 B |
| SumParallelForWithLocalFinally |   1000 |  36,386.6 ns | 11,210.86 ns |  7,415.29 ns |  62.34 |   12.07 |     - |     - |     - |    2048 B |
|                        SumSIMD |   1000 | 524,825.3 ns | 16,917.59 ns |  8,848.23 ns | 888.92 |   42.91 |     - |     - |     - |         - |
|                                |        |              |              |              |        |         |       |       |       |           |
|                            Sum |  10000 |  13,953.5 ns | 13,970.95 ns |  9,240.92 ns |   1.00 |    0.00 |     - |     - |     - |         - |
|      SumParallelForInterlocked |  10000 |  48,726.4 ns | 33,407.44 ns | 22,096.95 ns |   5.36 |    4.56 |     - |     - |     - |    1944 B |
| SumParallelForWithLocalFinally |  10000 |  35,280.7 ns |  8,949.82 ns |  5,919.75 ns |   3.74 |    2.33 |     - |     - |     - |    2003 B |
|                        SumSIMD |  10000 | 545,092.6 ns | 42,120.09 ns | 25,064.99 ns |  61.84 |   33.05 |     - |     - |     - |         - |
|                                |        |              |              |              |        |         |       |       |       |           |
|                            Sum | 100000 |  55,138.5 ns |    716.09 ns |    374.53 ns |   1.00 |    0.00 |     - |     - |     - |         - |
|      SumParallelForInterlocked | 100000 |  33,248.6 ns | 13,561.80 ns |  8,070.41 ns |   0.62 |    0.14 |     - |     - |     - |    1944 B |
| SumParallelForWithLocalFinally | 100000 |  37,787.2 ns |  9,927.10 ns |  6,566.16 ns |   0.67 |    0.13 |     - |     - |     - |    2048 B |
|                        SumSIMD | 100000 | 548,687.0 ns | 27,351.00 ns | 18,090.99 ns |   9.95 |    0.28 |     - |     - |     - |         - |

결과가 다소 다르긴 하네

using System;
using System.Diagnostics;
using System.Linq;
using System.Numerics;
using System.Threading;
using System.Threading.Tasks;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
namespace simd_test {
[SimpleJob(targetCount: 10, invocationCount: 3)]
[MemoryDiagnoser]
public class Program {
static void Main(string[] args) {
BenchmarkRunner.Run<Program>();
}
[Params(1000, 10000, 100000)]
public int Length;
public int[] data;
[GlobalSetup]
public void SetUp() {
var random = new Random();
data = new int[1000000];
for (int i = 0; i < data.Length; i++) {
data[i] = random.Next();
}
}
[Benchmark(Baseline = true)]
public void Sum() {
long result = 0;
for (int i = 0; i < Length; i++) {
result += data[i];
}
}
[Benchmark]
public void SumParallelForInterlocked() {
long result = 0;
Parallel.For(0, 8, i => {
for (int j = i * 8; j < (i + 1) * 8; j++) {
Interlocked.Add(ref result, data[j]);
}
});
}
[Benchmark]
public void SumParallelForWithLocalFinally() {
long result = 0;
Parallel.For(0, 8,
localInit: () => 0L,
localFinally: total => result += total,
body: (i, s, t) => {
for (int j = i * 8; j < (i + 1) * 8; j++) {
t += data[j];
}
return t;
});
}
[Benchmark]
public void SumSIMD() {
long result = 0;
var simdLength = Vector<int>.Count;
int i = 0;
int lastSafeVectorIndex = data.Length - simdLength;
for (i = 0; i <= lastSafeVectorIndex; i+= simdLength) {
result = result + data[i] + data[i + 1] + data[i + 2] + data[i + 3]
+ data[i + 4] + data[i + 5] + data[i + 6] + data[i + 7];
}
for (; i < data.Length; i++) {
result += data[i];
}
}
}
}
using System;
using System.Diagnostics;
using System.Linq;
using System.Numerics;
using System.Threading;
using System.Threading.Tasks;
namespace simd_test {
class Program {
private static int[] data;
static void Main(string[] args) {
var random = new Random();
data = new int[100000];
for (int i = 0; i < data.Length; i++) {
data[i] = random.Next();
}
Console.WriteLine($"IsHardwareAccelerated: {Vector.IsHardwareAccelerated}");
PerformanceTest(nameof(Sum), Sum);
PerformanceTest(nameof(SumParallelForInterlocked), SumParallelForInterlocked);
PerformanceTest(nameof(SumParallelForWithLocalFinally), SumParallelForWithLocalFinally);
PerformanceTest(nameof(SumSIMD), SumSIMD);
}
static void PerformanceTest(string name, Action<int[]> func) {
var stopwatch = new Stopwatch();
stopwatch.Start();
for (int i = 0; i < 10000; i++) {
func(data);
}
stopwatch.Stop();
Console.WriteLine($"{name}: {stopwatch.ElapsedMilliseconds:000000} ms");
}
static void Sum(int[] input) {
long result = 0;
var length = input.Length;
for (int i = 0; i < length; i++) {
result += input[i];
}
}
static void SumParallelForInterlocked(int[] input) {
long result = 0;
Parallel.For(0, 8, i => {
for (int j = i * 8; j < (i + 1) * 8; j++) {
Interlocked.Add(ref result, input[j]);
}
});
}
static void SumParallelForWithLocalFinally(int[] input) {
long result = 0;
Parallel.For(0, 8,
localInit: () => 0L,
localFinally: total => result += total,
body: (i, s, t) => {
for (int j = i * 8; j < (i + 1) * 8; j++) {
t += input[j];
}
return t;
});
}
static void SumSIMD(int[] input) {
long result = 0;
var simdLength = Vector<int>.Count;
int i = 0;
int lastSafeVectorIndex = input.Length - simdLength;
for (i = 0; i <= lastSafeVectorIndex; i+= simdLength) {
result = result + input[i] + input[i + 1] + input[i + 2] + input[i + 3]
+ input[i + 4] + input[i + 5] + input[i + 6] + input[i + 7];
}
for (; i < input.Length; i++) {
result += input[i];
}
result /= input.Length;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment