lethern · November 6, 2018 04:56
diff --git a/thread_test.cpp b/thread_test.cpp
 // might work with compiler optimilization turned on, but better when disabled

 #include <thread>
 #include <iostream>
 #include <string>
 #include <Windows.h> // for Sleep()

 typedef unsigned long long uint64;

 constexpr int CPU_N = 4;  // the more the longer it will wait (reduce n), 99+ will break the array
 constexpr int n = 100000000;

 struct shared_data {
 	volatile bool start = false;
 	volatile int arr[CPU_N * 101 + 5] ={ 0 };
 	// arr[i*100] = inc
 	// arr[i*100+1] = i (different cache)
 	// arr[2, 3, ...] = i1, i2, i3.. (same cache)
 	shared_data() {
 		for( int i=0; i < CPU_N; ++i )
 			arr[i * 100] = 1;
 	}
 };

 std::ostream& operator <<( std::ostream& os, shared_data& shared ) {
 	for( int i=0; i < CPU_N; ++i ) {
 		int val = shared.arr[i * 100 + 1];
 		if( val )  std::cout << " i" << (i + 1) << " " << val << " (" << val / 1000000 << " M)\n";
 	}
 	for( int j=2; j < CPU_N + 1; ++j ) {
 		int val = shared.arr[j];
 		if( val )  std::cout << " j" << j << " " << val << " (" << val / 1000000 << " M)\n";
 	}
 	return os;
 }

 struct thread_data {
 	shared_data* shared;
 	int number;
 	std::thread* t;
 	void init( shared_data* s, int n ) {
 		shared= s;
 		number= n;
 	}
 	void init_t( std::thread* _t ) {
 		t= _t;
 	}
 };


 void parallelized_func_1( thread_data* s ) {
 	auto& shared = *s->shared;
 	int index = s->number * 100 + 1;

 	while( !shared.start );

 	for( int i=0; i < n; ++i ) {
 		shared.arr[index]++;
 	}
 }

 void parallelized_func_2( thread_data* s ) {
 	auto& shared = *s->shared;
 	int index = s->number + 1;

 	while( !shared.start );

 	for( int i=0; i < n; ++i ) {
 		shared.arr[index]++;
 	}
 }

 void parallelized_func_3( thread_data* s ) {
 	auto& shared = *s->shared;
 	int index = s->number * 100 + 1;

 	while( !shared.start );

 	for( int i=0; i < n; ++i ) {
 		shared.arr[index]+= shared.arr[0];
 	}
 }

 void parallelized_func_4( thread_data* s ) {
 	auto& shared = *s->shared;
 	int index = s->number * 100 + 1;
 	int inc_index = s->number * 100;

 	while( !shared.start );

 	for( int i=0; i < n; ++i ) {
 		shared.arr[index]+= shared.arr[inc_index];
 	}
 }

 void test( void( *func )(thread_data* s), std::string description );

 void thread_test()
 {
 	test(parallelized_func_1, "Save on diff cache line");
 	test(parallelized_func_2, "Save on same cache line");
 	test(parallelized_func_3, "Save on diff cache line, read from same cache line");
 	test(parallelized_func_4, "Save on diff cache line, read from diff cache line");
 }

 void test( void( *func )(thread_data* s), std::string description )
 {
 	shared_data shared;

 	thread_data data[CPU_N];
 	std::thread* t[CPU_N];

 	for( int i=0; i < CPU_N; ++i ) {
 		data[i].init( &shared, i );
 		t[i] = new std::thread( func, &data[i] );
 		data[i].init_t( t[i] );
 	}

 	shared.start= true;
 	Sleep( 50 );

 	std::cout << description << '\n' << shared << std::endl;
 	
 	for( int i=0; i < CPU_N; ++i ) {
 		t[i]->join();
 		delete t[i];
 	}
 }


 /*
 number (number M) is sum for given thread, so additions performed in 50ms

 cpu=2
 Save on diff cache line
 i1 19610018 (19 M)
 i2 20091855 (20 M)
 Save on same cache line
 i1 7833202 (7 M)
 j2 8351601 (8 M)
 Save on diff cache line, read from same cache line
 i1 12648547 (12 M)
 i2 5045249 (5 M)
 Save on diff cache line, read from diff cache line
 i1 4268625 (4 M)
 i2 16202878 (16 M)


 cpu=4
 Save on diff cache line
 i1 8232565 (8 M)
 i2 8713713 (8 M)
 i3 24232903 (24 M)
 i4 19517495 (19 M)
 Save on same cache line
 i1 6144409 (6 M)
 j2 5420106 (5 M)
 j3 5421032 (5 M)
 j4 6614839 (6 M)
 Save on diff cache line, read from same cache line
 i1 2639208 (2 M)
 i2 9254534 (9 M)
 i3 19828949 (19 M)
 i4 12866018 (12 M)
 Save on diff cache line, read from diff cache line
 i1 23546340 (23 M)
 i2 14772822 (14 M)
 i3 16452606 (16 M)
 i4 24126207 (24 M)
 
 cpu=8
 Save on diff cache line
 i1 10473691 (10 M)
 i2 6352630 (6 M)
 i3 7592871 (7 M)
 i4 10344223 (10 M)
 i5 18681724 (18 M)
 i6 13409743 (13 M)
 i7 21532665 (21 M)
 i8 21089546 (21 M)
 Save on same cache line
 i1 3459366 (3 M)
 j2 2143212 (2 M)
 j3 3425390 (3 M)
 j4 5591536 (5 M)
 j5 6009022 (6 M)
 j6 2654168 (2 M)
 j7 6207111 (6 M)
 j8 4458036 (4 M)
 Save on diff cache line, read from same cache line
 i1 4561187 (4 M)
 i2 5034110 (5 M)
 i3 2119836 (2 M)
 i4 4495064 (4 M)
 i5 14529670 (14 M)
 i6 7462099 (7 M)
 i7 14228359 (14 M)
 i8 14074283 (14 M)
 Save on diff cache line, read from diff cache line
 i1 5470949 (5 M)
 i2 5348128 (5 M)
 i3 8305827 (8 M)
 i4 6489701 (6 M)
 i5 15874218 (15 M)
 i6 12931176 (12 M)
 i7 15900159 (15 M)
 i8 16156779 (16 M)

 */
	// might work with compiler optimilization turned on, but better when disabled

	#include <thread>
	#include <iostream>
	#include <string>
	#include <Windows.h> // for Sleep()

	typedef unsigned long long uint64;

	constexpr int CPU_N = 4; // the more the longer it will wait (reduce n), 99+ will break the array
	constexpr int n = 100000000;

	struct shared_data {
	volatile bool start = false;
	volatile int arr[CPU_N * 101 + 5] ={ 0 };
	// arr[i*100] = inc
	// arr[i*100+1] = i (different cache)
	// arr[2, 3, ...] = i1, i2, i3.. (same cache)
	shared_data() {
	for( int i=0; i < CPU_N; ++i )
	arr[i * 100] = 1;
	}
	};

	std::ostream& operator <<( std::ostream& os, shared_data& shared ) {
	for( int i=0; i < CPU_N; ++i ) {
	int val = shared.arr[i * 100 + 1];
	if( val ) std::cout << " i" << (i + 1) << " " << val << " (" << val / 1000000 << " M)\n";
	}
	for( int j=2; j < CPU_N + 1; ++j ) {
	int val = shared.arr[j];
	if( val ) std::cout << " j" << j << " " << val << " (" << val / 1000000 << " M)\n";
	}
	return os;
	}

	struct thread_data {
	shared_data* shared;
	int number;
	std::thread* t;
	void init( shared_data* s, int n ) {
	shared= s;
	number= n;
	}
	void init_t( std::thread* _t ) {
	t= _t;
	}
	};


	void parallelized_func_1( thread_data* s ) {
	auto& shared = *s->shared;
	int index = s->number * 100 + 1;

	while( !shared.start );

	for( int i=0; i < n; ++i ) {
	shared.arr[index]++;
	}
	}

	void parallelized_func_2( thread_data* s ) {
	auto& shared = *s->shared;
	int index = s->number + 1;

	while( !shared.start );

	for( int i=0; i < n; ++i ) {
	shared.arr[index]++;
	}
	}

	void parallelized_func_3( thread_data* s ) {
	auto& shared = *s->shared;
	int index = s->number * 100 + 1;

	while( !shared.start );

	for( int i=0; i < n; ++i ) {
	shared.arr[index]+= shared.arr[0];
	}
	}

	void parallelized_func_4( thread_data* s ) {
	auto& shared = *s->shared;
	int index = s->number * 100 + 1;
	int inc_index = s->number * 100;

	while( !shared.start );

	for( int i=0; i < n; ++i ) {
	shared.arr[index]+= shared.arr[inc_index];
	}
	}

	void test( void( func )(thread_data s), std::string description );

	void thread_test()
	{
	test(parallelized_func_1, "Save on diff cache line");
	test(parallelized_func_2, "Save on same cache line");
	test(parallelized_func_3, "Save on diff cache line, read from same cache line");
	test(parallelized_func_4, "Save on diff cache line, read from diff cache line");
	}

	void test( void( func )(thread_data s), std::string description )
	{
	shared_data shared;

	thread_data data[CPU_N];
	std::thread* t[CPU_N];

	for( int i=0; i < CPU_N; ++i ) {
	data[i].init( &shared, i );
	t[i] = new std::thread( func, &data[i] );
	data[i].init_t( t[i] );
	}

	shared.start= true;
	Sleep( 50 );

	std::cout << description << '\n' << shared << std::endl;

	for( int i=0; i < CPU_N; ++i ) {
	t[i]->join();
	delete t[i];
	}
	}


	/*
	number (number M) is sum for given thread, so additions performed in 50ms

	cpu=2
	Save on diff cache line
	i1 19610018 (19 M)
	i2 20091855 (20 M)
	Save on same cache line
	i1 7833202 (7 M)
	j2 8351601 (8 M)
	Save on diff cache line, read from same cache line
	i1 12648547 (12 M)
	i2 5045249 (5 M)
	Save on diff cache line, read from diff cache line
	i1 4268625 (4 M)
	i2 16202878 (16 M)


	cpu=4
	Save on diff cache line
	i1 8232565 (8 M)
	i2 8713713 (8 M)
	i3 24232903 (24 M)
	i4 19517495 (19 M)
	Save on same cache line
	i1 6144409 (6 M)
	j2 5420106 (5 M)
	j3 5421032 (5 M)
	j4 6614839 (6 M)
	Save on diff cache line, read from same cache line
	i1 2639208 (2 M)
	i2 9254534 (9 M)
	i3 19828949 (19 M)
	i4 12866018 (12 M)
	Save on diff cache line, read from diff cache line
	i1 23546340 (23 M)
	i2 14772822 (14 M)
	i3 16452606 (16 M)
	i4 24126207 (24 M)

	cpu=8
	Save on diff cache line
	i1 10473691 (10 M)
	i2 6352630 (6 M)
	i3 7592871 (7 M)
	i4 10344223 (10 M)
	i5 18681724 (18 M)
	i6 13409743 (13 M)
	i7 21532665 (21 M)
	i8 21089546 (21 M)
	Save on same cache line
	i1 3459366 (3 M)
	j2 2143212 (2 M)
	j3 3425390 (3 M)
	j4 5591536 (5 M)
	j5 6009022 (6 M)
	j6 2654168 (2 M)
	j7 6207111 (6 M)
	j8 4458036 (4 M)
	Save on diff cache line, read from same cache line
	i1 4561187 (4 M)
	i2 5034110 (5 M)
	i3 2119836 (2 M)
	i4 4495064 (4 M)
	i5 14529670 (14 M)
	i6 7462099 (7 M)
	i7 14228359 (14 M)
	i8 14074283 (14 M)
	Save on diff cache line, read from diff cache line
	i1 5470949 (5 M)
	i2 5348128 (5 M)
	i3 8305827 (8 M)
	i4 6489701 (6 M)
	i5 15874218 (15 M)
	i6 12931176 (12 M)
	i7 15900159 (15 M)
	i8 16156779 (16 M)

	*/
No results found