Last active
September 7, 2016 19:49
-
-
Save edefazio/c999d89ad7a6bf374884cc7c02be3367 to your computer and use it in GitHub Desktop.
SWAR Data Locality and Java Lambdas (Query performance of Lambdas verses SWAR binary packed rows)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package swar.subword; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.Random; | |
import java.util.concurrent.Callable; | |
import java.util.concurrent.ExecutorService; | |
import java.util.concurrent.Executors; | |
import java.util.concurrent.Future; | |
/** | |
* Illustrates that doing the SAME work | |
* (querying a large dataset of XY points and counting the number of | |
* XY points where X and Y coordinates are BOTH Odd) | |
* can have vastly different performance because of Data Representation. | |
* | |
* Two key take-aways are: | |
* <UL> | |
* <LI>Single threaded Lambdas perform worse - Because Java objects do not | |
* have locality the baseline performance of the( single threaded ) | |
* Lambda expression over the collection is (2-3x) | |
* worse than with SWAR (which is efficiently packing the data within the | |
* data cache without pointer indirection) | |
*<PRE> | |
* List Lambda <B>15943ms</B> for (10000x) scanned 100000 found 250050000 | |
* Compound SWAR Bit Mask <B>4253ms</B> for (10000x) scanned 100000 found 250050000 | |
*</PRE> | |
* <LI>Lambdas dont scale as well - When converting the Lambda | |
* from a stream to parallel stream (to utilize multiple threads) the | |
* we get a 1.5x speedup (on a 4 core machine) alternatively, | |
* SWAR gets about a 2.6x speedup using 4 threads | |
*<PRE> | |
* List Lambda 5460ms for (10x) scanned 10000000 found 24992440 | |
* List Lambda (Parallel) 3473ms for (10x) scanned 10000000 found 24992440 | |
* 1.5x scale for 4 threads | |
* | |
* SWAR 1244ms for (10x) scanned 10000000 found 24992440 | |
* SWAR (4 threads) 468ms for (10x) scanned 10000000 found 24992440 | |
* 2.6x scale for 4 threads | |
*</PRE> | |
*</UL> | |
* | |
* In addition, it shows that traditional Java Objects suffer | |
* | |
* SWAR (SIMD Within a Register) is a technique for packing more than | |
* one field into a register sized word | |
* | |
* @author M. Eric DeFazio [email protected] | |
*/ | |
public class SWARPacked_v_ListLambda_Perf | |
{ | |
public static class XY | |
{ | |
public final int x; | |
public final int y; | |
public XY( int x, int y ) | |
{ | |
this.x = x; | |
this.y = y; | |
} | |
public boolean bothOdd() | |
{ | |
return ( x & 1 ) == 1 && | |
( y & 1 ) == 1; | |
} | |
} | |
private static final int ROW_COUNT = 10000000; | |
private static final int SCAN_ITERATIONS = 10; | |
public static void main( String[] args ) | |
{ | |
/** Generate Random Test Data */ | |
long[] xyCompound = new long[ ROW_COUNT ]; | |
List<XY> xyList = new ArrayList<XY>( ROW_COUNT ); | |
Random r = new Random(); | |
for ( int i = 0; i < ROW_COUNT; i++ ) | |
{ | |
int x = r.nextInt(); | |
int y = r.nextInt(); | |
//we need this & MASK to avoid Java sign extension | |
xyCompound[ i ] = ( x + 0L << 32 ) | ( y & ( -1L >>> 32 ) ); | |
xyList.add( new XY( x, y ) ); | |
} | |
/** lets try a GC BEFORE running the timings*/ | |
System.gc(); | |
timeLambdaScan( xyList, SCAN_ITERATIONS ); | |
timeLambdaParallelScan( xyList, SCAN_ITERATIONS ); | |
timeSWARMask( xyCompound, SCAN_ITERATIONS ); | |
timeMultiThreadedSWARMask( xyCompound, 4, SCAN_ITERATIONS ); | |
} | |
public static void timeLambdaScan ( List<XY> xyList, int iterations ) | |
{ | |
int found = 0; | |
long start = System.currentTimeMillis(); | |
found = listLambdaScan( xyList, iterations ); | |
long end = System.currentTimeMillis(); | |
System.out.println ("List Lambda "+(end-start) | |
+"ms for ("+iterations+"x) scanned "+xyList.size()+" found "+found); | |
} | |
public static void timeLambdaParallelScan ( List<XY> xyList, int iterations ) | |
{ | |
int found = 0; | |
long start = System.currentTimeMillis(); | |
found = listLambdaParallelScan(xyList, iterations ); | |
long end = System.currentTimeMillis(); | |
System.out.println ("List Lambda (Parallel) "+(end-start)+"ms for ("+iterations+"x) scanned "+xyList.size()+" found "+found); | |
} | |
public static int listLambdaScan ( List<XY> xyList, int iterations ) | |
{ | |
int found = 0; | |
for (int it = 0; it < iterations; it++ ) | |
{ | |
found+= xyList.stream() | |
.filter(s -> s.bothOdd()) | |
.count(); | |
} | |
return found; | |
} | |
public static int listLambdaParallelScan ( List<XY> xyList, int iterations ) | |
{ | |
int found = 0; | |
for (int it = 0; it < iterations; it++ ) | |
{ | |
found+= xyList.parallelStream() | |
.filter(s -> s.bothOdd()) | |
.count(); | |
} | |
return found; | |
} | |
private static void timeSWARMask ( long[] xyCompound, int iterations ) | |
{ | |
long start = System.currentTimeMillis(); | |
int found = compoundMaskBothOdd( xyCompound, iterations ); | |
long end = System.currentTimeMillis(); | |
System.out.println( "Compound SWAR Bit Mask " + (end-start) | |
+ "ms for (" + iterations + "x) scanned " + xyCompound.length | |
+ " found " + found); | |
} | |
public static final long BOTH_ODD_MASK = ( 1L << 32 ) + 1; | |
public static int compoundMaskBothOdd( long[] xyCompound, int iterations ) | |
{ | |
int found = 0; | |
for( int it = 0; it < iterations; it++ ) | |
{ | |
for( int i = 0; i < xyCompound.length; i++ ) | |
{ | |
if ( ( xyCompound[ i ] & BOTH_ODD_MASK ) == BOTH_ODD_MASK ) | |
{ | |
found++; | |
} | |
} | |
} | |
return found; | |
} | |
public static int parallelSWAR( | |
long[] xyCompound, int threadCount, int iterations ) | |
{ | |
int gTotal = 0; | |
ExecutorService executorService = | |
Executors.newFixedThreadPool( threadCount ); | |
List<Callable<Integer>> tasks = new ArrayList<Callable<Integer>>(); | |
int elementsPer = xyCompound.length / threadCount; | |
for( int i = 0; i < threadCount; i++ ) | |
{ | |
if( i != threadCount -1 ) | |
{ | |
tasks.add( | |
new PartitionOddScan( | |
iterations, | |
xyCompound, | |
i * elementsPer, | |
elementsPer ) ); | |
} | |
else | |
{ | |
tasks.add( | |
new PartitionOddScan( | |
iterations, | |
xyCompound, | |
i * elementsPer + ( xyCompound.length % threadCount ), | |
elementsPer ) ); | |
} | |
} | |
try | |
{ | |
List<Future<Integer>> res = executorService.invokeAll( tasks ); | |
for( int i = 0; i < res.size(); i++ ) | |
{ | |
gTotal += res.get( i ).get(); | |
} | |
} | |
catch( Exception ex ) | |
{ | |
System.out.println("Exception " + ex ); | |
throw new RuntimeException( ex ); | |
} | |
//} | |
executorService.shutdownNow(); | |
return gTotal; | |
} | |
public static class PartitionOddScan | |
implements Callable<Integer> | |
{ | |
public final int iterations; | |
public final long[] compounds; | |
public final int startIndex; | |
public final int count; | |
public PartitionOddScan( | |
int iterations, | |
long[] xyCompound, | |
int startIndex, | |
int count ) | |
{ | |
this.iterations = iterations; | |
this.compounds = xyCompound; | |
this.startIndex = startIndex; | |
this.count = count; | |
} | |
@Override | |
public Integer call() throws Exception | |
{ | |
int found = 0; | |
for( int it = 0; it < iterations; it++ ) | |
{ | |
for( int i = startIndex; i < startIndex + count; i++ ) | |
{ | |
if ( ( compounds[ i ] & BOTH_ODD_MASK ) == BOTH_ODD_MASK ) | |
{ | |
found++; | |
} | |
} | |
} | |
return found; | |
} | |
} | |
/** | |
* Create a Divide and conquer Thread Pool to | |
* @param xyCompound | |
* @param threadCount number of Threads | |
* @param iterations | |
*/ | |
private static void timeMultiThreadedSWARMask( | |
long[] xyCompound, int threadCount, int iterations ) | |
{ | |
long start = System.currentTimeMillis(); | |
int found = parallelSWAR ( xyCompound, threadCount, iterations ); | |
long end = System.currentTimeMillis(); | |
System.out.println( "Parallel SWAR Bit Mask " + (end-start) | |
+ "ms for (" + iterations + "x) scanned " + xyCompound.length | |
+ " found " + found+" ...with "+ threadCount+" threads" ); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Some example test results on a 4 physical core machine
List Lambda 15943ms for (10000x) scanned 100000 found 250050000
List Lambda (Parallel) 10415ms for (10000x) scanned 100000 found 250050000
Compound SWAR Bit Mask 4253ms for (10000x) scanned 100000 found 250050000
Parallel SWAR Bit Mask 2511ms for (10000x) scanned 100000 found 250050000 ...with 4 threads
List Lambda 24649ms for (10000x) scanned 100000 found 249780000
List Lambda (Parallel) 20472ms for (10000x) scanned 100000 found 249780000
Compound SWAR Bit Mask 6234ms for (10000x) scanned 100000 found 249780000
Parallel SWAR Bit Mask 3400ms for (10000x) scanned 100000 found 249780000 ...with 2 threads
List Lambda 4849ms for (10x) scanned 10000000 found 25006830
List Lambda (Parallel) 2300ms for (10x) scanned 10000000 found 25006830
Compound SWAR Bit Mask 1871ms for (10x) scanned 10000000 found 25006830
Parallel SWAR Bit Mask 730ms for (10x) scanned 10000000 found 25006830 ...with 2 threads
List Lambda 5460ms for (10x) scanned 10000000 found 24992440
List Lambda (Parallel) 3473ms for (10x) scanned 10000000 found 24992440
Compound SWAR Bit Mask 1244ms for (10x) scanned 10000000 found 24992440
Parallel SWAR Bit Mask 468ms for (10x) scanned 10000000 found 24992440 ...with 4 threads
List Lambda 4361ms for (10x) scanned 10000000 found 25006790
List Lambda (Parallel) 3979ms for (10x) scanned 10000000 found 25006790
Compound SWAR Bit Mask 1398ms for (10x) scanned 10000000 found 25006790
Parallel SWAR Bit Mask 389ms for (10x) scanned 10000000 found 25006790 ...with 4 threads