Skip to content

Instantly share code, notes, and snippets.

@airbreather
Last active August 29, 2015 14:07
Show Gist options
  • Save airbreather/71208fcfbcdc07ca33fd to your computer and use it in GitHub Desktop.
Save airbreather/71208fcfbcdc07ca33fd to your computer and use it in GitHub Desktop.
using System;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Runtime;
using System.Threading.Tasks;
using GeoAPI.Geometries;
using NetTopologySuite.Geometries;
using NetTopologySuite.IO;
using NetTopologySuite.Operation.Union;
namespace ConsoleApplication1
{
internal static class Program
{
private static void Main()
{
// Set this to the directory where you've extracted all the TIGER shapefiles.
// I grabbed all the files from ftp://ftp2.census.gov/geo/tiger/TIGER2014/TRACT/
// and extracted them to this place.
const string TigerBaseDirectoryPath = @"C:\Users\Joe Amenta\Downloads\TIGER\";
// Set this to a writable location where we will write out the WKT of the result.
const string OutputWktPath = @"C:\Freedom\j.txt";
if (!GCSettings.IsServerGC)
{
// I've noticed MASSIVE differences in CPU utilization here.
throw new Exception(@"You should probably set this in your .exe.config file:
<configuration>
<runtime>
<gcServer enabled=""true"" />
</runtime>
</configuration>");
}
////int[] someSubset = { 01, 13, 45, 37, 47, 28, 21, 51, 54, 39, 18 };
var fac = new GeometryFactory(new MiniCoordinateSequenceFactory());
var allGeoms = Directory.EnumerateFiles(TigerBaseDirectoryPath, "*.shp", SearchOption.AllDirectories)
////.Where(shpName => someSubset.Any(id => shpName.Contains("tl_2014_" + id + "_tract")))
.AsParallel()
.SelectMany(shp => new ShapefileReader(shp, fac).Cast<IGeometry>())
.ToArray();
Console.WriteLine("started union");
Stopwatch sw = Stopwatch.StartNew();
IGeometry result = UnaryUnionOp.ParallelUnion(allGeoms, new ParallelOptions());
sw.Stop();
Console.WriteLine("{0:n0} seconds", sw.ElapsedTicks / (double)Stopwatch.Frequency);
File.WriteAllText(OutputWktPath, result.ToString());
Console.ReadLine();
}
private sealed class MiniCoordinateSequenceFactory : ICoordinateSequenceFactory
{
public ICoordinateSequence Create(Coordinate[] coordinates)
{
return new MiniCoordinateSequence(coordinates);
}
public ICoordinateSequence Create(ICoordinateSequence coordSeq)
{
MiniCoordinateSequence s = coordSeq as MiniCoordinateSequence;
if (s != null)
{
return s.Clone();
}
return new MiniCoordinateSequence(coordSeq.ToCoordinateArray());
}
public ICoordinateSequence Create(int size, int dimension)
{
if (dimension != 2)
{
throw new NotSupportedException("Unsupported dimension: " + dimension);
}
XYPair[] coords = new XYPair[size];
return new MiniCoordinateSequence(coords);
}
public ICoordinateSequence Create(int size, Ordinates ordinates)
{
if (ordinates != Ordinates.XY)
{
throw new NotSupportedException("Unsupported ordinates: " + ordinates);
}
XYPair[] coords = new XYPair[size];
return new MiniCoordinateSequence(coords);
}
public Ordinates Ordinates
{
get { return Ordinates.XY; }
}
}
private sealed class MiniCoordinateSequence : ICoordinateSequence
{
private readonly XYPair[] impl;
private readonly WeakReference<Coordinate[]> weakRef = new WeakReference<Coordinate[]>(null);
internal MiniCoordinateSequence(Coordinate[] coords)
{
this.impl = new XYPair[coords.Length];
Coordinate[] copyArray = new Coordinate[coords.Length];
for (int i = 0; i < coords.Length; i++)
{
Coordinate copy = new Coordinate(coords[i]);
copyArray[i] = copy;
this.impl[i].X = copy.X;
this.impl[i].Y = copy.Y;
}
weakRef.SetTarget(copyArray);
}
internal MiniCoordinateSequence(XYPair[] coords)
{
this.impl = coords;
}
public int Dimension
{
get { return 2; }
}
public Ordinates Ordinates
{
get { return Ordinates.XY; }
}
public int Count
{
get { return this.impl.Length; }
}
object ICloneable.Clone()
{
return this.Clone();
}
public MiniCoordinateSequence Clone()
{
XYPair[] copy = new XYPair[this.Count];
Array.Copy(this.impl, 0, copy, 0, copy.Length);
return new MiniCoordinateSequence(copy);
}
public Coordinate GetCoordinate(int i)
{
return this.GetCoordinateArray()[i];
}
public Coordinate GetCoordinateCopy(int i)
{
return new Coordinate(this.GetCoordinate(i));
}
public void GetCoordinate(int index, Coordinate coord)
{
XYPair pair = this.impl[index];
coord.X = pair.X;
coord.Y = pair.Y;
}
public double GetX(int index)
{
return this.impl[index].X;
}
public double GetY(int index)
{
return this.impl[index].Y;
}
public double GetOrdinate(int index, Ordinate ordinate)
{
switch (ordinate)
{
case Ordinate.X:
return this.GetX(index);
case Ordinate.Y:
return this.GetY(index);
default:
throw new NotSupportedException("Unsupported ordinate: " + ordinate);
}
}
public void SetOrdinate(int index, Ordinate ordinate, double value)
{
switch (ordinate)
{
case Ordinate.X:
this.impl[index].X = value;
break;
case Ordinate.Y:
this.impl[index].Y = value;
break;
default:
throw new NotSupportedException("Unsupported ordinate: " + ordinate);
}
}
public Coordinate[] ToCoordinateArray()
{
return this.GetCoordinateArray();
}
public Envelope ExpandEnvelope(Envelope env)
{
foreach (XYPair pair in this.impl)
{
env.ExpandToInclude(pair.X, pair.Y);
}
return env;
}
public ICoordinateSequence Reversed()
{
XYPair[] copy = new XYPair[this.Count];
Array.Copy(this.impl, 0, copy, 0, copy.Length);
Array.Reverse(copy);
return new MiniCoordinateSequence(copy);
}
private Coordinate[] GetCoordinateArray()
{
Coordinate[] target;
if (!this.weakRef.TryGetTarget(out target))
{
target = new Coordinate[this.Count];
for (int i = 0; i < this.Count; i++)
{
XYPair pair = this.impl[i];
target[i] = new Coordinate(pair.X, pair.Y);
}
this.weakRef.SetTarget(target);
}
return target;
}
}
private struct XYPair
{
internal double X;
internal double Y;
}
}
}
@airbreather
Copy link
Author

Unsurprisingly, this commit (+ this one) resulted in increased CPU time, but decreased elapsed time, when unioning the 48 contiguous US states + DC. What is a little surprising is how small of a difference it was.
Before: 6,254 elapsed / 4:11:11 CPU
After: 6,156 elapsed / 4:14:50 CPU

It's important to note that blindly making a change to the partitioner like that is almost certain to increase both CPU time and elapsed time. In theory, the workload has to look almost exactly like this in order to avoid increasing elapsed time: very significant punishment (relative to communication / synchronization overhead) for lumping tasks together more eagerly than the theoretically optimal partitioning. So unless I'm severely mistaken or my assumptions* are too faulty, the fact that I noticed any amount of decrease in elapsed time at all is evidence that strongly supports the hypothesis there are some workloads that can benefit significantly from this, even if it looks like a dinky improvement here.

*assumptions: I'm assuming the OS didn't do anything that gave "Before" too big of a disadvantage when compared like this.

@airbreather
Copy link
Author

...but when I fix what appears to be a derp in de89f7c6 that broke some of the wonderful optimizations in CascadedPolygonUnion, I can union the 48 contiguous US states + DC in 3,525 seconds elapsed / 3:11:22 CPU.

The fix is here.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment