Created
October 25, 2012 04:28
-
-
Save juanplopes/3950410 to your computer and use it in GitHub Desktop.
Naïve external sort in C#
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
using System.IO; | |
namespace DirtyMergeSort | |
{ | |
public static class Program | |
{ | |
public static void Main(string[] args) | |
{ | |
File.WriteAllLines(@"d:\sorted.txt", | |
File.ReadLines(@"d:\data.txt") | |
.Select(Progress("read")) | |
.Sorted(1000000) | |
.Select(Progress("write"))); | |
} | |
private static Func<string, int, string> Progress(string type) | |
{ | |
return (x, i) => | |
{ | |
if (i % 100000 == 0) Console.WriteLine("{0}: {1}", type, i); | |
return x; | |
}; | |
} | |
public static IEnumerable<string> Sorted(this IEnumerable<string> lines, int size) | |
{ | |
var files = lines.Partition(size) | |
.Select(x=>new {File = x, Lines = File.ReadLines(x).GetEnumerator()}) | |
.Where(x=>x.Lines.MoveNext()) | |
.ToList(); | |
while (files.Any()) | |
{ | |
var current = files.Min(x => x.Lines.Current); | |
yield return current; | |
files = files.Where(x => | |
{ | |
while (x.Lines.Current == current) | |
{ | |
if (!x.Lines.MoveNext()) | |
{ | |
File.Delete(x.File); | |
Console.WriteLine("Deleted file {0}", x.File); | |
return false; | |
} | |
} | |
return true; | |
}).ToList(); | |
} | |
} | |
public static IEnumerable<string> Partition(this IEnumerable<string> lines, int size) | |
{ | |
return lines.Batch(size) | |
.AsParallel() | |
.WithMergeOptions(ParallelMergeOptions.NotBuffered) | |
.Select(batch => | |
{ | |
var temp = Path.GetTempFileName(); | |
File.WriteAllLines(temp, batch.OrderBy(x=>x)); | |
Console.WriteLine("Wrote file {0}", temp); | |
return temp; | |
}); | |
} | |
public static IEnumerable<List<T>> Batch<T>(this IEnumerable<T> collection, int size) | |
{ | |
var batch = new List<T>(size); | |
foreach (T item in collection) | |
{ | |
batch.Add(item); | |
if (batch.Count == size) | |
{ | |
yield return batch; | |
batch = new List<T>(size); | |
} | |
} | |
if (batch.Any()) | |
yield return batch; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment