Skip to content

Instantly share code, notes, and snippets.

@Beej126
Last active June 2, 2022 18:17
Show Gist options
  • Save Beej126/f7a4c65d0f3525d447c0d7af8d26ae20 to your computer and use it in GitHub Desktop.
Save Beej126/f7a4c65d0f3525d447c0d7af8d26ae20 to your computer and use it in GitHub Desktop.
Split big CSV quickly (multiple gigs < minute) based on column, powershell with embedded C#
# C# syntax works under both pwsh core and legacy Windows Powershell which is capped at C# 5.0 and .Net Framework vs dotnet core
Add-Type @"
using System;
using System.IO;
public static class BigCsvSplitter
{
public static void Run()
{
var lastmac = "";
var fileCount = 0;
var fileLineCount = 0;
var totalLineCount = 0;
using (var fs = new FileStream("$($pwd -replace "\\", "/")/prime.csv", FileMode.Open, FileAccess.Read, FileShare.None, 4096, FileOptions.SequentialScan))
using (var sr = new StreamReader(fs))
{
FileStream outfs = null;
// BufferedStream outbfs = null;
StreamWriter outsw = null;
sr.ReadLine(); //eat the first header line
while (sr.Peek() >= 0)
{
var line = sr.ReadLine();
var mac = (line == null ? null : line.Substring(0, line.IndexOf(",")));
if (mac != lastmac)
{
if (fileCount > 0) Console.WriteLine(" closing, total lines: " + fileLineCount);
lastmac = mac;
if (outsw != null)
{
outsw.Flush();
outsw.Close();
outsw.Dispose();
// outbfs?.Flush();
// outbfs?.Close();
// outbfs?.Dispose();
// outfs?.Flush();
// outfs?.Close();
outfs.Dispose();
}
outfs = new FileStream("$($pwd -replace "\\", "/")/" + mac + ".csv", FileMode.CreateNew, FileAccess.Write);
// outbfs = new BufferedStream(fs, 1024);
outsw = new StreamWriter(outfs);
fileCount++;
fileLineCount = 0;
Console.WriteLine("new file #" + fileCount + ": " + mac);
}
fileLineCount++;
totalLineCount++;
outsw.WriteLine(line);
if (fileLineCount % 1000 == 0) Console.WriteLine(" file #: " + fileCount + ", processed lines: " + fileLineCount);
}
if (outsw != null)
{
outsw.Flush();
outsw.Close();
outsw.Dispose();
// outbfs?.Flush();
// outbfs?.Close();
// outbfs?.Dispose();
// outfs?.Flush();
// outfs?.Close();
outfs.Dispose();
}
}
Console.WriteLine("DONE! " + fileCount + " total files, " + totalLineCount + " total lines.\r\n");
}
}
"@
$stopwatch = [System.Diagnostics.Stopwatch]::StartNew()
[BigCsvSplitter]::Run()
$stopwatch.Elapsed
MAC List gpi NDC MAC Price
MAC2751A1B 01100010102125 00003067371 16.32730
MAC2751A1B 01100010102125 00049052083 16.32730
MAC2751A1B 01100010102125 00209857422 16.32730
MAC2751A1C 01100010102125 54868348000 16.32730
MAC2751A1C 01100010102125 00364290638 16.32730
MAC2751A1C 01100010102125 00781613595 16.32730
MAC2751A1D 01100010102125 00781613594 16.32730
MAC2751A1D 01100010102125 00049052044 16.32730
MAC2751A1D 01100010102125 63323032320 16.32730
MAC2751A1D 01100010102125 00049052022 16.32730
MAC2751A1E 01100010102125 00049052084 16.32730
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment