Last active
June 2, 2022 18:17
-
-
Save Beej126/f7a4c65d0f3525d447c0d7af8d26ae20 to your computer and use it in GitHub Desktop.
Split big CSV quickly (multiple gigs < minute) based on column, powershell with embedded C#
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# C# syntax works under both pwsh core and legacy Windows Powershell which is capped at C# 5.0 and .Net Framework vs dotnet core | |
Add-Type @" | |
using System; | |
using System.IO; | |
public static class BigCsvSplitter | |
{ | |
public static void Run() | |
{ | |
var lastmac = ""; | |
var fileCount = 0; | |
var fileLineCount = 0; | |
var totalLineCount = 0; | |
using (var fs = new FileStream("$($pwd -replace "\\", "/")/prime.csv", FileMode.Open, FileAccess.Read, FileShare.None, 4096, FileOptions.SequentialScan)) | |
using (var sr = new StreamReader(fs)) | |
{ | |
FileStream outfs = null; | |
// BufferedStream outbfs = null; | |
StreamWriter outsw = null; | |
sr.ReadLine(); //eat the first header line | |
while (sr.Peek() >= 0) | |
{ | |
var line = sr.ReadLine(); | |
var mac = (line == null ? null : line.Substring(0, line.IndexOf(","))); | |
if (mac != lastmac) | |
{ | |
if (fileCount > 0) Console.WriteLine(" closing, total lines: " + fileLineCount); | |
lastmac = mac; | |
if (outsw != null) | |
{ | |
outsw.Flush(); | |
outsw.Close(); | |
outsw.Dispose(); | |
// outbfs?.Flush(); | |
// outbfs?.Close(); | |
// outbfs?.Dispose(); | |
// outfs?.Flush(); | |
// outfs?.Close(); | |
outfs.Dispose(); | |
} | |
outfs = new FileStream("$($pwd -replace "\\", "/")/" + mac + ".csv", FileMode.CreateNew, FileAccess.Write); | |
// outbfs = new BufferedStream(fs, 1024); | |
outsw = new StreamWriter(outfs); | |
fileCount++; | |
fileLineCount = 0; | |
Console.WriteLine("new file #" + fileCount + ": " + mac); | |
} | |
fileLineCount++; | |
totalLineCount++; | |
outsw.WriteLine(line); | |
if (fileLineCount % 1000 == 0) Console.WriteLine(" file #: " + fileCount + ", processed lines: " + fileLineCount); | |
} | |
if (outsw != null) | |
{ | |
outsw.Flush(); | |
outsw.Close(); | |
outsw.Dispose(); | |
// outbfs?.Flush(); | |
// outbfs?.Close(); | |
// outbfs?.Dispose(); | |
// outfs?.Flush(); | |
// outfs?.Close(); | |
outfs.Dispose(); | |
} | |
} | |
Console.WriteLine("DONE! " + fileCount + " total files, " + totalLineCount + " total lines.\r\n"); | |
} | |
} | |
"@ | |
$stopwatch = [System.Diagnostics.Stopwatch]::StartNew() | |
[BigCsvSplitter]::Run() | |
$stopwatch.Elapsed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
MAC List | gpi | NDC | MAC Price | |
---|---|---|---|---|
MAC2751A1B | 01100010102125 | 00003067371 | 16.32730 | |
MAC2751A1B | 01100010102125 | 00049052083 | 16.32730 | |
MAC2751A1B | 01100010102125 | 00209857422 | 16.32730 | |
MAC2751A1C | 01100010102125 | 54868348000 | 16.32730 | |
MAC2751A1C | 01100010102125 | 00364290638 | 16.32730 | |
MAC2751A1C | 01100010102125 | 00781613595 | 16.32730 | |
MAC2751A1D | 01100010102125 | 00781613594 | 16.32730 | |
MAC2751A1D | 01100010102125 | 00049052044 | 16.32730 | |
MAC2751A1D | 01100010102125 | 63323032320 | 16.32730 | |
MAC2751A1D | 01100010102125 | 00049052022 | 16.32730 | |
MAC2751A1E | 01100010102125 | 00049052084 | 16.32730 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment