Created
August 22, 2022 02:55
-
-
Save altbodhi/690dfc2c8e70a579c15fe04f72b83bf5 to your computer and use it in GitHub Desktop.
Very simple code for sorting big text file in csharp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
2_000_000 => 16 sec | |
5_ => 32 | |
10_ => 1:10 | |
20_ => 2:58 | |
40_ => 13:44 file size = 2.5 Gb | |
*/ | |
using System.IO; | |
using System.Diagnostics; | |
using static System.Console; | |
int count = args.Length > 0 && int.TryParse(args[0], out var val) ? val : 2_000_000; | |
var sw = Stopwatch.StartNew(); | |
var alf = get_alf(); | |
if (Directory.Exists(Constants.TMP)) | |
Directory.Delete(Constants.TMP, true); | |
Directory.CreateDirectory(Constants.TMP); | |
var comp = new SimpleComparer(); | |
do_it(count); | |
sw.Stop(); | |
WriteLine(sw.Elapsed); | |
char[] get_alf() | |
{ | |
var list = new List<char>(); | |
for (char x = 'A'; x <= 'Z'; x++) | |
list.Add(x); | |
for (char x = 'a'; x <= 'z'; x++) | |
list.Add(x); | |
list.Add(' '); | |
return list.ToArray(); | |
} | |
string gen_str() | |
{ | |
var range = Enumerable.Range(0, Random.Shared.Next(20, 100)); | |
var chars = range.Select(x => alf[Random.Shared.Next(0, alf.Length)]).ToArray(); | |
return new string(chars); | |
} | |
int gen_num() => Random.Shared.Next(0, 10_000); | |
void do_it(int count) | |
{ | |
var source = $"L{count}.txt"; | |
File.WriteAllLines(source, Enumerable.Range(0, count).Select(x => $"{gen_num()}.{gen_str()}")); | |
int num = 0; | |
foreach (var chunk in File.ReadAllLines(source).Chunk(200_000)) | |
File.WriteAllLines(Path.Combine($".\\{Constants.TMP}\\", $"{(++num):D5}.txt"), chunk.OrderBy(x => x, comp)); | |
SortFiles($"S{count}.txt"); | |
} | |
void SortFiles(string outputFile) | |
{ | |
var readers = Directory.GetFiles(Path.Combine($".\\{Constants.TMP}\\")).Select(x => new StreamReader(x)); | |
try | |
{ | |
var lines = readers.Select(x => new TxtReader | |
{ | |
Line = x.ReadLine(), | |
Reader = x | |
}).OrderBy(x => x.Line, comparer: comp).ToList(); | |
using var writer = new StreamWriter(outputFile); | |
Reorder(); | |
void Reorder() | |
{ | |
if (lines.Count == 1) | |
return; | |
int i = 0; | |
while (comp.Compare(lines[i].Line, lines[i + 1].Line) > 0) | |
{ | |
var t = lines[i]; | |
lines[i] = lines[i + 1]; | |
lines[i + 1] = t; | |
i++; | |
if (i + 1 == lines.Count) | |
return; | |
} | |
} | |
while (lines.Count > 0) | |
{ | |
var current = lines[0]; | |
writer.WriteLine(current.Line); | |
if (current.Reader.EndOfStream) | |
{ | |
lines.Remove(current); | |
WriteLine($"{DateTime.Now:s} : {lines.Count}"); | |
continue; | |
} | |
current.Line = current.Reader.ReadLine(); | |
Reorder(); | |
} | |
} | |
finally | |
{ | |
foreach (var r in readers) | |
r.Dispose(); | |
} | |
} | |
public class TxtReader | |
{ | |
public string Line; | |
public StreamReader Reader; | |
} | |
class SimpleComparer : IComparer<string> | |
{ | |
public int Compare(string s1, string s2) | |
{ | |
var p1 = s1.IndexOf('.'); | |
var v1 = s1.AsSpan(p1 + 2); | |
var p2 = s2.IndexOf('.'); | |
var v2 = s2.AsSpan(p2 + 2); | |
var vc = | |
v1.CompareTo(v2, StringComparison.Ordinal); | |
if (vc != 0) | |
return vc; | |
else | |
{ | |
var n1 = Int32.Parse(s1.AsSpan(0, p1)); | |
var n2 = Int32.Parse(s2.AsSpan(0, p2)); | |
return n1.CompareTo(n2); | |
} | |
} | |
} | |
public static class Constants | |
{ | |
public const string TMP = "tmp"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment