Last active
April 26, 2023 01:57
-
-
Save kstrauss/bac26d6458603b634479cc990c12fb97 to your computer and use it in GitHub Desktop.
Simple samples of functions to depersonalize data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
async void Main() | |
{ | |
// baby first names https://github.com/hadley/data-baby-names | |
var fnames = ReadFileAsync(@"C:\temp\depersonal\bfnames.csv"); | |
// surnames https://github.com/fivethirtyeight/data/blob/master/most-common-name/surnames.csv | |
var lnames = ReadFileAsync(@"C:\temp\depersonal\surnamesClean.csv"); | |
var testNames = ReadFileAsync(@"c:\temp\depersonal\nbaPlayers.csv"); | |
var r = new Random(); | |
int Max = 3; | |
var result = new int[Max]; | |
for(int i = 0; i< Max;i++){ | |
result[i] = r.Next(); | |
} | |
result.Dump(); | |
(await testNames).TakeLast(10) | |
.Select(async re => | |
{ | |
var (first, last) = GenerateName(re, await fnames, await lnames); | |
return new { Original = re, DeFirst = first, DeLast = last }; | |
}) | |
.Dump(); | |
} | |
// You can define other methods, fields, classes and namespaces here | |
public static (string,string) GenerateName(string original, List<string> fnames, List<string> lnames, bool deterministic = false) | |
{ | |
List<string> names = new List<string>(); | |
//should be from hash | |
Random random = deterministic ? new Random(BitConverter.ToInt32(CalculateMD5Hash(original))) : Random.Shared ; | |
string firstName = fnames[random.Next(0,fnames.Count)]; | |
string lastName = lnames[random.Next(0,lnames.Count)]; | |
return (firstName,lastName); | |
} | |
async Task<List<string>> ReadFileAsync(string fname) | |
{ | |
var records = new List<string>(); | |
using (var reader = new StreamReader(fname)) | |
{ | |
while (!reader.EndOfStream) | |
{ | |
records.Add(await reader.ReadLineAsync()); | |
} | |
return records; | |
} | |
} | |
public static byte[] CalculateMD5Hash(string input) | |
{ | |
using (var md5 = MD5.Create()) | |
{ | |
var bytes = Encoding.UTF8.GetBytes(input); | |
return md5.ComputeHash(bytes); | |
} | |
} | |
List<T> ReadCSV<T>(string fname) | |
{ | |
using (var reader = new StreamReader(fname)) | |
using (var csv = new CsvReader(reader, CultureInfo.InvariantCulture)) | |
{ | |
var records = csv.GetRecords<T>(); | |
return new List<T>(records); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment