Last active
November 29, 2024 10:51
-
-
Save dhcgn/c24149b5b7454a8da0565d9b600fcf64 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| void Main() | |
| { | |
| var path = @"C:\Users\...\Downloads\o200k_base.tiktoken"; | |
| var lines = File.ReadAllLines(path); | |
| var tokens = lines | |
| .Select(line => | |
| { | |
| var parts = line.Split(' '); | |
| if (parts.Length != 2) return null; | |
| var base64Content = parts[0]; | |
| var tokenId = parts[1]; | |
| try | |
| { | |
| var decodedContent = Encoding.UTF8.GetString(Convert.FromBase64String(base64Content)); | |
| // Remove non-printable characters | |
| var cleanedContent = new string(decodedContent.Where(c => !char.IsControl(c)).ToArray()); | |
| return new Token { Content = cleanedContent, TokenId = tokenId, Length = cleanedContent.Length }; | |
| } | |
| catch | |
| { | |
| return null; | |
| } | |
| }) | |
| .Where(token => token != null) | |
| .ToList(); | |
| // Top 25 longest tokens overall | |
| var longestTokens = tokens | |
| .OrderByDescending(token => token.Length) | |
| .Take(25) | |
| .ToList(); | |
| // Top 25 longest tokens with Latin alphabet (case insensitive) | |
| var latinRegex = new Regex(@"^[a-z]+$", RegexOptions.IgnoreCase); | |
| var longestLatinTokens = tokens | |
| .Where(token => latinRegex.IsMatch(token.Content)) | |
| .OrderByDescending(token => token.Length) | |
| .Take(25) | |
| .ToList(); | |
| // Top 25 longest tokens with Chinese characters | |
| var chineseRegex = new Regex(@"^[\u4e00-\u9fa5]+$"); | |
| var longestChineseTokens = tokens | |
| .Where(token => chineseRegex.IsMatch(token.Content)) | |
| .OrderByDescending(token => token.Length) | |
| .Take(25) | |
| .ToList(); | |
| // Top 25 longest tokens starting with < and ending with > | |
| var bracketRegex = new Regex(@"^<.*>$"); | |
| var longestBracketTokens = tokens | |
| .Where(token => bracketRegex.IsMatch(token.Content)) | |
| .OrderByDescending(token => token.Length) | |
| .Take(25) | |
| .ToList(); | |
| // Display the results | |
| DisplayResults("The 25 longest tokens overall:", longestTokens); | |
| DisplayResults("The 25 longest tokens with Latin alphabet (case insensitive):", longestLatinTokens); | |
| DisplayResults("The 25 longest tokens with Chinese characters:", longestChineseTokens); | |
| DisplayResults("The 25 longest tokens starting with < and ending with >:", longestBracketTokens); | |
| } | |
| void DisplayResults(string title, List<Token> tokens) | |
| { | |
| Console.WriteLine(title); | |
| foreach (var token in tokens) | |
| { | |
| Console.WriteLine($"{token.Length} chars - Token ID: {token.TokenId}, Content: '{token.Content}'"); | |
| } | |
| Console.WriteLine(); | |
| } | |
| class Token | |
| { | |
| public string Content { get; set; } | |
| public string TokenId { get; set; } | |
| public int Length { get; set; } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment