Skip to content

Instantly share code, notes, and snippets.

@dhcgn
Last active November 29, 2024 10:51
Show Gist options
  • Select an option

  • Save dhcgn/c24149b5b7454a8da0565d9b600fcf64 to your computer and use it in GitHub Desktop.

Select an option

Save dhcgn/c24149b5b7454a8da0565d9b600fcf64 to your computer and use it in GitHub Desktop.
void Main()
{
var path = @"C:\Users\...\Downloads\o200k_base.tiktoken";
var lines = File.ReadAllLines(path);
var tokens = lines
.Select(line =>
{
var parts = line.Split(' ');
if (parts.Length != 2) return null;
var base64Content = parts[0];
var tokenId = parts[1];
try
{
var decodedContent = Encoding.UTF8.GetString(Convert.FromBase64String(base64Content));
// Remove non-printable characters
var cleanedContent = new string(decodedContent.Where(c => !char.IsControl(c)).ToArray());
return new Token { Content = cleanedContent, TokenId = tokenId, Length = cleanedContent.Length };
}
catch
{
return null;
}
})
.Where(token => token != null)
.ToList();
// Top 25 longest tokens overall
var longestTokens = tokens
.OrderByDescending(token => token.Length)
.Take(25)
.ToList();
// Top 25 longest tokens with Latin alphabet (case insensitive)
var latinRegex = new Regex(@"^[a-z]+$", RegexOptions.IgnoreCase);
var longestLatinTokens = tokens
.Where(token => latinRegex.IsMatch(token.Content))
.OrderByDescending(token => token.Length)
.Take(25)
.ToList();
// Top 25 longest tokens with Chinese characters
var chineseRegex = new Regex(@"^[\u4e00-\u9fa5]+$");
var longestChineseTokens = tokens
.Where(token => chineseRegex.IsMatch(token.Content))
.OrderByDescending(token => token.Length)
.Take(25)
.ToList();
// Top 25 longest tokens starting with < and ending with >
var bracketRegex = new Regex(@"^<.*>$");
var longestBracketTokens = tokens
.Where(token => bracketRegex.IsMatch(token.Content))
.OrderByDescending(token => token.Length)
.Take(25)
.ToList();
// Display the results
DisplayResults("The 25 longest tokens overall:", longestTokens);
DisplayResults("The 25 longest tokens with Latin alphabet (case insensitive):", longestLatinTokens);
DisplayResults("The 25 longest tokens with Chinese characters:", longestChineseTokens);
DisplayResults("The 25 longest tokens starting with < and ending with >:", longestBracketTokens);
}
void DisplayResults(string title, List<Token> tokens)
{
Console.WriteLine(title);
foreach (var token in tokens)
{
Console.WriteLine($"{token.Length} chars - Token ID: {token.TokenId}, Content: '{token.Content}'");
}
Console.WriteLine();
}
class Token
{
public string Content { get; set; }
public string TokenId { get; set; }
public int Length { get; set; }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment