Skip to content

Instantly share code, notes, and snippets.

@reinoldus
Created February 6, 2025 05:24
Show Gist options
  • Save reinoldus/c7d47feeacfc0a9e4bf0496365d8ac78 to your computer and use it in GitHub Desktop.
Save reinoldus/c7d47feeacfc0a9e4bf0496365d8ac78 to your computer and use it in GitHub Desktop.
import timeit
import unicodedata
import string
import sys
from typing import List
from statistics import mean, stdev
# Original functions (copied from above)
def sample_tokens_CURRENT(inputstring: str, length: int = 64) -> List[str]:
tokens = []
for token in inputstring.split():
token = token.strip(string.punctuation)
if token.isalnum():
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample
return sample
def sample_tokens_EASY_FIX(inputstring: str, length: int = 64) -> List[str]:
tokens = []
for token in inputstring.split():
token = token.strip(string.punctuation + "。")
if token.isalnum():
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample
return sample
def strip_all_punctuation(text: str) -> str:
cleaned_chars = []
for char in text:
is_punctuation = unicodedata.category(char).startswith('P')
cleaned_char = ' ' if is_punctuation else char
cleaned_chars.append(cleaned_char)
return ''.join(cleaned_chars)
def sample_tokens_unicode_fix(inputstring: str, length: int = 64) -> list[str]:
tokens = []
inputstring = strip_all_punctuation(inputstring)
for token in inputstring.split():
if token.isalnum():
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample
return sample
PUNCT_TBL = dict.fromkeys((i for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith('P')), ord(' '))
def sample_tokens_translate(inputstring: str, length: int = 64) -> List[str]:
"""Implementation using translation table to replace punctuation with spaces"""
tokens = []
# Replace all punctuation with spaces using translation table
clean_text = inputstring.translate(PUNCT_TBL)
for token in clean_text.split():
if token.isalnum():
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample
return sample
# New implementation using isidentifier()
def sample_tokens_identifier(inputstring: str, length: int = 64) -> List[str]:
tokens = []
inputstring = ''.join(c for c in inputstring if c.isidentifier())
for token in inputstring.split():
# Join only identifier characters
if token:
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample
return sample
# Test data
test_cases = {
"mixed_languages": "Hello,World! こんにちは。नमस्ते! ¡Hola!",
"mandarin": "行政長官岑浩。" * 100, # Multiplied for better timing measurement
"chinese_text": """会谈后,两国元首共同签署《中华人民共和国和吉尔吉斯共和国关于深化新时代全面战略伙伴关系的联合声明》,见证签署共建"一带一路"合作规划以及外交、经贸、农业等领域多项合作文件。""" * 50,
"english": "The quick brown fox jumps over the lazy dog! Multiple times..." * 100
}
def run_benchmark(func, test_case, num_runs=1000):
# Create a closure for timeit
def test_wrapper():
return func(test_case)
# Run the benchmark
times = timeit.repeat(test_wrapper, number=1, repeat=num_runs)
return {
'mean': mean(times) * 1000, # Convert to milliseconds
'stdev': stdev(times) * 1000,
'min': min(times) * 1000,
'max': max(times) * 1000
}
# Run benchmarks
functions = {
'Current': sample_tokens_CURRENT,
'Easy Fix': sample_tokens_EASY_FIX,
'Unicode Fix': sample_tokens_unicode_fix,
'Identifier': sample_tokens_identifier,
'Translate': sample_tokens_translate
}
# Print results
print("Benchmark Results (times in milliseconds)")
print("=" * 80)
for test_name, test_text in test_cases.items():
print(f"\nTest Case: {test_name}")
print("-" * 40)
# Then run performance tests
print("\nPerformance results:")
results = {name: run_benchmark(func, test_text) for name, func in functions.items()}
# Print results in a table format
print(f"{'Function':<15} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10}")
print("-" * 60)
for name, stats in results.items():
print(f"{name:<15} {stats['mean']:>10.3f} {stats['stdev']:>10.3f} {stats['min']:>10.3f} {stats['max']:>10.3f}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment