Last active
February 8, 2025 05:42
-
-
Save reinoldus/e950fb56cd937c130cb0bb36f9df5fcc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import timeit | |
import unicodedata | |
import string | |
import sys | |
from typing import List | |
from statistics import mean, stdev, median | |
# Original functions (copied from above) | |
def sample_tokens_CURRENT(inputstring: str, length: int = 64) -> List[str]: | |
tokens = [] | |
for token in inputstring.split(): | |
token = token.strip(string.punctuation) | |
if token.isalnum(): | |
tokens.append(token) | |
sample = [] | |
for i in range(4, -1, -1): | |
sample = [t for t in tokens if len(t) > i] | |
if len(sample) >= length / 2: | |
return sample | |
return sample | |
def sample_tokens_EASY_FIX(inputstring: str, length: int = 64) -> List[str]: | |
tokens = [] | |
for token in inputstring.split(): | |
token = token.strip(string.punctuation + "。") | |
if token.isalnum(): | |
tokens.append(token) | |
sample = [] | |
for i in range(4, -1, -1): | |
sample = [t for t in tokens if len(t) > i] | |
if len(sample) >= length / 2: | |
return sample | |
return sample | |
def strip_all_punctuation(text: str) -> str: | |
cleaned_chars = [] | |
for char in text: | |
is_punctuation = unicodedata.category(char).startswith('P') | |
cleaned_char = ' ' if is_punctuation else char | |
cleaned_chars.append(cleaned_char) | |
return ''.join(cleaned_chars) | |
def sample_tokens_unicode_fix(inputstring: str, length: int = 64) -> list[str]: | |
tokens = [] | |
inputstring = strip_all_punctuation(inputstring) | |
for token in inputstring.split(): | |
if token.isalnum(): | |
tokens.append(token) | |
sample = [] | |
for i in range(4, -1, -1): | |
sample = [t for t in tokens if len(t) > i] | |
if len(sample) >= length / 2: | |
return sample | |
return sample | |
PUNCT_TBL = dict.fromkeys((i for i in range(sys.maxunicode) | |
if unicodedata.category(chr(i)).startswith('P')), ord(' ')) | |
def sample_tokens_translate(inputstring: str, length: int = 64) -> List[str]: | |
"""Implementation using translation table to replace punctuation with spaces""" | |
tokens = [] | |
# Replace all punctuation with spaces using translation table | |
clean_text = inputstring.translate(PUNCT_TBL) | |
for token in clean_text.split(): | |
if token.isalnum(): | |
tokens.append(token) | |
sample = [] | |
for i in range(4, -1, -1): | |
sample = [t for t in tokens if len(t) > i] | |
if len(sample) >= length / 2: | |
return sample | |
return sample | |
PUNCT_TBL_STATIC = str.maketrans({i: ' ' for i in range(0x10FFFF) if unicodedata.category(chr(i)).startswith('P')}) | |
def sample_tokens_translate_static(inputstring: str, length: int = 64) -> List[str]: | |
"""Implementation using translation table to replace punctuation with spaces""" | |
tokens = [] | |
# Replace all punctuation with spaces using translation table | |
clean_text = inputstring.translate(PUNCT_TBL_STATIC) | |
for token in clean_text.split(): | |
if token.isalnum(): | |
tokens.append(token) | |
sample = [] | |
for i in range(4, -1, -1): | |
sample = [t for t in tokens if len(t) > i] | |
if len(sample) >= length / 2: | |
return sample | |
return sample | |
# New implementation using isidentifier() | |
def sample_tokens_identifier(inputstring: str, length: int = 64) -> List[str]: | |
tokens = [] | |
inputstring = ''.join(c for c in inputstring if c.isidentifier()) | |
for token in inputstring.split(): | |
# Join only identifier characters | |
if token: | |
tokens.append(token) | |
sample = [] | |
for i in range(4, -1, -1): | |
sample = [t for t in tokens if len(t) > i] | |
if len(sample) >= length / 2: | |
return sample | |
return sample | |
# Test data | |
test_cases = { | |
"mixed_languages": "Hello,World! こんにちは。नमस्ते! ¡Hola!", | |
"mandarin": "行政長官岑浩。" * 100, # Multiplied for better timing measurement | |
"chinese_text": """会谈后,两国元首共同签署《中华人民共和国和吉尔吉斯共和国关于深化新时代全面战略伙伴关系的联合声明》,见证签署共建"一带一路"合作规划以及外交、经贸、农业等领域多项合作文件。""" * 50, | |
"english": "The quick brown fox jumps over the lazy dog! Multiple times..." * 100 | |
} | |
def run_benchmark(func, test_case, num_runs=10000): | |
# Create a closure for timeit | |
def test_wrapper(): | |
return func(test_case) | |
# Run the benchmark | |
times = timeit.repeat(test_wrapper, number=1, repeat=num_runs) | |
return { | |
'mean': mean(times) * 1000, # Convert to milliseconds | |
'stdev': stdev(times) * 1000, | |
'median': median(times) * 1000, | |
'min': min(times) * 1000, | |
'max': max(times) * 1000 | |
} | |
# Run benchmarks | |
functions = { | |
'Current': sample_tokens_CURRENT, | |
'Easy Fix': sample_tokens_EASY_FIX, | |
'Unicode Fix': sample_tokens_unicode_fix, | |
'Identifier': sample_tokens_identifier, | |
'Translate': sample_tokens_translate, | |
'Static Translate': sample_tokens_translate_static | |
} | |
# Print results | |
print("Benchmark Results (times in milliseconds)") | |
print("=" * 80) | |
for test_name, test_text in test_cases.items(): | |
print(f"\nTest Case: {test_name}") | |
print("-" * 40) | |
# Then run performance tests | |
print("\nPerformance results:") | |
results = {name: run_benchmark(func, test_text) for name, func in functions.items()} | |
# Print results in a table format | |
print(f"{'Function':<15} {'Mean':>10} {'StdDev':>10} {'Median':>10} {'Min':>10} {'Max':>10}") | |
print("-" * 60) | |
for name, stats in results.items(): | |
print(f"{name:<15} {stats['mean']:>10.3f} {stats['stdev']:>10.3f} {stats['median']:>10.3f} {stats['min']:>10.3f} {stats['max']:>10.3f}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Results: