Skip to content

Instantly share code, notes, and snippets.

@reinoldus
Last active February 8, 2025 05:42
Show Gist options
  • Save reinoldus/e950fb56cd937c130cb0bb36f9df5fcc to your computer and use it in GitHub Desktop.
Save reinoldus/e950fb56cd937c130cb0bb36f9df5fcc to your computer and use it in GitHub Desktop.
import timeit
import unicodedata
import string
import sys
from typing import List
from statistics import mean, stdev, median
# Original functions (copied from above)
def sample_tokens_CURRENT(inputstring: str, length: int = 64) -> List[str]:
tokens = []
for token in inputstring.split():
token = token.strip(string.punctuation)
if token.isalnum():
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample
return sample
def sample_tokens_EASY_FIX(inputstring: str, length: int = 64) -> List[str]:
tokens = []
for token in inputstring.split():
token = token.strip(string.punctuation + "。")
if token.isalnum():
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample
return sample
def strip_all_punctuation(text: str) -> str:
cleaned_chars = []
for char in text:
is_punctuation = unicodedata.category(char).startswith('P')
cleaned_char = ' ' if is_punctuation else char
cleaned_chars.append(cleaned_char)
return ''.join(cleaned_chars)
def sample_tokens_unicode_fix(inputstring: str, length: int = 64) -> list[str]:
tokens = []
inputstring = strip_all_punctuation(inputstring)
for token in inputstring.split():
if token.isalnum():
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample
return sample
PUNCT_TBL = dict.fromkeys((i for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith('P')), ord(' '))
def sample_tokens_translate(inputstring: str, length: int = 64) -> List[str]:
"""Implementation using translation table to replace punctuation with spaces"""
tokens = []
# Replace all punctuation with spaces using translation table
clean_text = inputstring.translate(PUNCT_TBL)
for token in clean_text.split():
if token.isalnum():
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample
return sample
PUNCT_TBL_STATIC = str.maketrans({i: ' ' for i in range(0x10FFFF) if unicodedata.category(chr(i)).startswith('P')})
def sample_tokens_translate_static(inputstring: str, length: int = 64) -> List[str]:
"""Implementation using translation table to replace punctuation with spaces"""
tokens = []
# Replace all punctuation with spaces using translation table
clean_text = inputstring.translate(PUNCT_TBL_STATIC)
for token in clean_text.split():
if token.isalnum():
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample
return sample
# New implementation using isidentifier()
def sample_tokens_identifier(inputstring: str, length: int = 64) -> List[str]:
tokens = []
inputstring = ''.join(c for c in inputstring if c.isidentifier())
for token in inputstring.split():
# Join only identifier characters
if token:
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample
return sample
# Test data
test_cases = {
"mixed_languages": "Hello,World! こんにちは。नमस्ते! ¡Hola!",
"mandarin": "行政長官岑浩。" * 100, # Multiplied for better timing measurement
"chinese_text": """会谈后,两国元首共同签署《中华人民共和国和吉尔吉斯共和国关于深化新时代全面战略伙伴关系的联合声明》,见证签署共建"一带一路"合作规划以及外交、经贸、农业等领域多项合作文件。""" * 50,
"english": "The quick brown fox jumps over the lazy dog! Multiple times..." * 100
}
def run_benchmark(func, test_case, num_runs=10000):
# Create a closure for timeit
def test_wrapper():
return func(test_case)
# Run the benchmark
times = timeit.repeat(test_wrapper, number=1, repeat=num_runs)
return {
'mean': mean(times) * 1000, # Convert to milliseconds
'stdev': stdev(times) * 1000,
'median': median(times) * 1000,
'min': min(times) * 1000,
'max': max(times) * 1000
}
# Run benchmarks
functions = {
'Current': sample_tokens_CURRENT,
'Easy Fix': sample_tokens_EASY_FIX,
'Unicode Fix': sample_tokens_unicode_fix,
'Identifier': sample_tokens_identifier,
'Translate': sample_tokens_translate,
'Static Translate': sample_tokens_translate_static
}
# Print results
print("Benchmark Results (times in milliseconds)")
print("=" * 80)
for test_name, test_text in test_cases.items():
print(f"\nTest Case: {test_name}")
print("-" * 40)
# Then run performance tests
print("\nPerformance results:")
results = {name: run_benchmark(func, test_text) for name, func in functions.items()}
# Print results in a table format
print(f"{'Function':<15} {'Mean':>10} {'StdDev':>10} {'Median':>10} {'Min':>10} {'Max':>10}")
print("-" * 60)
for name, stats in results.items():
print(f"{name:<15} {stats['mean']:>10.3f} {stats['stdev']:>10.3f} {stats['median']:>10.3f} {stats['min']:>10.3f} {stats['max']:>10.3f}")
@reinoldus
Copy link
Author

Results:

Test Case: mixed_languages
----------------------------------------

Performance results:
Function              Mean     StdDev     Median        Min        Max
------------------------------------------------------------
Current              0.001      0.000      0.001      0.001      0.010
Easy Fix             0.002      0.001      0.001      0.001      0.032
Unicode Fix          0.006      0.001      0.006      0.005      0.061
Identifier           0.003      0.000      0.003      0.002      0.016
Translate            0.003      0.001      0.003      0.002      0.049
Static Translate      0.003      0.000      0.003      0.002      0.015

Test Case: mandarin
----------------------------------------

Performance results:
Function              Mean     StdDev     Median        Min        Max
------------------------------------------------------------
Current              0.002      0.000      0.001      0.001      0.008
Easy Fix             0.002      0.000      0.002      0.002      0.011
Unicode Fix          0.082      0.011      0.080      0.074      0.843
Identifier           0.032      0.005      0.031      0.029      0.411
Translate            0.027      0.002      0.026      0.025      0.060
Static Translate      0.027      0.002      0.027      0.025      0.081

Test Case: chinese_text
----------------------------------------

Performance results:
Function              Mean     StdDev     Median        Min        Max
------------------------------------------------------------
Current              0.005      0.001      0.005      0.005      0.022
Easy Fix             0.005      0.001      0.005      0.005      0.030
Unicode Fix          0.472      0.031      0.466      0.442      1.253
Identifier           0.185      0.014      0.182      0.174      0.540
Translate            0.152      0.019      0.149      0.138      0.915
Static Translate      0.152      0.008      0.150      0.143      0.283

Test Case: english
----------------------------------------

Performance results:
Function              Mean     StdDev     Median        Min        Max
------------------------------------------------------------
Current              0.110      0.006      0.108      0.104      0.225
Easy Fix             0.152      0.009      0.150      0.145      0.287
Unicode Fix          0.624      0.049      0.617      0.579      2.451
Identifier           0.203      0.021      0.200      0.186      1.081
Translate            0.083      0.010      0.081      0.075      0.704
Static Translate      0.081      0.006      0.080      0.076      0.206

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment