reinoldus · February 6, 2025 05:24
diff --git a/benchmark.py b/benchmark.py
 import timeit
 import unicodedata
 import string
 import sys
 from typing import List
 from statistics import mean, stdev


 # Original functions (copied from above)
 def sample_tokens_CURRENT(inputstring: str, length: int = 64) -> List[str]:
    tokens = []
    for token in inputstring.split():
        token = token.strip(string.punctuation)
        if token.isalnum():
            tokens.append(token)
    sample = []
    for i in range(4, -1, -1):
        sample = [t for t in tokens if len(t) > i]
        if len(sample) >= length / 2:
            return sample
    return sample


 def sample_tokens_EASY_FIX(inputstring: str, length: int = 64) -> List[str]:
    tokens = []
    for token in inputstring.split():
        token = token.strip(string.punctuation + "。")
        if token.isalnum():
            tokens.append(token)
    sample = []
    for i in range(4, -1, -1):
        sample = [t for t in tokens if len(t) > i]
        if len(sample) >= length / 2:
            return sample
    return sample


 def strip_all_punctuation(text: str) -> str:
    cleaned_chars = []
    for char in text:
        is_punctuation = unicodedata.category(char).startswith('P')
        cleaned_char = ' ' if is_punctuation else char
        cleaned_chars.append(cleaned_char)
    return ''.join(cleaned_chars)


 def sample_tokens_unicode_fix(inputstring: str, length: int = 64) -> list[str]:
    tokens = []
    inputstring = strip_all_punctuation(inputstring)
    for token in inputstring.split():
        if token.isalnum():
            tokens.append(token)
    sample = []
    for i in range(4, -1, -1):
        sample = [t for t in tokens if len(t) > i]
        if len(sample) >= length / 2:
            return sample
    return sample



 PUNCT_TBL = dict.fromkeys((i for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P')), ord(' '))


 def sample_tokens_translate(inputstring: str, length: int = 64) -> List[str]:
    """Implementation using translation table to replace punctuation with spaces"""
    tokens = []
    # Replace all punctuation with spaces using translation table
    clean_text = inputstring.translate(PUNCT_TBL)
    for token in clean_text.split():
        if token.isalnum():
            tokens.append(token)
    sample = []
    for i in range(4, -1, -1):
        sample = [t for t in tokens if len(t) > i]
        if len(sample) >= length / 2:
            return sample
    return sample


 # New implementation using isidentifier()
 def sample_tokens_identifier(inputstring: str, length: int = 64) -> List[str]:
    tokens = []
    inputstring = ''.join(c for c in inputstring if c.isidentifier())
    for token in inputstring.split():
        # Join only identifier characters

        if token:
            tokens.append(token)
    sample = []
    for i in range(4, -1, -1):
        sample = [t for t in tokens if len(t) > i]
        if len(sample) >= length / 2:
            return sample
    return sample


 # Test data
 test_cases = {
    "mixed_languages": "Hello,World! こんにちは。नमस्ते! ¡Hola!",
    "mandarin": "行政長官岑浩。" * 100,  # Multiplied for better timing measurement
    "chinese_text": """会谈后，两国元首共同签署《中华人民共和国和吉尔吉斯共和国关于深化新时代全面战略伙伴关系的联合声明》，见证签署共建"一带一路"合作规划以及外交、经贸、农业等领域多项合作文件。""" * 50,
    "english": "The quick brown fox jumps over the lazy dog! Multiple times..." * 100
 }


 def run_benchmark(func, test_case, num_runs=1000):
    # Create a closure for timeit
    def test_wrapper():
        return func(test_case)

    # Run the benchmark
    times = timeit.repeat(test_wrapper, number=1, repeat=num_runs)
    return {
        'mean': mean(times) * 1000,  # Convert to milliseconds
        'stdev': stdev(times) * 1000,
        'min': min(times) * 1000,
        'max': max(times) * 1000
    }


 # Run benchmarks
 functions = {
    'Current': sample_tokens_CURRENT,
    'Easy Fix': sample_tokens_EASY_FIX,
    'Unicode Fix': sample_tokens_unicode_fix,
    'Identifier': sample_tokens_identifier,
    'Translate': sample_tokens_translate
 }

 # Print results
 print("Benchmark Results (times in milliseconds)")
 print("=" * 80)

 for test_name, test_text in test_cases.items():
    print(f"\nTest Case: {test_name}")
    print("-" * 40)

    # Then run performance tests
    print("\nPerformance results:")
    results = {name: run_benchmark(func, test_text) for name, func in functions.items()}

    # Print results in a table format
    print(f"{'Function':<15} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10}")
    print("-" * 60)
    for name, stats in results.items():
        print(f"{name:<15} {stats['mean']:>10.3f} {stats['stdev']:>10.3f} {stats['min']:>10.3f} {stats['max']:>10.3f}")
	import timeit
	import unicodedata
	import string
	import sys
	from typing import List
	from statistics import mean, stdev


	# Original functions (copied from above)
	def sample_tokens_CURRENT(inputstring: str, length: int = 64) -> List[str]:
	tokens = []
	for token in inputstring.split():
	token = token.strip(string.punctuation)
	if token.isalnum():
	tokens.append(token)
	sample = []
	for i in range(4, -1, -1):
	sample = [t for t in tokens if len(t) > i]
	if len(sample) >= length / 2:
	return sample
	return sample


	def sample_tokens_EASY_FIX(inputstring: str, length: int = 64) -> List[str]:
	tokens = []
	for token in inputstring.split():
	token = token.strip(string.punctuation + "。")
	if token.isalnum():
	tokens.append(token)
	sample = []
	for i in range(4, -1, -1):
	sample = [t for t in tokens if len(t) > i]
	if len(sample) >= length / 2:
	return sample
	return sample


	def strip_all_punctuation(text: str) -> str:
	cleaned_chars = []
	for char in text:
	is_punctuation = unicodedata.category(char).startswith('P')
	cleaned_char = ' ' if is_punctuation else char
	cleaned_chars.append(cleaned_char)
	return ''.join(cleaned_chars)


	def sample_tokens_unicode_fix(inputstring: str, length: int = 64) -> list[str]:
	tokens = []
	inputstring = strip_all_punctuation(inputstring)
	for token in inputstring.split():
	if token.isalnum():
	tokens.append(token)
	sample = []
	for i in range(4, -1, -1):
	sample = [t for t in tokens if len(t) > i]
	if len(sample) >= length / 2:
	return sample
	return sample



	PUNCT_TBL = dict.fromkeys((i for i in range(sys.maxunicode)
	if unicodedata.category(chr(i)).startswith('P')), ord(' '))


	def sample_tokens_translate(inputstring: str, length: int = 64) -> List[str]:
	"""Implementation using translation table to replace punctuation with spaces"""
	tokens = []
	# Replace all punctuation with spaces using translation table
	clean_text = inputstring.translate(PUNCT_TBL)
	for token in clean_text.split():
	if token.isalnum():
	tokens.append(token)
	sample = []
	for i in range(4, -1, -1):
	sample = [t for t in tokens if len(t) > i]
	if len(sample) >= length / 2:
	return sample
	return sample


	# New implementation using isidentifier()
	def sample_tokens_identifier(inputstring: str, length: int = 64) -> List[str]:
	tokens = []
	inputstring = ''.join(c for c in inputstring if c.isidentifier())
	for token in inputstring.split():
	# Join only identifier characters

	if token:
	tokens.append(token)
	sample = []
	for i in range(4, -1, -1):
	sample = [t for t in tokens if len(t) > i]
	if len(sample) >= length / 2:
	return sample
	return sample


	# Test data
	test_cases = {
	"mixed_languages": "Hello,World! こんにちは。नमस्ते! ¡Hola!",
	"mandarin": "行政長官岑浩。" * 100, # Multiplied for better timing measurement
	"chinese_text": """会谈后，两国元首共同签署《中华人民共和国和吉尔吉斯共和国关于深化新时代全面战略伙伴关系的联合声明》，见证签署共建"一带一路"合作规划以及外交、经贸、农业等领域多项合作文件。""" * 50,
	"english": "The quick brown fox jumps over the lazy dog! Multiple times..." * 100
	}


	def run_benchmark(func, test_case, num_runs=1000):
	# Create a closure for timeit
	def test_wrapper():
	return func(test_case)

	# Run the benchmark
	times = timeit.repeat(test_wrapper, number=1, repeat=num_runs)
	return {
	'mean': mean(times) * 1000, # Convert to milliseconds
	'stdev': stdev(times) * 1000,
	'min': min(times) * 1000,
	'max': max(times) * 1000
	}


	# Run benchmarks
	functions = {
	'Current': sample_tokens_CURRENT,
	'Easy Fix': sample_tokens_EASY_FIX,
	'Unicode Fix': sample_tokens_unicode_fix,
	'Identifier': sample_tokens_identifier,
	'Translate': sample_tokens_translate
	}

	# Print results
	print("Benchmark Results (times in milliseconds)")
	print("=" * 80)

	for test_name, test_text in test_cases.items():
	print(f"\nTest Case: {test_name}")
	print("-" * 40)

	# Then run performance tests
	print("\nPerformance results:")
	results = {name: run_benchmark(func, test_text) for name, func in functions.items()}

	# Print results in a table format
	print(f"{'Function':<15} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10}")
	print("-" * 60)
	for name, stats in results.items():
	print(f"{name:<15} {stats['mean']:>10.3f} {stats['stdev']:>10.3f} {stats['min']:>10.3f} {stats['max']:>10.3f}")