Last active
March 25, 2025 15:02
-
-
Save mmguero/1842c1b06f58503acb854f73befa9d78 to your computer and use it in GitHub Desktop.
Benchmarking different methods for getting text file line counts with Python (wc batched, wc thread pool, wc, and mmap)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import sys | |
import mmap | |
import subprocess | |
import time | |
from multiprocessing import Pool | |
def count_lines_wc(file_path): | |
try: | |
result = subprocess.run(["wc", "-l", file_path], capture_output=True, text=True, check=True) | |
return file_path, int(result.stdout.split()[0]) | |
except Exception as e: | |
print(f"Error counting lines of {file_path}: {e}", file=sys.stderr) | |
return file_path, 0 | |
def count_lines_wc_batch(file_paths): | |
try: | |
result = subprocess.run(["wc", "-l"] + file_paths, capture_output=True, text=True, check=True) | |
return [ | |
(file, int(count)) | |
for line in result.stdout.strip().split("\n") | |
if (count := line.split(maxsplit=1)[0]) and (file := line.split(maxsplit=1)[1].strip()) != "total" | |
] | |
except Exception as e: | |
print(f"Error counting lines of {file_path}: {e}", file=sys.stderr) | |
return [(file_path, 0) for file_path in file_paths] | |
def count_lines_mmap(file_path): | |
try: | |
with open(file_path, "r") as f: | |
return file_path, mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ).read().count(b"\n") | |
except Exception as e: | |
print(f"Error counting lines of {file_path}: {e}", file=sys.stderr) | |
return file_path, 0 | |
if __name__ == "__main__": | |
file_paths = [arg for arg in sys.argv[1:] if os.path.isfile(arg)] | |
print("Testing wc", file=sys.stderr) | |
start_time = time.perf_counter() | |
for arg in file_paths: | |
if result := count_lines_wc(arg): | |
print(f"{result[0]} {result[1]}") | |
end_time = time.perf_counter() | |
print(f"wc (single) time: {end_time - start_time:.6f} seconds\n", file=sys.stderr) | |
print("Testing wc (threaded)", file=sys.stderr) | |
start_time = time.perf_counter() | |
with Pool(processes=os.cpu_count()) as pool: | |
results = pool.map(count_lines_wc, file_paths) | |
for result in results: | |
print(f"{result[0]} {result[1]}") | |
end_time = time.perf_counter() | |
print(f"wc (threaded) time: {end_time - start_time:.6f} seconds\n", file=sys.stderr) | |
print("Testing mmap", file=sys.stderr) | |
start_time = time.perf_counter() | |
for arg in file_paths: | |
if result := count_lines_mmap(arg): | |
print(f"{result[0]} {result[1]}") | |
end_time = time.perf_counter() | |
print(f"mmap time: {end_time - start_time:.6f} seconds\n", file=sys.stderr) | |
print("Testing wc (batch)", file=sys.stderr) | |
start_time = time.perf_counter() | |
if results := count_lines_wc_batch(file_paths): | |
for result in results: | |
print(f"{result[0]} {result[1]}") | |
end_time = time.perf_counter() | |
print(f"wc (batch) time: {end_time - start_time:.6f} seconds\n", file=sys.stderr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment