Skip to content

Instantly share code, notes, and snippets.

@mmguero
Last active March 25, 2025 15:02
Show Gist options
  • Save mmguero/1842c1b06f58503acb854f73befa9d78 to your computer and use it in GitHub Desktop.
Save mmguero/1842c1b06f58503acb854f73befa9d78 to your computer and use it in GitHub Desktop.
Benchmarking different methods for getting text file line counts with Python (wc batched, wc thread pool, wc, and mmap)
#!/usr/bin/env python3
import os
import sys
import mmap
import subprocess
import time
from multiprocessing import Pool
def count_lines_wc(file_path):
try:
result = subprocess.run(["wc", "-l", file_path], capture_output=True, text=True, check=True)
return file_path, int(result.stdout.split()[0])
except Exception as e:
print(f"Error counting lines of {file_path}: {e}", file=sys.stderr)
return file_path, 0
def count_lines_wc_batch(file_paths):
try:
result = subprocess.run(["wc", "-l"] + file_paths, capture_output=True, text=True, check=True)
return [
(file, int(count))
for line in result.stdout.strip().split("\n")
if (count := line.split(maxsplit=1)[0]) and (file := line.split(maxsplit=1)[1].strip()) != "total"
]
except Exception as e:
print(f"Error counting lines of {file_path}: {e}", file=sys.stderr)
return [(file_path, 0) for file_path in file_paths]
def count_lines_mmap(file_path):
try:
with open(file_path, "r") as f:
return file_path, mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ).read().count(b"\n")
except Exception as e:
print(f"Error counting lines of {file_path}: {e}", file=sys.stderr)
return file_path, 0
if __name__ == "__main__":
file_paths = [arg for arg in sys.argv[1:] if os.path.isfile(arg)]
print("Testing wc", file=sys.stderr)
start_time = time.perf_counter()
for arg in file_paths:
if result := count_lines_wc(arg):
print(f"{result[0]} {result[1]}")
end_time = time.perf_counter()
print(f"wc (single) time: {end_time - start_time:.6f} seconds\n", file=sys.stderr)
print("Testing wc (threaded)", file=sys.stderr)
start_time = time.perf_counter()
with Pool(processes=os.cpu_count()) as pool:
results = pool.map(count_lines_wc, file_paths)
for result in results:
print(f"{result[0]} {result[1]}")
end_time = time.perf_counter()
print(f"wc (threaded) time: {end_time - start_time:.6f} seconds\n", file=sys.stderr)
print("Testing mmap", file=sys.stderr)
start_time = time.perf_counter()
for arg in file_paths:
if result := count_lines_mmap(arg):
print(f"{result[0]} {result[1]}")
end_time = time.perf_counter()
print(f"mmap time: {end_time - start_time:.6f} seconds\n", file=sys.stderr)
print("Testing wc (batch)", file=sys.stderr)
start_time = time.perf_counter()
if results := count_lines_wc_batch(file_paths):
for result in results:
print(f"{result[0]} {result[1]}")
end_time = time.perf_counter()
print(f"wc (batch) time: {end_time - start_time:.6f} seconds\n", file=sys.stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment