Last active
October 1, 2022 15:08
-
-
Save koaning/19fd1ba63d79baa8bb38cf626b8c65c5 to your computer and use it in GitHub Desktop.
HTML parsing benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import timeit | |
import requests | |
import html_text | |
import justext | |
from selectolax.parser import HTMLParser | |
# There's a difference between these two | |
html_long = requests.get("http://planet.python.org/").content.decode("utf-8") | |
html_short = "<p><b>This</b> is just a small example.</p>" | |
# Run benchmark | |
htmls = {"long": html_long, "short": html_short} | |
results = {} | |
for html_name, html in htmls.items(): | |
methods = { | |
"justext": lambda: justext.justext(html, tuple()), | |
"htmltext": lambda: html_text.extract_text(html, guess_layout=False), | |
"selectolax": lambda: HTMLParser(html).text() | |
} | |
for method, func in methods.items(): | |
print(method, html_name) | |
results[(html_name, method)] = timeit.timeit(func, number=500) | |
# Pretty Print | |
from rich.console import Console | |
from rich.table import Table | |
table = Table(title="Benchmark Results") | |
table.add_column("HTML Variant", style="cyan") | |
table.add_column("Library", justify="right", style="magenta") | |
table.add_column("Time (s)", style="green") | |
for (variant, library), speed in results.items(): | |
table.add_row(variant, library, str(speed)) | |
console = Console() | |
console.print(table) | |
""" | |
Benchmark Results | |
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓ | |
┃ HTML Variant ┃ Library ┃ Time (s) ┃ | |
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩ | |
│ long │ justext │ 0.49477454099996976 │ | |
│ long │ htmltext │ 10.072820916999945 │ | |
│ long │ selectolax │ 14.882792749999908 │ | |
│ short │ justext │ 0.025378625000030297 │ | |
│ short │ htmltext │ 0.012803834000010283 │ | |
│ short │ selectolax │ 0.004145417000017915 │ | |
└──────────────┴────────────┴──────────────────────┘ | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment