Skip to content

Instantly share code, notes, and snippets.

@xflr6
xflr6 / xpath.py
Last active October 12, 2025 09:40
Use some advanced XPath features of lxml for scraping html/xml
"""Use advanced XPath features of lxml (see also scrapy parsel)."""
from collections.abc import Callable
import functools
from typing import Any, Self, overload
import urllib.request
import lxml.etree
import lxml.html
@xflr6
xflr6 / parse_url.py
Last active October 12, 2025 08:53
Compare different ways to get a html tree from an url with streaming
"""Compare ways to return HTML tree streamed and parsed from a given URL."""
import contextlib
from typing import Literal, overload
import urllib.request
import xml.etree.ElementTree as etree
import certifi
import html5lib
import lxml.html
@xflr6
xflr6 / iso639p3.py
Last active October 12, 2025 09:50
Download and parse ISO 639-3 code tables from sil.org
"""Download and parse ISO 639-3 code tables from https://iso639-3.sil.org download link."""
from collections.abc import Callable, Iterable, Iterator, Mapping
import contextlib
import csv
import fnmatch
import functools
import html.parser
import http.client
import io
@xflr6
xflr6 / urlretrieve.py
Last active October 4, 2025 17:21
Replacement for urllib.urlretrieve(url, filename) using the requests library
"""Implement `urllib.urlretrieve(url, filename)` with requests library."""
import contextlib
import os
import urllib
import requests
def urlretrieve(url: str,
@xflr6
xflr6 / ethnologue.py
Last active October 12, 2025 09:50
Download and parse ethnologue.com language code tables
"""Download and parse code tables from https://www.ethnologue.com download link."""
from collections.abc import Callable, Iterable, Iterator, Mapping
import contextlib
import csv
import fnmatch
import functools
import html.parser
import http.client
import io
@xflr6
xflr6 / common_prefix.py
Last active October 4, 2025 17:50
Case-insensitive longest common prefix of two strings
"""Longest common prefix."""
import itertools
def common_prefix(left: str, right: str) -> str:
"""Return the case-insensitive longest common prefix of two strings.
>>> common_prefix('spam', 'spameggs')
'spam'
@xflr6
xflr6 / pearsonr.py
Last active October 12, 2025 09:57
Pure Python replacement for scipy.stats.pearsonr
"""Pure-python replacement for scipy.stats.pearsonr."""
from collections.abc import Sequence
import itertools
import math
import operator
def pearsonr(X: Sequence[int], Y: Sequence[int]) -> float:
"""Return the correlation coefficient between the variable sequences X and Y.
@xflr6
xflr6 / sedlike.py
Last active October 4, 2025 17:55
Make Python regex search/replace functions from sed-like s/old/new/g strings
"""Search/replace func from string roughly like 'sed -r s/old/new/g'."""
from collections.abc import Callable
import functools
def search_replace(cmd: str, *, _cache={}) -> Callable[[str], str]:
"""Return a callable from sed/Perl-style search-replace pattern string.
>>> search_replace('s/ham/spam/g')('ham-eggs-ham')
@xflr6
xflr6 / replace.py
Last active June 4, 2022 09:29
Apply multiple search-replace rules to a string with a combined regex
"""Apply search-replace rules with re."""
from collections.abc import Iterable
import re
class Replace:
"""Multiple search-replace with first-matching regex."""
def __init__(self, pairs: Iterable[tuple[str, str]]) -> None:
@xflr6
xflr6 / ordered.py
Last active October 4, 2025 17:57
Iterate over dictionaries with variable keys in predefined order
"""Iterate over dicts with variable keys in predefined order."""
from collections.abc import Iterable
from typing import Self
class KeyOrder(dict):
"""Key -> index mapping for iterating over dicts (unknown keys last)."""
@classmethod