Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active October 25, 2025 12:41
Show Gist options
  • Save xflr6/888d878abbae5298f25b939774e3382b to your computer and use it in GitHub Desktop.
Save xflr6/888d878abbae5298f25b939774e3382b to your computer and use it in GitHub Desktop.
Drop Glottolog bibfiles for control characters
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "7ac9103f-50a3-4b07-b5ce-27f024588879",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'anla.bib',\n",
" 'asjp2010.bib',\n",
" 'bahasa.bib',\n",
" 'bibliolux.bib',\n",
" 'bowern.bib',\n",
" 'eballiso2009.bib',\n",
" 'fabreall2009ann.bib',\n",
" 'gilbertese.bib',\n",
" 'gj.bib',\n",
" 'goba.bib',\n",
" 'guldemann.bib',\n",
" 'haspelmath.bib',\n",
" 'hedvig-tirailleur.bib',\n",
" 'lapolla-tibeto-burman.bib',\n",
" 'lewinmanx.bib',\n",
" 'ludger-paschen-germanic.bib',\n",
" 'marctang.bib',\n",
" 'mpieva.bib',\n",
" 'otomanguean.bib',\n",
" 'ozbib.bib',\n",
" 'phoible.bib',\n",
" 'sala.bib',\n",
" 'schikowski_chintang.bib',\n",
" 'seifart.bib',\n",
" 'silpng.bib',\n",
" 'sn.bib',\n",
" 'stampe.bib',\n",
" 'weball.bib',\n",
" 'zorcpapers.bib',\n",
" 'zurich.bib'}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\"https://en.wikipedia.org/wiki/Latin-1_Supplement_(Unicode_block)\"\"\"\n",
"\n",
"import collections\n",
"import configparser\n",
"import pathlib\n",
"import re\n",
"from typing import Self\n",
"\n",
"GLOTTOLOG = pathlib.Path('~/Desktop/glottolog').expanduser()\n",
"\n",
"REFERENCES = GLOTTOLOG / 'references'\n",
"\n",
"CONFIG = REFERENCES / 'BIBFILES.ini'\n",
"\n",
"BIBTEX = REFERENCES / 'bibtex'\n",
"\n",
"HH_BIB = BIBTEX / 'hh.bib'\n",
"\n",
"SKIP = {'degruyter.bib',\n",
" 'sil16.bib'}\n",
"\n",
"ENCODING = 'utf-8'\n",
"\n",
"class Config(configparser.ConfigParser):\n",
" \n",
" @classmethod\n",
" def from_path(cls, path: pathlib.Path, *, encoding: str = ENCODING) -> Self:\n",
" inst = cls()\n",
" with path.open(encoding=encoding) as f:\n",
" inst.read_file(f)\n",
" return inst\n",
"\n",
"config = Config.from_path(CONFIG)\n",
"editable = {filename for filename, section in config.items()\n",
" if 'enhancement of existing items possible' in section['curation'].lower()\n",
" and filename not in SKIP}\n",
"editable"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "09fe3cd1-02ae-4fb2-868b-1208b0bb20e6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Counter()"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"C1_CONTROL_CHAR = re.compile(r'[\\u0080-\\u009f]')\n",
"\n",
"def count_control_chars(path: pathlib.Path, /, *,\n",
" verbose: bool = True,\n",
" encoding: str = ENCODING) -> collections.Counter:\n",
" result = collections.Counter()\n",
" with path.open(encoding=ENCODING) as f:\n",
" for i, line in enumerate(f, start=1):\n",
" if (chars := C1_CONTROL_CHAR.findall(line)):\n",
" codes = collections.Counter(f'U+{ord(c):04X}' for c in chars)\n",
" stats = ', '.join(codepoint + ('*' * (count - 1) if count > 1 else '')\n",
" for codepoint, count in codes.most_common())\n",
" print(path.name, i, stats)\n",
" result += codes\n",
" return result\n",
"\n",
"count_control_chars(HH_BIB)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b94b529b-e206-4960-988e-47eacdee31cd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"43"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bibfiles = sorted(BIBTEX.glob('*.bib'))\n",
"len(bibfiles)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "30ba69d2-85e9-421d-ad4b-681d28661cba",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"benjamins.bib 30480 U+0097**, U+0092\n",
"benjamins.bib 30519 U+0093, U+0094\n",
"benjamins.bib 187581 U+0092\n",
"benjamins.bib 187629 U+0092**, U+0091*\n",
"benjamins.bib 188005 U+009A\n",
"benjamins.bib 190517 U+009A\n",
"benjamins.bib 210878 U+0093, U+0092, U+0094\n",
"benjamins.bib 210904 U+0092\n",
"benjamins.bib 210910 U+0092*, U+0093, U+0094\n",
"benjamins.bib 210936 U+0097*\n",
"benjamins.bib 221732 U+0092**\n",
"benjamins.bib 228273 U+0092\n",
"benjamins.bib 228324 U+0092\n",
"benjamins.bib 245295 U+009A*\n",
"benjamins.bib 245823 U+009A*\n",
"benjamins.bib 246339 U+009A\n",
"benjamins.bib 246609 U+009A\n",
"benjamins.bib 246619 U+009A***********\n",
"benjamins.bib 265615 U+0092\n",
"benjamins.bib 265625 U+0092**\n",
"benjamins.bib 265657 U+0096\n",
"benjamins.bib 265753 U+0092*\n",
"benjamins.bib 265787 U+0092*\n",
"benjamins.bib 265868 U+0092*, U+0091\n",
"benjamins.bib 265885 U+0096\n",
"benjamins.bib 265958 U+0092\n",
"benjamins.bib 265992 U+0092\n",
"benjamins.bib 284135 U+009A\n",
"benjamins.bib 291394 U+009A\n",
"benjamins.bib 329617 U+0092\n",
"benjamins.bib 329634 U+0097***, U+0091, U+0092\n",
"benjamins.bib 329651 U+0092, U+0093, U+0094\n",
"benjamins.bib 329667 U+0093*, U+0094*, U+0092*, U+0097*\n",
"benjamins.bib 329683 U+0092**, U+0093, U+0094\n",
"benjamins.bib 339883 U+0092\n",
"benjamins.bib 339893 U+0093*, U+0094*\n",
"benjamins.bib 339925 U+0093, U+0094\n",
"benjamins.bib 455519 U+0092\n",
"benjamins.bib 455568 U+0093**, U+0094**\n",
"benjamins.bib 455585 U+0093, U+0094\n",
"benjamins.bib 455602 U+0097*\n",
"benjamins.bib 470937 U+008A\n",
"benjamins.bib 471033 U+008A\n",
"benjamins.bib 500076 U+0097***\n",
"benjamins.bib 704803 U+0091, U+0092\n",
"benjamins.bib 704813 U+0091**, U+0092**\n",
"benjamins.bib 704830 U+0092***, U+0091**\n",
"benjamins.bib 705181 U+0092*******, U+0093, U+0094\n",
"degruyter.bib 530160 U+0091\n",
"degruyter.bib 568722 U+0096\n",
"degruyter.bib 568750 U+0091, U+0092\n",
"degruyter.bib 569352 U+0092*, U+0091\n",
"degruyter.bib 569366 U+0091, U+0092\n",
"degruyter.bib 569422 U+0091, U+0092\n",
"degruyter.bib 569576 U+0091, U+0092\n",
"degruyter.bib 569744 U+0091, U+0092\n",
"degruyter.bib 569758 U+0091, U+0092\n",
"degruyter.bib 569954 U+0091, U+0092\n",
"degruyter.bib 570122 U+0096\n",
"degruyter.bib 570752 U+0091, U+0092\n",
"degruyter.bib 578214 U+0096\n",
"degruyter.bib 578508 U+0096\n",
"degruyter.bib 578886 U+0096\n",
"degruyter.bib 579880 U+0091, U+0092\n",
"degruyter.bib 580650 U+0084, U+0094\n",
"degruyter.bib 580804 U+0096\n",
"degruyter.bib 581406 U+0092\n",
"degruyter.bib 581700 U+0091, U+0092\n",
"degruyter.bib 581770 U+0091, U+0092\n",
"degruyter.bib 581784 U+0091, U+0092\n",
"degruyter.bib 581882 U+0084*, U+0094*\n",
"degruyter.bib 582120 U+0091, U+0092\n",
"degruyter.bib 582302 U+0093, U+0094\n",
"degruyter.bib 582330 U+0084, U+0093\n",
"degruyter.bib 582344 U+0092, U+0084, U+0094\n",
"degruyter.bib 582414 U+0094, U+0093\n",
"degruyter.bib 582470 U+0084, U+0093\n",
"degruyter.bib 585229 U+0096\n",
"degruyter.bib 586629 U+0092*\n",
"degruyter.bib 622167 U+0092*\n",
"degruyter.bib 622257 U+0093, U+0094\n",
"degruyter.bib 622272 U+0092\n",
"degruyter.bib 622407 U+0092, U+0096\n",
"degruyter.bib 622422 U+0092\n",
"degruyter.bib 697231 U+0092*\n",
"degruyter.bib 697246 U+0093****, U+0094****, U+0092\n",
"degruyter.bib 697261 U+0092\n",
"degruyter.bib 697320 U+0092**\n",
"degruyter.bib 697350 U+0092*\n",
"degruyter.bib 697365 U+0092\n",
"degruyter.bib 713347 U+0096\n",
"degruyter.bib 713515 U+0096*\n",
"degruyter.bib 713809 U+0096\n",
"degruyter.bib 713977 U+0096*\n",
"degruyter.bib 715588 U+0096\n",
"degruyter.bib 715854 U+0096\n",
"degruyter.bib 716063 U+0086\n",
"degruyter.bib 716120 U+0084, U+0094\n",
"degruyter.bib 716470 U+0084, U+0094\n",
"degruyter.bib 716568 U+0084*, U+0094*\n",
"degruyter.bib 716582 U+0084, U+0094\n",
"degruyter.bib 716596 U+0096\n",
"degruyter.bib 716680 U+0096*\n",
"degruyter.bib 717660 U+0096\n",
"degruyter.bib 804023 U+0084, U+0093\n",
"degruyter.bib 804039 U+0084, U+0093, U+0096\n",
"degruyter.bib 804055 U+009A*, U+0084, U+0093\n",
"degruyter.bib 804104 U+0096\n",
"degruyter.bib 804152 U+0084, U+0093\n",
"degruyter.bib 804295 U+0084, U+0093\n",
"degruyter.bib 804311 U+0096\n",
"degruyter.bib 804327 U+0084, U+009E, U+0093\n",
"degruyter.bib 804343 U+0096\n",
"degruyter.bib 804359 U+0096\n",
"degruyter.bib 804375 U+0096\n",
"degruyter.bib 804390 U+0084, U+0093, U+0096\n",
"degruyter.bib 804447 U+0096*\n",
"degruyter.bib 804533 U+0084, U+0093, U+0096\n",
"degruyter.bib 804558 U+0096\n",
"degruyter.bib 804879 U+0096\n",
"degruyter.bib 805198 U+009A*\n",
"degruyter.bib 805246 U+0096\n",
"degruyter.bib 805321 U+0096\n",
"degruyter.bib 805336 U+0084, U+0093, U+0096\n",
"degruyter.bib 805361 U+0084**, U+0093*, U+0096, U+0094\n",
"degruyter.bib 805461 U+0093, U+0094\n",
"degruyter.bib 805485 U+0084, U+0093, U+0096\n",
"degruyter.bib 805558 U+0096***, U+009A*\n",
"degruyter.bib 805757 U+0096\n",
"degruyter.bib 805882 U+0096\n",
"degruyter.bib 805912 U+0084, U+0096, U+0093\n",
"degruyter.bib 805927 U+0096\n",
"degruyter.bib 805952 U+009A, U+0096\n",
"degruyter.bib 805984 U+0096\n",
"degruyter.bib 806166 U+0096**\n",
"degruyter.bib 806198 U+009A\n",
"degruyter.bib 806905 U+0096\n",
"degruyter.bib 808589 U+0096\n",
"degruyter.bib 808841 U+0096\n",
"degruyter.bib 810227 U+0096*\n",
"degruyter.bib 810997 U+0096\n",
"degruyter.bib 811053 U+0084, U+0094\n",
"degruyter.bib 845233 U+0093, U+0094\n",
"degruyter.bib 845250 U+0096***\n",
"degruyter.bib 845335 U+0096*\n",
"degruyter.bib 845480 U+0096*\n",
"degruyter.bib 845582 U+0093, U+0094\n",
"degruyter.bib 845604 U+0096\n",
"degruyter.bib 845715 U+0096\n",
"degruyter.bib 846030 U+0096\n",
"evobib.bib 4486 U+0094*\n",
"evobib.bib 4691 U+0094*\n",
"evobib.bib 6288 U+0092\n",
"evobib.bib 7160 U+0092\n",
"evobib.bib 8587 U+0094\n",
"evobib.bib 10343 U+0092*\n",
"evobib.bib 11496 U+0094*\n",
"evobib.bib 14514 U+0092\n",
"evobib.bib 15045 U+0092\n",
"evobib.bib 15281 U+0094***\n",
"evobib.bib 15293 U+0094*\n",
"evobib.bib 16156 U+0088*\n",
"evobib.bib 18642 U+0094*\n",
"evobib.bib 18774 U+0094\n",
"evobib.bib 22670 U+0092\n",
"evobib.bib 23686 U+0094*\n",
"evobib.bib 24962 U+0094*\n",
"evobib.bib 25640 U+0094\n",
"evobib.bib 33191 U+0094\n",
"evobib.bib 34896 U+0092\n",
"evobib.bib 39165 U+0092\n",
"evobib.bib 39581 U+0094**\n",
"evobib.bib 40401 U+008C\n",
"evobib.bib 40599 U+0092\n",
"evobib.bib 41969 U+0094\n",
"evobib.bib 43554 U+0094\n",
"evobib.bib 43642 U+0094*\n",
"evobib.bib 46546 U+0094\n",
"evobib.bib 47101 U+0092\n",
"evobib.bib 47387 U+0092\n",
"evobib.bib 47481 U+0092\n",
"ldh.bib 12171 U+0092\n",
"ldh.bib 12817 U+0093, U+0094\n",
"ldh.bib 13861 U+0096\n",
"sil16.bib 75677 U+0080\n",
"sil16.bib 85046 U+0080\n",
"sil16.bib 124097 U+0090\n",
"sil16.bib 129034 U+0080*\n",
"sil16.bib 137779 U+0090\n",
"sil16.bib 143544 U+0080**\n",
"sil16.bib 146284 U+0080*\n"
]
}
],
"source": [
"total = collections.Counter()\n",
"\n",
"for path in bibfiles:\n",
" total += count_control_chars(path)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "749dc209-c154-40ff-a9f1-e9ecacbe8900",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('U+0092', 100),\n",
" ('U+0096', 70),\n",
" ('U+0094', 68),\n",
" ('U+0093', 42),\n",
" ('U+009A', 30),\n",
" ('U+0091', 26),\n",
" ('U+0084', 26),\n",
" ('U+0097', 17),\n",
" ('U+0080', 9),\n",
" ('U+008A', 2),\n",
" ('U+0088', 2),\n",
" ('U+0090', 2),\n",
" ('U+0086', 1),\n",
" ('U+009E', 1),\n",
" ('U+008C', 1)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"total.most_common()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "729d15df-e759-4a08-bf61-fe625a6b4778",
"metadata": {},
"outputs": [],
"source": [
"for path in bibfiles:\n",
" if path.name not in editable:\n",
" continue\n",
"\n",
" old = path.read_text(encoding=ENCODING)\n",
" (new, replaced) = C1_CONTROL_CHAR.subn('', old)\n",
"\n",
" if replaced:\n",
" print(f'{path.name} replaced {replaced} characters')\n",
" path.write_text(new, encoding=ENCODING)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment