Last active
October 25, 2025 12:41
-
-
Save xflr6/888d878abbae5298f25b939774e3382b to your computer and use it in GitHub Desktop.
Drop Glottolog bibfiles for control characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "7ac9103f-50a3-4b07-b5ce-27f024588879", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'anla.bib',\n", | |
| " 'asjp2010.bib',\n", | |
| " 'bahasa.bib',\n", | |
| " 'bibliolux.bib',\n", | |
| " 'bowern.bib',\n", | |
| " 'eballiso2009.bib',\n", | |
| " 'fabreall2009ann.bib',\n", | |
| " 'gilbertese.bib',\n", | |
| " 'gj.bib',\n", | |
| " 'goba.bib',\n", | |
| " 'guldemann.bib',\n", | |
| " 'haspelmath.bib',\n", | |
| " 'hedvig-tirailleur.bib',\n", | |
| " 'lapolla-tibeto-burman.bib',\n", | |
| " 'lewinmanx.bib',\n", | |
| " 'ludger-paschen-germanic.bib',\n", | |
| " 'marctang.bib',\n", | |
| " 'mpieva.bib',\n", | |
| " 'otomanguean.bib',\n", | |
| " 'ozbib.bib',\n", | |
| " 'phoible.bib',\n", | |
| " 'sala.bib',\n", | |
| " 'schikowski_chintang.bib',\n", | |
| " 'seifart.bib',\n", | |
| " 'silpng.bib',\n", | |
| " 'sn.bib',\n", | |
| " 'stampe.bib',\n", | |
| " 'weball.bib',\n", | |
| " 'zorcpapers.bib',\n", | |
| " 'zurich.bib'}" | |
| ] | |
| }, | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "\"\"\"https://en.wikipedia.org/wiki/Latin-1_Supplement_(Unicode_block)\"\"\"\n", | |
| "\n", | |
| "import collections\n", | |
| "import configparser\n", | |
| "import pathlib\n", | |
| "import re\n", | |
| "from typing import Self\n", | |
| "\n", | |
| "GLOTTOLOG = pathlib.Path('~/Desktop/glottolog').expanduser()\n", | |
| "\n", | |
| "REFERENCES = GLOTTOLOG / 'references'\n", | |
| "\n", | |
| "CONFIG = REFERENCES / 'BIBFILES.ini'\n", | |
| "\n", | |
| "BIBTEX = REFERENCES / 'bibtex'\n", | |
| "\n", | |
| "HH_BIB = BIBTEX / 'hh.bib'\n", | |
| "\n", | |
| "SKIP = {'degruyter.bib',\n", | |
| " 'sil16.bib'}\n", | |
| "\n", | |
| "ENCODING = 'utf-8'\n", | |
| "\n", | |
| "class Config(configparser.ConfigParser):\n", | |
| " \n", | |
| " @classmethod\n", | |
| " def from_path(cls, path: pathlib.Path, *, encoding: str = ENCODING) -> Self:\n", | |
| " inst = cls()\n", | |
| " with path.open(encoding=encoding) as f:\n", | |
| " inst.read_file(f)\n", | |
| " return inst\n", | |
| "\n", | |
| "config = Config.from_path(CONFIG)\n", | |
| "editable = {filename for filename, section in config.items()\n", | |
| " if 'enhancement of existing items possible' in section['curation'].lower()\n", | |
| " and filename not in SKIP}\n", | |
| "editable" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "09fe3cd1-02ae-4fb2-868b-1208b0bb20e6", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Counter()" | |
| ] | |
| }, | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "C1_CONTROL_CHAR = re.compile(r'[\\u0080-\\u009f]')\n", | |
| "\n", | |
| "def count_control_chars(path: pathlib.Path, /, *,\n", | |
| " verbose: bool = True,\n", | |
| " encoding: str = ENCODING) -> collections.Counter:\n", | |
| " result = collections.Counter()\n", | |
| " with path.open(encoding=ENCODING) as f:\n", | |
| " for i, line in enumerate(f, start=1):\n", | |
| " if (chars := C1_CONTROL_CHAR.findall(line)):\n", | |
| " codes = collections.Counter(f'U+{ord(c):04X}' for c in chars)\n", | |
| " stats = ', '.join(codepoint + ('*' * (count - 1) if count > 1 else '')\n", | |
| " for codepoint, count in codes.most_common())\n", | |
| " print(path.name, i, stats)\n", | |
| " result += codes\n", | |
| " return result\n", | |
| "\n", | |
| "count_control_chars(HH_BIB)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "b94b529b-e206-4960-988e-47eacdee31cd", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "43" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "bibfiles = sorted(BIBTEX.glob('*.bib'))\n", | |
| "len(bibfiles)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "30ba69d2-85e9-421d-ad4b-681d28661cba", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "benjamins.bib 30480 U+0097**, U+0092\n", | |
| "benjamins.bib 30519 U+0093, U+0094\n", | |
| "benjamins.bib 187581 U+0092\n", | |
| "benjamins.bib 187629 U+0092**, U+0091*\n", | |
| "benjamins.bib 188005 U+009A\n", | |
| "benjamins.bib 190517 U+009A\n", | |
| "benjamins.bib 210878 U+0093, U+0092, U+0094\n", | |
| "benjamins.bib 210904 U+0092\n", | |
| "benjamins.bib 210910 U+0092*, U+0093, U+0094\n", | |
| "benjamins.bib 210936 U+0097*\n", | |
| "benjamins.bib 221732 U+0092**\n", | |
| "benjamins.bib 228273 U+0092\n", | |
| "benjamins.bib 228324 U+0092\n", | |
| "benjamins.bib 245295 U+009A*\n", | |
| "benjamins.bib 245823 U+009A*\n", | |
| "benjamins.bib 246339 U+009A\n", | |
| "benjamins.bib 246609 U+009A\n", | |
| "benjamins.bib 246619 U+009A***********\n", | |
| "benjamins.bib 265615 U+0092\n", | |
| "benjamins.bib 265625 U+0092**\n", | |
| "benjamins.bib 265657 U+0096\n", | |
| "benjamins.bib 265753 U+0092*\n", | |
| "benjamins.bib 265787 U+0092*\n", | |
| "benjamins.bib 265868 U+0092*, U+0091\n", | |
| "benjamins.bib 265885 U+0096\n", | |
| "benjamins.bib 265958 U+0092\n", | |
| "benjamins.bib 265992 U+0092\n", | |
| "benjamins.bib 284135 U+009A\n", | |
| "benjamins.bib 291394 U+009A\n", | |
| "benjamins.bib 329617 U+0092\n", | |
| "benjamins.bib 329634 U+0097***, U+0091, U+0092\n", | |
| "benjamins.bib 329651 U+0092, U+0093, U+0094\n", | |
| "benjamins.bib 329667 U+0093*, U+0094*, U+0092*, U+0097*\n", | |
| "benjamins.bib 329683 U+0092**, U+0093, U+0094\n", | |
| "benjamins.bib 339883 U+0092\n", | |
| "benjamins.bib 339893 U+0093*, U+0094*\n", | |
| "benjamins.bib 339925 U+0093, U+0094\n", | |
| "benjamins.bib 455519 U+0092\n", | |
| "benjamins.bib 455568 U+0093**, U+0094**\n", | |
| "benjamins.bib 455585 U+0093, U+0094\n", | |
| "benjamins.bib 455602 U+0097*\n", | |
| "benjamins.bib 470937 U+008A\n", | |
| "benjamins.bib 471033 U+008A\n", | |
| "benjamins.bib 500076 U+0097***\n", | |
| "benjamins.bib 704803 U+0091, U+0092\n", | |
| "benjamins.bib 704813 U+0091**, U+0092**\n", | |
| "benjamins.bib 704830 U+0092***, U+0091**\n", | |
| "benjamins.bib 705181 U+0092*******, U+0093, U+0094\n", | |
| "degruyter.bib 530160 U+0091\n", | |
| "degruyter.bib 568722 U+0096\n", | |
| "degruyter.bib 568750 U+0091, U+0092\n", | |
| "degruyter.bib 569352 U+0092*, U+0091\n", | |
| "degruyter.bib 569366 U+0091, U+0092\n", | |
| "degruyter.bib 569422 U+0091, U+0092\n", | |
| "degruyter.bib 569576 U+0091, U+0092\n", | |
| "degruyter.bib 569744 U+0091, U+0092\n", | |
| "degruyter.bib 569758 U+0091, U+0092\n", | |
| "degruyter.bib 569954 U+0091, U+0092\n", | |
| "degruyter.bib 570122 U+0096\n", | |
| "degruyter.bib 570752 U+0091, U+0092\n", | |
| "degruyter.bib 578214 U+0096\n", | |
| "degruyter.bib 578508 U+0096\n", | |
| "degruyter.bib 578886 U+0096\n", | |
| "degruyter.bib 579880 U+0091, U+0092\n", | |
| "degruyter.bib 580650 U+0084, U+0094\n", | |
| "degruyter.bib 580804 U+0096\n", | |
| "degruyter.bib 581406 U+0092\n", | |
| "degruyter.bib 581700 U+0091, U+0092\n", | |
| "degruyter.bib 581770 U+0091, U+0092\n", | |
| "degruyter.bib 581784 U+0091, U+0092\n", | |
| "degruyter.bib 581882 U+0084*, U+0094*\n", | |
| "degruyter.bib 582120 U+0091, U+0092\n", | |
| "degruyter.bib 582302 U+0093, U+0094\n", | |
| "degruyter.bib 582330 U+0084, U+0093\n", | |
| "degruyter.bib 582344 U+0092, U+0084, U+0094\n", | |
| "degruyter.bib 582414 U+0094, U+0093\n", | |
| "degruyter.bib 582470 U+0084, U+0093\n", | |
| "degruyter.bib 585229 U+0096\n", | |
| "degruyter.bib 586629 U+0092*\n", | |
| "degruyter.bib 622167 U+0092*\n", | |
| "degruyter.bib 622257 U+0093, U+0094\n", | |
| "degruyter.bib 622272 U+0092\n", | |
| "degruyter.bib 622407 U+0092, U+0096\n", | |
| "degruyter.bib 622422 U+0092\n", | |
| "degruyter.bib 697231 U+0092*\n", | |
| "degruyter.bib 697246 U+0093****, U+0094****, U+0092\n", | |
| "degruyter.bib 697261 U+0092\n", | |
| "degruyter.bib 697320 U+0092**\n", | |
| "degruyter.bib 697350 U+0092*\n", | |
| "degruyter.bib 697365 U+0092\n", | |
| "degruyter.bib 713347 U+0096\n", | |
| "degruyter.bib 713515 U+0096*\n", | |
| "degruyter.bib 713809 U+0096\n", | |
| "degruyter.bib 713977 U+0096*\n", | |
| "degruyter.bib 715588 U+0096\n", | |
| "degruyter.bib 715854 U+0096\n", | |
| "degruyter.bib 716063 U+0086\n", | |
| "degruyter.bib 716120 U+0084, U+0094\n", | |
| "degruyter.bib 716470 U+0084, U+0094\n", | |
| "degruyter.bib 716568 U+0084*, U+0094*\n", | |
| "degruyter.bib 716582 U+0084, U+0094\n", | |
| "degruyter.bib 716596 U+0096\n", | |
| "degruyter.bib 716680 U+0096*\n", | |
| "degruyter.bib 717660 U+0096\n", | |
| "degruyter.bib 804023 U+0084, U+0093\n", | |
| "degruyter.bib 804039 U+0084, U+0093, U+0096\n", | |
| "degruyter.bib 804055 U+009A*, U+0084, U+0093\n", | |
| "degruyter.bib 804104 U+0096\n", | |
| "degruyter.bib 804152 U+0084, U+0093\n", | |
| "degruyter.bib 804295 U+0084, U+0093\n", | |
| "degruyter.bib 804311 U+0096\n", | |
| "degruyter.bib 804327 U+0084, U+009E, U+0093\n", | |
| "degruyter.bib 804343 U+0096\n", | |
| "degruyter.bib 804359 U+0096\n", | |
| "degruyter.bib 804375 U+0096\n", | |
| "degruyter.bib 804390 U+0084, U+0093, U+0096\n", | |
| "degruyter.bib 804447 U+0096*\n", | |
| "degruyter.bib 804533 U+0084, U+0093, U+0096\n", | |
| "degruyter.bib 804558 U+0096\n", | |
| "degruyter.bib 804879 U+0096\n", | |
| "degruyter.bib 805198 U+009A*\n", | |
| "degruyter.bib 805246 U+0096\n", | |
| "degruyter.bib 805321 U+0096\n", | |
| "degruyter.bib 805336 U+0084, U+0093, U+0096\n", | |
| "degruyter.bib 805361 U+0084**, U+0093*, U+0096, U+0094\n", | |
| "degruyter.bib 805461 U+0093, U+0094\n", | |
| "degruyter.bib 805485 U+0084, U+0093, U+0096\n", | |
| "degruyter.bib 805558 U+0096***, U+009A*\n", | |
| "degruyter.bib 805757 U+0096\n", | |
| "degruyter.bib 805882 U+0096\n", | |
| "degruyter.bib 805912 U+0084, U+0096, U+0093\n", | |
| "degruyter.bib 805927 U+0096\n", | |
| "degruyter.bib 805952 U+009A, U+0096\n", | |
| "degruyter.bib 805984 U+0096\n", | |
| "degruyter.bib 806166 U+0096**\n", | |
| "degruyter.bib 806198 U+009A\n", | |
| "degruyter.bib 806905 U+0096\n", | |
| "degruyter.bib 808589 U+0096\n", | |
| "degruyter.bib 808841 U+0096\n", | |
| "degruyter.bib 810227 U+0096*\n", | |
| "degruyter.bib 810997 U+0096\n", | |
| "degruyter.bib 811053 U+0084, U+0094\n", | |
| "degruyter.bib 845233 U+0093, U+0094\n", | |
| "degruyter.bib 845250 U+0096***\n", | |
| "degruyter.bib 845335 U+0096*\n", | |
| "degruyter.bib 845480 U+0096*\n", | |
| "degruyter.bib 845582 U+0093, U+0094\n", | |
| "degruyter.bib 845604 U+0096\n", | |
| "degruyter.bib 845715 U+0096\n", | |
| "degruyter.bib 846030 U+0096\n", | |
| "evobib.bib 4486 U+0094*\n", | |
| "evobib.bib 4691 U+0094*\n", | |
| "evobib.bib 6288 U+0092\n", | |
| "evobib.bib 7160 U+0092\n", | |
| "evobib.bib 8587 U+0094\n", | |
| "evobib.bib 10343 U+0092*\n", | |
| "evobib.bib 11496 U+0094*\n", | |
| "evobib.bib 14514 U+0092\n", | |
| "evobib.bib 15045 U+0092\n", | |
| "evobib.bib 15281 U+0094***\n", | |
| "evobib.bib 15293 U+0094*\n", | |
| "evobib.bib 16156 U+0088*\n", | |
| "evobib.bib 18642 U+0094*\n", | |
| "evobib.bib 18774 U+0094\n", | |
| "evobib.bib 22670 U+0092\n", | |
| "evobib.bib 23686 U+0094*\n", | |
| "evobib.bib 24962 U+0094*\n", | |
| "evobib.bib 25640 U+0094\n", | |
| "evobib.bib 33191 U+0094\n", | |
| "evobib.bib 34896 U+0092\n", | |
| "evobib.bib 39165 U+0092\n", | |
| "evobib.bib 39581 U+0094**\n", | |
| "evobib.bib 40401 U+008C\n", | |
| "evobib.bib 40599 U+0092\n", | |
| "evobib.bib 41969 U+0094\n", | |
| "evobib.bib 43554 U+0094\n", | |
| "evobib.bib 43642 U+0094*\n", | |
| "evobib.bib 46546 U+0094\n", | |
| "evobib.bib 47101 U+0092\n", | |
| "evobib.bib 47387 U+0092\n", | |
| "evobib.bib 47481 U+0092\n", | |
| "ldh.bib 12171 U+0092\n", | |
| "ldh.bib 12817 U+0093, U+0094\n", | |
| "ldh.bib 13861 U+0096\n", | |
| "sil16.bib 75677 U+0080\n", | |
| "sil16.bib 85046 U+0080\n", | |
| "sil16.bib 124097 U+0090\n", | |
| "sil16.bib 129034 U+0080*\n", | |
| "sil16.bib 137779 U+0090\n", | |
| "sil16.bib 143544 U+0080**\n", | |
| "sil16.bib 146284 U+0080*\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "total = collections.Counter()\n", | |
| "\n", | |
| "for path in bibfiles:\n", | |
| " total += count_control_chars(path)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "749dc209-c154-40ff-a9f1-e9ecacbe8900", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('U+0092', 100),\n", | |
| " ('U+0096', 70),\n", | |
| " ('U+0094', 68),\n", | |
| " ('U+0093', 42),\n", | |
| " ('U+009A', 30),\n", | |
| " ('U+0091', 26),\n", | |
| " ('U+0084', 26),\n", | |
| " ('U+0097', 17),\n", | |
| " ('U+0080', 9),\n", | |
| " ('U+008A', 2),\n", | |
| " ('U+0088', 2),\n", | |
| " ('U+0090', 2),\n", | |
| " ('U+0086', 1),\n", | |
| " ('U+009E', 1),\n", | |
| " ('U+008C', 1)]" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "total.most_common()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "729d15df-e759-4a08-bf61-fe625a6b4778", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "for path in bibfiles:\n", | |
| " if path.name not in editable:\n", | |
| " continue\n", | |
| "\n", | |
| " old = path.read_text(encoding=ENCODING)\n", | |
| " (new, replaced) = C1_CONTROL_CHAR.subn('', old)\n", | |
| "\n", | |
| " if replaced:\n", | |
| " print(f'{path.name} replaced {replaced} characters')\n", | |
| " path.write_text(new, encoding=ENCODING)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.14.0" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment