Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active October 25, 2025 12:44
Show Gist options
  • Save xflr6/8d5b4d71e6464f21608bafc0675294a9 to your computer and use it in GitHub Desktop.
Save xflr6/8d5b4d71e6464f21608bafc0675294a9 to your computer and use it in GitHub Desktop.
Check Glottolog -> Wikidata mapping
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "599cf58f-bdd6-4da7-a81c-22b4321a8d58",
"metadata": {},
"outputs": [],
"source": [
"import functools\n",
"import logging\n",
"import pathlib\n",
"\n",
"from IPython.display import display\n",
"import pandas as pd\n",
"import rdflib\n",
"\n",
"logging.basicConfig(format='[%(levelname)s@%(name)s] %(message)s', level=logging.INFO)\n",
"logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "0a0f7ae6-3d8e-493f-8dec-ee8700559665",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[[email protected]] select pg_catalog.version()\n",
"[[email protected]] [raw sql] {}\n",
"[[email protected]] select current_schema()\n",
"[[email protected]] [raw sql] {}\n",
"[[email protected]] show standard_conforming_strings\n",
"[[email protected]] [raw sql] {}\n",
"[[email protected]] BEGIN (implicit)\n",
"[[email protected]] SELECT id, name, created FROM dataset\n",
"[[email protected]] [raw sql] {}\n",
"[[email protected]] ROLLBACK\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>created</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>glottolog</th>\n",
" <td>Glottolog 5.2</td>\n",
" <td>2025-05-27 09:27:58.292501+00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name created\n",
"id \n",
"glottolog Glottolog 5.2 2025-05-27 09:27:58.292501+00:00"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ENGINE = 'postgresql://postgres@/glottolog3'\n",
"\n",
"read_glottolog= functools.partial(pd.read_sql_query, con=ENGINE)\n",
"\n",
"read_glottolog('SELECT id, name, created FROM dataset', index_col='id')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "fad09ef2-fad6-470a-b17c-e67f5f64ff7c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[[email protected]] select pg_catalog.version()\n",
"[[email protected]] [raw sql] {}\n",
"[[email protected]] select current_schema()\n",
"[[email protected]] [raw sql] {}\n",
"[[email protected]] show standard_conforming_strings\n",
"[[email protected]] [raw sql] {}\n",
"[[email protected]] BEGIN (implicit)\n",
"[[email protected]] SELECT\n",
" l.id AS glottocode,\n",
" l.name,\n",
" ll.level,\n",
" ll.category,\n",
" SUBSTRING(url.wikidata FROM '/([^/]+)$') AS qid,\n",
" SUBSTRING(url.wikipedia FROM '/([^/]+)$') AS title,\n",
" i.name AS iso639_3\n",
"FROM language AS l\n",
"JOIN languoid AS ll USING (pk)\n",
"JOIN JSON_TABLE(l.jsondata, '$.links' COLUMNS (\n",
" wikidata TEXT PATH '$[*].url ? (@ starts with \"https://www.wikidata.org/entity/\")' ERROR ON ERROR,\n",
" wikipedia TEXT PATH '$[*].url ? (@ starts with \"https://en.wikipedia.org/wiki/\")' ERROR ON ERROR)) AS url\n",
" ON url.wikidata IS NOT NULL\n",
"LEFT JOIN (\n",
" languageidentifier AS li\n",
" JOIN identifier AS i\n",
" ON li.identifier_pk = i.pk AND i.type = 'iso639-3'\n",
") ON li.language_pk = l.pk\n",
"ORDER BY l.id\n",
"[[email protected]] [raw sql] {}\n",
"[[email protected]] ROLLBACK\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 11458 entries, aant1238 to zyph1238\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 name 11458 non-null string \n",
" 1 level 11458 non-null category\n",
" 2 category 11458 non-null category\n",
" 3 qid 11458 non-null string \n",
" 4 title 9126 non-null string \n",
" 5 iso639_3 7971 non-null string \n",
"dtypes: category(2), string(4)\n",
"memory usage: 3.2 MB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>level</th>\n",
" <th>category</th>\n",
" <th>qid</th>\n",
" <th>title</th>\n",
" <th>iso639_3</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>aant1238</th>\n",
" <td>Aantantara</td>\n",
" <td>dialect</td>\n",
" <td>Dialect</td>\n",
" <td>Q31312216</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1238</th>\n",
" <td>Aari-Gayil</td>\n",
" <td>family</td>\n",
" <td>Family</td>\n",
" <td>Q85516014</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>aiz</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1239</th>\n",
" <td>Aari</td>\n",
" <td>language</td>\n",
" <td>Spoken L1 Language</td>\n",
" <td>Q7495</td>\n",
" <td>Aari_language</td>\n",
" <td>aiw</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1240</th>\n",
" <td>Aariya</td>\n",
" <td>language</td>\n",
" <td>Bookkeeping</td>\n",
" <td>Q4661732</td>\n",
" <td>Aariya_language</td>\n",
" <td>aay</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aasa1238</th>\n",
" <td>Aasax</td>\n",
" <td>language</td>\n",
" <td>Spoken L1 Language</td>\n",
" <td>Q56620</td>\n",
" <td>Asa_language</td>\n",
" <td>aas</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name level category qid \\\n",
"glottocode \n",
"aant1238 Aantantara dialect Dialect Q31312216 \n",
"aari1238 Aari-Gayil family Family Q85516014 \n",
"aari1239 Aari language Spoken L1 Language Q7495 \n",
"aari1240 Aariya language Bookkeeping Q4661732 \n",
"aasa1238 Aasax language Spoken L1 Language Q56620 \n",
"\n",
" title iso639_3 \n",
"glottocode \n",
"aant1238 <NA> <NA> \n",
"aari1238 <NA> aiz \n",
"aari1239 Aari_language aiw \n",
"aari1240 Aariya_language aay \n",
"aasa1238 Asa_language aas "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"GLOTTOLOG_QUERY = '''\n",
"SELECT\n",
" l.id AS glottocode,\n",
" l.name,\n",
" ll.level,\n",
" ll.category,\n",
" SUBSTRING(url.wikidata FROM '/([^/]+)$') AS qid,\n",
" SUBSTRING(url.wikipedia FROM '/([^/]+)$') AS title,\n",
" i.name AS iso639_3\n",
"FROM language AS l\n",
"JOIN languoid AS ll USING (pk)\n",
"JOIN JSON_TABLE(l.jsondata, '$.links' COLUMNS (\n",
" wikidata TEXT PATH '$[*].url ? (@ starts with \"https://www.wikidata.org/entity/\")' ERROR ON ERROR,\n",
" wikipedia TEXT PATH '$[*].url ? (@ starts with \"https://en.wikipedia.org/wiki/\")' ERROR ON ERROR)) AS url\n",
" ON url.wikidata IS NOT NULL\n",
"LEFT JOIN (\n",
" languageidentifier AS li\n",
" JOIN identifier AS i\n",
" ON li.identifier_pk = i.pk AND i.type = 'iso639-3'\n",
") ON li.language_pk = l.pk\n",
"ORDER BY l.id\n",
"'''.strip()\n",
"\n",
"gf = read_glottolog(GLOTTOLOG_QUERY, index_col='glottocode', dtype={'glottocode': 'string',\n",
" 'name': 'string',\n",
" 'level': 'category',\n",
" 'category': 'category',\n",
" 'qid': 'string',\n",
" 'title': 'string',\n",
" 'iso639_3': 'string'})\n",
"\n",
"gf.info(memory_usage='deep')\n",
"assert gf.index.is_unique\n",
"assert gf.index.is_monotonic_increasing\n",
"gf.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "09bc107b-e4f4-4180-a3a1-6308ae308b77",
"metadata": {},
"outputs": [],
"source": [
"def iterrows(query, *, endpoint: str,\n",
" identifier: str | None = None,\n",
" limit: int | None = None,\n",
" verbose: bool = False,\n",
" per_request: int = 100_000):\n",
" logging.info('endpoint: %r', endpoint)\n",
" graph = rdflib.Graph('SPARQLStore', identifier=identifier, bind_namespaces='none')\n",
" graph.open(endpoint)\n",
" logging.info('graph: %s', graph)\n",
" logging.debug('namespaces: %r', list(graph.namespaces()))\n",
"\n",
" if limit is None:\n",
" limit = float('inf')\n",
"\n",
" offset = 0\n",
" while offset < limit:\n",
" request_limit = min(limit - offset, per_request)\n",
" request_query = (f'{query}\\n'\n",
" f'OFFSET {offset:d}\\n'\n",
" f'LIMIT {request_limit:d}')\n",
" logging.info(\"graph.query('''\\n%s\\n''')\", request_query)\n",
" result = graph.query(request_query)\n",
" if not offset:\n",
" yield list(map(str, result.vars))\n",
" n = 0\n",
" for n, row in enumerate(result, start=1):\n",
" yield [v.toPython() if v is not None else None for v in row]\n",
" if n < request_limit:\n",
" return\n",
" offset += n\n",
"\n",
"\n",
"def read_sparql_query(query, /, *, endpoint: str,\n",
" identifier: str | None = None,\n",
" limit: int | None = None, **kwargs) -> pd.DataFrame:\n",
" rows = iterrows(query, endpoint=endpoint, identifier=identifier, limit=limit)\n",
" return pd.DataFrame.from_records(rows, columns=next(rows), **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "52d240a3-f66b-4e27-9374-902ea159f82c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] graph: <http://www.wikidata.org> a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'SPARQLStore'].\n",
"[INFO@root] graph.query('''\n",
"SELECT\n",
" ?glottocode\n",
" (strafter(str(?languoid), str(wd:)) AS ?qid)\n",
" (?languoidLabel AS ?name)\n",
" (strafter(str(?siteLink), \"https://en.wikipedia.org/wiki/\") AS ?title)\n",
"WHERE {\n",
" ?languoid wdt:P1394 ?glottocode.\n",
" FILTER (REGEX(?glottocode, \"^[a-z0-9]{4}[0-9]{4}$\")).\n",
" OPTIONAL {\n",
" ?siteLink schema:about ?languoid;\n",
" schema:inLanguage \"en\";\n",
" schema:isPartOf <https://en.wikipedia.org/>.\n",
" }\n",
" SERVICE wikibase:label {\n",
" bd:serviceParam wikibase:language \"en\".\n",
" ?languoid rdfs:label ?languoidLabel.\n",
" }\n",
"}\n",
"ORDER BY\n",
" ?glottocode\n",
" xsd:integer(strafter(str(?languoid), str(wd:Q)))\n",
"OFFSET 0\n",
"LIMIT 100000\n",
"''')\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 11797 entries, aant1238 to zyph1238\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 qid 11797 non-null string\n",
" 1 name 11797 non-null string\n",
" 2 title 9385 non-null string\n",
"dtypes: string(3)\n",
"memory usage: 2.7 MB\n",
"CPU times: total: 594 ms\n",
"Wall time: 5.9 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>qid</th>\n",
" <th>name</th>\n",
" <th>title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>aant1238</th>\n",
" <td>Q31312216</td>\n",
" <td>Aantantara</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1238</th>\n",
" <td>Q85516014</td>\n",
" <td>Aari-Gayil</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1239</th>\n",
" <td>Q7495</td>\n",
" <td>Aari</td>\n",
" <td>Aari_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1240</th>\n",
" <td>Q4661732</td>\n",
" <td>Aariya</td>\n",
" <td>Aariya_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aasa1238</th>\n",
" <td>Q56620</td>\n",
" <td>Asa</td>\n",
" <td>Asa_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aata1238</th>\n",
" <td>Q31314288</td>\n",
" <td>Aatasaara</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>abaa1238</th>\n",
" <td>Q31363054</td>\n",
" <td>Aba dialect</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>abab1239</th>\n",
" <td>Q17379636</td>\n",
" <td>Ababda</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>abab1240</th>\n",
" <td>Q4931250</td>\n",
" <td>Boan</td>\n",
" <td>Boan_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>abad1240</th>\n",
" <td>Q20644975</td>\n",
" <td>Abzakh Adyghe</td>\n",
" <td>Abzakh_dialect</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" qid name title\n",
"glottocode \n",
"aant1238 Q31312216 Aantantara <NA>\n",
"aari1238 Q85516014 Aari-Gayil <NA>\n",
"aari1239 Q7495 Aari Aari_language\n",
"aari1240 Q4661732 Aariya Aariya_language\n",
"aasa1238 Q56620 Asa Asa_language\n",
"aata1238 Q31314288 Aatasaara <NA>\n",
"abaa1238 Q31363054 Aba dialect <NA>\n",
"abab1239 Q17379636 Ababda <NA>\n",
"abab1240 Q4931250 Boan Boan_languages\n",
"abad1240 Q20644975 Abzakh Adyghe Abzakh_dialect"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"\n",
"ENDPOINT = 'https://query.wikidata.org/sparql' # https://en.wikibooks.org/wiki/SPARQL/Prefixes\n",
"\n",
"WIKIDATA_QUERY = '''\n",
"SELECT\n",
" ?glottocode\n",
" (strafter(str(?languoid), str(wd:)) AS ?qid)\n",
" (?languoidLabel AS ?name)\n",
" (strafter(str(?siteLink), \"https://en.wikipedia.org/wiki/\") AS ?title)\n",
"WHERE {\n",
" ?languoid wdt:P1394 ?glottocode.\n",
" FILTER (REGEX(?glottocode, \"^[a-z0-9]{4}[0-9]{4}$\")).\n",
" OPTIONAL {\n",
" ?siteLink schema:about ?languoid;\n",
" schema:inLanguage \"en\";\n",
" schema:isPartOf <https://en.wikipedia.org/>.\n",
" }\n",
" SERVICE wikibase:label {\n",
" bd:serviceParam wikibase:language \"en\".\n",
" ?languoid rdfs:label ?languoidLabel.\n",
" }\n",
"}\n",
"ORDER BY\n",
" ?glottocode\n",
" xsd:integer(strafter(str(?languoid), str(wd:Q)))\n",
"'''.strip()\n",
"\n",
"CSV_CACHE_PATH = pathlib.Path('wikidata.csv')\n",
"\n",
"\n",
"def read_wikidata(sparql: str, /, *, endpoint: str = ENDPOINT,\n",
" identifier: str | None = 'http://www.wikidata.org',\n",
" encoding: str = 'utf-8',\n",
" dtype='string',\n",
" cache_path: pathlib.Path = CSV_CACHE_PATH) -> pd.DataFrame:\n",
" if not cache_path.exists():\n",
" df = read_sparql_query(sparql, endpoint=endpoint, identifier=identifier, index='glottocode').astype(dtype)\n",
" df.to_csv(cache_path, encoding=encoding)\n",
" return pd.read_csv(cache_path, index_col='glottocode', encoding=encoding).astype(dtype)\n",
"\n",
"\n",
"wf = read_wikidata(WIKIDATA_QUERY)\n",
"\n",
"wf.info(memory_usage='deep')\n",
"assert wf.index.is_monotonic_increasing\n",
"wf.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "bad4f455-99ae-46f2-ab20-af765dc20a4f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>qid</th>\n",
" <th>name</th>\n",
" <th>title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>ache1244</th>\n",
" <td>Q383701</td>\n",
" <td>Aché</td>\n",
" <td>Ach%C3%A9_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ache1244</th>\n",
" <td>Q10949828</td>\n",
" <td>Ache Yi</td>\n",
" <td>Ache_Yi_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ainu1252</th>\n",
" <td>Q27969</td>\n",
" <td>Ainu</td>\n",
" <td>Ainu_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ainu1252</th>\n",
" <td>Q50111972</td>\n",
" <td>Ainu</td>\n",
" <td>Ainu_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>amap1241</th>\n",
" <td>Q2523999</td>\n",
" <td>Karipúna Creole French</td>\n",
" <td>Karip%C3%BAna_French_Creole</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zeme1240</th>\n",
" <td>Q21491053</td>\n",
" <td>Zeme Naga</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zena1250</th>\n",
" <td>Q2293952</td>\n",
" <td>Zenati languages</td>\n",
" <td>Zenati_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zena1250</th>\n",
" <td>Q2741732</td>\n",
" <td>Northern Berber languages</td>\n",
" <td>Northern_Berber_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zhan1240</th>\n",
" <td>Q6674568</td>\n",
" <td>Longyan dialect</td>\n",
" <td>Longyan_dialect</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zhan1240</th>\n",
" <td>Q15937822</td>\n",
" <td>Zhangping dialect</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>168 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" qid name title\n",
"glottocode \n",
"ache1244 Q383701 Aché Ach%C3%A9_language\n",
"ache1244 Q10949828 Ache Yi Ache_Yi_language\n",
"ainu1252 Q27969 Ainu Ainu_language\n",
"ainu1252 Q50111972 Ainu Ainu_languages\n",
"amap1241 Q2523999 Karipúna Creole French Karip%C3%BAna_French_Creole\n",
"... ... ... ...\n",
"zeme1240 Q21491053 Zeme Naga <NA>\n",
"zena1250 Q2293952 Zenati languages Zenati_languages\n",
"zena1250 Q2741732 Northern Berber languages Northern_Berber_languages\n",
"zhan1240 Q6674568 Longyan dialect Longyan_dialect\n",
"zhan1240 Q15937822 Zhangping dialect <NA>\n",
"\n",
"[168 rows x 3 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wf.loc[lambda x: x.index.duplicated(keep=False)]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "bd95e9b4-7f08-4f71-846b-feba88170dfa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 11543 entries, aant1238 to zyph1238\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 name 11543 non-null string \n",
" 1 level 11543 non-null category\n",
" 2 qid 11543 non-null string \n",
" 3 title 9164 non-null string \n",
" 4 qid_wd 11540 non-null string \n",
" 5 title_wd 9228 non-null string \n",
"dtypes: category(1), string(5)\n",
"memory usage: 4.0 MB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>level</th>\n",
" <th>qid</th>\n",
" <th>title</th>\n",
" <th>qid_wd</th>\n",
" <th>title_wd</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>aant1238</th>\n",
" <td>Aantantara</td>\n",
" <td>dialect</td>\n",
" <td>Q31312216</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q31312216</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1238</th>\n",
" <td>Aari-Gayil</td>\n",
" <td>family</td>\n",
" <td>Q85516014</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q85516014</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1239</th>\n",
" <td>Aari</td>\n",
" <td>language</td>\n",
" <td>Q7495</td>\n",
" <td>Aari_language</td>\n",
" <td>Q7495</td>\n",
" <td>Aari_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1240</th>\n",
" <td>Aariya</td>\n",
" <td>language</td>\n",
" <td>Q4661732</td>\n",
" <td>Aariya_language</td>\n",
" <td>Q4661732</td>\n",
" <td>Aariya_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aasa1238</th>\n",
" <td>Aasax</td>\n",
" <td>language</td>\n",
" <td>Q56620</td>\n",
" <td>Asa_language</td>\n",
" <td>Q56620</td>\n",
" <td>Asa_language</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name level qid title qid_wd \\\n",
"glottocode \n",
"aant1238 Aantantara dialect Q31312216 <NA> Q31312216 \n",
"aari1238 Aari-Gayil family Q85516014 <NA> Q85516014 \n",
"aari1239 Aari language Q7495 Aari_language Q7495 \n",
"aari1240 Aariya language Q4661732 Aariya_language Q4661732 \n",
"aasa1238 Aasax language Q56620 Asa_language Q56620 \n",
"\n",
" title_wd \n",
"glottocode \n",
"aant1238 <NA> \n",
"aari1238 <NA> \n",
"aari1239 Aari_language \n",
"aari1240 Aariya_language \n",
"aasa1238 Asa_language "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = (gf.join(wf, on='glottocode', rsuffix='_wd')\n",
" .drop(['category', 'name_wd', 'iso639_3'], axis='columns'))\n",
"\n",
"df.info(memory_usage='deep')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a5261e90-de74-4b54-8b29-38c69e86ddf4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>level</th>\n",
" <th>qid</th>\n",
" <th>title</th>\n",
" <th>qid_wd</th>\n",
" <th>title_wd</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>ache1244</th>\n",
" <td>Ache</td>\n",
" <td>language</td>\n",
" <td>Q10949828</td>\n",
" <td>Ache_Yi_language</td>\n",
" <td>Q383701</td>\n",
" <td>Ach%C3%A9_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ache1244</th>\n",
" <td>Ache</td>\n",
" <td>language</td>\n",
" <td>Q10949828</td>\n",
" <td>Ache_Yi_language</td>\n",
" <td>Q10949828</td>\n",
" <td>Ache_Yi_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ainu1252</th>\n",
" <td>Ainu</td>\n",
" <td>family</td>\n",
" <td>Q27969</td>\n",
" <td>Ainu_language</td>\n",
" <td>Q27969</td>\n",
" <td>Ainu_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ainu1252</th>\n",
" <td>Ainu</td>\n",
" <td>family</td>\n",
" <td>Q27969</td>\n",
" <td>Ainu_language</td>\n",
" <td>Q50111972</td>\n",
" <td>Ainu_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>amap1241</th>\n",
" <td>Amapá Creole</td>\n",
" <td>language</td>\n",
" <td>Q12626810</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q2523999</td>\n",
" <td>Karip%C3%BAna_French_Creole</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zeme1240</th>\n",
" <td>Zeme Naga</td>\n",
" <td>language</td>\n",
" <td>Q21491053</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q21491053</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zena1250</th>\n",
" <td>Zenatic</td>\n",
" <td>family</td>\n",
" <td>Q2741732</td>\n",
" <td>Northern_Berber_languages</td>\n",
" <td>Q2293952</td>\n",
" <td>Zenati_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zena1250</th>\n",
" <td>Zenatic</td>\n",
" <td>family</td>\n",
" <td>Q2741732</td>\n",
" <td>Northern_Berber_languages</td>\n",
" <td>Q2741732</td>\n",
" <td>Northern_Berber_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zhan1240</th>\n",
" <td>Zhangping-Longyan</td>\n",
" <td>dialect</td>\n",
" <td>Q15937822</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q6674568</td>\n",
" <td>Longyan_dialect</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zhan1240</th>\n",
" <td>Zhangping-Longyan</td>\n",
" <td>dialect</td>\n",
" <td>Q15937822</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q15937822</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>168 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" name level qid title \\\n",
"glottocode \n",
"ache1244 Ache language Q10949828 Ache_Yi_language \n",
"ache1244 Ache language Q10949828 Ache_Yi_language \n",
"ainu1252 Ainu family Q27969 Ainu_language \n",
"ainu1252 Ainu family Q27969 Ainu_language \n",
"amap1241 Amapá Creole language Q12626810 <NA> \n",
"... ... ... ... ... \n",
"zeme1240 Zeme Naga language Q21491053 <NA> \n",
"zena1250 Zenatic family Q2741732 Northern_Berber_languages \n",
"zena1250 Zenatic family Q2741732 Northern_Berber_languages \n",
"zhan1240 Zhangping-Longyan dialect Q15937822 <NA> \n",
"zhan1240 Zhangping-Longyan dialect Q15937822 <NA> \n",
"\n",
" qid_wd title_wd \n",
"glottocode \n",
"ache1244 Q383701 Ach%C3%A9_language \n",
"ache1244 Q10949828 Ache_Yi_language \n",
"ainu1252 Q27969 Ainu_language \n",
"ainu1252 Q50111972 Ainu_languages \n",
"amap1241 Q2523999 Karip%C3%BAna_French_Creole \n",
"... ... ... \n",
"zeme1240 Q21491053 <NA> \n",
"zena1250 Q2293952 Zenati_languages \n",
"zena1250 Q2741732 Northern_Berber_languages \n",
"zhan1240 Q6674568 Longyan_dialect \n",
"zhan1240 Q15937822 <NA> \n",
"\n",
"[168 rows x 6 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[lambda x: x.index.duplicated(keep=False)]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ad1703bd-0bc1-404c-9a03-24f3e316dcc4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 94 entries, ache1244 to zhan1240\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 name 94 non-null string \n",
" 1 level 94 non-null category\n",
" 2 title 44 non-null string \n",
" 3 title_wd 85 non-null string \n",
" 4 qid 94 non-null string \n",
" 5 qid_wd 94 non-null string \n",
"dtypes: category(1), string(5)\n",
"memory usage: 33.5 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>level</th>\n",
" <th>title</th>\n",
" <th>title_wd</th>\n",
" <th>qid</th>\n",
" <th>qid_wd</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>mala1544</th>\n",
" <td>Malabar-Sri Lanka Portuguese</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Sri_Lankan_Portuguese_creole</td>\n",
" <td>Q131549994</td>\n",
" <td>Q3537122</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mala1544</th>\n",
" <td>Malabar-Sri Lanka Portuguese</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Cannanore_Portuguese_Creole</td>\n",
" <td>Q131549994</td>\n",
" <td>Q14623791</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mose1249</th>\n",
" <td>Mosetén-Chimané</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Chimane_language</td>\n",
" <td>Q25395221</td>\n",
" <td>Q35950</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mose1249</th>\n",
" <td>Mosetén-Chimané</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q25395221</td>\n",
" <td>Q12637318</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name level title \\\n",
"glottocode \n",
"mala1544 Malabar-Sri Lanka Portuguese language <NA> \n",
"mala1544 Malabar-Sri Lanka Portuguese language <NA> \n",
"mose1249 Mosetén-Chimané language <NA> \n",
"mose1249 Mosetén-Chimané language <NA> \n",
"\n",
" title_wd qid qid_wd \n",
"glottocode \n",
"mala1544 Sri_Lankan_Portuguese_creole Q131549994 Q3537122 \n",
"mala1544 Cannanore_Portuguese_Creole Q131549994 Q14623791 \n",
"mose1249 Chimane_language Q25395221 Q35950 \n",
"mose1249 <NA> Q25395221 Q12637318 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mismatch = df.loc[df['qid'] != df['qid_wd'],\n",
" ['name', 'level', 'title', 'title_wd', 'qid', 'qid_wd']]\n",
"\n",
"mismatch.info(memory_usage='deep')\n",
"mismatch.loc[mismatch.index.duplicated(keep=False)]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "5103047c-6691-495d-a65f-b91982848f86",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>level</th>\n",
" <th>title</th>\n",
" <th>title_wd</th>\n",
" <th>qid</th>\n",
" <th>qid_wd</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>ache1244</th>\n",
" <td>Ache</td>\n",
" <td>language</td>\n",
" <td>Ache_Yi_language</td>\n",
" <td>Ach%C3%A9_language</td>\n",
" <td>Q10949828</td>\n",
" <td>Q383701</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ainu1252</th>\n",
" <td>Ainu</td>\n",
" <td>family</td>\n",
" <td>Ainu_language</td>\n",
" <td>Ainu_languages</td>\n",
" <td>Q27969</td>\n",
" <td>Q50111972</td>\n",
" </tr>\n",
" <tr>\n",
" <th>amap1241</th>\n",
" <td>Amapá Creole</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Karip%C3%BAna_French_Creole</td>\n",
" <td>Q12626810</td>\n",
" <td>Q2523999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>amer1254</th>\n",
" <td>Latin American Spanish</td>\n",
" <td>dialect</td>\n",
" <td>Spanish_language_in_the_Americas</td>\n",
" <td>Latin_American_Spanish</td>\n",
" <td>Q3058369</td>\n",
" <td>Q56649449</td>\n",
" </tr>\n",
" <tr>\n",
" <th>andr1246</th>\n",
" <td>Andro</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Andro_language</td>\n",
" <td>Q30301408</td>\n",
" <td>Q55603949</td>\n",
" </tr>\n",
" <tr>\n",
" <th>anga1293</th>\n",
" <td>Angal</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Angal_language</td>\n",
" <td>Q10951553</td>\n",
" <td>Q4761919</td>\n",
" </tr>\n",
" <tr>\n",
" <th>araf1243</th>\n",
" <td>Arafundi</td>\n",
" <td>family</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Arafundi_languages</td>\n",
" <td>Q11170629</td>\n",
" <td>Q4783702</td>\n",
" </tr>\n",
" <tr>\n",
" <th>arit1239</th>\n",
" <td>Arritinngithigh</td>\n",
" <td>language</td>\n",
" <td>Arritinngithigh_language</td>\n",
" <td>Adithinngithigh_language</td>\n",
" <td>Q4796002</td>\n",
" <td>Q4683034</td>\n",
" </tr>\n",
" <tr>\n",
" <th>arma1243</th>\n",
" <td>Arman</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Arman_language</td>\n",
" <td>Q30771931</td>\n",
" <td>Q132858961</td>\n",
" </tr>\n",
" <tr>\n",
" <th>arme1241</th>\n",
" <td>Armenic</td>\n",
" <td>family</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Armenian_language</td>\n",
" <td>Q12627235</td>\n",
" <td>Q8785</td>\n",
" </tr>\n",
" <tr>\n",
" <th>assy1241</th>\n",
" <td>Assyrian Neo-Aramaic</td>\n",
" <td>language</td>\n",
" <td>Suret_language</td>\n",
" <td>Ashurian_Aramaic</td>\n",
" <td>Q29440</td>\n",
" <td>Q24915992</td>\n",
" </tr>\n",
" <tr>\n",
" <th>astu1245</th>\n",
" <td>Asturian-Leonese-Cantabrian</td>\n",
" <td>language</td>\n",
" <td>Asturian_language</td>\n",
" <td>Asturleonese_language</td>\n",
" <td>Q29507</td>\n",
" <td>Q35390</td>\n",
" </tr>\n",
" <tr>\n",
" <th>azte1234</th>\n",
" <td>Aztec</td>\n",
" <td>family</td>\n",
" <td>Nahuatl</td>\n",
" <td>Nahuan_languages</td>\n",
" <td>Q13300</td>\n",
" <td>Q11965602</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bang1364</th>\n",
" <td>Bangkalan</td>\n",
" <td>dialect</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q123379651</td>\n",
" <td>Q123378773</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bari1298</th>\n",
" <td>Barikewa</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Omati_language</td>\n",
" <td>Q63214981</td>\n",
" <td>Q7089905</td>\n",
" </tr>\n",
" <tr>\n",
" <th>basq1248</th>\n",
" <td>Basque</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Basque_language</td>\n",
" <td>Q129256519</td>\n",
" <td>Q8752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bori1243</th>\n",
" <td>Bori-Karko</td>\n",
" <td>language</td>\n",
" <td>Adi_languages</td>\n",
" <td>Bori_language</td>\n",
" <td>Q56440</td>\n",
" <td>Q4945106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cent2194</th>\n",
" <td>Central Moroccan Berber</td>\n",
" <td>language</td>\n",
" <td>Central_Atlas_Tamazight</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q49741</td>\n",
" <td>Q30557579</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dagh1238</th>\n",
" <td>Daghestanian</td>\n",
" <td>family</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q23307817</td>\n",
" <td>Q10465901</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dakh1244</th>\n",
" <td>Dakhini (Urdu)</td>\n",
" <td>dialect</td>\n",
" <td>Hyderabadi_Urdu</td>\n",
" <td>Deccani_language</td>\n",
" <td>Q13211705</td>\n",
" <td>Q669431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>damu1236</th>\n",
" <td>Damu</td>\n",
" <td>language</td>\n",
" <td>Adi_languages</td>\n",
" <td>Damu_language</td>\n",
" <td>Q56440</td>\n",
" <td>Q17002115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>east2709</th>\n",
" <td>East Chadic B.1</td>\n",
" <td>family</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q20586134</td>\n",
" <td>Q16723063</td>\n",
" </tr>\n",
" <tr>\n",
" <th>esto1258</th>\n",
" <td>Estonian</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Estonian_language</td>\n",
" <td>Q12361545</td>\n",
" <td>Q9072</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fuzh1239</th>\n",
" <td>Houguan</td>\n",
" <td>dialect</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Fuzhou_dialect</td>\n",
" <td>Q18943758</td>\n",
" <td>Q35571</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hawu1234</th>\n",
" <td>Hawu-Dhao</td>\n",
" <td>family</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Savu_languages</td>\n",
" <td>Q30317289</td>\n",
" <td>Q7428455</td>\n",
" </tr>\n",
" <tr>\n",
" <th>kaur1271</th>\n",
" <td>Kaure-Narau</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Kaure_language</td>\n",
" <td>Q12634336</td>\n",
" <td>Q20526532</td>\n",
" </tr>\n",
" <tr>\n",
" <th>kawi1241</th>\n",
" <td>Kawi</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Old_Javanese</td>\n",
" <td>Q49340</td>\n",
" <td>Q49341</td>\n",
" </tr>\n",
" <tr>\n",
" <th>kham1285</th>\n",
" <td>Kham-Magar-Chepang</td>\n",
" <td>family</td>\n",
" <td>Greater_Magaric_languages</td>\n",
" <td>Magaric_languages</td>\n",
" <td>Q55612963</td>\n",
" <td>Q1064367</td>\n",
" </tr>\n",
" <tr>\n",
" <th>krik1239</th>\n",
" <td>Pykobjê</td>\n",
" <td>dialect</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Pykobj%C3%AA_dialect</td>\n",
" <td>Q3199710</td>\n",
" <td>Q98113977</td>\n",
" </tr>\n",
" <tr>\n",
" <th>kumi1248</th>\n",
" <td>Tipai</td>\n",
" <td>language</td>\n",
" <td>Tiipai_language</td>\n",
" <td>Kumeyaay_language</td>\n",
" <td>Q3027471</td>\n",
" <td>Q4910139</td>\n",
" </tr>\n",
" <tr>\n",
" <th>lush1256</th>\n",
" <td>Dokshi</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Dokshi_language</td>\n",
" <td>Q56199820</td>\n",
" <td>Q116450964</td>\n",
" </tr>\n",
" <tr>\n",
" <th>maca1264</th>\n",
" <td>Macao Pidgin Portuguese</td>\n",
" <td>language</td>\n",
" <td>Macanese_Portuguese</td>\n",
" <td>Macau_Pidgin_Portuguese</td>\n",
" <td>Q9290867</td>\n",
" <td>Q128804537</td>\n",
" </tr>\n",
" <tr>\n",
" <th>makr1243</th>\n",
" <td>Makrani</td>\n",
" <td>dialect</td>\n",
" <td>Makrani_dialect</td>\n",
" <td>Makrani_dialect</td>\n",
" <td>Q113554995</td>\n",
" <td>Q12634001</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mala1544</th>\n",
" <td>Malabar-Sri Lanka Portuguese</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Sri_Lankan_Portuguese_creole</td>\n",
" <td>Q131549994</td>\n",
" <td>Q3537122</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mala1544</th>\n",
" <td>Malabar-Sri Lanka Portuguese</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Cannanore_Portuguese_Creole</td>\n",
" <td>Q131549994</td>\n",
" <td>Q14623791</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mark1255</th>\n",
" <td>Markweeta</td>\n",
" <td>language</td>\n",
" <td>Markwet_language</td>\n",
" <td>Nandi%E2%80%93Markweta_languages</td>\n",
" <td>Q56874</td>\n",
" <td>Q11028135</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mesk1242</th>\n",
" <td>Meskwaki</td>\n",
" <td>language</td>\n",
" <td>Fox_language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q1440172</td>\n",
" <td>Q134397009</td>\n",
" </tr>\n",
" <tr>\n",
" <th>miji1239</th>\n",
" <td>Miji</td>\n",
" <td>family</td>\n",
" <td>Miji_languages</td>\n",
" <td>Mijiic_languages</td>\n",
" <td>Q61965015</td>\n",
" <td>Q116482753</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mixe1286</th>\n",
" <td>Mixe</td>\n",
" <td>family</td>\n",
" <td>Mixean_languages</td>\n",
" <td>Mixe_languages</td>\n",
" <td>Q36225</td>\n",
" <td>Q3833010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>moke1241</th>\n",
" <td>Moken-Moklen</td>\n",
" <td>family</td>\n",
" <td>Moklenic_languages</td>\n",
" <td>Moken_language</td>\n",
" <td>Q60787593</td>\n",
" <td>Q3217488</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mose1249</th>\n",
" <td>Mosetén-Chimané</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Chimane_language</td>\n",
" <td>Q25395221</td>\n",
" <td>Q35950</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mose1249</th>\n",
" <td>Mosetén-Chimané</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q25395221</td>\n",
" <td>Q12637318</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mosi1247</th>\n",
" <td>Akie</td>\n",
" <td>language</td>\n",
" <td>Nandi%E2%80%93Markweta_languages</td>\n",
" <td>Mosiro_language</td>\n",
" <td>Q11028135</td>\n",
" <td>Q6916288</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mugo1238</th>\n",
" <td>Mugom</td>\n",
" <td>language</td>\n",
" <td>Mugom-Karmarong_language</td>\n",
" <td>Mugom_dialect</td>\n",
" <td>Q113245255</td>\n",
" <td>Q6932210</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nata1254</th>\n",
" <td>Northern Amis</td>\n",
" <td>dialect</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q42508148</td>\n",
" <td>Q2328886</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ncan1245</th>\n",
" <td>Ncane-Mungong</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Noni_language</td>\n",
" <td>Q11297920</td>\n",
" <td>Q36072</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ndyu1242</th>\n",
" <td>Aukan</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Ndyuka_language</td>\n",
" <td>Q2659044</td>\n",
" <td>Q35037</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nisa1239</th>\n",
" <td>Nisa-Anasi</td>\n",
" <td>language</td>\n",
" <td>Nisa_language</td>\n",
" <td>Nisa-Anasi_language</td>\n",
" <td>Q13593518</td>\n",
" <td>Q4751795</td>\n",
" </tr>\n",
" <tr>\n",
" <th>noir1238</th>\n",
" <td>Noiri</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Bhilori_language</td>\n",
" <td>Q12953774</td>\n",
" <td>Q4901734</td>\n",
" </tr>\n",
" <tr>\n",
" <th>norm1245</th>\n",
" <td>Normand</td>\n",
" <td>dialect</td>\n",
" <td>Norman_language</td>\n",
" <td>Anglo-Norman_language</td>\n",
" <td>Q33850</td>\n",
" <td>Q35214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nort2724</th>\n",
" <td>Northern Bai</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Northern_Bai_language</td>\n",
" <td>Q12642165</td>\n",
" <td>Q122463830</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nort2930</th>\n",
" <td>Northeast Kiwai</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Kiwai_language</td>\n",
" <td>Q11732324</td>\n",
" <td>Q6418846</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nort2937</th>\n",
" <td>Northern Hill/Valley Yokuts</td>\n",
" <td>dialect</td>\n",
" <td>Northern_Valley_Yokuts</td>\n",
" <td>Kings_River_Yokuts</td>\n",
" <td>Q85789777</td>\n",
" <td>Q6413014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nucl1235</th>\n",
" <td>Eastern Armenian</td>\n",
" <td>language</td>\n",
" <td>Armenian_language</td>\n",
" <td>Eastern_Armenian</td>\n",
" <td>Q8785</td>\n",
" <td>Q181059</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ocot1243</th>\n",
" <td>Ocotepec Mixtec</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>%C3%91um%C3%AD_Mixtec</td>\n",
" <td>Q25559575</td>\n",
" <td>Q8078669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>otom1276</th>\n",
" <td>Otomaco-Taparita</td>\n",
" <td>family</td>\n",
" <td>Otomaco_language</td>\n",
" <td>Otom%C3%A1koan_languages</td>\n",
" <td>Q16879234</td>\n",
" <td>Q3217503</td>\n",
" </tr>\n",
" <tr>\n",
" <th>panj1256</th>\n",
" <td>Eastern Panjabi</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q113612554</td>\n",
" <td>Q112664216</td>\n",
" </tr>\n",
" <tr>\n",
" <th>peno1244</th>\n",
" <td>Peñoles Mixtec</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Estetla_Mixtec</td>\n",
" <td>Q42411307</td>\n",
" <td>Q5401071</td>\n",
" </tr>\n",
" <tr>\n",
" <th>poch1244</th>\n",
" <td>Pochutec</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Pochutec_language</td>\n",
" <td>Q42968898</td>\n",
" <td>Q2427341</td>\n",
" </tr>\n",
" <tr>\n",
" <th>puri1261</th>\n",
" <td>Puri-Coroado</td>\n",
" <td>family</td>\n",
" <td>Puri_language</td>\n",
" <td>Purian_languages</td>\n",
" <td>Q7261687</td>\n",
" <td>Q5684712</td>\n",
" </tr>\n",
" <tr>\n",
" <th>rian1260</th>\n",
" <td>Riang</td>\n",
" <td>family</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Riang_language</td>\n",
" <td>Q42353409</td>\n",
" <td>Q2741615</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sanf1262</th>\n",
" <td>San Francisco Matlatzinca</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Matlatzinca_language</td>\n",
" <td>Q12953704</td>\n",
" <td>Q3832945</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sanl1248</th>\n",
" <td>San Luís Temalacayuca Popoloca</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Northern_Popoloca_language</td>\n",
" <td>Q25559602</td>\n",
" <td>Q7058861</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sant1454</th>\n",
" <td>Santa Inés Ahuatempan Popoloca</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Western_Popoloca_language</td>\n",
" <td>Q42365276</td>\n",
" <td>Q7988174</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sate1242</th>\n",
" <td>Ems-Weser Frisian</td>\n",
" <td>language</td>\n",
" <td>Saterland_Frisian_language</td>\n",
" <td>East_Frisian_language</td>\n",
" <td>Q27154</td>\n",
" <td>Q494355</td>\n",
" </tr>\n",
" <tr>\n",
" <th>shap1240</th>\n",
" <td>Shapsug</td>\n",
" <td>dialect</td>\n",
" <td>Shapsug_dialect</td>\n",
" <td>Kfar_Kama_Adyghe_dialect</td>\n",
" <td>Q12813044</td>\n",
" <td>Q6398657</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sini1245</th>\n",
" <td>Sinitic</td>\n",
" <td>family</td>\n",
" <td>Sinitic_languages</td>\n",
" <td>Chinese_language</td>\n",
" <td>Q33857</td>\n",
" <td>Q7850</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sira1267</th>\n",
" <td>Sirayaic</td>\n",
" <td>language</td>\n",
" <td>Sirayaic_languages</td>\n",
" <td>Siraya_language</td>\n",
" <td>Q2107202</td>\n",
" <td>Q716604</td>\n",
" </tr>\n",
" <tr>\n",
" <th>soni1257</th>\n",
" <td>Soninke-Bozo</td>\n",
" <td>family</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Soninke%E2%80%93Bozo_languages</td>\n",
" <td>Q16111680</td>\n",
" <td>Q104835958</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sout2808</th>\n",
" <td>Sumayela Ndebele</td>\n",
" <td>language</td>\n",
" <td>Southern_Ndebele_language</td>\n",
" <td>Sumayela_Ndebele_language</td>\n",
" <td>Q36785</td>\n",
" <td>Q16920700</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sout2978</th>\n",
" <td>Southern East Cree</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>East_Cree</td>\n",
" <td>Q12953464</td>\n",
" <td>Q282011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sout2990</th>\n",
" <td>Southern Pastaza Quechua</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Lowland_Peruvian_Quechua</td>\n",
" <td>Q25559692</td>\n",
" <td>Q6694075</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sout3212</th>\n",
" <td>Southeastern Ngwi</td>\n",
" <td>family</td>\n",
" <td>Nisoish_languages</td>\n",
" <td>Southeastern_Loloish_languages</td>\n",
" <td>Q56990</td>\n",
" <td>Q16111894</td>\n",
" </tr>\n",
" <tr>\n",
" <th>taih1245</th>\n",
" <td>Tai Pao (Retired)</td>\n",
" <td>language</td>\n",
" <td>Tai_Hang_Tong_language</td>\n",
" <td>Tai_Pao_language</td>\n",
" <td>Q7675753</td>\n",
" <td>Q7675795</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tall1235</th>\n",
" <td>Tallán</td>\n",
" <td>language</td>\n",
" <td>Tall%C3%A1n_language</td>\n",
" <td>Catacaoan_languages</td>\n",
" <td>Q16910468</td>\n",
" <td>Q5051139</td>\n",
" </tr>\n",
" <tr>\n",
" <th>talu1238</th>\n",
" <td>Lavu-Yongsheng-Talu</td>\n",
" <td>language</td>\n",
" <td>Lavu_language</td>\n",
" <td>Talu_language</td>\n",
" <td>Q16999095</td>\n",
" <td>Q48769531</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tati1242</th>\n",
" <td>Tati (Maithili)</td>\n",
" <td>dialect</td>\n",
" <td>Tati_language_(Iran)</td>\n",
" <td>Th%C4%93thi</td>\n",
" <td>Q34165</td>\n",
" <td>Q55635832</td>\n",
" </tr>\n",
" <tr>\n",
" <th>temb1276</th>\n",
" <td>Tenetehara</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Tenetehara_language</td>\n",
" <td>Q10322157</td>\n",
" <td>Q7699720</td>\n",
" </tr>\n",
" <tr>\n",
" <th>temn1245</th>\n",
" <td>Northern Mel</td>\n",
" <td>family</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Baga_language</td>\n",
" <td>Q16114535</td>\n",
" <td>Q35005</td>\n",
" </tr>\n",
" <tr>\n",
" <th>timo1237</th>\n",
" <td>Timote-Cuica</td>\n",
" <td>language</td>\n",
" <td>Timote_language</td>\n",
" <td>Timotean_languages</td>\n",
" <td>Q7806995</td>\n",
" <td>Q3217540</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tuku1240</th>\n",
" <td>Tukumanféd</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Kagwahiva_language</td>\n",
" <td>Q42330115</td>\n",
" <td>Q6346712</td>\n",
" </tr>\n",
" <tr>\n",
" <th>wail1242</th>\n",
" <td>Ale</td>\n",
" <td>language</td>\n",
" <td>Wailapa_language</td>\n",
" <td>Aleut_language</td>\n",
" <td>Q7960062</td>\n",
" <td>Q27210</td>\n",
" </tr>\n",
" <tr>\n",
" <th>west2340</th>\n",
" <td>Western Aragonese</td>\n",
" <td>dialect</td>\n",
" <td>Ans%C3%B3_Aragonese</td>\n",
" <td>Western_Aragonese</td>\n",
" <td>Q3574358</td>\n",
" <td>Q3574028</td>\n",
" </tr>\n",
" <tr>\n",
" <th>west2488</th>\n",
" <td>Western Krahn</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Western_Krahn_language</td>\n",
" <td>Q35809</td>\n",
" <td>Q10975611</td>\n",
" </tr>\n",
" <tr>\n",
" <th>west2616</th>\n",
" <td>Western Aleut</td>\n",
" <td>dialect</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Q109496435</td>\n",
" <td>Q136388785</td>\n",
" </tr>\n",
" <tr>\n",
" <th>wudi1238</th>\n",
" <td>Wuding-Luquan Yi</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Nasu_language</td>\n",
" <td>Q25559456</td>\n",
" <td>Q56403</td>\n",
" </tr>\n",
" <tr>\n",
" <th>yaba1249</th>\n",
" <td>Yabaâna-Mainatari</td>\n",
" <td>language</td>\n",
" <td>Mainatari_language</td>\n",
" <td>Yaba%C3%A2na_language</td>\n",
" <td>Q97484838</td>\n",
" <td>Q3450534</td>\n",
" </tr>\n",
" <tr>\n",
" <th>yaga1256</th>\n",
" <td>Yagara-Jandai</td>\n",
" <td>language</td>\n",
" <td>Turrbal_language</td>\n",
" <td>Durubalic_languages</td>\n",
" <td>Q16979305</td>\n",
" <td>Q5316792</td>\n",
" </tr>\n",
" <tr>\n",
" <th>yare1249</th>\n",
" <td>Yareni Zapotec</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Ixtl%C3%A1n_Zapotec</td>\n",
" <td>Q12645368</td>\n",
" <td>Q6101185</td>\n",
" </tr>\n",
" <tr>\n",
" <th>yulp1239</th>\n",
" <td>Yulparija</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Yulparija</td>\n",
" <td>Q17319895</td>\n",
" <td>Q106554801</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zamu1245</th>\n",
" <td>Zamuco</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Ayoreo_language</td>\n",
" <td>Q12645537</td>\n",
" <td>Q56634</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zeme1240</th>\n",
" <td>Zeme Naga</td>\n",
" <td>language</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Zeme_language</td>\n",
" <td>Q21491053</td>\n",
" <td>Q56373</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zena1250</th>\n",
" <td>Zenatic</td>\n",
" <td>family</td>\n",
" <td>Northern_Berber_languages</td>\n",
" <td>Zenati_languages</td>\n",
" <td>Q2741732</td>\n",
" <td>Q2293952</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zhan1240</th>\n",
" <td>Zhangping-Longyan</td>\n",
" <td>dialect</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>Longyan_dialect</td>\n",
" <td>Q15937822</td>\n",
" <td>Q6674568</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name level \\\n",
"glottocode \n",
"ache1244 Ache language \n",
"ainu1252 Ainu family \n",
"amap1241 Amapá Creole language \n",
"amer1254 Latin American Spanish dialect \n",
"andr1246 Andro language \n",
"anga1293 Angal language \n",
"araf1243 Arafundi family \n",
"arit1239 Arritinngithigh language \n",
"arma1243 Arman language \n",
"arme1241 Armenic family \n",
"assy1241 Assyrian Neo-Aramaic language \n",
"astu1245 Asturian-Leonese-Cantabrian language \n",
"azte1234 Aztec family \n",
"bang1364 Bangkalan dialect \n",
"bari1298 Barikewa language \n",
"basq1248 Basque language \n",
"bori1243 Bori-Karko language \n",
"cent2194 Central Moroccan Berber language \n",
"dagh1238 Daghestanian family \n",
"dakh1244 Dakhini (Urdu) dialect \n",
"damu1236 Damu language \n",
"east2709 East Chadic B.1 family \n",
"esto1258 Estonian language \n",
"fuzh1239 Houguan dialect \n",
"hawu1234 Hawu-Dhao family \n",
"kaur1271 Kaure-Narau language \n",
"kawi1241 Kawi language \n",
"kham1285 Kham-Magar-Chepang family \n",
"krik1239 Pykobjê dialect \n",
"kumi1248 Tipai language \n",
"lush1256 Dokshi language \n",
"maca1264 Macao Pidgin Portuguese language \n",
"makr1243 Makrani dialect \n",
"mala1544 Malabar-Sri Lanka Portuguese language \n",
"mala1544 Malabar-Sri Lanka Portuguese language \n",
"mark1255 Markweeta language \n",
"mesk1242 Meskwaki language \n",
"miji1239 Miji family \n",
"mixe1286 Mixe family \n",
"moke1241 Moken-Moklen family \n",
"mose1249 Mosetén-Chimané language \n",
"mose1249 Mosetén-Chimané language \n",
"mosi1247 Akie language \n",
"mugo1238 Mugom language \n",
"nata1254 Northern Amis dialect \n",
"ncan1245 Ncane-Mungong language \n",
"ndyu1242 Aukan language \n",
"nisa1239 Nisa-Anasi language \n",
"noir1238 Noiri language \n",
"norm1245 Normand dialect \n",
"nort2724 Northern Bai language \n",
"nort2930 Northeast Kiwai language \n",
"nort2937 Northern Hill/Valley Yokuts dialect \n",
"nucl1235 Eastern Armenian language \n",
"ocot1243 Ocotepec Mixtec language \n",
"otom1276 Otomaco-Taparita family \n",
"panj1256 Eastern Panjabi language \n",
"peno1244 Peñoles Mixtec language \n",
"poch1244 Pochutec language \n",
"puri1261 Puri-Coroado family \n",
"rian1260 Riang family \n",
"sanf1262 San Francisco Matlatzinca language \n",
"sanl1248 San Luís Temalacayuca Popoloca language \n",
"sant1454 Santa Inés Ahuatempan Popoloca language \n",
"sate1242 Ems-Weser Frisian language \n",
"shap1240 Shapsug dialect \n",
"sini1245 Sinitic family \n",
"sira1267 Sirayaic language \n",
"soni1257 Soninke-Bozo family \n",
"sout2808 Sumayela Ndebele language \n",
"sout2978 Southern East Cree language \n",
"sout2990 Southern Pastaza Quechua language \n",
"sout3212 Southeastern Ngwi family \n",
"taih1245 Tai Pao (Retired) language \n",
"tall1235 Tallán language \n",
"talu1238 Lavu-Yongsheng-Talu language \n",
"tati1242 Tati (Maithili) dialect \n",
"temb1276 Tenetehara language \n",
"temn1245 Northern Mel family \n",
"timo1237 Timote-Cuica language \n",
"tuku1240 Tukumanféd language \n",
"wail1242 Ale language \n",
"west2340 Western Aragonese dialect \n",
"west2488 Western Krahn language \n",
"west2616 Western Aleut dialect \n",
"wudi1238 Wuding-Luquan Yi language \n",
"yaba1249 Yabaâna-Mainatari language \n",
"yaga1256 Yagara-Jandai language \n",
"yare1249 Yareni Zapotec language \n",
"yulp1239 Yulparija language \n",
"zamu1245 Zamuco language \n",
"zeme1240 Zeme Naga language \n",
"zena1250 Zenatic family \n",
"zhan1240 Zhangping-Longyan dialect \n",
"\n",
" title \\\n",
"glottocode \n",
"ache1244 Ache_Yi_language \n",
"ainu1252 Ainu_language \n",
"amap1241 <NA> \n",
"amer1254 Spanish_language_in_the_Americas \n",
"andr1246 <NA> \n",
"anga1293 <NA> \n",
"araf1243 <NA> \n",
"arit1239 Arritinngithigh_language \n",
"arma1243 <NA> \n",
"arme1241 <NA> \n",
"assy1241 Suret_language \n",
"astu1245 Asturian_language \n",
"azte1234 Nahuatl \n",
"bang1364 <NA> \n",
"bari1298 <NA> \n",
"basq1248 <NA> \n",
"bori1243 Adi_languages \n",
"cent2194 Central_Atlas_Tamazight \n",
"dagh1238 <NA> \n",
"dakh1244 Hyderabadi_Urdu \n",
"damu1236 Adi_languages \n",
"east2709 <NA> \n",
"esto1258 <NA> \n",
"fuzh1239 <NA> \n",
"hawu1234 <NA> \n",
"kaur1271 <NA> \n",
"kawi1241 <NA> \n",
"kham1285 Greater_Magaric_languages \n",
"krik1239 <NA> \n",
"kumi1248 Tiipai_language \n",
"lush1256 <NA> \n",
"maca1264 Macanese_Portuguese \n",
"makr1243 Makrani_dialect \n",
"mala1544 <NA> \n",
"mala1544 <NA> \n",
"mark1255 Markwet_language \n",
"mesk1242 Fox_language \n",
"miji1239 Miji_languages \n",
"mixe1286 Mixean_languages \n",
"moke1241 Moklenic_languages \n",
"mose1249 <NA> \n",
"mose1249 <NA> \n",
"mosi1247 Nandi%E2%80%93Markweta_languages \n",
"mugo1238 Mugom-Karmarong_language \n",
"nata1254 <NA> \n",
"ncan1245 <NA> \n",
"ndyu1242 <NA> \n",
"nisa1239 Nisa_language \n",
"noir1238 <NA> \n",
"norm1245 Norman_language \n",
"nort2724 <NA> \n",
"nort2930 <NA> \n",
"nort2937 Northern_Valley_Yokuts \n",
"nucl1235 Armenian_language \n",
"ocot1243 <NA> \n",
"otom1276 Otomaco_language \n",
"panj1256 <NA> \n",
"peno1244 <NA> \n",
"poch1244 <NA> \n",
"puri1261 Puri_language \n",
"rian1260 <NA> \n",
"sanf1262 <NA> \n",
"sanl1248 <NA> \n",
"sant1454 <NA> \n",
"sate1242 Saterland_Frisian_language \n",
"shap1240 Shapsug_dialect \n",
"sini1245 Sinitic_languages \n",
"sira1267 Sirayaic_languages \n",
"soni1257 <NA> \n",
"sout2808 Southern_Ndebele_language \n",
"sout2978 <NA> \n",
"sout2990 <NA> \n",
"sout3212 Nisoish_languages \n",
"taih1245 Tai_Hang_Tong_language \n",
"tall1235 Tall%C3%A1n_language \n",
"talu1238 Lavu_language \n",
"tati1242 Tati_language_(Iran) \n",
"temb1276 <NA> \n",
"temn1245 <NA> \n",
"timo1237 Timote_language \n",
"tuku1240 <NA> \n",
"wail1242 Wailapa_language \n",
"west2340 Ans%C3%B3_Aragonese \n",
"west2488 <NA> \n",
"west2616 <NA> \n",
"wudi1238 <NA> \n",
"yaba1249 Mainatari_language \n",
"yaga1256 Turrbal_language \n",
"yare1249 <NA> \n",
"yulp1239 <NA> \n",
"zamu1245 <NA> \n",
"zeme1240 <NA> \n",
"zena1250 Northern_Berber_languages \n",
"zhan1240 <NA> \n",
"\n",
" title_wd qid qid_wd \n",
"glottocode \n",
"ache1244 Ach%C3%A9_language Q10949828 Q383701 \n",
"ainu1252 Ainu_languages Q27969 Q50111972 \n",
"amap1241 Karip%C3%BAna_French_Creole Q12626810 Q2523999 \n",
"amer1254 Latin_American_Spanish Q3058369 Q56649449 \n",
"andr1246 Andro_language Q30301408 Q55603949 \n",
"anga1293 Angal_language Q10951553 Q4761919 \n",
"araf1243 Arafundi_languages Q11170629 Q4783702 \n",
"arit1239 Adithinngithigh_language Q4796002 Q4683034 \n",
"arma1243 Arman_language Q30771931 Q132858961 \n",
"arme1241 Armenian_language Q12627235 Q8785 \n",
"assy1241 Ashurian_Aramaic Q29440 Q24915992 \n",
"astu1245 Asturleonese_language Q29507 Q35390 \n",
"azte1234 Nahuan_languages Q13300 Q11965602 \n",
"bang1364 <NA> Q123379651 Q123378773 \n",
"bari1298 Omati_language Q63214981 Q7089905 \n",
"basq1248 Basque_language Q129256519 Q8752 \n",
"bori1243 Bori_language Q56440 Q4945106 \n",
"cent2194 <NA> Q49741 Q30557579 \n",
"dagh1238 <NA> Q23307817 Q10465901 \n",
"dakh1244 Deccani_language Q13211705 Q669431 \n",
"damu1236 Damu_language Q56440 Q17002115 \n",
"east2709 <NA> Q20586134 Q16723063 \n",
"esto1258 Estonian_language Q12361545 Q9072 \n",
"fuzh1239 Fuzhou_dialect Q18943758 Q35571 \n",
"hawu1234 Savu_languages Q30317289 Q7428455 \n",
"kaur1271 Kaure_language Q12634336 Q20526532 \n",
"kawi1241 Old_Javanese Q49340 Q49341 \n",
"kham1285 Magaric_languages Q55612963 Q1064367 \n",
"krik1239 Pykobj%C3%AA_dialect Q3199710 Q98113977 \n",
"kumi1248 Kumeyaay_language Q3027471 Q4910139 \n",
"lush1256 Dokshi_language Q56199820 Q116450964 \n",
"maca1264 Macau_Pidgin_Portuguese Q9290867 Q128804537 \n",
"makr1243 Makrani_dialect Q113554995 Q12634001 \n",
"mala1544 Sri_Lankan_Portuguese_creole Q131549994 Q3537122 \n",
"mala1544 Cannanore_Portuguese_Creole Q131549994 Q14623791 \n",
"mark1255 Nandi%E2%80%93Markweta_languages Q56874 Q11028135 \n",
"mesk1242 <NA> Q1440172 Q134397009 \n",
"miji1239 Mijiic_languages Q61965015 Q116482753 \n",
"mixe1286 Mixe_languages Q36225 Q3833010 \n",
"moke1241 Moken_language Q60787593 Q3217488 \n",
"mose1249 Chimane_language Q25395221 Q35950 \n",
"mose1249 <NA> Q25395221 Q12637318 \n",
"mosi1247 Mosiro_language Q11028135 Q6916288 \n",
"mugo1238 Mugom_dialect Q113245255 Q6932210 \n",
"nata1254 <NA> Q42508148 Q2328886 \n",
"ncan1245 Noni_language Q11297920 Q36072 \n",
"ndyu1242 Ndyuka_language Q2659044 Q35037 \n",
"nisa1239 Nisa-Anasi_language Q13593518 Q4751795 \n",
"noir1238 Bhilori_language Q12953774 Q4901734 \n",
"norm1245 Anglo-Norman_language Q33850 Q35214 \n",
"nort2724 Northern_Bai_language Q12642165 Q122463830 \n",
"nort2930 Kiwai_language Q11732324 Q6418846 \n",
"nort2937 Kings_River_Yokuts Q85789777 Q6413014 \n",
"nucl1235 Eastern_Armenian Q8785 Q181059 \n",
"ocot1243 %C3%91um%C3%AD_Mixtec Q25559575 Q8078669 \n",
"otom1276 Otom%C3%A1koan_languages Q16879234 Q3217503 \n",
"panj1256 <NA> Q113612554 Q112664216 \n",
"peno1244 Estetla_Mixtec Q42411307 Q5401071 \n",
"poch1244 Pochutec_language Q42968898 Q2427341 \n",
"puri1261 Purian_languages Q7261687 Q5684712 \n",
"rian1260 Riang_language Q42353409 Q2741615 \n",
"sanf1262 Matlatzinca_language Q12953704 Q3832945 \n",
"sanl1248 Northern_Popoloca_language Q25559602 Q7058861 \n",
"sant1454 Western_Popoloca_language Q42365276 Q7988174 \n",
"sate1242 East_Frisian_language Q27154 Q494355 \n",
"shap1240 Kfar_Kama_Adyghe_dialect Q12813044 Q6398657 \n",
"sini1245 Chinese_language Q33857 Q7850 \n",
"sira1267 Siraya_language Q2107202 Q716604 \n",
"soni1257 Soninke%E2%80%93Bozo_languages Q16111680 Q104835958 \n",
"sout2808 Sumayela_Ndebele_language Q36785 Q16920700 \n",
"sout2978 East_Cree Q12953464 Q282011 \n",
"sout2990 Lowland_Peruvian_Quechua Q25559692 Q6694075 \n",
"sout3212 Southeastern_Loloish_languages Q56990 Q16111894 \n",
"taih1245 Tai_Pao_language Q7675753 Q7675795 \n",
"tall1235 Catacaoan_languages Q16910468 Q5051139 \n",
"talu1238 Talu_language Q16999095 Q48769531 \n",
"tati1242 Th%C4%93thi Q34165 Q55635832 \n",
"temb1276 Tenetehara_language Q10322157 Q7699720 \n",
"temn1245 Baga_language Q16114535 Q35005 \n",
"timo1237 Timotean_languages Q7806995 Q3217540 \n",
"tuku1240 Kagwahiva_language Q42330115 Q6346712 \n",
"wail1242 Aleut_language Q7960062 Q27210 \n",
"west2340 Western_Aragonese Q3574358 Q3574028 \n",
"west2488 Western_Krahn_language Q35809 Q10975611 \n",
"west2616 <NA> Q109496435 Q136388785 \n",
"wudi1238 Nasu_language Q25559456 Q56403 \n",
"yaba1249 Yaba%C3%A2na_language Q97484838 Q3450534 \n",
"yaga1256 Durubalic_languages Q16979305 Q5316792 \n",
"yare1249 Ixtl%C3%A1n_Zapotec Q12645368 Q6101185 \n",
"yulp1239 Yulparija Q17319895 Q106554801 \n",
"zamu1245 Ayoreo_language Q12645537 Q56634 \n",
"zeme1240 Zeme_language Q21491053 Q56373 \n",
"zena1250 Zenati_languages Q2741732 Q2293952 \n",
"zhan1240 Longyan_dialect Q15937822 Q6674568 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"with pd.option_context('display.max_rows', 300):\n",
" display(mismatch)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment