Created
August 4, 2021 04:14
-
-
Save dnk8n/27c315c2173480a8f6d4ba5cae44f17a to your computer and use it in GitHub Desktop.
Output of processing a filtered including timings of long running processes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "8e6cbe27", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pathlib import Path\n", | |
"\n", | |
"\n", | |
"# # No evidence that template expanding helps in most cases, so going to skip it in favour of speed\n", | |
"wikiextractor_cmd_base = 'python -m wikiextractor.WikiExtractor --json -o -'\n", | |
"#wikiextractor_cmd_base = 'python -m wikiextractor.WikiExtractor --no-templates --json -o -'\n", | |
"filter_categories_by_keywords = ['Flight', 'Travel', 'Tourism', 'Aerospace', 'Airlines', 'Airports', 'Airfields', 'Aviation', 'Transport']\n", | |
"csv_file = \"/home/dnk8n/Downloads/travel-wiki-extract-full-templates-processed.csv\"\n", | |
"\n", | |
"# exclude recombined gz and index files\n", | |
"#pattern = 'enwiki-20210720-pages-articles-multistream[0-9]*.xml*.bz2' ## For some reason selecting multiple files bugs out\n", | |
"pattern = 'enwiki-20210720-pages-articles-multistream.xml.bz2' ## Full wikipedia dump\n", | |
"#pattern = 'enwiki-20210720-pages-articles-multistream16.xml-p20460153p20570392.bz2' ## Smallest file to dev on\n", | |
"wiki_dir = Path('/home/dnk8n/wikipedia')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "9addbb9f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import subprocess\n", | |
"from typing import Pattern, Union\n", | |
"import xml.sax\n", | |
"from xml.sax import SAXParseException\n", | |
"from xml.sax.expatreader import ExpatParser" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "b76248cc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import re\n", | |
"\n", | |
"RE_CATEGORY = re.compile('\\[\\[Category\\:([\\w\\-\\s]+)\\]\\]')\n", | |
"\n", | |
"class WikiXmlHandler(xml.sax.handler.ContentHandler):\n", | |
" \"\"\"Content handler for Wiki XML data using SAX\"\"\"\n", | |
" def __init__(self, filter_categories_by=None):\n", | |
" super().__init__()\n", | |
" self._current_tag = []\n", | |
" self._current_id = \"\"\n", | |
" self._temp_holding_dict = {}\n", | |
" self._filter_categories_by = filter_categories_by or []\n", | |
" self.matching_docs = {}\n", | |
"\n", | |
" def startElement(self, name, attrs):\n", | |
" \"\"\"Opening tag of element\"\"\"\n", | |
" self._current_tag.append(name)\n", | |
"\n", | |
" def characters(self, content):\n", | |
" \"\"\"Characters between opening and closing tags\"\"\"\n", | |
" if self._current_tag:\n", | |
" if self._current_tag[-1] == \"id\":\n", | |
" outer_tag = self._current_tag[-2]\n", | |
" if outer_tag == \"page\": \n", | |
" self._current_id = content\n", | |
" elif outer_tag == \"revision\":\n", | |
" self._temp_holding_dict['revid'] = content\n", | |
" elif self._current_tag[-1] == \"timestamp\":\n", | |
" self._temp_holding_dict['timestamp'] = content\n", | |
" elif self._current_tag[-1] == \"text\":\n", | |
" category_match = RE_CATEGORY.match(content)\n", | |
" if category_match:\n", | |
" category = category_match.group(1)\n", | |
" if self._temp_holding_dict.get('categories') is None:\n", | |
" self._temp_holding_dict['categories'] = []\n", | |
" self._temp_holding_dict['categories'].append(category)\n", | |
"\n", | |
" def endElement(self, name):\n", | |
" \"\"\"Closing tag of element\"\"\"\n", | |
" if self._current_tag:\n", | |
" if name == self._current_tag[-1]:\n", | |
" ended_tag = self._current_tag.pop()\n", | |
" if ended_tag == \"page\":\n", | |
" wiki_categories = self._temp_holding_dict.get('categories')\n", | |
" if wiki_categories:\n", | |
" if self._filter_categories_by:\n", | |
" if not any(substr.lower() in wiki_category.lower() for substr in self._filter_categories_by for wiki_category in wiki_categories):\n", | |
" self._temp_holding_dict = {}\n", | |
" if self._temp_holding_dict:\n", | |
" self._temp_holding_dict['categories'] = '\\n'.join(self._temp_holding_dict['categories'])\n", | |
" self.matching_docs[self._current_id] = self._temp_holding_dict\n", | |
" self._temp_holding_dict = {}\n", | |
" else:\n", | |
" if self._filter_categories_by:\n", | |
" self._temp_holding_dict = {}\n", | |
" else:\n", | |
" self.matching_docs[self._current_id] = self._temp_holding_dict\n", | |
" self._temp_holding_dict = {}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "3e2829bb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def manualextract_bz_dir_serial(bz_dir: Union[str, Path], pattern: Union[str, Pattern], parser: ExpatParser):\n", | |
" bz_dir_path = Path(bz_dir)\n", | |
" assert bz_dir_path.is_dir()\n", | |
" for f in bz_dir_path.glob(pattern):\n", | |
" manualextract_bz_file(f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "854471c7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def manualextract_bz_file(f: Union[str, Path]):\n", | |
" f_path = Path(f)\n", | |
" print('Processing file: ', f)\n", | |
" assert f_path.is_file()\n", | |
" for line in subprocess.Popen(\n", | |
" [\"bzcat\"],\n", | |
" stdin = f.open(),\n", | |
" stdout = subprocess.PIPE\n", | |
" ).stdout:\n", | |
" try:\n", | |
" parser.feed(line)\n", | |
" except SAXParseException as e:\n", | |
" print('error with file: ', f)\n", | |
" break\n", | |
" except StopIteration:\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "baba5f16", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"\n", | |
"def wikiextract_bz(bz_dir: Union[str, Path], pattern: Union[str, Pattern]):\n", | |
" bz_dir_path = Path(bz_dir)\n", | |
" assert bz_dir_path.is_dir()\n", | |
" for f in bz_dir_path.glob(pattern):\n", | |
" wikiextractor_cmd = wikiextractor_cmd_base.split() + [f.as_posix()]\n", | |
" print('wikiextractor_cmd: ', wikiextractor_cmd)\n", | |
" for line in subprocess.Popen(wikiextractor_cmd, stdout=subprocess.PIPE).stdout:\n", | |
" yield json.loads(line)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "a828be0f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def merge_docs(main_docs, meta_docs):\n", | |
" for doc in main_docs:\n", | |
" id_ = doc['id']\n", | |
" meta = meta_docs.pop(id_, None)\n", | |
" if meta:\n", | |
" yield {\n", | |
" 'id': id_,\n", | |
" 'url': doc['url'] + '&oldid=' + meta['revid'],\n", | |
" 'title': doc['title'],\n", | |
" 'text': doc['text'],\n", | |
" **meta\n", | |
" }\n", | |
" if not meta_docs:\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "8125ad93", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"handler = WikiXmlHandler(filter_categories_by_keywords)\n", | |
"parser = xml.sax.make_parser()\n", | |
"parser.setContentHandler(handler)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "3b67dcbf", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# %%time\n", | |
"# # Paralised\n", | |
"# from multiprocessing import Pool\n", | |
"\n", | |
"# partitions = [f for f in wiki_dir.glob(pattern)]\n", | |
"# pool = Pool(processes = 3)\n", | |
"# pool.map(manualextract_bz_file, partitions)\n", | |
"# pool.close()\n", | |
"# pool.join()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "3b8bc526", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Processing file: /home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream.xml.bz2\n", | |
"CPU times: user 1h 14min 15s, sys: 53.9 s, total: 1h 15min 9s\n", | |
"Wall time: 1h 31min 6s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# Serial\n", | |
"manualextract_bz_dir_serial(wiki_dir, pattern, parser)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "5bcd05ff", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 4 µs, sys: 0 ns, total: 4 µs\n", | |
"Wall time: 5.96 µs\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# Naturally paralised\n", | |
"wikiextract_docs = wikiextract_bz(wiki_dir, pattern)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "06c3dea0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"wikiextractor_cmd: ['python', '-m', 'wikiextractor.WikiExtractor', '--json', '-o', '-', '/home/dnk8n/wikipedia/enwiki-20210720-pages-articles-multistream.xml.bz2']\n", | |
"CPU times: user 5min 44s, sys: 18.8 s, total: 6min 3s\n", | |
"Wall time: 3h 10min 27s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"import csv\n", | |
"\n", | |
"csv_columns = ['id', 'url', 'title','text', 'revid', 'timestamp', 'categories']\n", | |
"try:\n", | |
" with open(csv_file, 'w') as csvfile:\n", | |
" writer = csv.DictWriter(csvfile, fieldnames=csv_columns)\n", | |
" writer.writeheader()\n", | |
" for data in merge_docs(wikiextract_docs, handler.matching_docs):\n", | |
" writer.writerow(data)\n", | |
"except IOError:\n", | |
" print(\"I/O error\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "63e8cb03", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# TODO: Potentially locate category pages like below and traverse subcategories, thus expanding search of categories to filter by\n", | |
"# https://en.wikipedia.org/wiki/Category:Flight\n", | |
"# https://en.wikipedia.org/wiki/Category:Travel\n", | |
"# https://en.wikipedia.org/wiki/Category:Tourism\n", | |
"# https://en.wikipedia.org/wiki/Category:Aerospace\n", | |
"# https://en.wikipedia.org/wiki/Category:Airlines\n", | |
"# https://en.wikipedia.org/wiki/Category:Airports\n", | |
"# https://en.wikipedia.org/wiki/Category:Airfields\n", | |
"# https://en.wikipedia.org/wiki/Category:Aviation\n", | |
"# https://en.wikipedia.org/wiki/Category:Transport\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment