dnk8n · August 3, 2021 14:35
diff --git a/process-wiki-dump.ipynb b/process-wiki-dump.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cb8c367",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "\n",
    "# # No evidence that template expanding helps in most cases, so going to skip it in favour of speed\n",
    "# wikiextractor_cmd_base = 'python -m wikiextractor.WikiExtractor --json -o -'\n",
    "wikiextractor_cmd_base = 'python -m wikiextractor.WikiExtractor --no-templates --json -o -'\n",
    "filter_categories_by_keywords = ['Flight', 'Travel', 'Tourism', 'Aerospace', 'Airlines', 'Airports', 'Airfields', 'Aviation', 'Transport']\n",
    "csv_file = \"/home/dnk8n/Downloads/travel-wiki-extract.csv\"\n",
    "\n",
    "# exclude recombined gz and index files\n",
    "#pattern = 'enwiki-20210720-pages-articles-multistream[0-9]*.xml*.bz2'  ## For some reason selecting multiple files bugs out\n",
    "#pattern = 'enwiki-20210720-pages-articles-multistream.xml.bz2'  ## Full wikipedia dump\n",
    "pattern = 'enwiki-20210720-pages-articles-multistream16.xml-p20460153p20570392.bz2'  ## Smallest file to dev on\n",
    "wiki_dir = Path('/home/dnk8n/wikipedia')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9addbb9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import subprocess\n",
    "from typing import Pattern, Union\n",
    "import xml.sax\n",
    "from xml.sax import SAXParseException\n",
    "from xml.sax.expatreader import ExpatParser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b76248cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "RE_CATEGORY = re.compile('\\[\\[Category\\:([\\w\\-\\s]+)\\]\\]')\n",
    "\n",
    "class WikiXmlHandler(xml.sax.handler.ContentHandler):\n",
    "    \"\"\"Content handler for Wiki XML data using SAX\"\"\"\n",
    "    def __init__(self, filter_categories_by=None):\n",
    "        super().__init__()\n",
    "        self._current_tag = []\n",
    "        self._current_id = \"\"\n",
    "        self._temp_holding_dict = {}\n",
    "        self._filter_categories_by = filter_categories_by or []\n",
    "        self.matching_docs = {}\n",
    "\n",
    "    def startElement(self, name, attrs):\n",
    "        \"\"\"Opening tag of element\"\"\"\n",
    "        self._current_tag.append(name)\n",
    "\n",
    "    def characters(self, content):\n",
    "        \"\"\"Characters between opening and closing tags\"\"\"\n",
    "        if self._current_tag:\n",
    "            if self._current_tag[-1] == \"id\":\n",
    "                outer_tag = self._current_tag[-2]\n",
    "                if outer_tag == \"page\":                    \n",
    "                    self._current_id = content\n",
    "                elif outer_tag == \"revision\":\n",
    "                    self._temp_holding_dict['revid'] = content\n",
    "            elif self._current_tag[-1] == \"timestamp\":\n",
    "                self._temp_holding_dict['timestamp'] = content\n",
    "            elif self._current_tag[-1] == \"text\":\n",
    "                category_match = RE_CATEGORY.match(content)\n",
    "                if category_match:\n",
    "                    category = category_match.group(1)\n",
    "                    if self._temp_holding_dict.get('categories') is None:\n",
    "                        self._temp_holding_dict['categories'] = []\n",
    "                    self._temp_holding_dict['categories'].append(category)\n",
    "\n",
    "    def endElement(self, name):\n",
    "        \"\"\"Closing tag of element\"\"\"\n",
    "        if self._current_tag:\n",
    "            if name == self._current_tag[-1]:\n",
    "                ended_tag = self._current_tag.pop()\n",
    "                if ended_tag == \"page\":\n",
    "                    wiki_categories = self._temp_holding_dict.get('categories')\n",
    "                    if wiki_categories:\n",
    "                        if self._filter_categories_by:\n",
    "                            if not any(substr.lower() in wiki_category.lower() for substr in self._filter_categories_by for wiki_category in wiki_categories):\n",
    "                                self._temp_holding_dict = {}\n",
    "                        if self._temp_holding_dict:\n",
    "                            self._temp_holding_dict['categories'] = '\\n'.join(self._temp_holding_dict['categories'])\n",
    "                            self.matching_docs[self._current_id] = self._temp_holding_dict\n",
    "                            self._temp_holding_dict = {}\n",
    "                    else:\n",
    "                        if self._filter_categories_by:\n",
    "                            self._temp_holding_dict = {}\n",
    "                        else:\n",
    "                            self.matching_docs[self._current_id] = self._temp_holding_dict\n",
    "                            self._temp_holding_dict = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e2829bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def manualextract_bz_dir_serial(bz_dir: Union[str, Path], pattern: Union[str, Pattern], parser: ExpatParser):\n",
    "    bz_dir_path = Path(bz_dir)\n",
    "    assert bz_dir_path.is_dir()\n",
    "    for f in bz_dir_path.glob(pattern):\n",
    "        manualextract_bz_file(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "854471c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def manualextract_bz_file(f: Union[str, Path]):\n",
    "    f_path = Path(f)\n",
    "    print('Processing file: ', f)\n",
    "    assert f_path.is_file()\n",
    "    for line in subprocess.Popen(\n",
    "        [\"bzcat\"],\n",
    "        stdin = f.open(),\n",
    "        stdout = subprocess.PIPE\n",
    "    ).stdout:\n",
    "        try:\n",
    "            parser.feed(line)\n",
    "        except SAXParseException as e:\n",
    "            print('error with file: ', f)\n",
    "            break\n",
    "        except StopIteration:\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "baba5f16",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "def wikiextract_bz(bz_dir: Union[str, Path], pattern: Union[str, Pattern]):\n",
    "    bz_dir_path = Path(bz_dir)\n",
    "    assert bz_dir_path.is_dir()\n",
    "    for f in bz_dir_path.glob(pattern):\n",
    "        wikiextractor_cmd = wikiextractor_cmd_base.split() + [f.as_posix()]\n",
    "        print('wikiextractor_cmd: ', wikiextractor_cmd)\n",
    "        for line in subprocess.Popen(wikiextractor_cmd, stdout=subprocess.PIPE).stdout:\n",
    "            yield json.loads(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a828be0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def merge_docs(main_docs, meta_docs):\n",
    "    for doc in main_docs:\n",
    "        id_ = doc['id']\n",
    "        meta = meta_docs.pop(id_, None)\n",
    "        if meta:\n",
    "            yield {\n",
    "                'id': id_,\n",
    "                'url': doc['url'] + '&oldid=' + meta['revid'],\n",
    "                'title': doc['title'],\n",
    "                'text': doc['text'],\n",
    "                **meta\n",
    "            }\n",
    "        if not meta_docs:\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8125ad93",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "handler = WikiXmlHandler(filter_categories_by_keywords)\n",
    "parser = xml.sax.make_parser()\n",
    "parser.setContentHandler(handler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b67dcbf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# %%time\n",
    "# # Paralised\n",
    "# from multiprocessing import Pool\n",
    "\n",
    "# partitions = [f for f in wiki_dir.glob(pattern)]\n",
    "# pool = Pool(processes = 3)\n",
    "# pool.map(manualextract_bz_file, partitions)\n",
    "# pool.close()\n",
    "# pool.join()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b8bc526",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Serial\n",
    "manualextract_bz_dir_serial(wiki_dir, pattern, parser)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5bcd05ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Naturally paralised\n",
    "wikiextract_docs = wikiextract_bz(wiki_dir, pattern)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06c3dea0",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "import csv\n",
    "\n",
    "csv_columns = ['id', 'url', 'title','text', 'revid', 'timestamp', 'categories']\n",
    "try:\n",
    "    with open(csv_file, 'w') as csvfile:\n",
    "        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)\n",
    "        writer.writeheader()\n",
    "        for data in merge_docs(wikiextract_docs, handler.matching_docs):\n",
    "            writer.writerow(data)\n",
    "except IOError:\n",
    "    print(\"I/O error\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63e8cb03",
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: Potentially locate category pages like below and traverse subcategories, thus expanding search of categories to filter by\n",
    "#     https://en.wikipedia.org/wiki/Category:Flight\n",
    "#     https://en.wikipedia.org/wiki/Category:Travel\n",
    "#     https://en.wikipedia.org/wiki/Category:Tourism\n",
    "#     https://en.wikipedia.org/wiki/Category:Aerospace\n",
    "#     https://en.wikipedia.org/wiki/Category:Airlines\n",
    "#     https://en.wikipedia.org/wiki/Category:Airports\n",
    "#     https://en.wikipedia.org/wiki/Category:Airfields\n",
    "#     https://en.wikipedia.org/wiki/Category:Aviation\n",
    "#     https://en.wikipedia.org/wiki/Category:Transport\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "7cb8c367",
	"metadata": {},
	"outputs": [],
	"source": [
	"from pathlib import Path\n",
	"\n",
	"\n",
	"# # No evidence that template expanding helps in most cases, so going to skip it in favour of speed\n",
	"# wikiextractor_cmd_base = 'python -m wikiextractor.WikiExtractor --json -o -'\n",
	"wikiextractor_cmd_base = 'python -m wikiextractor.WikiExtractor --no-templates --json -o -'\n",
	"filter_categories_by_keywords = ['Flight', 'Travel', 'Tourism', 'Aerospace', 'Airlines', 'Airports', 'Airfields', 'Aviation', 'Transport']\n",
	"csv_file = \"/home/dnk8n/Downloads/travel-wiki-extract.csv\"\n",
	"\n",
	"# exclude recombined gz and index files\n",
	"#pattern = 'enwiki-20210720-pages-articles-multistream[0-9].xml.bz2' ## For some reason selecting multiple files bugs out\n",
	"#pattern = 'enwiki-20210720-pages-articles-multistream.xml.bz2' ## Full wikipedia dump\n",
	"pattern = 'enwiki-20210720-pages-articles-multistream16.xml-p20460153p20570392.bz2' ## Smallest file to dev on\n",
	"wiki_dir = Path('/home/dnk8n/wikipedia')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "9addbb9f",
	"metadata": {},
	"outputs": [],
	"source": [
	"import subprocess\n",
	"from typing import Pattern, Union\n",
	"import xml.sax\n",
	"from xml.sax import SAXParseException\n",
	"from xml.sax.expatreader import ExpatParser"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b76248cc",
	"metadata": {},
	"outputs": [],
	"source": [
	"import re\n",
	"\n",
	"RE_CATEGORY = re.compile('\\[\\[Category\\:([\\w\\-\\s]+)\\]\\]')\n",
	"\n",
	"class WikiXmlHandler(xml.sax.handler.ContentHandler):\n",
	" \"\"\"Content handler for Wiki XML data using SAX\"\"\"\n",
	" def __init__(self, filter_categories_by=None):\n",
	" super().__init__()\n",
	" self._current_tag = []\n",
	" self._current_id = \"\"\n",
	" self._temp_holding_dict = {}\n",
	" self._filter_categories_by = filter_categories_by or []\n",
	" self.matching_docs = {}\n",
	"\n",
	" def startElement(self, name, attrs):\n",
	" \"\"\"Opening tag of element\"\"\"\n",
	" self._current_tag.append(name)\n",
	"\n",
	" def characters(self, content):\n",
	" \"\"\"Characters between opening and closing tags\"\"\"\n",
	" if self._current_tag:\n",
	" if self._current_tag[-1] == \"id\":\n",
	" outer_tag = self._current_tag[-2]\n",
	" if outer_tag == \"page\": \n",
	" self._current_id = content\n",
	" elif outer_tag == \"revision\":\n",
	" self._temp_holding_dict['revid'] = content\n",
	" elif self._current_tag[-1] == \"timestamp\":\n",
	" self._temp_holding_dict['timestamp'] = content\n",
	" elif self._current_tag[-1] == \"text\":\n",
	" category_match = RE_CATEGORY.match(content)\n",
	" if category_match:\n",
	" category = category_match.group(1)\n",
	" if self._temp_holding_dict.get('categories') is None:\n",
	" self._temp_holding_dict['categories'] = []\n",
	" self._temp_holding_dict['categories'].append(category)\n",
	"\n",
	" def endElement(self, name):\n",
	" \"\"\"Closing tag of element\"\"\"\n",
	" if self._current_tag:\n",
	" if name == self._current_tag[-1]:\n",
	" ended_tag = self._current_tag.pop()\n",
	" if ended_tag == \"page\":\n",
	" wiki_categories = self._temp_holding_dict.get('categories')\n",
	" if wiki_categories:\n",
	" if self._filter_categories_by:\n",
	" if not any(substr.lower() in wiki_category.lower() for substr in self._filter_categories_by for wiki_category in wiki_categories):\n",
	" self._temp_holding_dict = {}\n",
	" if self._temp_holding_dict:\n",
	" self._temp_holding_dict['categories'] = '\\n'.join(self._temp_holding_dict['categories'])\n",
	" self.matching_docs[self._current_id] = self._temp_holding_dict\n",
	" self._temp_holding_dict = {}\n",
	" else:\n",
	" if self._filter_categories_by:\n",
	" self._temp_holding_dict = {}\n",
	" else:\n",
	" self.matching_docs[self._current_id] = self._temp_holding_dict\n",
	" self._temp_holding_dict = {}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "3e2829bb",
	"metadata": {},
	"outputs": [],
	"source": [
	"def manualextract_bz_dir_serial(bz_dir: Union[str, Path], pattern: Union[str, Pattern], parser: ExpatParser):\n",
	" bz_dir_path = Path(bz_dir)\n",
	" assert bz_dir_path.is_dir()\n",
	" for f in bz_dir_path.glob(pattern):\n",
	" manualextract_bz_file(f)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "854471c7",
	"metadata": {},
	"outputs": [],
	"source": [
	"def manualextract_bz_file(f: Union[str, Path]):\n",
	" f_path = Path(f)\n",
	" print('Processing file: ', f)\n",
	" assert f_path.is_file()\n",
	" for line in subprocess.Popen(\n",
	" [\"bzcat\"],\n",
	" stdin = f.open(),\n",
	" stdout = subprocess.PIPE\n",
	" ).stdout:\n",
	" try:\n",
	" parser.feed(line)\n",
	" except SAXParseException as e:\n",
	" print('error with file: ', f)\n",
	" break\n",
	" except StopIteration:\n",
	" break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "baba5f16",
	"metadata": {},
	"outputs": [],
	"source": [
	"import json\n",
	"\n",
	"def wikiextract_bz(bz_dir: Union[str, Path], pattern: Union[str, Pattern]):\n",
	" bz_dir_path = Path(bz_dir)\n",
	" assert bz_dir_path.is_dir()\n",
	" for f in bz_dir_path.glob(pattern):\n",
	" wikiextractor_cmd = wikiextractor_cmd_base.split() + [f.as_posix()]\n",
	" print('wikiextractor_cmd: ', wikiextractor_cmd)\n",
	" for line in subprocess.Popen(wikiextractor_cmd, stdout=subprocess.PIPE).stdout:\n",
	" yield json.loads(line)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "a828be0f",
	"metadata": {},
	"outputs": [],
	"source": [
	"def merge_docs(main_docs, meta_docs):\n",
	" for doc in main_docs:\n",
	" id_ = doc['id']\n",
	" meta = meta_docs.pop(id_, None)\n",
	" if meta:\n",
	" yield {\n",
	" 'id': id_,\n",
	" 'url': doc['url'] + '&oldid=' + meta['revid'],\n",
	" 'title': doc['title'],\n",
	" 'text': doc['text'],\n",
	" **meta\n",
	" }\n",
	" if not meta_docs:\n",
	" break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "8125ad93",
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"handler = WikiXmlHandler(filter_categories_by_keywords)\n",
	"parser = xml.sax.make_parser()\n",
	"parser.setContentHandler(handler)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "3b67dcbf",
	"metadata": {},
	"outputs": [],
	"source": [
	"# %%time\n",
	"# # Paralised\n",
	"# from multiprocessing import Pool\n",
	"\n",
	"# partitions = [f for f in wiki_dir.glob(pattern)]\n",
	"# pool = Pool(processes = 3)\n",
	"# pool.map(manualextract_bz_file, partitions)\n",
	"# pool.close()\n",
	"# pool.join()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "3b8bc526",
	"metadata": {},
	"outputs": [],
	"source": [
	"%%time\n",
	"# Serial\n",
	"manualextract_bz_dir_serial(wiki_dir, pattern, parser)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "5bcd05ff",
	"metadata": {},
	"outputs": [],
	"source": [
	"%%time\n",
	"# Naturally paralised\n",
	"wikiextract_docs = wikiextract_bz(wiki_dir, pattern)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "06c3dea0",
	"metadata": {},
	"outputs": [],
	"source": [
	"%%time\n",
	"import csv\n",
	"\n",
	"csv_columns = ['id', 'url', 'title','text', 'revid', 'timestamp', 'categories']\n",
	"try:\n",
	" with open(csv_file, 'w') as csvfile:\n",
	" writer = csv.DictWriter(csvfile, fieldnames=csv_columns)\n",
	" writer.writeheader()\n",
	" for data in merge_docs(wikiextract_docs, handler.matching_docs):\n",
	" writer.writerow(data)\n",
	"except IOError:\n",
	" print(\"I/O error\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "63e8cb03",
	"metadata": {},
	"outputs": [],
	"source": [
	"# TODO: Potentially locate category pages like below and traverse subcategories, thus expanding search of categories to filter by\n",
	"# https://en.wikipedia.org/wiki/Category:Flight\n",
	"# https://en.wikipedia.org/wiki/Category:Travel\n",
	"# https://en.wikipedia.org/wiki/Category:Tourism\n",
	"# https://en.wikipedia.org/wiki/Category:Aerospace\n",
	"# https://en.wikipedia.org/wiki/Category:Airlines\n",
	"# https://en.wikipedia.org/wiki/Category:Airports\n",
	"# https://en.wikipedia.org/wiki/Category:Airfields\n",
	"# https://en.wikipedia.org/wiki/Category:Aviation\n",
	"# https://en.wikipedia.org/wiki/Category:Transport\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}