Last active
August 3, 2021 14:50
-
-
Save dnk8n/afcd8585865fa29abe625e8ecee94c68 to your computer and use it in GitHub Desktop.
Download Wiki Dumps
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "862aa30c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"# Library for parsing HTML\n", | |
"from bs4 import BeautifulSoup\n", | |
"base_url = 'https://dumps.wikimedia.org'\n", | |
"enwiki_url = base_url + '/enwiki'\n", | |
"index = requests.get(enwiki_url).text\n", | |
"soup_index = BeautifulSoup(index, 'html.parser')\n", | |
"# Find the links on the page\n", | |
"dumps = [a['href'] for a in soup_index.find_all('a') if \n", | |
" a.has_attr('href')]\n", | |
"dumps" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "f1a42fd4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dump_url = enwiki_url + '/20210720'\n", | |
"# Retrieve the html\n", | |
"dump_html = requests.get(dump_url).text\n", | |
"# Convert to a soup\n", | |
"soup_dump = BeautifulSoup(dump_html, 'html.parser')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "87b626ee", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"from pathlib import Path\n", | |
"from tqdm import tqdm\n", | |
"\n", | |
"wikipedia_dir = Path.home() / 'wikipedia-dev'\n", | |
"wikipedia_dir.mkdir(parents=True, exist_ok=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "6f20bc9a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Find list elements with the class file\n", | |
"targets = [i.a.attrs[\"href\"] for i in soup_dump.find_all('li', {'class': 'file'}) if \"multistream\" in str(i)]\n", | |
"destinations = [t.split('/')[-1] for t in targets]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "68e3a949", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define the remote file to retrieve\n", | |
"for target, destination in zip(targets, destinations):\n", | |
" print('target: ', base_url + target)\n", | |
" print('destination: ', wikipedia_dir / destination)\n", | |
" response = requests.get(base_url + target, stream=True)\n", | |
" total_size_in_bytes= int(response.headers.get('content-length', 0))\n", | |
" block_size = 1024\n", | |
" progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)\n", | |
" with (wikipedia_dir / destination).open('wb')as f:\n", | |
" for data in response.iter_content(block_size):\n", | |
" progress_bar.update(len(data))\n", | |
" f.write(data)\n", | |
" progress_bar.close()\n", | |
" if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:\n", | |
" print(\"ERROR, something went wrong\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "a1a80970", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from typing import Union\n", | |
"from pathlib import Path\n", | |
"from pprint import pprint\n", | |
"from hashlib import md5\n", | |
"\n", | |
"\n", | |
"\n", | |
"def pairwise(iterable):\n", | |
" return zip(*[iter(iterable)] * 2)\n", | |
"\n", | |
"def check_downloads(wiki_dir, destinations, md5sum_url):\n", | |
" response = requests.get(md5sum_url)\n", | |
" md5sum_dict = {k: v for v,k in pairwise(response.text.split())}\n", | |
" for dest in destinations:\n", | |
" dest_path = Path(wiki_dir / dest)\n", | |
" assert dest_path.is_file()\n", | |
" \n", | |
" with dest_path.open('rb') as f:\n", | |
" file_hash = md5()\n", | |
" while chunk := f.read(8192):\n", | |
" file_hash.update(chunk)\n", | |
" actual_md5sum = file_hash.hexdigest()\n", | |
" expected_md5sum = md5sum_dict[dest]\n", | |
" assert actual_md5sum == expected_md5sum\n", | |
" print('\\033[1m' + 'OK' + '\\033[0m', dest_path)\n", | |
" \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "6d148eab", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"check_downloads(wikipedia_dir, destinations, 'https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-md5sums.txt')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment