Created
June 15, 2019 10:55
-
-
Save njanakiev/0fe0900cd620ba0e54d92f03999b3f0a to your computer and use it in GitHub Desktop.
Download images from commons gallery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
import mwclient | |
import pandas as pd | |
category_names = ['Computer_hardware', 'Data_centers', 'Servers', | |
'Control_rooms', 'Floppy_disk_drives', 'Hard_disks', | |
'Automobile_dashboards', 'Old_maps', 'Laboratory_equipment'] | |
folder = '/mnt/cloud-volume/wikimedia-commons/' | |
for category_name in category_names: | |
category_folder = os.path.join(folder, category_name) | |
if not os.path.exists(category_folder): | |
os.mkdir(category_folder) | |
site = mwclient.Site('commons.wikimedia.org') | |
category = site.Categories[category_name] | |
for member in category.members(namespace=6): | |
image_name = member.page_title | |
image_url = "https://commons.wikimedia.org/wiki/Special:FilePath/" + image_name | |
image_path = os.path.join(category_folder, image_name) | |
print(image_url) | |
response = requests.get(image_url, stream=True) | |
if response.status_code == 200: | |
with open(image_path, 'wb') as image: | |
for chunk in response: | |
image.write(chunk) | |
else: | |
print('Status Code :', response.status_code) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment