Created
February 28, 2013 07:26
-
-
Save akfreas/5054929 to your computer and use it in GitHub Desktop.
Grabs profile pictures of a person/place/thing from Wikipedia. This function will check for all images associated with an article's title and compare those images against the images inside the infobox (typically the box on the top right hand corner of an article), returning the WikiMedia URLs of the images inside the infobox.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def figure_wikipedia_pic(figure_name, image_size): | |
wiki_images_get = requests.get("http://en.wikipedia.org/w/api.php?format=json&action=query&titles=%s&prop=images" % figure_name) | |
wiki_json = wiki_images_get.json() | |
wiki_page_json = requests.get("http://en.wikipedia.org/w/api.php?format=json&action=query&titles=%s&prop=revisions&rvprop=content&rvsection=0" % figure_name).json() | |
wiki_page_json = str(wiki_page_json) | |
pages = wiki_json['query']['pages'] | |
images = [pages[key] for key in pages.keys()][0]['images'] #flatten list, this might not work | |
first_image = None | |
if len(images) > 0: | |
for image_dict in images: | |
formatted_image_name = image_dict['title'].split(":")[1] | |
if string.find(wiki_page_json, formatted_image_name) > -1: | |
first_image = image_dict | |
if first_image != None: | |
image_info = requests.get("http://en.wikipedia.org/w/api.php?format=json&action=query&titles=%s&prop=imageinfo&iiprop=url" % first_image['title']).json() | |
image_query = image_info['query'] | |
pages = image_query['pages'] | |
image_urls = [] | |
for page_key in pages.keys(): | |
image_info_dict = {} | |
if "title" in pages[page_key].keys(): | |
raw_image_title = pages[page_key]["title"] | |
image_title = raw_image_title.split(":")[1].replace(" ", "_") | |
if "imageinfo" in pages[page_key].keys(): | |
imageinfo = pages[page_key]['imageinfo'] | |
for info in imageinfo: | |
info['url'] | |
url_split = info['url'].split("commons") | |
url_split.insert(1, "commons/thumb") | |
formatted_url = "".join(url_split) | |
resized_url = "%s/%dpx-%s" % (formatted_url, image_size, image_title) | |
image_urls.append({'url' : resized_url, 'title' : image_title}) | |
return image_urls | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment