Created
May 7, 2019 22:58
-
-
Save Zylvian/e81ae63f18c07e4014d8d9d039a386ff to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
import string | |
import requests | |
import logging as log | |
from constants import Constants | |
""" | |
S = requests.Session() | |
PARAMS = { | |
"action": "query", | |
"format": "json", | |
"generator": "allpages", | |
"gtitles":"Luffy", | |
"list": "allimages" | |
} | |
url = 'https://onepiece.fandom.com/api.php' | |
print(requests.get(url=url, params=PARAMS).content) | |
""" | |
class Fetcher: | |
def __init__(self, wiki_name): | |
self.constants = Constants() | |
wiki_site = 'https://{wiki_name}.fandom.com'.format(wiki_name=wiki_name) | |
self._querystartlink = wiki_site + '/api/v1/Search/List?query=' | |
self._queryendlink = '&limit=1&minArticleQuality=10&batch=1&namespaces=0%2C14' | |
self._imagestartlink = wiki_site + '/api.php?format=json&action=imageserving&wisId=' | |
self._summarystartlink = wiki_site + "/api/v1/Articles/AsSimpleJson?id=" | |
#self._endlink = self._startlink+'query&' | |
# '&prop=info&inprop=url&generator=allpages&gapfromSS=' | |
def get_wiki_pages(self, names): | |
pages = [] | |
for name in names: | |
try: | |
pages.append(self.__fetch_page(name)) | |
except KeyError: | |
pass | |
log.info("Input names: " + ",".join(names)) | |
return pages | |
def cleanName(self, name): | |
"""ignore all special characters, numbers, whitespace, case""" | |
return ''.join(c for c in name.lower() if c in string.ascii_lowercase) | |
def __get_correct_page(self, checked_name, all_pages): | |
# Gets first page | |
first_page = None | |
log_string = "" | |
#clean_name = self.cleanName(checked_name) | |
clean_name = checked_name.replace(" ", "+") | |
# Checks for any direct hits. | |
# difflib.get_close_matches[0] | |
for nr, page in enumerate(all_pages.values()): | |
title = page['title'] | |
title_clean = self.cleanName(title) | |
log_string += title + "," | |
if title_clean == clean_name: | |
log.info("Found direct match, page nr {}: {}".format(nr + 1, clean_name)) | |
first_page = page | |
break | |
# Get first containing | |
# if not first_page: | |
# pages = all_pages.values() | |
# pages_containing = [page for page in pages if checked_name in page['title'].lower()] | |
# if pages_containing: | |
# first_page = pages_containing[0] | |
# print("bingo") | |
# Gets first entry | |
if not first_page: | |
first_page = next(iter(all_pages.values())) | |
log.info("Input name: {} \n Parsed titles were: {}.\n Result title was: {}".format(checked_name, log_string[:-1], | |
first_page["title"])) | |
return first_page | |
def __fetch_page(self, name): | |
# Returns translated name or the same name | |
#clean_name = self.cleanName(name) | |
checked_name = self.constants.translateAlt(name.lower()) | |
# All pages with "name" in there, and their URLs. | |
fetch_json = requests.get(self._querystartlink + checked_name.title() | |
).json() #'Use "gapfilterredir=nonredirects" option instead of "redirects" when using allpages as a generator' #gaplimit=1 | |
# Gets the first page | |
#all_pages = fetch_json['query']['pages'] | |
#first_page = self.__get_correct_page(checked_name, all_pages) | |
first_page = fetch_json["items"][0] | |
return first_page | |
# ASSUME THAT THE FIRST LINK IS CORRECT - MIGHT BE REDIRECTION LINK! | |
def check_title(self): | |
pass | |
def fetch_image_url(self, page_id): | |
image_json = requests.get(self._imagestartlink+str(page_id)).json() | |
try: | |
image_url_dirty = image_json["image"]["imageserving"] | |
image_url = (image_url_dirty.split("/revision/"))[0] | |
return image_url | |
except KeyError: | |
log.info("Couldn't parse image url") | |
return "" | |
def fetch_summary(self, page_id): | |
fetch_json = requests.get(self._summarystartlink+str(page_id)).json() | |
return fetch_json["sections"][0]["content"][0]["text"] | |
class SpellChecker(): | |
"""Find and fix simple spelling errors. | |
based on Peter Norvig | |
http://norvig.com/spell-correct.html | |
""" | |
def __init__(self, names): | |
self.model = set(names) | |
def __known(self, words): | |
for w in words: | |
if w in self.model: | |
return w | |
return None | |
def __edits(self, word): | |
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] | |
deletes = (a + b[1:] for a, b in splits if b) | |
transposes = (a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1) | |
replaces = (a + c + b[1:] for a, b in splits for c in string.ascii_lowercase if b) | |
inserts = (a + c + b for a, b in splits for c in string.ascii_lowercase) | |
return itertools.chain(deletes, transposes, replaces, inserts) | |
def correct(self, word): | |
"""returns input word or fixed version if found""" | |
return self.__known([word]) or self.__known(self.__edits(word)) or word | |
""" | |
# distance 2 | |
def known_edits2(word): | |
return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS) | |
""" | |
# Test | |
#print(requests.get(startlink+'generator=allpages&gapfrom=Luffy&prop=info').content) #prop=info&inprop=url | |
#image_json = requests.get(startlink+'generator=allpages&gapfrom=Luffy&prop=images').json() | |
#print(image_json) | |
#test_output = image_json['query-continue'][''] | |
#All images from the Monkey D. Luffy page | |
#print(requests.get('https://onepiece.fandom.com/api.php?format=json&action=query&generator=images&titles=Monkey_D._Luffy&prop=imageinfo').content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment