Created
June 3, 2020 13:29
-
-
Save yzdann/c3ae91b6be99388fc6a60a6e51c8fba7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List | |
from pprint import pprint | |
import argparse | |
import wikipedia | |
def search_for_results(search: str, limit: int) -> List[str]: | |
wikipedia.set_lang("en") | |
search_result_list = wikipedia.search(query=search, results=limit) | |
return search_result_list | |
def extract_urls_of_results(search_result_list: List[str]) -> List[str]: | |
result_urls_list = [] | |
for result in search_result_list: | |
try: | |
page = wikipedia.page(result, auto_suggest=False) | |
except wikipedia.DisambiguationError as err: | |
page = wikipedia.page(err.options[0]) | |
url_of_page = page.url | |
# we can't use set because that will change order | |
if url_of_page not in result_urls_list: | |
result_urls_list.append(url_of_page) | |
return result_urls_list | |
def add_until_compelete( | |
result_urls: List[str], limit: int, i: int | |
) -> List[str]: | |
total_urls = len(result_urls) | |
if total_urls < limit: | |
remain = limit - total_urls | |
search_results = search_for_results(search, limit + remain + i) | |
remain_to_fetch = search_results[limit:limit + remain + i] | |
remain_urls = extract_urls_of_results(remain_to_fetch) | |
result_urls += [url for url in remain_urls if url not in result_urls] | |
i += remain | |
return add_until_compelete(result_urls, limit, i) | |
else: | |
return result_urls | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
"Search in wikipedia: python wikipedia-search.py --search 'new york' --limit 50" | |
) | |
parser.add_argument( | |
"--search", required=True, type=str, help="Keyword you want to search." | |
) | |
parser.add_argument( | |
"--limit", type=int, help="Limit num of result pages.", default=50 | |
) | |
args = parser.parse_args() | |
search = args.search | |
limit = args.limit | |
search_result_list = search_for_results(search, limit) | |
result_urls = extract_urls_of_results(search_result_list) | |
urls = add_until_compelete(result_urls, limit, i=0) | |
with open(f"urls-{search.replace(' ', '-')}.txt", "w") as file_: | |
pprint(result_urls, stream=file_) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment