Skip to content

Instantly share code, notes, and snippets.

@yzdann
Created June 3, 2020 13:29
Show Gist options
  • Save yzdann/c3ae91b6be99388fc6a60a6e51c8fba7 to your computer and use it in GitHub Desktop.
Save yzdann/c3ae91b6be99388fc6a60a6e51c8fba7 to your computer and use it in GitHub Desktop.
from typing import List
from pprint import pprint
import argparse
import wikipedia
def search_for_results(search: str, limit: int) -> List[str]:
wikipedia.set_lang("en")
search_result_list = wikipedia.search(query=search, results=limit)
return search_result_list
def extract_urls_of_results(search_result_list: List[str]) -> List[str]:
result_urls_list = []
for result in search_result_list:
try:
page = wikipedia.page(result, auto_suggest=False)
except wikipedia.DisambiguationError as err:
page = wikipedia.page(err.options[0])
url_of_page = page.url
# we can't use set because that will change order
if url_of_page not in result_urls_list:
result_urls_list.append(url_of_page)
return result_urls_list
def add_until_compelete(
result_urls: List[str], limit: int, i: int
) -> List[str]:
total_urls = len(result_urls)
if total_urls < limit:
remain = limit - total_urls
search_results = search_for_results(search, limit + remain + i)
remain_to_fetch = search_results[limit:limit + remain + i]
remain_urls = extract_urls_of_results(remain_to_fetch)
result_urls += [url for url in remain_urls if url not in result_urls]
i += remain
return add_until_compelete(result_urls, limit, i)
else:
return result_urls
if __name__ == "__main__":
parser = argparse.ArgumentParser(
"Search in wikipedia: python wikipedia-search.py --search 'new york' --limit 50"
)
parser.add_argument(
"--search", required=True, type=str, help="Keyword you want to search."
)
parser.add_argument(
"--limit", type=int, help="Limit num of result pages.", default=50
)
args = parser.parse_args()
search = args.search
limit = args.limit
search_result_list = search_for_results(search, limit)
result_urls = extract_urls_of_results(search_result_list)
urls = add_until_compelete(result_urls, limit, i=0)
with open(f"urls-{search.replace(' ', '-')}.txt", "w") as file_:
pprint(result_urls, stream=file_)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment