Created
April 10, 2015 06:03
-
-
Save jamespenguin/9568e611c0e7e7ceea3c to your computer and use it in GitHub Desktop.
The greatest wiki scraper ever made.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python -S | |
# | |
# Wikipedia Sentence Scraper | |
# | |
# This script requires the Wikipedia module to run (https://pypi.python.org/pypi/wikipedia/) | |
# If you have pip installed, just run | |
# pip install wikipedia | |
# | |
import re | |
import threading | |
import time | |
import sys | |
import wikipedia | |
### | |
# Global Config | |
### | |
start_page_names = [ | |
"Engaged to the Unidentified" | |
] | |
target_sentence_count = 100000 | |
output_file_name = "sentences.txt" | |
#!# Don't touch anything beyond this point #!# | |
sentences = [] | |
thread_limit = 25 | |
processed_pages = [] | |
show_status = False | |
#!# End Config #!# | |
# Hack method because python < 3 sucks at outputing unicode to console | |
def output(string): | |
print string.encode('utf8', 'replace') | |
# Method that handles displaying run status | |
def output_status(): | |
global sentences, target_sentence_count, processed_pages, show_status | |
show_status = True | |
print "[+] Scraping sentences..." | |
while show_status: | |
# Build status line | |
percent = (len(sentences) / float(target_sentence_count)) * 100 | |
line = "[+] %d / %d (%.2f%%) (%d pages)" % (len(sentences), target_sentence_count, percent, len(processed_pages)) | |
# Output status line | |
sys.stdout.write("\r" + line.encode('utf8', 'replace')) | |
sys.stdout.flush() | |
time.sleep(0.03) | |
# Display one last time so we get it to 100% | |
percent = (len(sentences) / float(target_sentence_count)) * 100 | |
line = "[+] %d / %d (%.2f%%) (%d pages)" % (len(sentences), target_sentence_count, percent, len(processed_pages)) | |
# Output status line | |
output("\r" + line) | |
# Method that handles scraping a page for sentences and spawning other scrapers | |
def scrape_page(page_name, use_threading=False): | |
global sentences, target_sentence_count, processed_pages, thread_limit | |
# Bail out if we have enough sentences or we've already done this page | |
if len(sentences) >= target_sentence_count or page_name in processed_pages: | |
return | |
# Add page name to processed pages array so we don't rescrape it | |
processed_pages.append(page_name) | |
# Get page | |
try: | |
page = wikipedia.page(page_name) | |
except: | |
return | |
# Process sentences out of page content | |
content = page.content | |
# Strip sub-headers and headers out of page content | |
content = re.sub('\=\=\=\=[^=]+\=\=\=\=', '', content) | |
content = re.sub('\=\=\=[^=]+\=\=\=', '', content) | |
content = re.sub('\=\=[^=]+\=\=', '', content) | |
content = re.sub('\n [^=]+\=\=', '', content) | |
# Break page content up into workable lines and parse those for sentences | |
lines = map(lambda x: x.strip(), content.split("\n")) | |
lines = filter(lambda x: x, lines) | |
ending_punctuation = ["?", ".", "!"] | |
for line in lines: | |
sentence = "" | |
for char in line: | |
if len(sentences) >= target_sentence_count: | |
break | |
sentence += char | |
if char in ending_punctuation: | |
sentences.append(sentence.strip()) | |
sentence = "" | |
# Launch scraper threads for all linked pages | |
for link_title in page.links: | |
if len(sentences) >= target_sentence_count: | |
break | |
if threading.activeCount() >= thread_limit: | |
scrape_page(link_title) | |
else: | |
threading.Thread(target=scrape_page, args=(link_title, )).start() | |
if __name__ == '__main__': | |
# Verify our config is set up properly | |
if not start_page_names: | |
print "ERROR: You must supply at least one start page name to start scraping sentences." | |
raw_input("Press enter to exit...") | |
exit() | |
elif target_sentence_count < 1: | |
print "ERROR: You must configure the script to scrape at least 1 sentence." | |
raw_input("Press enter to exit...") | |
exit() | |
# Start output status display thread | |
threading.Thread(target=output_status, args=()).start() | |
## Start spidering and scraping start pages | |
for page_name in start_page_names: | |
while threading.activeCount() >= thread_limit: | |
time.sleep(0.03) | |
threading.Thread(target=scrape_page, args=(page_name, True)).start() | |
# Hold tight and wait for all threads to wrap up | |
while threading.activeCount() > 2: | |
time.sleep(0.03) | |
show_status = False | |
while threading.activeCount() > 1: | |
time.sleep(0.03) | |
# Output sentences to file | |
print "[+] Writing sentences to output file..." | |
if (len(sentences) < target_sentence_count): | |
print "[+] WARNING: Was unable to scrape enough sentences to reach target number." | |
f = open(output_file_name, 'wb') | |
f.write(u"\n".join(sentences).encode('utf8')) | |
f.close() | |
# Wrap it up | |
print "[+] All done!" | |
raw_input("[+] Press enter to exit...") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment