Skip to content

Instantly share code, notes, and snippets.

@jamespenguin
Created April 10, 2015 06:03
Show Gist options
  • Save jamespenguin/9568e611c0e7e7ceea3c to your computer and use it in GitHub Desktop.
Save jamespenguin/9568e611c0e7e7ceea3c to your computer and use it in GitHub Desktop.
The greatest wiki scraper ever made.
#!/usr/bin/env python -S
#
# Wikipedia Sentence Scraper
#
# This script requires the Wikipedia module to run (https://pypi.python.org/pypi/wikipedia/)
# If you have pip installed, just run
# pip install wikipedia
#
import re
import threading
import time
import sys
import wikipedia
###
# Global Config
###
start_page_names = [
"Engaged to the Unidentified"
]
target_sentence_count = 100000
output_file_name = "sentences.txt"
#!# Don't touch anything beyond this point #!#
sentences = []
thread_limit = 25
processed_pages = []
show_status = False
#!# End Config #!#
# Hack method because python < 3 sucks at outputing unicode to console
def output(string):
print string.encode('utf8', 'replace')
# Method that handles displaying run status
def output_status():
global sentences, target_sentence_count, processed_pages, show_status
show_status = True
print "[+] Scraping sentences..."
while show_status:
# Build status line
percent = (len(sentences) / float(target_sentence_count)) * 100
line = "[+] %d / %d (%.2f%%) (%d pages)" % (len(sentences), target_sentence_count, percent, len(processed_pages))
# Output status line
sys.stdout.write("\r" + line.encode('utf8', 'replace'))
sys.stdout.flush()
time.sleep(0.03)
# Display one last time so we get it to 100%
percent = (len(sentences) / float(target_sentence_count)) * 100
line = "[+] %d / %d (%.2f%%) (%d pages)" % (len(sentences), target_sentence_count, percent, len(processed_pages))
# Output status line
output("\r" + line)
# Method that handles scraping a page for sentences and spawning other scrapers
def scrape_page(page_name, use_threading=False):
global sentences, target_sentence_count, processed_pages, thread_limit
# Bail out if we have enough sentences or we've already done this page
if len(sentences) >= target_sentence_count or page_name in processed_pages:
return
# Add page name to processed pages array so we don't rescrape it
processed_pages.append(page_name)
# Get page
try:
page = wikipedia.page(page_name)
except:
return
# Process sentences out of page content
content = page.content
# Strip sub-headers and headers out of page content
content = re.sub('\=\=\=\=[^=]+\=\=\=\=', '', content)
content = re.sub('\=\=\=[^=]+\=\=\=', '', content)
content = re.sub('\=\=[^=]+\=\=', '', content)
content = re.sub('\n [^=]+\=\=', '', content)
# Break page content up into workable lines and parse those for sentences
lines = map(lambda x: x.strip(), content.split("\n"))
lines = filter(lambda x: x, lines)
ending_punctuation = ["?", ".", "!"]
for line in lines:
sentence = ""
for char in line:
if len(sentences) >= target_sentence_count:
break
sentence += char
if char in ending_punctuation:
sentences.append(sentence.strip())
sentence = ""
# Launch scraper threads for all linked pages
for link_title in page.links:
if len(sentences) >= target_sentence_count:
break
if threading.activeCount() >= thread_limit:
scrape_page(link_title)
else:
threading.Thread(target=scrape_page, args=(link_title, )).start()
if __name__ == '__main__':
# Verify our config is set up properly
if not start_page_names:
print "ERROR: You must supply at least one start page name to start scraping sentences."
raw_input("Press enter to exit...")
exit()
elif target_sentence_count < 1:
print "ERROR: You must configure the script to scrape at least 1 sentence."
raw_input("Press enter to exit...")
exit()
# Start output status display thread
threading.Thread(target=output_status, args=()).start()
## Start spidering and scraping start pages
for page_name in start_page_names:
while threading.activeCount() >= thread_limit:
time.sleep(0.03)
threading.Thread(target=scrape_page, args=(page_name, True)).start()
# Hold tight and wait for all threads to wrap up
while threading.activeCount() > 2:
time.sleep(0.03)
show_status = False
while threading.activeCount() > 1:
time.sleep(0.03)
# Output sentences to file
print "[+] Writing sentences to output file..."
if (len(sentences) < target_sentence_count):
print "[+] WARNING: Was unable to scrape enough sentences to reach target number."
f = open(output_file_name, 'wb')
f.write(u"\n".join(sentences).encode('utf8'))
f.close()
# Wrap it up
print "[+] All done!"
raw_input("[+] Press enter to exit...")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment