jamespenguin · April 10, 2015 06:03
diff --git a/wiki_scraper b/wiki_scraper
 #!/usr/bin/env python -S
 #
 # Wikipedia Sentence Scraper
 #
 # This script requires the Wikipedia module to run (https://pypi.python.org/pypi/wikipedia/)
 # If you have pip installed, just run
 #    pip install wikipedia
 #
 import re
 import threading
 import time
 import sys
 import wikipedia

 ###
 # Global Config
 ###

 start_page_names = [
    "Engaged to the Unidentified"
 ]
 target_sentence_count = 100000
 output_file_name = "sentences.txt"

 #!# Don't touch anything beyond this point #!#

 sentences = []
 thread_limit = 25
 processed_pages = []
 show_status = False

 #!# End Config #!#

 # Hack method because python < 3 sucks at outputing unicode to console
 def output(string):
    print string.encode('utf8', 'replace')

 # Method that handles displaying run status
 def output_status():
    global sentences, target_sentence_count, processed_pages, show_status

    show_status = True
    print "[+] Scraping sentences..."
    while show_status:
        # Build status line
        percent = (len(sentences) / float(target_sentence_count)) * 100
        line = "[+] %d / %d (%.2f%%) (%d pages)" % (len(sentences), target_sentence_count, percent, len(processed_pages))

        # Output status line
        sys.stdout.write("\r" + line.encode('utf8', 'replace'))
        sys.stdout.flush()
        time.sleep(0.03)

    # Display one last time so we get it to 100%
    percent = (len(sentences) / float(target_sentence_count)) * 100
    line = "[+] %d / %d (%.2f%%) (%d pages)" % (len(sentences), target_sentence_count, percent, len(processed_pages))

    # Output status line
    output("\r" + line)


 # Method that handles scraping a page for sentences and spawning other scrapers
 def scrape_page(page_name, use_threading=False):
    global sentences, target_sentence_count, processed_pages, thread_limit

    # Bail out if we have enough sentences or we've already done this page
    if len(sentences) >= target_sentence_count or page_name in processed_pages:
        return

    # Add page name to processed pages array so we don't rescrape it
    processed_pages.append(page_name)

    # Get page
    try:
        page = wikipedia.page(page_name)
    except:
        return

    # Process sentences out of page content
    content = page.content

    # Strip sub-headers and headers out of page content
    content = re.sub('\=\=\=\=[^=]+\=\=\=\=', '', content)
    content = re.sub('\=\=\=[^=]+\=\=\=', '', content)
    content = re.sub('\=\=[^=]+\=\=', '', content)
    content = re.sub('\n [^=]+\=\=', '', content)

    # Break page content up into workable lines and parse those for sentences
    lines = map(lambda x: x.strip(), content.split("\n"))
    lines = filter(lambda x: x, lines)
    ending_punctuation = ["?", ".", "!"]
    for line in lines:
        sentence = ""
        for char in line:
            if len(sentences) >= target_sentence_count:
                break
            sentence += char
            if char in ending_punctuation:
                sentences.append(sentence.strip())
                sentence = ""

    # Launch scraper threads for all linked pages
    for link_title in page.links:
        if len(sentences) >= target_sentence_count:
            break
        if threading.activeCount() >= thread_limit:
            scrape_page(link_title)
        else:
            threading.Thread(target=scrape_page, args=(link_title, )).start()



 if __name__ == '__main__':
    # Verify our config is set up properly
    if not start_page_names:
        print "ERROR:  You must supply at least one start page name to start scraping sentences."
        raw_input("Press enter to exit...")
        exit()
    elif target_sentence_count < 1:
        print "ERROR: You must configure the script to scrape at least 1 sentence."
        raw_input("Press enter to exit...")
        exit()

    # Start output status display thread
    threading.Thread(target=output_status, args=()).start()

    ## Start spidering and scraping start pages
    for page_name in start_page_names:
        while threading.activeCount() >= thread_limit:
            time.sleep(0.03)
        threading.Thread(target=scrape_page, args=(page_name, True)).start()

    # Hold tight and wait for all threads to wrap up
    while threading.activeCount() > 2:
        time.sleep(0.03)
    show_status = False
    while threading.activeCount() > 1:
        time.sleep(0.03)

    # Output sentences to file
    print "[+] Writing sentences to output file..."
    if (len(sentences) < target_sentence_count):
        print "[+] WARNING: Was unable to scrape enough sentences to reach target number."
    f = open(output_file_name, 'wb')
    f.write(u"\n".join(sentences).encode('utf8'))
    f.close()

    # Wrap it up
    print "[+] All done!"
    raw_input("[+] Press enter to exit...")
	#!/usr/bin/env python -S
	#
	# Wikipedia Sentence Scraper
	#
	# This script requires the Wikipedia module to run (https://pypi.python.org/pypi/wikipedia/)
	# If you have pip installed, just run
	# pip install wikipedia
	#
	import re
	import threading
	import time
	import sys
	import wikipedia

	###
	# Global Config
	###

	start_page_names = [
	"Engaged to the Unidentified"
	]
	target_sentence_count = 100000
	output_file_name = "sentences.txt"

	#!# Don't touch anything beyond this point #!#

	sentences = []
	thread_limit = 25
	processed_pages = []
	show_status = False

	#!# End Config #!#

	# Hack method because python < 3 sucks at outputing unicode to console
	def output(string):
	print string.encode('utf8', 'replace')

	# Method that handles displaying run status
	def output_status():
	global sentences, target_sentence_count, processed_pages, show_status

	show_status = True
	print "[+] Scraping sentences..."
	while show_status:
	# Build status line
	percent = (len(sentences) / float(target_sentence_count)) * 100
	line = "[+] %d / %d (%.2f%%) (%d pages)" % (len(sentences), target_sentence_count, percent, len(processed_pages))

	# Output status line
	sys.stdout.write("\r" + line.encode('utf8', 'replace'))
	sys.stdout.flush()
	time.sleep(0.03)

	# Display one last time so we get it to 100%
	percent = (len(sentences) / float(target_sentence_count)) * 100
	line = "[+] %d / %d (%.2f%%) (%d pages)" % (len(sentences), target_sentence_count, percent, len(processed_pages))

	# Output status line
	output("\r" + line)


	# Method that handles scraping a page for sentences and spawning other scrapers
	def scrape_page(page_name, use_threading=False):
	global sentences, target_sentence_count, processed_pages, thread_limit

	# Bail out if we have enough sentences or we've already done this page
	if len(sentences) >= target_sentence_count or page_name in processed_pages:
	return

	# Add page name to processed pages array so we don't rescrape it
	processed_pages.append(page_name)

	# Get page
	try:
	page = wikipedia.page(page_name)
	except:
	return

	# Process sentences out of page content
	content = page.content

	# Strip sub-headers and headers out of page content
	content = re.sub('\=\=\=\=[^=]+\=\=\=\=', '', content)
	content = re.sub('\=\=\=[^=]+\=\=\=', '', content)
	content = re.sub('\=\=[^=]+\=\=', '', content)
	content = re.sub('\n [^=]+\=\=', '', content)

	# Break page content up into workable lines and parse those for sentences
	lines = map(lambda x: x.strip(), content.split("\n"))
	lines = filter(lambda x: x, lines)
	ending_punctuation = ["?", ".", "!"]
	for line in lines:
	sentence = ""
	for char in line:
	if len(sentences) >= target_sentence_count:
	break
	sentence += char
	if char in ending_punctuation:
	sentences.append(sentence.strip())
	sentence = ""

	# Launch scraper threads for all linked pages
	for link_title in page.links:
	if len(sentences) >= target_sentence_count:
	break
	if threading.activeCount() >= thread_limit:
	scrape_page(link_title)
	else:
	threading.Thread(target=scrape_page, args=(link_title, )).start()



	if __name__ == '__main__':
	# Verify our config is set up properly
	if not start_page_names:
	print "ERROR: You must supply at least one start page name to start scraping sentences."
	raw_input("Press enter to exit...")
	exit()
	elif target_sentence_count < 1:
	print "ERROR: You must configure the script to scrape at least 1 sentence."
	raw_input("Press enter to exit...")
	exit()

	# Start output status display thread
	threading.Thread(target=output_status, args=()).start()

	## Start spidering and scraping start pages
	for page_name in start_page_names:
	while threading.activeCount() >= thread_limit:
	time.sleep(0.03)
	threading.Thread(target=scrape_page, args=(page_name, True)).start()

	# Hold tight and wait for all threads to wrap up
	while threading.activeCount() > 2:
	time.sleep(0.03)
	show_status = False
	while threading.activeCount() > 1:
	time.sleep(0.03)

	# Output sentences to file
	print "[+] Writing sentences to output file..."
	if (len(sentences) < target_sentence_count):
	print "[+] WARNING: Was unable to scrape enough sentences to reach target number."
	f = open(output_file_name, 'wb')
	f.write(u"\n".join(sentences).encode('utf8'))
	f.close()

	# Wrap it up
	print "[+] All done!"
	raw_input("[+] Press enter to exit...")