komly · October 6, 2015 13:03
diff --git a/nuts.py b/nuts.py
 #!/usr/bin/env python3

 import re
 import os
 import time
 from bs4 import BeautifulSoup
 from urllib.request import urlopen, Request

 def get_wikipedia_page(name):
    url = 'https://en.wikipedia.org/wiki/%s' % name
    req = Request(url, headers={
        'user-agent': 'Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5'
    })
    return urlopen(req).read()

 def get_page_content(html):
    page = BeautifulSoup(html)
    return page.find('div', id='content')

 def get_nuts_count(content):
    nuts = re.findall(r'nut[\w\']+\B', content, re.IGNORECASE)
    return len(nuts), nuts


 html = get_wikipedia_page('Squirel')
 content = get_page_content(html)
 words = set(re.findall(r'[a-zA-Z]+', content.text))
 with open('words.txt', 'wb') as f:
    for word in sorted(words):
        f.write((word + os.linesep).encode('utf-8'))


 links = content.find_all('a', href=True)
 for link in links:
    if link['href'].startswith('/wiki/') and ':' not in link['href']:
        name = link['href'].split('/wiki/')[1]
        page = get_wikipedia_page(name)
        content = get_page_content(page).text
        count, nuts = get_nuts_count(content)
        print('Page: %s, nuts: count %d %s' % (name, count, ', '.join(nuts)))
        time.sleep(0.1)
	#!/usr/bin/env python3

	import re
	import os
	import time
	from bs4 import BeautifulSoup
	from urllib.request import urlopen, Request

	def get_wikipedia_page(name):
	url = 'https://en.wikipedia.org/wiki/%s' % name
	req = Request(url, headers={
	'user-agent': 'Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5'
	})
	return urlopen(req).read()

	def get_page_content(html):
	page = BeautifulSoup(html)
	return page.find('div', id='content')

	def get_nuts_count(content):
	nuts = re.findall(r'nut[\w\']+\B', content, re.IGNORECASE)
	return len(nuts), nuts


	html = get_wikipedia_page('Squirel')
	content = get_page_content(html)
	words = set(re.findall(r'[a-zA-Z]+', content.text))
	with open('words.txt', 'wb') as f:
	for word in sorted(words):
	f.write((word + os.linesep).encode('utf-8'))


	links = content.find_all('a', href=True)
	for link in links:
	if link['href'].startswith('/wiki/') and ':' not in link['href']:
	name = link['href'].split('/wiki/')[1]
	page = get_wikipedia_page(name)
	content = get_page_content(page).text
	count, nuts = get_nuts_count(content)
	print('Page: %s, nuts: count %d %s' % (name, count, ', '.join(nuts)))
	time.sleep(0.1)