Skip to content

Instantly share code, notes, and snippets.

@komly
Created October 6, 2015 13:03
Show Gist options
  • Save komly/73ca4c7585260e63b301 to your computer and use it in GitHub Desktop.
Save komly/73ca4c7585260e63b301 to your computer and use it in GitHub Desktop.
nuts.py
#!/usr/bin/env python3
import re
import os
import time
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
def get_wikipedia_page(name):
url = 'https://en.wikipedia.org/wiki/%s' % name
req = Request(url, headers={
'user-agent': 'Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5'
})
return urlopen(req).read()
def get_page_content(html):
page = BeautifulSoup(html)
return page.find('div', id='content')
def get_nuts_count(content):
nuts = re.findall(r'nut[\w\']+\B', content, re.IGNORECASE)
return len(nuts), nuts
html = get_wikipedia_page('Squirel')
content = get_page_content(html)
words = set(re.findall(r'[a-zA-Z]+', content.text))
with open('words.txt', 'wb') as f:
for word in sorted(words):
f.write((word + os.linesep).encode('utf-8'))
links = content.find_all('a', href=True)
for link in links:
if link['href'].startswith('/wiki/') and ':' not in link['href']:
name = link['href'].split('/wiki/')[1]
page = get_wikipedia_page(name)
content = get_page_content(page).text
count, nuts = get_nuts_count(content)
print('Page: %s, nuts: count %d %s' % (name, count, ', '.join(nuts)))
time.sleep(0.1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment