Last active
February 9, 2016 07:36
-
-
Save mikeboers/8277319 to your computer and use it in GitHub Desktop.
Solving a wordsearch by mining Wikipedia.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
LAHLERIRAHLEDDIKWKT | |
CNANOCIESRRAUNAFAOG | |
WKRGGVREGAASNSESLRB | |
TETEUAEGNLWOGNTKDET | |
IUHLUHTCITDSERIAAAA | |
RECKSUPERHEROEWSRRC | |
EATCTVNCEANRNETGTKK | |
NBAOAENNHRFAPMAKHEY | |
IKGPREDATORTARRLVRR | |
LSYSKCRVFHNSYYIAATU | |
BKGYERFFOJTEACASDRB | |
OTANDROIDENLYTKEEAD | |
GHXENOMORPHTDYBRRTA | |
TYOGSOTHOTHTWRIBUSR | |
YTOBORRRLSTARWARSLB | |
VAJLBRRULILBORIGKLL | |
TDRAZIWSAKTDIREWOLF | |
AROFWOTLESRAASEMINA | |
FGILEANRLFNATACOREI |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-w', '--words', action='append') | |
parser.add_argument('puzzle') | |
args = parser.parse_args() | |
puzzle = [list(line.strip()) for line in open(args.puzzle).read().strip().split()] | |
rows = len(puzzle) | |
cols = len(puzzle[0]) | |
print 'Puzzle is', rows, 'by', cols | |
dictionary = {} | |
FIN = '*' | |
def add_word_to_dictionary(word, node=dictionary): | |
node = node.setdefault(word[0], {}) | |
remaining = word[1:] | |
if remaining: | |
add_word_to_dictionary(remaining, node) | |
else: | |
node[FIN] = node.get(FIN, 0) + 1 | |
word_count = 0 | |
for path in args.words or ['/usr/share/dict/words']: | |
print 'Loading', path | |
for line in open(path): | |
line = line.strip() | |
if line: | |
add_word_to_dictionary(line.upper()) | |
word_count += 1 | |
print word_count, 'words' | |
directions = [ | |
('N' , 0, -1), | |
('NE', 1, -1), | |
('E' , 1, 0), | |
('SE', 1, 1), | |
('S' , 0, 1), | |
('SW', -1, 1), | |
('W' , -1, 0), | |
('NW', -1, -1), | |
] | |
for sx in xrange(cols): | |
for sy in xrange(rows): | |
for direction, dx, dy in directions: | |
solutions = [] | |
node = dictionary | |
x = sx | |
y = sy | |
word = '' | |
while ( | |
node and | |
x >= 0 and x < cols and | |
y >= 0 and y < rows | |
): | |
if False and len(word) > 2: | |
print ' %2d,%2d %2s: %s' % (sx, sy, direction, word) | |
char = puzzle[y][x] | |
word += char | |
node = node.get(char) | |
if node and FIN in node: | |
solutions.append(word) | |
x += dx | |
y += dy | |
for word in sorted(solutions, key=len, reverse=True): | |
if len(word) > 3: | |
print '* %2d,%2d %2s: %s' % (sx, sy, direction, word) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import json | |
import os | |
import re | |
import requests | |
base_url = 'http://en.wikipedia.org/w/api.php' | |
base_params = dict( | |
format='json', | |
action='query', | |
titles='Main Page', | |
prop='revisions', | |
rvprop='content', | |
) | |
def get_content(title): | |
path = os.path.join('wp_data', title +'.json') | |
if not os.path.exists(path): | |
params = base_params.copy() | |
params['titles'] = title | |
res = requests.get(base_url, params=params) | |
with open(path, 'w') as fh: | |
fh.write(res.text) | |
data = json.load(open(path)) | |
try: | |
return data['query']['pages'].values()[0]['revisions'][0]['*'] | |
except (KeyError, IndexError) as e: | |
# This only happens on "file" pages, which we don't care about. | |
return '' | |
parser = argparse.ArgumentParser() | |
parser.add_argument('title', nargs='+') | |
parser.add_argument('-d', '--depth', type=int, default=0) | |
parser.add_argument('-o', '--output', nargs='?') | |
args = parser.parse_args() | |
def walk(title, depth=0): | |
content = get_content(title) | |
yield title, content | |
if depth <= 0: | |
return | |
for m in re.finditer(r'\[\[(.+?)(?:\||\]\])', content): | |
subtitle = m.group(1) | |
for x in walk(subtitle, depth - 1): | |
yield x | |
output = open(args.output, 'w') if args.output else None | |
seen = set() | |
for starting_title in args.title: | |
for title, content in walk(starting_title, args.depth): | |
print title, len(content) | |
if output: | |
for word in re.findall(r'\b[a-z]+\b', content): | |
if word not in seen: | |
output.write(word + '\n') | |
seen.add(word) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment