Created
December 10, 2021 20:04
-
-
Save msullivan/17a6abba8281e5610e189db9d82b925c to your computer and use it in GitHub Desktop.
Scrape an advent of code problem description for inputs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Script that tries to scrape all potential test inputs. | |
By default it writes all of them to files in the directory | |
<day>.tests. It also prints all of the contents out along | |
with the file names so that you can quickly inspect and | |
determine which you want to use. | |
Depends on advent-of-code-data 1.1.0 (later versions may | |
also work, but this depends on internals). | |
Also depends on beautifulsoup, but that should get picked | |
up by advent-of-code-data ;). | |
""" | |
from aocd.get import current_day, most_recent_year | |
from aocd.models import default_user, Puzzle | |
import bs4 | |
import bs4.element | |
import html | |
import os.path | |
import argparse | |
# adapated from aocd internals | |
def get_puzzle(session=None, day=None, year=None): | |
""" | |
Get puzzle for day (1-25) and year (>= 2015) | |
User's session cookie is needed (puzzle inputs differ by user) | |
""" | |
if session is None: | |
user = default_user() | |
else: | |
user = User(token=session) | |
if day is None: | |
day = current_day() | |
if year is None: | |
year = most_recent_year() | |
puzzle = Puzzle(year=year, day=day, user=user) | |
return puzzle | |
def cleanup(el): | |
# strip out ems, since those appear in plenty of inputs | |
if isinstance(el, bs4.element.NavigableString): | |
return str(el) | |
elif isinstance(el, bs4.element.Tag) and el.name == 'em': | |
return cleanup(el.contents[0]) | |
else: | |
return el | |
def slurp(soup): | |
codes = [y for x in soup.find_all('pre') if (y := x.find('code'))] | |
tests = [] | |
for code in codes: | |
cleaned = [cleanup(x) for x in code.contents] | |
if all(isinstance(x, str) for x in cleaned): | |
tests.append(html.unescape(''.join(cleaned))) | |
return tests | |
def write(day, tests, dirname, dry=True): | |
if not dirname: | |
dirname = f'{day}.tests' | |
if not dry: | |
os.makedirs(dirname, exist_ok=True) | |
for i, test in enumerate(tests): | |
name = os.path.join(dirname, str(i + 1)) | |
print(f'==== {name}') | |
print(test) | |
if not dry: | |
with open(name, 'w') as f: | |
f.write(test) | |
parser = argparse.ArgumentParser(description='AOC test input scraper') | |
parser.add_argument("day", nargs="?", type=int) | |
parser.add_argument("year", nargs="?", type=int) | |
parser.add_argument('--dry', '-d', action='store_true', | |
help='Do a dry run, without writing files') | |
parser.add_argument('--dir', type=str, | |
help='Override target directory') | |
def main(): | |
args = parser.parse_args() | |
puzzle = get_puzzle(day=args.day, year=args.year) | |
tests = slurp(puzzle._soup()) | |
write(puzzle.day, tests, dirname=args.dir, dry=args.dry) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment