Last active
January 9, 2016 18:22
-
-
Save mgrady3/dc5dae7968b81aec2931 to your computer and use it in GitHub Desktop.
NHLStreamCheck
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Check reddit.com/r/nhlstreams for a game thread for a given team | |
# | |
# based on: | |
# https://gist.github.com/cheesinglee/49add283073a9a517771 | |
import argparse | |
import praw | |
import time | |
# adapted from https://gist.github.com/cheesinglee/49add283073a9a517771 | |
# removed extraneous functionality | |
def process_post(post, keys): | |
d = {} | |
postdict = vars(post) | |
for key in keys: | |
val = postdict[key] | |
try: | |
val = val.lower() | |
except AttributeError: | |
pass | |
d[key] = val | |
d['has_thumbnail'] = (post.thumbnail != u'default') and (post.thumbnail != u'self') | |
post.replace_more_comments(limit=None, threshold=0) | |
comments = post.comments | |
flat_comments = praw.helpers.flatten_tree(comments) | |
d['n_comments'] = len(list(flat_comments)) | |
return d | |
### PARSE COMMAND LINE ARGS | |
def main(): | |
parser = argparse.ArgumentParser() | |
helpstr = "Enter an NHL Team Name without location. Example: \'Maple Leafs\' not \'Toronto Maple Leafs.\' " | |
parser.add_argument("team", help=helpstr) | |
parser.add_argument("--num", help="Number of posts to scrape") | |
args = parser.parse_args() | |
teams = { | |
"Boston" : "Bruins", | |
"Buffalo" : "Sabres", | |
"Detroit" : "Red Wings", | |
"Florida" : "Panthers", | |
"Montreal" : "Canadians", | |
"Ottawa" : "Senators", | |
"Tampa Bay" : "Lightning", | |
"Toronto" : "Maple Leafs", | |
"Carolina" : "Hurrocanes", | |
"Columbus" : "Blue Jackets", | |
"New Jersey" : "Devils", | |
"New York" : "Islanders", | |
"New York" : "Rangers", | |
"Philadelphia" : "Flyers", | |
"Pittsburgh" : "Penguins", | |
"Washington" : "Capitals", | |
"Anaheim" : "Ducks", | |
"Arizona" : "Coyotes", | |
"Calgary" : "Flames", | |
"Edmonton" : "Oilers", | |
"Los Angeles " : "Kings", | |
"San Jose" : "Sharks", | |
"Vancouver" : "Canucks", | |
"Chicago" : "Blackhawks", | |
"Colorado" : "Avalanche", | |
"Dallas" : "Stars", | |
"Minnesota" : "Wild", | |
"Nashville" : "Predators", | |
"Saint Louis" : "Blues", | |
"Winnipeg" : "Jets" | |
} | |
# validate command line args | |
if args.team not in teams.values(): | |
print("Error: Team Not Found") | |
print("Enter a valid team name") | |
else: | |
print("Searching for stream for {0}".format(args.team)) | |
if args.num: | |
num_posts = args.num | |
else: | |
num_posts = 10 # DEFAULT VALUE | |
print('Number of Posts to scrape: {}'.format(num_posts)) | |
### Scrape Section | |
POST_KEYS = ['title', 'created_utc'] # reddit post attributes to store; created time currently unused | |
r = praw.Reddit('Reddit Dataset builder') | |
ids = [] | |
posts = [] | |
searched = [] | |
urls = [] | |
ts = time.time() | |
print("Scraping subreddit: {0}".format('nhlstreams')) | |
sub = r.get_subreddit('nhlstreams') | |
print('Scraping Posts...') | |
# Generate list of newest submissions to /r/nhlstreams with length num_posts | |
for post in sub.get_new(limit=num_posts): | |
if post.id not in ids: | |
# print post.title | |
# if not posts: | |
# print type(post) | |
posts.append(process_post(post, POST_KEYS)) | |
urls.append(post.permalink) | |
ids.append(post.id) | |
print('Scraped {0} posts'.format(len(posts))) | |
tf = time.time() | |
print('Total time elapsed: {0} seconds.'.format(round(tf-ts, 2))) | |
# Search the scraped posts for thread titles with team name | |
# This could be wrapped into the scraping to speed up the script | |
# ie. only store matching posts instead of scraping and storing all posts | |
# then searching for match after the fact | |
for idx, post in enumerate(posts): | |
if args.team in post['title'] or args.team.lower() in post['title']: | |
searched.append((post['title'], urls[idx])) | |
### RESULTS | |
# print('from all scrape:') | |
# print(posts[0]) | |
# print(urls[0]) | |
# print("\n") | |
print('After search: ') | |
if not searched: | |
print('Failed to find Game Thread for {0} in newest {1} posts at /r/nhlstreams'.format(args.team, num_posts)) | |
else: | |
print('Found Threads:') | |
for match in searched: | |
print("Thread: {0} \n URL: {1}".format(match[0], match[1])) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment