-
-
Save jon1scr/7b5a900d6993c166091534df8ba6be8a to your computer and use it in GitHub Desktop.
Scraping GitHub issues from https://github.com/xueyuanl/daily-hackernews/issues
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from github import Github | |
import csv | |
import re | |
import requests | |
from datetime import datetime, timedelta | |
g = Github("access_token") | |
REPO_NAME = 'xueyuanl/daily-hackernews' | |
FILE_NAME = "hn25.csv" | |
BASE_URL = 'https://hacker-news.firebaseio.com/v0/' | |
ITEM_URL = BASE_URL + 'item/' | |
USER_URL = BASE_URL + 'user/' | |
NOOB_ACCOUNT_TRESHOLD = 15 | |
repo = g.get_repo(REPO_NAME) | |
issues = repo.get_issues(state='open') | |
fields = ['Date', 'Title', 'Link', 'userID', 'userID Age', 'New Account?', 'Number of Points', 'Number of Comments', 'HN Link'] | |
rows = [] | |
for issue in issues: | |
date = all_square_brackets = all_links = title = hn_link = link = user_id = '' | |
user_id_age = is_new = number_of_points = number_of_comments = '' | |
try: | |
date = issue.title.split(' ')[3] | |
except IndexError: | |
print ('bad title format: ', issue.title) | |
# TODO | |
post = issue.body.split('\n') | |
del post[:3] # first 3 elements are empty | |
for line in post: | |
# regex for title | |
all_square_brackets = re.findall("\[(.*?)\]", line) | |
# regex for Link, HN Link | |
all_links = re.findall("(?P<url>https?://[^\s]+)", line.lower()) | |
title = all_square_brackets[0].strip('**') | |
link = all_links[0].strip(')') | |
hn_link = all_links[1].strip(')') | |
item_id = hn_link.split('=')[1] | |
item = requests.get(ITEM_URL + item_id + '.json') | |
user_id = item.json()['by'] | |
number_of_points = item.json()['score'] | |
user = requests.get(USER_URL + user_id + '.json') | |
# number_of_comments not outright available, needs more computing. | |
try: | |
user_id_age = user.json()['created'] | |
account_created_date = datetime.fromtimestamp(user_id_age) | |
submission_date = is_new = '' | |
try: | |
submission_date = datetime.strptime(date, '%d-%m-%Y') | |
calc_date = - timedelta(days=NOOB_ACCOUNT_TRESHOLD) | |
is_new = account_created_date > submission_date # submission date should always be 15 days greater than account creation | |
except ValueError: | |
print ('date parsing issue: ', line) | |
# TODO | |
except TypeError: | |
print ('Automated submission.') | |
row = [date, title, link, user_id, user_id_age, is_new, number_of_points, number_of_comments, hn_link] | |
rows.append(row) | |
with open(FILE_NAME, 'w', encoding='utf-8-sig') as csvfile: | |
csvwriter = csv.writer(csvfile) | |
csvwriter.writerow(fields) | |
csvwriter.writerows(rows) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment