Last active
November 6, 2015 08:40
-
-
Save RobertPaulson90/e3d4ea8d111a08fd34c3 to your computer and use it in GitHub Desktop.
Super simple URL/link scraper for any Reddit comment thread, enjoy! Requires PRAW and BeautifulSoup4, only tested on Python 2.7
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Super simple URL/link scraper for any Reddit comment thread, enjoy! | |
# Requires PRAW and BeautifulSoup4, only tested on Python 2.7 | |
threadID = "3rns3d" # Change threadID to the one you wish to scrape (hint: the ID is in the url) | |
import praw | |
import codecs | |
import pprint | |
import HTMLParser | |
from bs4 import BeautifulSoup | |
r = praw.Reddit('RedditComments-LinkScraper by /u/binaryPUNCH') | |
r = praw.Reddit(user_agent='RedditComments-LinkScraper') | |
submission = r.get_submission(submission_id=threadID) | |
submission.replace_more_comments(limit=None, threshold=0) | |
comments = praw.helpers.flatten_tree(submission.comments) | |
for loop in comments: | |
soup = BeautifulSoup(loop.body_html, 'html.parser') | |
for url in soup.find_all('a'): | |
with codecs.open("results-%s.txt" % threadID, "a", "utf-8") as fout: | |
fout.write((url.get('href')+u"\r\n")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment