Last active
August 29, 2015 14:27
-
-
Save w0nk0/23e9f6ce80d7454ae610 to your computer and use it in GitHub Desktop.
A little helper class to convert a subreddit into plain, unstructured text.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BEGIN_TOKEN = "\n#b#" | |
DEBUG = FALSE | |
import praw | |
class FlatSubreddit: | |
def __init__(self, subreddit, max_posts = MAX_POSTS, cache=True ): | |
# if you leave cache at True, it will do some very dumb caching in a sub-folder 'cache/' | |
# so you can re-read reddits many times without worrying about API limits or | |
# access time. Nothing expires from the cache, you'll have to manually delete the files in it. | |
self.sub = subreddit | |
self.max_posts = max_posts | |
self._text = "" | |
self._cache = cache | |
def text(self): | |
if not self._text: | |
self._flatten() | |
return self._text or "." | |
def _make_cache_name(self): | |
return "cache/rddt-%s-%d.cache" % (self.sub, self.max_posts) | |
def _flatten(self): | |
if self._cache: | |
try: | |
with open(self._make_cache_name(),"rt") as f: | |
self._text = f.read() | |
if len(self._text): | |
return self._text | |
except: | |
self._text = "" | |
print "Reading %s.." % self.sub, | |
result = BEGIN_TOKEN+" " | |
sub = self.sub | |
bot = self.r = praw.Reddit("flatsubreddit") | |
subreddit = bot.get_subreddit(sub) | |
ctr = 0 | |
for post in subreddit.get_new(limit=self.max_posts): | |
if ctr % 10 == 9: print ctr+1, | |
ctr += 1 | |
if DEBUG: print post | |
comments = self._check_comments(post) | |
result += comments | |
#result = result.replace(".", ". ") | |
self._text = result | |
print " done." | |
with open(self._make_cache_name(),"wt") as f: | |
f.write(result) | |
return result | |
def _check_comments(self,post): | |
txt = "" | |
submission = self.r.get_submission(submission_id = post.id) | |
flat_comments = praw.helpers.flatten_tree(submission.comments) | |
try: | |
txt = submission.body + "." | |
except: | |
body = "FAIL" | |
if DEBUG: print "Parsing comments" | |
for comment in flat_comments: | |
if DEBUG: print ".", | |
try: | |
txt += str(comment.body) | |
except: | |
txt += "." | |
if DEBUG: print "\n" | |
return txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment