Skip to content

Instantly share code, notes, and snippets.

@w0nk0
Last active August 29, 2015 14:27
Show Gist options
  • Save w0nk0/23e9f6ce80d7454ae610 to your computer and use it in GitHub Desktop.
Save w0nk0/23e9f6ce80d7454ae610 to your computer and use it in GitHub Desktop.
A little helper class to convert a subreddit into plain, unstructured text.
BEGIN_TOKEN = "\n#b#"
DEBUG = FALSE
import praw
class FlatSubreddit:
def __init__(self, subreddit, max_posts = MAX_POSTS, cache=True ):
# if you leave cache at True, it will do some very dumb caching in a sub-folder 'cache/'
# so you can re-read reddits many times without worrying about API limits or
# access time. Nothing expires from the cache, you'll have to manually delete the files in it.
self.sub = subreddit
self.max_posts = max_posts
self._text = ""
self._cache = cache
def text(self):
if not self._text:
self._flatten()
return self._text or "."
def _make_cache_name(self):
return "cache/rddt-%s-%d.cache" % (self.sub, self.max_posts)
def _flatten(self):
if self._cache:
try:
with open(self._make_cache_name(),"rt") as f:
self._text = f.read()
if len(self._text):
return self._text
except:
self._text = ""
print "Reading %s.." % self.sub,
result = BEGIN_TOKEN+" "
sub = self.sub
bot = self.r = praw.Reddit("flatsubreddit")
subreddit = bot.get_subreddit(sub)
ctr = 0
for post in subreddit.get_new(limit=self.max_posts):
if ctr % 10 == 9: print ctr+1,
ctr += 1
if DEBUG: print post
comments = self._check_comments(post)
result += comments
#result = result.replace(".", ". ")
self._text = result
print " done."
with open(self._make_cache_name(),"wt") as f:
f.write(result)
return result
def _check_comments(self,post):
txt = ""
submission = self.r.get_submission(submission_id = post.id)
flat_comments = praw.helpers.flatten_tree(submission.comments)
try:
txt = submission.body + "."
except:
body = "FAIL"
if DEBUG: print "Parsing comments"
for comment in flat_comments:
if DEBUG: print ".",
try:
txt += str(comment.body)
except:
txt += "."
if DEBUG: print "\n"
return txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment