Skip to content

Instantly share code, notes, and snippets.

@debanjum
Last active October 26, 2017 04:14
Show Gist options
  • Save debanjum/965598b4096b7cabbb55 to your computer and use it in GitHub Desktop.
Save debanjum/965598b4096b7cabbb55 to your computer and use it in GitHub Desktop.
Import All Subreddit Posts into your Local Folder
#!/usr/bin/env python3
'''
#**************************************************************************#
# SubReddit_Importer.py #
# by #
# Debanjum Singh Solanky #
# #
# Copyright (C) 2016-2017 Debanjum Singh Solanky #
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
#**************************************************************************#
#**************************************************************************#
# DEPENDENCIES #
# ------------------------------------------------------------------------ #
# Language: Python3, Module(s): Praw #
# pip3 install praw #
#**************************************************************************#
#**************************************************************************#
# RUN #
# ------------------------------------------------------------------------ #
# Arguments reddit <username>, <password> to run script #
# <subreddit> to be imported and <format> for markup for local files, #
# this can be one of reStructeredText(rst) or (default)Markdown(md) #
# ./SubReddit_Importer.py <username> <password> <subreddit> [rst|md] #
#**************************************************************************#
'''
# Import Modules
import praw
import sys
from datetime import datetime as dt
def get_submissions(username, password, subredditname):
"Authenticate crawler and get all subreddit submissions"
# create bot
r = praw.Reddit(client_id="REPLACE_WITH_YOUR_REDDIT_APP_CLIENT_ID",
client_secret="REPLACE_WITH_YOUR_REDDIT_APP_CLIENT_SECRET",
user_agent='subreddit-importer',
username=username,
password=password)
# create generator to lazy import from subreddit
subreddit = r.subreddit(subredditname)
# return all submissions, ordered by newest first
return subreddit.new(limit=None)
def import2rst(username, password, subredditname):
"Import Subreddit Posts and Comments to ReStructuredText"
# open overview page to export subreddit data too
overview = open("{0}.rst".format(subredditname), 'w')
# authenticate, and get submissions
submissions = get_submissions(username, password, subredditname)
# for each submission to subreddit
print("[+] Importing Posts")
for post in submissions:
# extract title
print(" [+] "+post.title)
if post.url:
message = "`"+post.title+" <"+post.url+">`_\n" # rst format if url in title
else:
message = post.title + "\n" # rst format if no url in title
message += '=' * len(message) # add markup for header
# extract post author and data created
message += "\n`"+post.id+" <"+post.id+".rst>`_" + " by **" + post.author.name + "**"
message += dt.fromtimestamp(float(post.created_utc)).strftime(" at *%H:%M:%S* on *%d-%m-%Y*\n\n")
# write to landing/overview page to link to all imported subreddit posts
overview.write(message)
# extract post body, if it exists
if post.selftext:
message += "\n " + post.selftext
message += '\n\n\n'
# extract all comments and their authors
for comment in post.comments:
if hasattr(comment, 'author'):
if hasattr(comment.author, 'name'):
message += " + **" + comment.author.name + "**"
message += dt.fromtimestamp(float(comment.created_utc)).strftime(" at *%H:%M:%S* on *%d-%m-%Y*:\n\n")
if comment.body:
message += " *" + comment.body + "*\n\n"
else:
message += "\n\n\n"
elif comment.body:
message += " + *" + "[unknown]" + "*:\n"
message += " " + comment.body + "\n\n"
# write post with comments to separate file
postfile = open("{0}.rst".format(post.id), 'w')
postfile.write(message)
postfile.close()
print("[+] Import Completed!")
overview.close()
def import2md(username, password, subredditname):
"Import Subreddit Posts and Comments to Markdown"
# open overview page to export subreddit data too
overview = open("{0}.md".format(subredditname), 'w')
# authenticate, and get submissions
submissions = get_submissions(username, password, subredditname)
# for each submission to subreddit
print("[+] Importing Posts")
for post in submissions:
# extract title
print(" [+] "+post.title)
if post.url:
message = "["+post.title+"]("+post.url+")\n" # md format of url in title
else:
message = post.title + "\n" # rst format of no url in title
message += '=' * len(message) # add markup for header
# extract post author and data created
message += "\n[" + post.id + "](" + post.id + ".md)" + " by **" + post.author.name + "**"
message += dt.fromtimestamp(float(post.created_utc)).strftime(" at *%H:%M:%S* on *%d-%m-%Y*\n\n")
# write title, post_id, author, creation_date to overview file containing all post summaries
overview.write(message)
# extract post body, if it exists
if post.selftext:
message += "\n " + post.selftext
message += '\n\n\n'
# extract all comments and their authors
for comment in post.comments:
if hasattr(comment, 'author'):
if hasattr(comment.author, 'name'):
message += " + **" + comment.author.name + "**"
message += dt.fromtimestamp(float(comment.created_utc)).strftime(" at *%H:%M:%S* on *%d-%m-%Y*:\n\n")
if comment.body:
message += " *" + comment.body + "*\n\n"
else:
message += "\n\n\n"
elif comment.body:
message += " + *" + "[unknown]" + "*:\n"
message += " " + comment.body + "\n\n"
# write post with comments to separate file
postfile = open("{0}.md".format(post.id), 'w')
postfile.write(message)
postfile.close()
print("[+] Import Completed!")
overview.close()
if __name__ == "__main__":
# check arguments
if len(sys.argv) < 4 or len(sys.argv) > 5:
print("Incorrect Arguments! Correct Example:\n",
"./subreddit <username> <password> <subredditname> [conversion_type(rst|md)]")
sys.exit()
# if conversion format argument passed
if len(sys.argv) == 5:
# if specified import subreddit in rst format
if sys.argv[4] == "rst":
print("Importing in format: reStructuredText")
import2rst(sys.argv[1], sys.argv[2], sys.argv[3])
# elif specified import subreddit in md format
elif sys.argv[4] == "md":
print("Importing in format: Markdown")
import2md(sys.argv[1], sys.argv[2], sys.argv[3])
# else informing user of correct argument for rst, md conversion
else:
print("Unknown conversion format")
print("The last argument should be 'rst' for reStructured Text and or 'md' for Markdown")
else:
print("Defaulting to conversion format: Markdown(md)")
import2md(sys.argv[1], sys.argv[2], sys.argv[3])
@yarikoptic
Copy link

FWIW and FYI ''.join(['=' for i in message]) == '=' * len(message)

@debanjum
Copy link
Author

^ fixed. Hadn't seen your comment until now :/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment