Last active
October 26, 2017 04:14
-
-
Save debanjum/965598b4096b7cabbb55 to your computer and use it in GitHub Desktop.
Import All Subreddit Posts into your Local Folder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' | |
#**************************************************************************# | |
# SubReddit_Importer.py # | |
# by # | |
# Debanjum Singh Solanky # | |
# # | |
# Copyright (C) 2016-2017 Debanjum Singh Solanky # | |
# # | |
# This program is free software: you can redistribute it and/or modify # | |
# it under the terms of the GNU General Public License as published by # | |
# the Free Software Foundation, either version 3 of the License, or # | |
# (at your option) any later version. # | |
# # | |
# This program is distributed in the hope that it will be useful, # | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of # | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # | |
# GNU General Public License for more details. # | |
#**************************************************************************# | |
#**************************************************************************# | |
# DEPENDENCIES # | |
# ------------------------------------------------------------------------ # | |
# Language: Python3, Module(s): Praw # | |
# pip3 install praw # | |
#**************************************************************************# | |
#**************************************************************************# | |
# RUN # | |
# ------------------------------------------------------------------------ # | |
# Arguments reddit <username>, <password> to run script # | |
# <subreddit> to be imported and <format> for markup for local files, # | |
# this can be one of reStructeredText(rst) or (default)Markdown(md) # | |
# ./SubReddit_Importer.py <username> <password> <subreddit> [rst|md] # | |
#**************************************************************************# | |
''' | |
# Import Modules | |
import praw | |
import sys | |
from datetime import datetime as dt | |
def get_submissions(username, password, subredditname): | |
"Authenticate crawler and get all subreddit submissions" | |
# create bot | |
r = praw.Reddit(client_id="REPLACE_WITH_YOUR_REDDIT_APP_CLIENT_ID", | |
client_secret="REPLACE_WITH_YOUR_REDDIT_APP_CLIENT_SECRET", | |
user_agent='subreddit-importer', | |
username=username, | |
password=password) | |
# create generator to lazy import from subreddit | |
subreddit = r.subreddit(subredditname) | |
# return all submissions, ordered by newest first | |
return subreddit.new(limit=None) | |
def import2rst(username, password, subredditname): | |
"Import Subreddit Posts and Comments to ReStructuredText" | |
# open overview page to export subreddit data too | |
overview = open("{0}.rst".format(subredditname), 'w') | |
# authenticate, and get submissions | |
submissions = get_submissions(username, password, subredditname) | |
# for each submission to subreddit | |
print("[+] Importing Posts") | |
for post in submissions: | |
# extract title | |
print(" [+] "+post.title) | |
if post.url: | |
message = "`"+post.title+" <"+post.url+">`_\n" # rst format if url in title | |
else: | |
message = post.title + "\n" # rst format if no url in title | |
message += '=' * len(message) # add markup for header | |
# extract post author and data created | |
message += "\n`"+post.id+" <"+post.id+".rst>`_" + " by **" + post.author.name + "**" | |
message += dt.fromtimestamp(float(post.created_utc)).strftime(" at *%H:%M:%S* on *%d-%m-%Y*\n\n") | |
# write to landing/overview page to link to all imported subreddit posts | |
overview.write(message) | |
# extract post body, if it exists | |
if post.selftext: | |
message += "\n " + post.selftext | |
message += '\n\n\n' | |
# extract all comments and their authors | |
for comment in post.comments: | |
if hasattr(comment, 'author'): | |
if hasattr(comment.author, 'name'): | |
message += " + **" + comment.author.name + "**" | |
message += dt.fromtimestamp(float(comment.created_utc)).strftime(" at *%H:%M:%S* on *%d-%m-%Y*:\n\n") | |
if comment.body: | |
message += " *" + comment.body + "*\n\n" | |
else: | |
message += "\n\n\n" | |
elif comment.body: | |
message += " + *" + "[unknown]" + "*:\n" | |
message += " " + comment.body + "\n\n" | |
# write post with comments to separate file | |
postfile = open("{0}.rst".format(post.id), 'w') | |
postfile.write(message) | |
postfile.close() | |
print("[+] Import Completed!") | |
overview.close() | |
def import2md(username, password, subredditname): | |
"Import Subreddit Posts and Comments to Markdown" | |
# open overview page to export subreddit data too | |
overview = open("{0}.md".format(subredditname), 'w') | |
# authenticate, and get submissions | |
submissions = get_submissions(username, password, subredditname) | |
# for each submission to subreddit | |
print("[+] Importing Posts") | |
for post in submissions: | |
# extract title | |
print(" [+] "+post.title) | |
if post.url: | |
message = "["+post.title+"]("+post.url+")\n" # md format of url in title | |
else: | |
message = post.title + "\n" # rst format of no url in title | |
message += '=' * len(message) # add markup for header | |
# extract post author and data created | |
message += "\n[" + post.id + "](" + post.id + ".md)" + " by **" + post.author.name + "**" | |
message += dt.fromtimestamp(float(post.created_utc)).strftime(" at *%H:%M:%S* on *%d-%m-%Y*\n\n") | |
# write title, post_id, author, creation_date to overview file containing all post summaries | |
overview.write(message) | |
# extract post body, if it exists | |
if post.selftext: | |
message += "\n " + post.selftext | |
message += '\n\n\n' | |
# extract all comments and their authors | |
for comment in post.comments: | |
if hasattr(comment, 'author'): | |
if hasattr(comment.author, 'name'): | |
message += " + **" + comment.author.name + "**" | |
message += dt.fromtimestamp(float(comment.created_utc)).strftime(" at *%H:%M:%S* on *%d-%m-%Y*:\n\n") | |
if comment.body: | |
message += " *" + comment.body + "*\n\n" | |
else: | |
message += "\n\n\n" | |
elif comment.body: | |
message += " + *" + "[unknown]" + "*:\n" | |
message += " " + comment.body + "\n\n" | |
# write post with comments to separate file | |
postfile = open("{0}.md".format(post.id), 'w') | |
postfile.write(message) | |
postfile.close() | |
print("[+] Import Completed!") | |
overview.close() | |
if __name__ == "__main__": | |
# check arguments | |
if len(sys.argv) < 4 or len(sys.argv) > 5: | |
print("Incorrect Arguments! Correct Example:\n", | |
"./subreddit <username> <password> <subredditname> [conversion_type(rst|md)]") | |
sys.exit() | |
# if conversion format argument passed | |
if len(sys.argv) == 5: | |
# if specified import subreddit in rst format | |
if sys.argv[4] == "rst": | |
print("Importing in format: reStructuredText") | |
import2rst(sys.argv[1], sys.argv[2], sys.argv[3]) | |
# elif specified import subreddit in md format | |
elif sys.argv[4] == "md": | |
print("Importing in format: Markdown") | |
import2md(sys.argv[1], sys.argv[2], sys.argv[3]) | |
# else informing user of correct argument for rst, md conversion | |
else: | |
print("Unknown conversion format") | |
print("The last argument should be 'rst' for reStructured Text and or 'md' for Markdown") | |
else: | |
print("Defaulting to conversion format: Markdown(md)") | |
import2md(sys.argv[1], sys.argv[2], sys.argv[3]) |
^ fixed. Hadn't seen your comment until now :/
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
FWIW and FYI
''.join(['=' for i in message])
=='=' * len(message)