Skip to content

Instantly share code, notes, and snippets.

@rounakdatta
Last active March 4, 2018 17:15
Show Gist options
  • Save rounakdatta/3046f13dc27352353c50a5c791048ec1 to your computer and use it in GitHub Desktop.
Save rounakdatta/3046f13dc27352353c50a5c791048ec1 to your computer and use it in GitHub Desktop.
subreddit content scraper
import requests
import string
link = 'https://www.reddit.com/r/DarkHumor/.json'
subreddit = requests.get(link, headers = {'User-agent': 'nexttechbot'})
posts = subreddit.text.split()
post_list = []
for post_header in posts:
if "https" in post_header:
post_header = post_header.replace("\"", "")
post_list.append(post_header[:-2])
# print(post_list)
def getpost(link):
post_link = (link + '/.json')
post_page = requests.get(post_link, headers = {'User-agent': 'nexttechbot'})
post_content = post_page.text.split()
title_index = (post_content.index("\"title\":"))
answer_index = (post_content.index("\"selftext\":"))
title = []
answer = []
title.append(post_content[title_index + 1] + " ")
answer.append(post_content[answer_index + 1] + " ")
title_index += 2
answer_index += 2
while "\"" not in post_content[title_index]:
title.append(post_content[title_index] + " ")
title_index += 1
while "\"" not in post_content[answer_index]:
answer.append(post_content[answer_index] + " ")
answer_index += 1
title.append(post_content[title_index])
answer.append(post_content[answer_index])
return((''.join(title)), (''.join(answer)))
with open("questions.txt", 'a') as qfile, open("answers.txt", 'a') as afile:
for link in post_list:
question, answer = getpost(link)
qfile.write(question + "\n")
afile.write(answer + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment