Created
November 5, 2013 01:27
-
-
Save nikolak/7312370 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 | |
import time | |
import json | |
import urllib2 | |
def get_submissions(): | |
url="http://www.reddit.com/r/all/new/.json" #URL of the page that we want to fetch | |
headers = { 'User-Agent' : 'fetching new submissions script' } # our script "identifier" | |
req = urllib2.Request(url, None, headers) # We create new request here to open the url with above set headers | |
data = urllib2.urlopen(req).read() # Open url and make data variable equal whatever we get | |
json_data=json.loads(data) # Load json from variable defined above as "data", it should be string/text that we get from reddit | |
return json_data["data"]["children"] # The JSON contains useless fields, the data we need is in data/children keys | |
if __name__=="__main__": | |
conn = sqlite3.connect('database.db') # Either create new database or connect to exiting one, sqlite module | |
# handles this for us so we don't have to worry about whether it exists or not | |
c = conn.cursor() # This is database cursor, basically what we will use to enter data into database | |
c.execute('''CREATE TABLE IF NOT EXISTS submissions | |
(id text, title text, author text, subreddit text, domain text, date real)''') | |
#Self explanatory but here we create new table named submissions, think of it like a sheet in excel | |
#but only if it doesn't already exist, also we create fields id, title etc which contain text. Only date is real i.e. number | |
user_choice=raw_input("1 - Gather data\n2 - Example stats\n>>>") # We ask user to provide input and make that input | |
# assigned to variable named user_choice | |
if user_choice=="1": # if user_choice is equal number one then we do this | |
while True: # this causes infinite loop, because True can never be false. | |
print "*"*50 #print * fifty times in a row | |
print "Getting new submissions..." | |
all_submissions=get_submissions() # we call above defined function which opens reddit and returns json list of submissions | |
for submission in all_submissions: # we go over that list and on each run the item we're going over is assigned to variable named submission | |
submission=submission["data"] # quick fix, we can either type submission["data"]["id"] etc or just make submission equal to key "data" | |
s=(submission["id"],) # we make a "variable" s with submission id | |
c.execute('SELECT * FROM submissions WHERE id=?', s) # we tell our database cursor to get us ANY item from submission table with id equal to variable s | |
if c.fetchone() is None: # then we fetch just one result from above execution, if there are no entries this item does not exist in our databse and we should write it | |
item=(submission["id"],submission["title"],submission["author"],submission["subreddit"], | |
submission["domain"],submission["created_utc"]) # this just creates a variable item with subission properties all in one place, we need those for our database | |
print "Submission ID {}, not in database - adding now.".format(submission["id"]) # not necessary, but so we know what we're doing | |
c.execute('INSERT INTO submissions VALUES (?,?,?,?,?,?)',item) # pretty much self explained, we enter 6 values into submission table, those values are taken from item variable | |
else: # we got some results from c.execute so our item already exist in database, skip it and print to user that it's skipped | |
print "Skipping submission ID {}, already in database".format(submission["id"]) | |
conn.commit() # commit changes to database file on hard drive | |
print "Changes saved sucessfuly, waiting 10 seconds..." | |
time.sleep(10) #sleep, time-out for 10 seconds before repating action again. Because as I said True is always true, and whatever is inside will just loop forever | |
elif user_choice=="2": # we want to get some stats | |
while True: #To enable user to enter new words all the time without having to restart script | |
user_input=raw_input("Enter a word to display all submissions that have it in title\n") #same as above, get user input - assign it to useR_input variable | |
for row in c.execute('SELECT * FROM submissions WHERE TITLE LIKE ("%" || ? || "%") ORDER BY date',(user_input,)): | |
# here we get all submissions from our database which have word user entered anywhere in their title, and we sort those by date | |
print row | |
else: | |
# the user input is neither 1 nor 2 so it's invalid. | |
print "Invalid input" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment