Last active
June 12, 2019 05:46
-
-
Save mashirozx/a8accbc6db88da06bee78346fc7e01b8 to your computer and use it in GitHub Desktop.
JSON API Crawler in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import pypyodbc | |
import json | |
import time | |
import random | |
import os | |
############## | |
# Parameters # | |
url = "https://example.com" | |
loop_times = 200000 # Set times of loops | |
sleep_time = False # Do you need a sleep between two requests? | |
# Your database location, MS Access Database ONLY! | |
db_location = "c:\\WorkDirectory\GitHub\py-wd\crawler_v1.0\hitokoto.accdb" | |
remove_old_db = False # Do you want to remove the old database file? Make sure that contains nothing important! | |
create_new_db = False # Do you want to create a new database? | |
table_name = "hitokoto_us" # The table you are going to write! | |
create_new_table = False # Do you want to create a new table (with the name above)? | |
is_first_time = False # Is it the first time to write in this table? | |
############## | |
# Create table | |
def Create_Table(): | |
SQL = 'CREATE TABLE ' + table_name +' (id int,hitokoto varchar(255),source varchar(255),add_time DateTime)' | |
conn.cursor().execute(SQL) | |
cur.commit() | |
def Request_Job(index): | |
response = requests.get(url) | |
#print (response.text) | |
data = response.json() | |
print ('Writing id = ' + str(index)) | |
print (data['text']) | |
print (data['source']) | |
print (data['catname']) | |
# Write | |
hitokoto_query = "\'" + data['text'] + "\'" | |
hitokoto_query.replace("'", '\'') | |
hitokoto_query.replace("''", '\"') | |
data['text'].replace("\'\'", '') # Fix a Speciall bug... | |
is_new = Check_Dup(hitokoto_query) | |
if is_new: | |
Write_DB(data,index) | |
# Time sleep | |
if sleep_time: | |
timer = random.randint(0,3) | |
time.sleep(timer/10) | |
# Insert Data | |
def Write_DB(data,index): | |
localtime = time.strftime("%Y-%m-%d", time.localtime()) | |
sql_insert = '''INSERT INTO ''' + table_name + '''(id,hitokoto,source,catname,add_time) VALUES(?,?,?,?,?)''' | |
insert_value = (index, data['text'], data['source'], data['catname'], localtime) | |
cur.execute(sql_insert, insert_value) | |
cur.commit() | |
# Check duplicate or not | |
def Check_Dup(check_hitokoto): | |
SQL_query = '''SELECT * FROM ''' + table_name + ''' WHERE `hitokoto` = ''' + check_hitokoto | |
cur.execute(SQL_query) | |
row = cur.fetchone() | |
if row: | |
print ('Nothing Speciall!') | |
return False | |
else: | |
print ('This is New!') | |
return True | |
############## | |
# Main Start # | |
############## | |
# Remove old accdb | |
if remove_old_db: | |
os.remove(db_location) | |
# Creat new database | |
if create_new_db: | |
connection = pypyodbc.win_create_mdb(db_location) | |
# Connect to accdb | |
connStr = 'Driver={Microsoft Access Driver (*.mdb)};DBQ=' + db_location | |
conn = pypyodbc.win_connect_mdb(connStr) | |
# Create a cursor | |
cur = conn.cursor() | |
# Creater a TABLE | |
if create_new_table: | |
Create_Table() | |
if is_first_time: | |
Request_Job(1) # For the first time. Must comment this line in the second time! Important! | |
# Requests Start | |
print ('Begin!') | |
for x in range(1, loop_times): | |
try: | |
SQL_max_id = 'SELECT MAX(id) FROM ' + table_name | |
cur.execute(SQL_max_id) | |
max_id = cur.fetchone()[0] | |
Request_Job(max_id + 1) | |
print ("Info: Job Finished!") | |
except: # Handeling Exceptions | |
print ("Error: Operating Failed!") | |
pass | |
else: | |
print ("Succeed") | |
times = round(x*(50/loop_times)) | |
print (u"\u2588" * times + u"\u2592" * (50-times)) | |
print ('All Done!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment