Skip to content

Instantly share code, notes, and snippets.

@tokoroten
Created April 4, 2016 09:43
Show Gist options
  • Save tokoroten/03d5de146ff042acdb86688d8e40075a to your computer and use it in GitHub Desktop.
Save tokoroten/03d5de146ff042acdb86688d8e40075a to your computer and use it in GitHub Desktop.
kickstarter_scraping
#coding: utf-8
import urllib
import json
import datetime
import os
import time
# http://www.kbremner.com/2014/03/15/kickstarter-part1.html
# https://github.com/markolson/kickscraper/wiki/Project
keyword = "" # all
category_id = 16 # technology
project_count = 0
total_hits = 0
end_page = 500
try:
os.mkdir("result")
except:
pass
for page_id in xrange(1, end_page):
query = "http://www.kickstarter.com/projects/search.json?term=%s&category_id=%d&page=%d&sort=most_funded" % (keyword, category_id, page_id)
respons_json = json.loads(urllib.urlopen(query).read())
if len(respons_json["projects"]) == 0:
break
project_count += len(respons_json["projects"])
total_hits = respons_json["total_hits"]
print "progress", project_count, "/", total_hits, round(float(project_count)/total_hits * 100, 2), "%"
for project in respons_json["projects"]:
#print project["name"]
#print project["category"]["slug"], "\t\t",project["name"]
#print project["urls"]["web"]["project"]
#print str(datetime.datetime.fromtimestamp(project["launched_at"])), str(datetime.datetime.fromtimestamp(project["deadline"]))
#print project["pledged"], project["goal"], project["pledged"] / project["goal"], project["backers_count"], project["state"]
filepath = "result/%d.json" % project["id"]
fp = open(filepath, "w")
fp.write(json.dumps(project, sort_keys=True, indent=2))
fp.close()
if project_count >= total_hits:
break
time.sleep(1)
#print json.dumps(respons_json, sort_keys=True, indent=2)
#data.append(respons_json)
#coding:utf-8
import os
import glob
import json
import datetime
import csv
fp = open("kickstarter_result.csv", "wb")
cw = csv.writer(fp,lineterminator="\n")
cw.writerow([
"category",
"pledged","goal",
"currency",
"backers_count",
"state",
"name",
"launched_at",
"deadline",
"blurb",
"url"])
for filename in glob.glob("result/*.json"):
project = json.loads(open(filename).read())
items = []
items.append(project["category"]["slug"])
items.append(project["pledged"])
items.append(project["goal"])
items.append(project["currency"])
items.append(project["backers_count"])
items.append(project["state"])
items.append(project["name"])
items.append(str(datetime.datetime.fromtimestamp(project["launched_at"])))
items.append(str(datetime.datetime.fromtimestamp(project["deadline"])))
items.append(project["blurb"].replace("\n", " ").replace("\r", " ").replace("," , " "))
items.append(project["urls"]["web"]["project"])
out = []
for item in items:
if type(item) is unicode:
item = item.encode("shift-jis", "ignore")
else:
item = str(item)
out.append(item)
cw.writerow(out)
fp.close()
@tokoroten
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment