Last active
December 15, 2015 18:48
-
-
Save bluele/5306015 to your computer and use it in GitHub Desktop.
get stats of kickstarter' project
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
__author__ = 'bluele' | |
from BeautifulSoup import BeautifulSoup as bs | |
import urllib | |
import requests | |
import re | |
class Project(object): | |
def __init__(self, soup): | |
self.soup = soup | |
self.__name_pattern = re.compile(ur'^by\n(?P<name>.*)') | |
def _parse_author(self, author_text): | |
regx = self.__name_pattern.search(author_text) | |
if not regx: | |
raise ValueError(author_text) | |
return regx.groupdict()['name'] | |
@property | |
def name(self): | |
""" プロジェクト名 | |
""" | |
if not hasattr(self, '__name'): | |
self.__name = self.soup.find('h2', { | |
'class': 'bbcard_name' | |
}).find('strong').text | |
return self.__name | |
@property | |
def author(self): | |
""" 投稿者 | |
by\nを取り除く | |
""" | |
if not hasattr(self, '__author'): | |
self.__author = self._parse_author( | |
self.soup.find('h2', { | |
'class': 'bbcard_name' | |
}).find('span').text | |
) | |
return self.__author | |
@property | |
def funded(self): | |
""" 目標金額に対する収集金額 """ | |
if not hasattr(self, '__funded'): | |
self.__funded = self.soup.find('li', { | |
'class': 'first funded' | |
}).find('strong').text | |
return self.__funded | |
@property | |
def pledged(self): | |
""" 収集金額 | |
""" | |
if not hasattr(self, '__pledged'): | |
self.__pledged = self.soup.find('li', { | |
'class': 'pledged' | |
}).find('strong').text | |
return self.__pledged | |
@property | |
def deadline(self): | |
""" 締め切り日時 | |
""" | |
if not hasattr(self, '__deadline'): | |
self.__deadline = self.soup.find('div', { | |
'class': 'deadline' | |
}).text | |
return self.__deadline | |
@property | |
def url(self): | |
""" プロジェクトのURL | |
""" | |
raise NotImplementedError | |
return None | |
def get_base_url(category, separate): | |
url = 'http://www.kickstarter.com/discover/categories/%s/%s' % (urllib.quote(category), separate) | |
print 'Analyze URL: %s' % url | |
return url | |
def get_projects(soup): | |
return soup.findAll('div', { | |
'class': 'project-card' | |
}) | |
def process_content(soup): | |
projects = list() | |
for project_soup in get_projects(soup): | |
projects.append(Project(project_soup)) | |
assert projects[0].name == 'IG Guitars. Designer Guitars You Personalise' | |
assert projects[0].author == "Rob O'Reilly" | |
assert projects[0].funded == '106%' | |
assert projects[0].pledged == '$26,505' | |
assert projects[0].deadline.upper() == 'APR 01, 2013' | |
#for project in projects: | |
# pass | |
def get_page_number(soup): | |
""" ページ数を返します | |
""" | |
soup = soup.find('div', { | |
'class': 'pagination' | |
}) | |
links = soup.findAll('a') | |
return int(links[-2].text) | |
def process(base_url): | |
resp = requests.get( | |
base_url, | |
params={ | |
'page': 1 | |
} | |
) | |
soup = bs(resp.content) | |
process_content(soup) | |
return # DEBUG | |
limit = get_page_number(soup) | |
for page in xrange(2, limit+1): | |
resp = requests.get( | |
base_url, | |
params={ | |
'page': page | |
} | |
) | |
process_content(bs(resp.content)) | |
def main(): | |
categories = [ | |
'product design', | |
] | |
separates = [ | |
'successful', | |
#'most-funded' | |
] | |
for category in categories: | |
for separate in separates: | |
return process(get_base_url(category, separate)) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment