Skip to content

Instantly share code, notes, and snippets.

@bluele
Last active December 15, 2015 18:48
Show Gist options
  • Save bluele/5306015 to your computer and use it in GitHub Desktop.
Save bluele/5306015 to your computer and use it in GitHub Desktop.
get stats of kickstarter' project
#-*- coding:utf-8 -*-
__author__ = 'bluele'
from BeautifulSoup import BeautifulSoup as bs
import urllib
import requests
import re
class Project(object):
def __init__(self, soup):
self.soup = soup
self.__name_pattern = re.compile(ur'^by\n(?P<name>.*)')
def _parse_author(self, author_text):
regx = self.__name_pattern.search(author_text)
if not regx:
raise ValueError(author_text)
return regx.groupdict()['name']
@property
def name(self):
""" プロジェクト名
"""
if not hasattr(self, '__name'):
self.__name = self.soup.find('h2', {
'class': 'bbcard_name'
}).find('strong').text
return self.__name
@property
def author(self):
""" 投稿者
by\nを取り除く
"""
if not hasattr(self, '__author'):
self.__author = self._parse_author(
self.soup.find('h2', {
'class': 'bbcard_name'
}).find('span').text
)
return self.__author
@property
def funded(self):
""" 目標金額に対する収集金額 """
if not hasattr(self, '__funded'):
self.__funded = self.soup.find('li', {
'class': 'first funded'
}).find('strong').text
return self.__funded
@property
def pledged(self):
""" 収集金額
"""
if not hasattr(self, '__pledged'):
self.__pledged = self.soup.find('li', {
'class': 'pledged'
}).find('strong').text
return self.__pledged
@property
def deadline(self):
""" 締め切り日時
"""
if not hasattr(self, '__deadline'):
self.__deadline = self.soup.find('div', {
'class': 'deadline'
}).text
return self.__deadline
@property
def url(self):
""" プロジェクトのURL
"""
raise NotImplementedError
return None
def get_base_url(category, separate):
url = 'http://www.kickstarter.com/discover/categories/%s/%s' % (urllib.quote(category), separate)
print 'Analyze URL: %s' % url
return url
def get_projects(soup):
return soup.findAll('div', {
'class': 'project-card'
})
def process_content(soup):
projects = list()
for project_soup in get_projects(soup):
projects.append(Project(project_soup))
assert projects[0].name == 'IG Guitars. Designer Guitars You Personalise'
assert projects[0].author == "Rob O'Reilly"
assert projects[0].funded == '106%'
assert projects[0].pledged == '$26,505'
assert projects[0].deadline.upper() == 'APR 01, 2013'
#for project in projects:
# pass
def get_page_number(soup):
""" ページ数を返します
"""
soup = soup.find('div', {
'class': 'pagination'
})
links = soup.findAll('a')
return int(links[-2].text)
def process(base_url):
resp = requests.get(
base_url,
params={
'page': 1
}
)
soup = bs(resp.content)
process_content(soup)
return # DEBUG
limit = get_page_number(soup)
for page in xrange(2, limit+1):
resp = requests.get(
base_url,
params={
'page': page
}
)
process_content(bs(resp.content))
def main():
categories = [
'product design',
]
separates = [
'successful',
#'most-funded'
]
for category in categories:
for separate in separates:
return process(get_base_url(category, separate))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment