Skip to content

Instantly share code, notes, and snippets.

@flyhigher139
Last active August 29, 2015 13:57
Show Gist options
  • Save flyhigher139/9757418 to your computer and use it in GitHub Desktop.
Save flyhigher139/9757418 to your computer and use it in GitHub Desktop.
抓取极客公园『产品观察』信息
# -*- coding: utf-8 -*-.
import urllib2
from bs4 import BeautifulSoup
base_url = 'http://www.geekpark.net'
data = {}
f = urllib2.urlopen('http://www.geekpark.net/read/seed')
src = f.read()
soup = BeautifulSoup(src)
data['title'] = soup.title
cast_div = soup.find(id='castlist-box')
entry = cast_div.find_all(class_ = 'entry')
items = []
for item in entry:
value_dict = {}
header = item.find(class_='header')
value_dict['title'] = unicode(header.a.string)
value_dict['href'] = base_url + header.a.attrs['href']
meta = item.find(class_='meta')
value_dict['author'] = unicode(meta.a.string)
main = item.find(class_='main')
value_dict['content'] = unicode(main.string)
items.append(value_dict)
for v in items:
print 'title: %s' % v['title']
print 'href: %s' % v['href']
print 'author: %s' % v['author']
# print 'date: %s' % v['date']
print 'content: %s' % v['content']
print '----------'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment