Skip to content

Instantly share code, notes, and snippets.

@PyYoshi
Created January 9, 2012 08:15
Show Gist options
  • Select an option

  • Save PyYoshi/1581859 to your computer and use it in GitHub Desktop.

Select an option

Save PyYoshi/1581859 to your computer and use it in GitHub Desktop.
startups-japan.comから企業リンクのみ抽出
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lxml.html import fromstring
import urllib
result_links = []
urls = [
'http://www.startups-japan.com/clients/index/page:1/',
'http://www.startups-japan.com/clients/index/page:2/',
'http://www.startups-japan.com/clients/index/page:3/',
'http://www.startups-japan.com/clients/index/page:4/',
'http://www.startups-japan.com/clients/index/page:5/',
'http://www.startups-japan.com/clients/index/page:6/',
]
def parent_scraper(urls):
results = []
for url in urls:
html = urllib.urlopen(url).read()
et = fromstring(html)
xpath = r'/html/body/div/div[2]/div/div/div/a'
for r in et.xpath(xpath):
ret = 'http://www.startups-japan.com' + r.attrib['href']
results.append(ret)
return results
def mainp_scraper(urls):
results = []
for url in urls:
html = urllib.urlopen(url).read()
et = fromstring(html)
xpath = r'/html/body/div/div[2]/div/div/p[2]/a'
for r in et.xpath(xpath):
results.append(r.attrib['href'])
return results
def main():
ret1 = parent_scraper(urls)
result = mainp_scraper(ret1)
for result in results:
print result
if __name__ in '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment