Skip to content

Instantly share code, notes, and snippets.

@staybuzz
Last active December 13, 2015 17:05
Show Gist options
  • Save staybuzz/9f92cb933b48e7f2d107 to your computer and use it in GitHub Desktop.
Save staybuzz/9f92cb933b48e7f2d107 to your computer and use it in GitHub Desktop.
俺ガイル公式ページの最新ニュースを拾ってくるやつ
#!/usr/bin/env python2
# coding: utf-8
import re
import requests
import lxml.html
from selenium import webdriver
class oregairu_hook(object):
def __init__(self):
self.target_url = 'http://www.tbs.co.jp/anime/oregairu/'
self.driver = webdriver.PhantomJS()
self.latest_data = {}
def get_html(self):
self.driver.get(self.target_url)
#driver.save_screenshot('oregairu.jpg')
return self.driver.page_source
def main(self):
page = self.get_html()
root = lxml.html.fromstring(page)
article = root.cssselect('.cbox h4')
""" get latest article's publish date and title """
str = article_title[0].text_content()
date = re.search('[0-9]*\.[0-9]*\.[0-9]*',str).group(0)
title = str[10:]
links = root.cssselect('.cbox a') # relative url
url = self.target_url+links[0].values()[0] # absolute url
self.latest_data['date'] = date
self.latest_data['title'] = title
self.latest_data['url'] = url
print self.latest_data['date']
print self.latest_data['title']
print self.latest_data['url']
if __name__ == '__main__':
oh = oregairu_hook()
oh.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment