Skip to content

Instantly share code, notes, and snippets.

@regen100
Created August 5, 2014 22:49
Show Gist options
  • Save regen100/c974c9b583cabd831acf to your computer and use it in GitHub Desktop.
Save regen100/c974c9b583cabd831acf to your computer and use it in GitHub Desktop.
DISCAS scraping
# -*- coding: utf-8 -*-
import lxml.html
URLFORMAT = 'http://www.discas.net/netdvd/cd/goodsDetail.do?titleID=%010d'
# XPathをコンパイルして再利用する
find_title = lxml.etree.XPath(
'string(//head/meta[@property="og:title"]/@content)')
is_available = lxml.etree.XPath(
'boolean(id("sectionGoods")//form[@name="UpdateWishListForm"])')
def getinfo(titleid):
url = URLFORMAT % titleid
# libxml2がWindows-31Jを認識してくれないのでCP932を渡す
parser = lxml.html.HTMLParser(encoding='cp932')
tree = lxml.html.parse(url, parser)
# タイトルを取得
title = find_title(tree)
# 「単品リストに追加」フォームを探す
available = is_available(tree)
return title, available
if __name__ == '__main__':
wishlist = [683928, 1209833, 1795844]
for titleid in wishlist:
title, available = getinfo(titleid)
print ('x', 'o')[available], title
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment