Created
December 27, 2014 08:02
-
-
Save bcdejp/67cb9321c296f510e0d2 to your computer and use it in GitHub Desktop.
Web(HTML)から情報を抽出する
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import requests | |
import lxml.html | |
URL = "http://make.bcde.jp/category/1/" | |
#Webページ(HTML)の取得 | |
req = requests.get(URL) | |
root = lxml.html.fromstring(req.text) | |
#<a href=""></a>によるリンクを抽出する | |
anchors = root.xpath('//a') | |
for anchor in anchors: | |
print anchor.attrib['href'] | |
#h1の中身を抽出する | |
h1s = root.xpath('//h1') | |
for h1 in h1s: | |
print h1.text | |
#idを指定して、タグに直接囲われたテキストを抽出 | |
content1 = root.get_element_by_id('content1').text | |
print content1 | |
#idを指定して、タグの中のテキストをすべて抽出 | |
content = root.get_element_by_id('content').text_content() | |
print content | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment