Created
December 30, 2014 14:54
-
-
Save ntuaha/9defe6fdb3041517d68d to your computer and use it in GitHub Desktop.
抓連結的爬蟲
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
#處理掉unicode 和 str 在ascii上的問題 | |
import sys | |
import os | |
import psycopg2 | |
import cookielib, urllib2,urllib | |
from lxml import html,etree | |
import StringIO | |
reload(sys) | |
sys.setdefaultencoding('utf8') | |
if __name__ == "__main__": | |
print "GG" | |
# 抓網頁下來 | |
url = 'http://news.cnyes.com/rollnews/list.shtml' | |
response = urllib2.build_opener().open(url) | |
the_page = response.read() | |
response.close() | |
# 將網頁轉成結構化資料 | |
parser = etree.HTMLParser() | |
root = etree.parse(StringIO.StringIO(the_page),parser) | |
# 抓指定位置的連結 | |
print root.xpath('//*[@id="container"]/div[4]/div[1]/div[2]/ul[2]/li[1]/a')[0].text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment