Skip to content

Instantly share code, notes, and snippets.

@shihongzhi
Created May 20, 2012 12:04
Show Gist options
  • Save shihongzhi/2757840 to your computer and use it in GitHub Desktop.
Save shihongzhi/2757840 to your computer and use it in GitHub Desktop.
把cc98的中包含的网址抓取下来,保存在tempurl2.txt中
import urllib2
from BeautifulSoup import BeautifulSoup
#判断是不是table,即是不是帖子目录了
def isTable(url):
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
tdTags = soup('td', align='left', width='*')
if len(tdTags) <= 1:
return True
else:
return False
#path 保存的是当前走过的路径。为深度遍历
def helper(url, path, fileHandle):
if not isTable(url):
finalPage = urllib2.urlopen(url)
finalSoup = BeautifulSoup(finalPage)
tdTags = finalSoup('td', align='left', width='*')
tdTags = tdTags[1:]
for tdTag in tdTags:
print path +" " + tdTag.contents[1].string
fileHandle.write(path.encode('utf-8')+' http://www.cc98.org/'+tdTag.contents[1]['href'].encode('utf-8') + '\n')
#fileHandle.write(path.encode('utf-8'))
return
else:
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
trTags = soup('tr', valign='middle')
trTags = trTags[1:]
for trTag in trTags:
url2 = url[:url.find('?')]
url2 += trTag.contents[2].contents[0]['href']
name = trTag.contents[2].contents[0].string
helper(url2, path + ' '+ name, fileHandle)
if __name__ =='__main__':
url = 'http://www.cc98.org/list_best.asp?boardid=147'
fileHandle = open('tempurl2.txt', 'w')
helper(url, '', fileHandle)
fileHandle.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment