shihongzhi · May 20, 2012 12:04
diff --git a/grep98list.py b/grep98list.py
 import urllib2
 from BeautifulSoup import BeautifulSoup

 #判断是不是table，即是不是帖子目录了
 def isTable(url):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    tdTags = soup('td', align='left', width='*')
    if len(tdTags) <= 1:
        return True
    else:
        return False 

 #path 保存的是当前走过的路径。为深度遍历
 def helper(url, path, fileHandle):
    if not isTable(url):
        finalPage = urllib2.urlopen(url)
        finalSoup = BeautifulSoup(finalPage)
        tdTags = finalSoup('td', align='left', width='*')
        tdTags = tdTags[1:]
        for tdTag in tdTags:
            print path +" " + tdTag.contents[1].string
            fileHandle.write(path.encode('utf-8')+' http://www.cc98.org/'+tdTag.contents[1]['href'].encode('utf-8') + '\n')
            #fileHandle.write(path.encode('utf-8'))
        return
    else:
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page)
        trTags = soup('tr', valign='middle')
        trTags = trTags[1:]
        for trTag in trTags:
            url2 = url[:url.find('?')]
            url2 += trTag.contents[2].contents[0]['href']
            name = trTag.contents[2].contents[0].string
            helper(url2, path + ' '+ name, fileHandle)

 if __name__ =='__main__':
    url = 'http://www.cc98.org/list_best.asp?boardid=147'
    fileHandle = open('tempurl2.txt', 'w')
    helper(url, '', fileHandle)
    fileHandle.close()
	import urllib2
	from BeautifulSoup import BeautifulSoup

	#判断是不是table，即是不是帖子目录了
	def isTable(url):
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page)
	tdTags = soup('td', align='left', width='*')
	if len(tdTags) <= 1:
	return True
	else:
	return False

	#path 保存的是当前走过的路径。为深度遍历
	def helper(url, path, fileHandle):
	if not isTable(url):
	finalPage = urllib2.urlopen(url)
	finalSoup = BeautifulSoup(finalPage)
	tdTags = finalSoup('td', align='left', width='*')
	tdTags = tdTags[1:]
	for tdTag in tdTags:
	print path +" " + tdTag.contents[1].string
	fileHandle.write(path.encode('utf-8')+' http://www.cc98.org/'+tdTag.contents[1]['href'].encode('utf-8') + '\n')
	#fileHandle.write(path.encode('utf-8'))
	return
	else:
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page)
	trTags = soup('tr', valign='middle')
	trTags = trTags[1:]
	for trTag in trTags:
	url2 = url[:url.find('?')]
	url2 += trTag.contents[2].contents[0]['href']
	name = trTag.contents[2].contents[0].string
	helper(url2, path + ' '+ name, fileHandle)

	if __name__ =='__main__':
	url = 'http://www.cc98.org/list_best.asp?boardid=147'
	fileHandle = open('tempurl2.txt', 'w')
	helper(url, '', fileHandle)
	fileHandle.close()