lmatt-bit · November 10, 2010 14:56
diff --git a/BeautifulSoup.py b/BeautifulSoup.py
 from BeautifulSoup import BeautifulSoup
 import re

 doc = ['<html><head><title>Page title</title></head>',
       '<body><p id="firstpara" align="center">This is paragraph <b>one</b>.',
       '<p id="secondpara" align="blah">This is paragraph <b>two</b>.',
       '</html>']

 doc = ''.join(doc)
 soup = BeautifulSoup(doc)

 print soup.prettify()#这里相当于把网页标准化。如<b>some thing后忘记了</b>，prettify会补上。
 #下面是打印的结果
 # <html>
 #  <head>
 #   <title>
 #    Page title
 #   </title>
 #  </head>
 #  <body>
 #   <p id="firstpara" align="center">
 #    This is paragraph
 #    <b>
 #     one
 #    </b>
 #    .
 #   </p>
 #   <p id="secondpara" align="blah">
 #    This is paragraph
 #    <b>
 #     two
 #    </b>
 #    .
 #   </p>
 #  </body>
 # </html>

 soup.contents[0].name#可以使用dot的方式访问节点，十分方便。
 # u'html'

 soup.contents[0].contents[0].name
 # u'head'

 head = soup.contents[0].contents[0]
 head.parent.name
 # u'html'

 head.next
 # <title>Page title</title>

 head.nextSibling.name
 # u'body'

 head.nextSibling.contents[0]
 # <p id="firstpara" align="center">This is paragraph <b>one</b>.</p>

 head.nextSibling.contents[0].nextSibling
 # <p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>

 soup.findAll('p', align="center")#可以使用筛选器，有点像css selectors
 # [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>]

 soup.find('p', align="center")
 # <p id="firstpara" align="center">This is paragraph <b>one</b>. </p>

 soup('p', align="center")[0]['id']
 # u'firstpara'

 soup.find('p', align=re.compile('^b.*'))['id']
 # u'secondpara'

 soup.find('p').b.string
 # u'one'

 soup('p')[1].b.string
 # u'two'
	from BeautifulSoup import BeautifulSoup
	import re

	doc = ['<html><head><title>Page title</title></head>',
	'<body><p id="firstpara" align="center">This is paragraph <b>one</b>.',
	'<p id="secondpara" align="blah">This is paragraph <b>two</b>.',
	'</html>']

	doc = ''.join(doc)
	soup = BeautifulSoup(doc)

	print soup.prettify()#这里相当于把网页标准化。如<b>some thing后忘记了</b>，prettify会补上。
	#下面是打印的结果
	# <html>
	# <head>
	# <title>
	# Page title
	# </title>
	# </head>
	# <body>
	# <p id="firstpara" align="center">
	# This is paragraph
	# <b>
	# one
	# </b>
	# .
	# </p>
	# <p id="secondpara" align="blah">
	# This is paragraph
	# <b>
	# two
	# </b>
	# .
	# </p>
	# </body>
	# </html>

	soup.contents[0].name#可以使用dot的方式访问节点，十分方便。
	# u'html'

	soup.contents[0].contents[0].name
	# u'head'

	head = soup.contents[0].contents[0]
	head.parent.name
	# u'html'

	head.next
	# <title>Page title</title>

	head.nextSibling.name
	# u'body'

	head.nextSibling.contents[0]
	# <p id="firstpara" align="center">This is paragraph <b>one</b>.</p>

	head.nextSibling.contents[0].nextSibling
	# <p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>

	soup.findAll('p', align="center")#可以使用筛选器，有点像css selectors
	# [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>]

	soup.find('p', align="center")
	# <p id="firstpara" align="center">This is paragraph <b>one</b>. </p>

	soup('p', align="center")[0]['id']
	# u'firstpara'

	soup.find('p', align=re.compile('^b.*'))['id']
	# u'secondpara'

	soup.find('p').b.string
	# u'one'

	soup('p')[1].b.string
	# u'two'
No results found