Skip to content

Instantly share code, notes, and snippets.

@lmatt-bit
Created November 10, 2010 14:56
Show Gist options
  • Select an option

  • Save lmatt-bit/670947 to your computer and use it in GitHub Desktop.

Select an option

Save lmatt-bit/670947 to your computer and use it in GitHub Desktop.
BeautifulSoup示例
from BeautifulSoup import BeautifulSoup
import re
doc = ['<html><head><title>Page title</title></head>',
'<body><p id="firstpara" align="center">This is paragraph <b>one</b>.',
'<p id="secondpara" align="blah">This is paragraph <b>two</b>.',
'</html>']
doc = ''.join(doc)
soup = BeautifulSoup(doc)
print soup.prettify()#这里相当于把网页标准化。如<b>some thing后忘记了</b>,prettify会补上。
#下面是打印的结果
# <html>
# <head>
# <title>
# Page title
# </title>
# </head>
# <body>
# <p id="firstpara" align="center">
# This is paragraph
# <b>
# one
# </b>
# .
# </p>
# <p id="secondpara" align="blah">
# This is paragraph
# <b>
# two
# </b>
# .
# </p>
# </body>
# </html>
soup.contents[0].name#可以使用dot的方式访问节点,十分方便。
# u'html'
soup.contents[0].contents[0].name
# u'head'
head = soup.contents[0].contents[0]
head.parent.name
# u'html'
head.next
# <title>Page title</title>
head.nextSibling.name
# u'body'
head.nextSibling.contents[0]
# <p id="firstpara" align="center">This is paragraph <b>one</b>.</p>
head.nextSibling.contents[0].nextSibling
# <p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>
soup.findAll('p', align="center")#可以使用筛选器,有点像css selectors
# [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>]
soup.find('p', align="center")
# <p id="firstpara" align="center">This is paragraph <b>one</b>. </p>
soup('p', align="center")[0]['id']
# u'firstpara'
soup.find('p', align=re.compile('^b.*'))['id']
# u'secondpara'
soup.find('p').b.string
# u'one'
soup('p')[1].b.string
# u'two'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment