Created
November 10, 2010 14:56
-
-
Save lmatt-bit/670947 to your computer and use it in GitHub Desktop.
BeautifulSoup示例
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from BeautifulSoup import BeautifulSoup | |
| import re | |
| doc = ['<html><head><title>Page title</title></head>', | |
| '<body><p id="firstpara" align="center">This is paragraph <b>one</b>.', | |
| '<p id="secondpara" align="blah">This is paragraph <b>two</b>.', | |
| '</html>'] | |
| doc = ''.join(doc) | |
| soup = BeautifulSoup(doc) | |
| print soup.prettify()#这里相当于把网页标准化。如<b>some thing后忘记了</b>,prettify会补上。 | |
| #下面是打印的结果 | |
| # <html> | |
| # <head> | |
| # <title> | |
| # Page title | |
| # </title> | |
| # </head> | |
| # <body> | |
| # <p id="firstpara" align="center"> | |
| # This is paragraph | |
| # <b> | |
| # one | |
| # </b> | |
| # . | |
| # </p> | |
| # <p id="secondpara" align="blah"> | |
| # This is paragraph | |
| # <b> | |
| # two | |
| # </b> | |
| # . | |
| # </p> | |
| # </body> | |
| # </html> | |
| soup.contents[0].name#可以使用dot的方式访问节点,十分方便。 | |
| # u'html' | |
| soup.contents[0].contents[0].name | |
| # u'head' | |
| head = soup.contents[0].contents[0] | |
| head.parent.name | |
| # u'html' | |
| head.next | |
| # <title>Page title</title> | |
| head.nextSibling.name | |
| # u'body' | |
| head.nextSibling.contents[0] | |
| # <p id="firstpara" align="center">This is paragraph <b>one</b>.</p> | |
| head.nextSibling.contents[0].nextSibling | |
| # <p id="secondpara" align="blah">This is paragraph <b>two</b>.</p> | |
| soup.findAll('p', align="center")#可以使用筛选器,有点像css selectors | |
| # [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>] | |
| soup.find('p', align="center") | |
| # <p id="firstpara" align="center">This is paragraph <b>one</b>. </p> | |
| soup('p', align="center")[0]['id'] | |
| # u'firstpara' | |
| soup.find('p', align=re.compile('^b.*'))['id'] | |
| # u'secondpara' | |
| soup.find('p').b.string | |
| # u'one' | |
| soup('p')[1].b.string | |
| # u'two' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment