Skip to content

Instantly share code, notes, and snippets.

@jinwei233
Created October 3, 2011 08:12
Show Gist options
  • Save jinwei233/1258682 to your computer and use it in GitHub Desktop.
Save jinwei233/1258682 to your computer and use it in GitHub Desktop.
beautifulsoup 分析html文档
#eg 1
from BeautifulSoup import BeautifulSoup
import re
html_text = """
<h2>this is cool #12345678901</h2>
<h2>this is nothing</h2>
<h2>this is interesting #126666678901</h2>
<h2>this is blah #124445678901</h2>
"""
soup = BeautifulSoup(html_text)
for elem in soup('h2', text=re.compile(r' #\S{11}')):
print elem.parent
#result
<h2>this is cool #12345678901</h2>
<h2>this is interesting #126666678901</h2>
<h2>this is blah #124445678901</h2>
#eg2
html = urllib.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html').read()
soup = BeautifulSoup.BeautifulSoup(html)
texts = soup.findAll(text=True)
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element)):
return False
return True
visible_texts = filter(visible, texts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment