jinwei233 · October 3, 2011 08:12
diff --git a/beatifulsoup_eg.py b/beatifulsoup_eg.py
 #eg 1

 from BeautifulSoup import BeautifulSoup
 import re

 html_text = """
 <h2>this is cool #12345678901</h2>
 <h2>this is nothing</h2>
 <h2>this is interesting #126666678901</h2>
 <h2>this is blah #124445678901</h2>
 """

 soup = BeautifulSoup(html_text)

 for elem in soup('h2', text=re.compile(r' #\S{11}')):
    print elem.parent

 #result
 <h2>this is cool #12345678901</h2>
 <h2>this is interesting #126666678901</h2>
 <h2>this is blah #124445678901</h2>





 #eg2

 html = urllib.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html').read()
 soup = BeautifulSoup.BeautifulSoup(html)
 texts = soup.findAll(text=True)

 def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element)):
        return False
    return True

 visible_texts = filter(visible, texts)
	#eg 1

	from BeautifulSoup import BeautifulSoup
	import re

	html_text = """
	<h2>this is cool #12345678901</h2>
	<h2>this is nothing</h2>
	<h2>this is interesting #126666678901</h2>
	<h2>this is blah #124445678901</h2>
	"""

	soup = BeautifulSoup(html_text)

	for elem in soup('h2', text=re.compile(r' #\S{11}')):
	print elem.parent

	#result
	<h2>this is cool #12345678901</h2>
	<h2>this is interesting #126666678901</h2>
	<h2>this is blah #124445678901</h2>





	#eg2

	html = urllib.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html').read()
	soup = BeautifulSoup.BeautifulSoup(html)
	texts = soup.findAll(text=True)

	def visible(element):
	if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
	return False
	elif re.match('<!--.*-->', str(element)):
	return False
	return True

	visible_texts = filter(visible, texts)