Last active
November 8, 2023 13:24
-
-
Save RichardBronosky/4060082 to your computer and use it in GitHub Desktop.
My solution to: http://stackoverflow.com/questions/9007653/how-to-find-tag-with-particular-text-with-beautiful-soup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<tr> | |
<td class="pos">\n | |
"Some text:"\n | |
<br>\n | |
<strong>some value</strong>\n | |
</td> | |
</tr> | |
<tr> | |
<td class="pos">\n | |
"Fixed text:"\n | |
<br>\n | |
<strong>text I am looking for</strong>\n | |
</td> | |
</tr> | |
<tr> | |
<td class="pos">\n | |
"Some other text:"\n | |
<br>\n | |
<strong>some other value</strong>\n | |
</td> | |
</tr> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Taken from https://gist.github.com/4060082 | |
# If you have BeautifulSoup, you can test this locally via: | |
# curl https://gist.githubusercontent.com/RichardBronosky/4060082/raw/test.py | python | |
from BeautifulSoup import BeautifulSoup | |
from urllib2 import urlopen | |
from pprint import pprint | |
import re | |
soup = BeautifulSoup(urlopen('https://gist.githubusercontent.com/RichardBronosky/4060082/raw/test.html').read()) | |
# I'm going to assume that Peter knew that re.compile is meant to cache a computation result for a performance benefit. However, I'm going to do that explicitly here to be very clear. | |
pattern = re.compile('Fixed text') | |
# Peter's suggestion here returns a list of what appear to be strings | |
columns = soup.findAll('td', text=pattern, attrs={'class' : 'pos'}) | |
# ...but it is actually a BeautifulSoup.NavigableString | |
print type(columns[0]) | |
#>> <class 'BeautifulSoup.NavigableString'> | |
# you can reach the tag using one of the convenience attributes seen here | |
pprint(columns[0].__dict__) | |
#>> {'next': <br />, | |
#>> 'nextSibling': <br />, | |
#>> 'parent': <td class="pos">\n | |
#>> "Fixed text:"\n | |
#>> <br />\n | |
#>> <strong>text I am looking for</strong>\n | |
#>> </td>, | |
#>> 'previous': <td class="pos">\n | |
#>> "Fixed text:"\n | |
#>> <br />\n | |
#>> <strong>text I am looking for</strong>\n | |
#>> </td>, | |
#>> 'previousSibling': None} | |
# I feel that 'parent' is safer to use than 'previous' based on http://www.crummy.com/software/BeautifulSoup/bs4/doc/#method-names | |
# So, if you want to find the 'text' in the 'strong' element... | |
pprint([t.parent.find('strong').text for t in soup.findAll('td', text=pattern, attrs={'class' : 'pos'})]) | |
#>> [u'text I am looking for'] | |
# Here is what we have learned: | |
print soup.find('strong') | |
#>> <strong>some value</strong> | |
print soup.find('strong', text='some value') | |
#>> u'some value' | |
print soup.find('strong', text='some value').parent | |
#>> <strong>some value</strong> | |
print soup.find('strong', text='some value') == soup.find('strong') | |
#>> False | |
print soup.find('strong', text='some value') == soup.find('strong').text | |
#>> True | |
print soup.find('strong', text='some value').parent == soup.find('strong') | |
#>> True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
update python 3.12 im debian 12
from bs4 import BeautifulSoup
import urllib.request
import re
url = ''
Use contextlib to safely close the connection
with urllib.request.urlopen(url) as response:
soup = BeautifulSoup(response, 'html.parser')
pattern = re.compile('Fixed text')
columns = soup.find_all('td', string=pattern, class_='pos')
if len(columns) > 0:
print(type(columns[0]))
else:
print("The 'columns' list is empty.")
print([t.parent.find('strong').text for t in columns])
print(soup.find('strong'))
print(soup.find('strong', string='some value'))
if soup.find('strong', string='some value'):
print(soup.find('strong', string='some value').parent)
if soup.find('strong', string='some value') == soup.find('strong'):
print("Both elements are the same.")
if soup.find('strong', string='some value').parent == soup.find('strong'):
print("Parent of 'some value' is the same as 'strong'.")