Skip to content

Instantly share code, notes, and snippets.

@rcackerman
Created July 3, 2012 22:42
Show Gist options
  • Select an option

  • Save rcackerman/3043895 to your computer and use it in GitHub Desktop.

Select an option

Save rcackerman/3043895 to your computer and use it in GitHub Desktop.
Finding a block of text using a string
import urllib2
from bs4 import BeautifulSoup
import re
def findBlockWithString(string, childtag, contentattrs, soup):
"""Finds tag within a block of HTML, using a string search. It goes through 3 steps:
it finds the 'tag' that includes the 'string' we're looking for, within the soup defined
by the program, then finds the parent HTML tag. Finally it finds the tag wanted
using the tag attribute, "contentattrs".
In practice, use this function on a block of HTML that has an identifying string in a
tag next to the tag you want. Finding the parent tag should include both the text searched
and the tag you want.
Keyword arguments:
string: the string for the larger block of text
childtag: the tag that includes the string
contentattrs: the class of the tag we want
soup: whatever part of the HTML
"""
try:
block = soup.find(childtag, text=string).find_parent()
stringPartWeWant = block.find(attrs=contentattrs)
return stringPartWeWant.string
except AttributeError:
return string, "None found"
def findAttrBlock(attrForBlock, attrForString, soup):
"""Finds tag within a block of HTML, using an attribute search. It goes through 3 steps:
it finds the tag with the class "attrForBlock" within the soup defined
by the program. Finally it finds the tag wanted using "attrForString".
In practice, the attrForBlock should find a tag that includes the tag you want.
Keyword arguments:
string: the string for the larger block of text
childtag: the tag that includes the string
contentattrs: the class of the tag we want
"""
try:
block = soup.find(attrs=attrForBlock)
attrStringWeWant = block.find(attrs=attrForString)
return attrStringWeWant.string
except AttributeError:
return "None found"
def findNumPages(soup):
lastPage = str(soup.find(attrs="pager-last last").find("a"))
pageNum = re.search(".+page=([0-9]+)", lastPage).group(1)
return pageNum
##################
## Start
##################
providerPages = []
# Starting with the first page
page = urllib2.urlopen("http://direct2food.org/taxonomy/term/14")
soup = BeautifulSoup(page)
numPages = int(findNumPages(soup))
i = 1
while i <= numPages:
url = "http://direct2food.org/taxonomy/term/14?page="+str(i)
print url
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
print soup.original_encoding
# The links to the program information is located in the content-area.
contentarea = soup.find("div", id="content-area")
links = contentarea.find_all(href=re.compile("^/node/"))
print links
for link in links:
providerPages.append(link)
i = i+1
# page = urllib2.urlopen("http://direct2food.org/node/3387")
# soup = BeautifulSoup(page)
#
# header = soup.find(id="content-header")
# centersection = soup.find(id="content-area")
#
# agency = header.find(attrs="title").string
# print agency
#
# address = centersection.find(attrs="street-address").string
# city = centersection.find(attrs="locality").string
# state = centersection.find(attrs="region").string
# zip = centersection.find(attrs="postal-code").string
# print address, city, state, zip
#
# phone = findAttrBlock(re.compile(".+phone-number"), "field-item odd", centersection)
# print phone
#
# operationhours = findAttrBlock(re.compile(".+operation-hours"), "field-item odd", centersection)
# servicesoffered = findBlockWithString("Services Provided", "h2", "field-content", centersection)
# populationserved = findBlockWithString("Population Served", "h2", "field-content", centersection)
# additionalreqs = findBlockWithString("Program Requirements", "h2", "field-content", centersection)
# additionalinfo = findAttrBlock(re.compile(".+additional-info"), "field-item odd", centersection)
#
# print "Operation hours:", operationhours, "\n", "Services offered:", servicesoffered, "\n", "Population served:", populationserved, "\n", "Additional Info:", additionalinfo, "\n", "Additonal Requirements", additionalreqs, "\n"
import urllib2
from bs4 import BeautifulSoup
import re
def findBlockWithString(string, childtag, contentattrs, soup):
print childtag
print string
block = soup.find(childtag, text=string).find_parent()
print block
stringBlock = block.find(attrs=contentattrs)
return stringBlock
page = urllib2.urlopen("http://direct2food.org/node/3386")
soup = BeautifulSoup(page)
findBlockWithString("Program Requirements", "h2", "field-content", "soup")
<div id="content-header">
<h2 class="agency-type">Senior Congregate Dining Site </h2>
<h1 class="title">Langley Apartments</h1>
</div>
<div id="content-area">
<div class="street-address">4930 South Langley</div>
<span class="locality">Chicago</span>,
<span class="region">IL</span>,
<span class="postal-code">60615</span></div></div>
</div>
</div></div> <!-- /node-inner, /node -->
</div>
</div>
<div class="panel-region-separator"> </div><div class="panel-pane pane-content-fieldgroup pane-program-profile-group-program-contact">
<div class="pane-content">
<div class="fieldgroup group-program-contact">
<div class="content">
<div class="field field-type-text field-field-phone-number">
<div class="field-label">Phone Number: </div>
<div class="field-items">
<div class="field-item odd">
312-744-4016 </div>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="panel-region-separator"> </div><div class="panel-pane pane-custom pane-1">
<div class="pane-content">
<p><em>Please call agency to confirm hours of operation.</em></p>
</div>
</div>
<div class="panel-region-separator"> </div><div class="panel-pane pane-node-links">
<div class="pane-content">
<ul class="links"><li class="sharethis_link first last"><a class="sharethis-link" href="http://direct2food.org/node/3386" rel="nofollow" title="Langley Apartments">ShareThis</a></li>
</ul> </div>
</div>
</div>
</div>
<div class="b-edge"><div class="l"></div><div class="r"></div></div>
</div>
</div>
</div>
</div>
<div class="panel-col-last panel-panel">
<div class="inside"><div class="rounded-corner">
<div class="wrap-corner">
<div class="t-edge"><div class="l"></div><div class="r"></div></div>
<div class="l-edge">
<div class="r-edge clear-block">
<div class="panel-pane pane-views pane-d2f-taxonomy-list">
<h2 class="pane-title">Services Provided</h2>
<div class="pane-content">
<div class="view view-d2f-taxonomy-list view-id-d2f_taxonomy_list view-display-id-panel_pane_1 view-dom-id-1">
<div class="view-content">
<div class="views-row views-row-1 views-row-odd views-row-first views-row-last">
<div class="views-field-tid">
<span class="field-content">Senior Congregate Dining Site</span>
</div>
</div>
</div>
</div> </div>
</div>
<div class="panel-region-separator"> </div><div class="panel-pane pane-views pane-d2f-taxonomy-list">
<h2 class="pane-title">Population Served</h2>
<div class="pane-content">
<div class="view view-d2f-taxonomy-list view-id-d2f_taxonomy_list view-display-id-panel_pane_2 view-dom-id-2">
<div class="view-content">
<div class="views-row views-row-1 views-row-odd views-row-first views-row-last">
<div class="views-field-tid">
<span class="field-content">People age 60 and older</span>
</div>
</div>
</div>
</div> </div>
</div>
<div class="panel-region-separator"> </div><div class="panel-pane pane-views pane-d2f-taxonomy-list">
<h2 class="pane-title">Program Requirements</h2>
<div class="pane-content">
<div class="view view-d2f-taxonomy-list view-id-d2f_taxonomy_list view-display-id-panel_pane_3 view-dom-id-3">
<div class="view-content">
<div class="views-row views-row-1 views-row-odd views-row-first views-row-last">
<div class="views-field-tid">
<span class="field-content">There are no additional requirements.</span>
</div>
</div>
</div>
</div> </div>
</div>
</div>
</div>
<div class="b-edge"><div class="l"></div><div class="r"></div></div>
</div>
</div>
</div>
</div>
</div>
<div class="panel-col-bottom panel-panel">
<div class="inside"><div class="rounded-corner">
<div class="wrap-corner">
<div class="t-edge"><div class="l"></div><div class="r"></div></div>
<div class="l-edge">
<div class="r-edge clear-block">
<div class="panel-pane pane-block pane-gmap-location-0">
<h2 class="pane-title">Location Map</h2>
<div class="pane-content">
<div class="gmap-control gmap-gmap gmap gmap-map gmap-auto1map-gmap" id="gmap-auto1map-gmap0" style="width: 100%; height: 200px;">Javascript is required to view this map.</div> </div>
</div>
</div>
</div>
<div class="b-edge"><div class="l"></div><div class="r"></div></div>
</div>
</div>
</div>
</div>
</div>
</div>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment