Created
July 3, 2012 22:42
-
-
Save rcackerman/3043895 to your computer and use it in GitHub Desktop.
Finding a block of text using a string
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import urllib2 | |
| from bs4 import BeautifulSoup | |
| import re | |
| def findBlockWithString(string, childtag, contentattrs, soup): | |
| """Finds tag within a block of HTML, using a string search. It goes through 3 steps: | |
| it finds the 'tag' that includes the 'string' we're looking for, within the soup defined | |
| by the program, then finds the parent HTML tag. Finally it finds the tag wanted | |
| using the tag attribute, "contentattrs". | |
| In practice, use this function on a block of HTML that has an identifying string in a | |
| tag next to the tag you want. Finding the parent tag should include both the text searched | |
| and the tag you want. | |
| Keyword arguments: | |
| string: the string for the larger block of text | |
| childtag: the tag that includes the string | |
| contentattrs: the class of the tag we want | |
| soup: whatever part of the HTML | |
| """ | |
| try: | |
| block = soup.find(childtag, text=string).find_parent() | |
| stringPartWeWant = block.find(attrs=contentattrs) | |
| return stringPartWeWant.string | |
| except AttributeError: | |
| return string, "None found" | |
| def findAttrBlock(attrForBlock, attrForString, soup): | |
| """Finds tag within a block of HTML, using an attribute search. It goes through 3 steps: | |
| it finds the tag with the class "attrForBlock" within the soup defined | |
| by the program. Finally it finds the tag wanted using "attrForString". | |
| In practice, the attrForBlock should find a tag that includes the tag you want. | |
| Keyword arguments: | |
| string: the string for the larger block of text | |
| childtag: the tag that includes the string | |
| contentattrs: the class of the tag we want | |
| """ | |
| try: | |
| block = soup.find(attrs=attrForBlock) | |
| attrStringWeWant = block.find(attrs=attrForString) | |
| return attrStringWeWant.string | |
| except AttributeError: | |
| return "None found" | |
| def findNumPages(soup): | |
| lastPage = str(soup.find(attrs="pager-last last").find("a")) | |
| pageNum = re.search(".+page=([0-9]+)", lastPage).group(1) | |
| return pageNum | |
| ################## | |
| ## Start | |
| ################## | |
| providerPages = [] | |
| # Starting with the first page | |
| page = urllib2.urlopen("http://direct2food.org/taxonomy/term/14") | |
| soup = BeautifulSoup(page) | |
| numPages = int(findNumPages(soup)) | |
| i = 1 | |
| while i <= numPages: | |
| url = "http://direct2food.org/taxonomy/term/14?page="+str(i) | |
| print url | |
| page = urllib2.urlopen(url) | |
| soup = BeautifulSoup(page) | |
| print soup.original_encoding | |
| # The links to the program information is located in the content-area. | |
| contentarea = soup.find("div", id="content-area") | |
| links = contentarea.find_all(href=re.compile("^/node/")) | |
| print links | |
| for link in links: | |
| providerPages.append(link) | |
| i = i+1 | |
| # page = urllib2.urlopen("http://direct2food.org/node/3387") | |
| # soup = BeautifulSoup(page) | |
| # | |
| # header = soup.find(id="content-header") | |
| # centersection = soup.find(id="content-area") | |
| # | |
| # agency = header.find(attrs="title").string | |
| # print agency | |
| # | |
| # address = centersection.find(attrs="street-address").string | |
| # city = centersection.find(attrs="locality").string | |
| # state = centersection.find(attrs="region").string | |
| # zip = centersection.find(attrs="postal-code").string | |
| # print address, city, state, zip | |
| # | |
| # phone = findAttrBlock(re.compile(".+phone-number"), "field-item odd", centersection) | |
| # print phone | |
| # | |
| # operationhours = findAttrBlock(re.compile(".+operation-hours"), "field-item odd", centersection) | |
| # servicesoffered = findBlockWithString("Services Provided", "h2", "field-content", centersection) | |
| # populationserved = findBlockWithString("Population Served", "h2", "field-content", centersection) | |
| # additionalreqs = findBlockWithString("Program Requirements", "h2", "field-content", centersection) | |
| # additionalinfo = findAttrBlock(re.compile(".+additional-info"), "field-item odd", centersection) | |
| # | |
| # print "Operation hours:", operationhours, "\n", "Services offered:", servicesoffered, "\n", "Population served:", populationserved, "\n", "Additional Info:", additionalinfo, "\n", "Additonal Requirements", additionalreqs, "\n" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import urllib2 | |
| from bs4 import BeautifulSoup | |
| import re | |
| def findBlockWithString(string, childtag, contentattrs, soup): | |
| print childtag | |
| print string | |
| block = soup.find(childtag, text=string).find_parent() | |
| print block | |
| stringBlock = block.find(attrs=contentattrs) | |
| return stringBlock | |
| page = urllib2.urlopen("http://direct2food.org/node/3386") | |
| soup = BeautifulSoup(page) | |
| findBlockWithString("Program Requirements", "h2", "field-content", "soup") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <div id="content-header"> | |
| <h2 class="agency-type">Senior Congregate Dining Site </h2> | |
| <h1 class="title">Langley Apartments</h1> | |
| </div> | |
| <div id="content-area"> | |
| <div class="street-address">4930 South Langley</div> | |
| <span class="locality">Chicago</span>, | |
| <span class="region">IL</span>, | |
| <span class="postal-code">60615</span></div></div> | |
| </div> | |
| </div></div> <!-- /node-inner, /node --> | |
| </div> | |
| </div> | |
| <div class="panel-region-separator"> </div><div class="panel-pane pane-content-fieldgroup pane-program-profile-group-program-contact"> | |
| <div class="pane-content"> | |
| <div class="fieldgroup group-program-contact"> | |
| <div class="content"> | |
| <div class="field field-type-text field-field-phone-number"> | |
| <div class="field-label">Phone Number: </div> | |
| <div class="field-items"> | |
| <div class="field-item odd"> | |
| 312-744-4016 </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="panel-region-separator"> </div><div class="panel-pane pane-custom pane-1"> | |
| <div class="pane-content"> | |
| <p><em>Please call agency to confirm hours of operation.</em></p> | |
| </div> | |
| </div> | |
| <div class="panel-region-separator"> </div><div class="panel-pane pane-node-links"> | |
| <div class="pane-content"> | |
| <ul class="links"><li class="sharethis_link first last"><a class="sharethis-link" href="http://direct2food.org/node/3386" rel="nofollow" title="Langley Apartments">ShareThis</a></li> | |
| </ul> </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="b-edge"><div class="l"></div><div class="r"></div></div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="panel-col-last panel-panel"> | |
| <div class="inside"><div class="rounded-corner"> | |
| <div class="wrap-corner"> | |
| <div class="t-edge"><div class="l"></div><div class="r"></div></div> | |
| <div class="l-edge"> | |
| <div class="r-edge clear-block"> | |
| <div class="panel-pane pane-views pane-d2f-taxonomy-list"> | |
| <h2 class="pane-title">Services Provided</h2> | |
| <div class="pane-content"> | |
| <div class="view view-d2f-taxonomy-list view-id-d2f_taxonomy_list view-display-id-panel_pane_1 view-dom-id-1"> | |
| <div class="view-content"> | |
| <div class="views-row views-row-1 views-row-odd views-row-first views-row-last"> | |
| <div class="views-field-tid"> | |
| <span class="field-content">Senior Congregate Dining Site</span> | |
| </div> | |
| </div> | |
| </div> | |
| </div> </div> | |
| </div> | |
| <div class="panel-region-separator"> </div><div class="panel-pane pane-views pane-d2f-taxonomy-list"> | |
| <h2 class="pane-title">Population Served</h2> | |
| <div class="pane-content"> | |
| <div class="view view-d2f-taxonomy-list view-id-d2f_taxonomy_list view-display-id-panel_pane_2 view-dom-id-2"> | |
| <div class="view-content"> | |
| <div class="views-row views-row-1 views-row-odd views-row-first views-row-last"> | |
| <div class="views-field-tid"> | |
| <span class="field-content">People age 60 and older</span> | |
| </div> | |
| </div> | |
| </div> | |
| </div> </div> | |
| </div> | |
| <div class="panel-region-separator"> </div><div class="panel-pane pane-views pane-d2f-taxonomy-list"> | |
| <h2 class="pane-title">Program Requirements</h2> | |
| <div class="pane-content"> | |
| <div class="view view-d2f-taxonomy-list view-id-d2f_taxonomy_list view-display-id-panel_pane_3 view-dom-id-3"> | |
| <div class="view-content"> | |
| <div class="views-row views-row-1 views-row-odd views-row-first views-row-last"> | |
| <div class="views-field-tid"> | |
| <span class="field-content">There are no additional requirements.</span> | |
| </div> | |
| </div> | |
| </div> | |
| </div> </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="b-edge"><div class="l"></div><div class="r"></div></div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="panel-col-bottom panel-panel"> | |
| <div class="inside"><div class="rounded-corner"> | |
| <div class="wrap-corner"> | |
| <div class="t-edge"><div class="l"></div><div class="r"></div></div> | |
| <div class="l-edge"> | |
| <div class="r-edge clear-block"> | |
| <div class="panel-pane pane-block pane-gmap-location-0"> | |
| <h2 class="pane-title">Location Map</h2> | |
| <div class="pane-content"> | |
| <div class="gmap-control gmap-gmap gmap gmap-map gmap-auto1map-gmap" id="gmap-auto1map-gmap0" style="width: 100%; height: 200px;">Javascript is required to view this map.</div> </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="b-edge"><div class="l"></div><div class="r"></div></div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment