Last active
August 29, 2015 14:06
-
-
Save lovesh/1f5e413581ffdf45de08 to your computer and use it in GitHub Desktop.
playing with HTML DOM using lxml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lxml.html | |
import urllib2 | |
class DOM(object): | |
def __init__(self, url=None, html=None, utf8=False): | |
if url: | |
page = urllib2.urlopen(url) | |
html = page.read() | |
page.close() | |
if utf8: | |
html = html.decode('iso-8859-1').encode('utf8') | |
self.html = html | |
try: | |
self.document = lxml.html.document_fromstring(self.html) | |
except: | |
self.document = False | |
def getNodesWithXpath(self, xpath): | |
if self.document is not False: | |
self.nodes = self.document.xpath(xpath) | |
return self.nodes[:] | |
return False | |
def getLinksWithXpath(self, xpath): | |
if self.document is not False: | |
self.nodes = self.document.xpath(xpath) | |
links = [[anchor.text_content(), anchor.get('href')] for anchor in self.nodes] | |
return links | |
return False | |
def getImgUrlWithXpath(self, xpath): | |
self.nodes = self.document.xpath(xpath) | |
urls = [img.get('src') for img in self.nodes] | |
return urls | |
def parseTBody(self, tbody_xpath): | |
""" takes xpath of <tbody> of a table where each <tr> has 2 <td> and returns a dict where key is first <td>'s content and value is second <td>'s content """ | |
data = {} | |
tbody = self.document.xpath(tbody_xpath)[0] | |
trs = tbody.xpath('tr') | |
for tr in trs: | |
tds = tr.xpath('td') | |
if len(tds) > 1: | |
key = tds[0].text_content().strip(':\n\t\r ').lower().replace('.', '') | |
value = tds[1].text_content().strip(':\n\t\r ') | |
data[key] = value | |
return data | |
def parseTBodyNode(self, tbody): | |
""" takes <tbody> of a table where each <tr> has 2 <td> and returns a dict where key is first <td>'s content and value is second <td>'s content """ | |
data = {} | |
trs = tbody.xpath('tr') | |
for tr in trs: | |
tds = tr.xpath('td') | |
if len(tds) > 1: | |
key = tds[0].text_content().strip(':\n\t\r ').lower().replace('.', '') | |
value = tds[1].text_content().strip(':\n\t\r ') | |
data[key] = value | |
return data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment