lovesh · August 29, 2015 14:06
diff --git a/dom.py b/dom.py
 import lxml.html
 import urllib2


 class DOM(object):
    def __init__(self, url=None, html=None, utf8=False):
        if url:
            page = urllib2.urlopen(url)
            html = page.read()
            page.close()
        if utf8:
            html = html.decode('iso-8859-1').encode('utf8')
        self.html = html
        try:
            self.document = lxml.html.document_fromstring(self.html)
        except:
            self.document = False

    def getNodesWithXpath(self, xpath):
        if self.document is not False:
            self.nodes = self.document.xpath(xpath)
            return self.nodes[:]
        return False

    def getLinksWithXpath(self, xpath):
        if self.document is not False:
            self.nodes = self.document.xpath(xpath)
            links = [[anchor.text_content(), anchor.get('href')] for anchor in self.nodes]
            return links
        return False

    def getImgUrlWithXpath(self, xpath):
        self.nodes = self.document.xpath(xpath)
        urls = [img.get('src') for img in self.nodes]
        return urls

    def parseTBody(self, tbody_xpath):
        """ takes xpath of <tbody> of a table where each <tr> has 2 <td> and returns a dict where key is first <td>'s content and value is second <td>'s content """

        data = {}
        tbody = self.document.xpath(tbody_xpath)[0]
        trs = tbody.xpath('tr')
        for tr in trs:
            tds = tr.xpath('td')
            if len(tds) > 1:
                key = tds[0].text_content().strip(':\n\t\r ').lower().replace('.', '')
                value = tds[1].text_content().strip(':\n\t\r ')
                data[key] = value
        return data

    def parseTBodyNode(self, tbody):
        """ takes <tbody> of a table where each <tr> has 2 <td> and returns a dict where key is first <td>'s content and value is second <td>'s content """

        data = {}
        trs = tbody.xpath('tr')
        for tr in trs:
            tds = tr.xpath('td')
            if len(tds) > 1:
                key = tds[0].text_content().strip(':\n\t\r ').lower().replace('.', '')
                value = tds[1].text_content().strip(':\n\t\r ')
                data[key] = value
        return data
	import lxml.html
	import urllib2


	class DOM(object):
	def __init__(self, url=None, html=None, utf8=False):
	if url:
	page = urllib2.urlopen(url)
	html = page.read()
	page.close()
	if utf8:
	html = html.decode('iso-8859-1').encode('utf8')
	self.html = html
	try:
	self.document = lxml.html.document_fromstring(self.html)
	except:
	self.document = False

	def getNodesWithXpath(self, xpath):
	if self.document is not False:
	self.nodes = self.document.xpath(xpath)
	return self.nodes[:]
	return False

	def getLinksWithXpath(self, xpath):
	if self.document is not False:
	self.nodes = self.document.xpath(xpath)
	links = [[anchor.text_content(), anchor.get('href')] for anchor in self.nodes]
	return links
	return False

	def getImgUrlWithXpath(self, xpath):
	self.nodes = self.document.xpath(xpath)
	urls = [img.get('src') for img in self.nodes]
	return urls

	def parseTBody(self, tbody_xpath):
	""" takes xpath of <tbody> of a table where each <tr> has 2 <td> and returns a dict where key is first <td>'s content and value is second <td>'s content """

	data = {}
	tbody = self.document.xpath(tbody_xpath)[0]
	trs = tbody.xpath('tr')
	for tr in trs:
	tds = tr.xpath('td')
	if len(tds) > 1:
	key = tds[0].text_content().strip(':\n\t\r ').lower().replace('.', '')
	value = tds[1].text_content().strip(':\n\t\r ')
	data[key] = value
	return data

	def parseTBodyNode(self, tbody):
	""" takes <tbody> of a table where each <tr> has 2 <td> and returns a dict where key is first <td>'s content and value is second <td>'s content """

	data = {}
	trs = tbody.xpath('tr')
	for tr in trs:
	tds = tr.xpath('td')
	if len(tds) > 1:
	key = tds[0].text_content().strip(':\n\t\r ').lower().replace('.', '')
	value = tds[1].text_content().strip(':\n\t\r ')
	data[key] = value
	return data
No results found