-
-
Save EdwardIII/7994229 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mechanize | |
from lxml import etree | |
from StringIO import StringIO | |
def get_text_recursive(node): | |
return (node.text or '') + ''.join(map(get_text_recursive, node)) + (node.tail or '') | |
def browser_to_xml(browser): | |
htmlparser = etree.HTMLParser() | |
try: | |
html_string = browser.response().read() | |
except AttributeError: # it's already a response object | |
html_string = browser.read() | |
tree = etree.parse(StringIO(html_string), htmlparser) | |
return tree | |
def search(term, **kwargs): | |
""" Search a site for parts | |
Returns: etree. XML root of the search results page | |
""" | |
if kwargs.get('browser'): | |
browser = kwargs['browser'] | |
else: | |
browser = mechanize.Browser() | |
search_url = '%s/%s' % (kwargs['base_url'], kwargs['search_page']) | |
browser.open(search_url) | |
if 'search_form_name' in kwargs: | |
browser.select_form(name=kwargs['search_form_name']) | |
elif 'search_form_number' in kwargs: | |
browser.select_form(nr=kwargs['search_form_number']) | |
else: | |
raise RuntimeError('Must specificy either search_form_name or search_form_number in kwargs') | |
browser[kwargs['search_input_name']] = term | |
# we need the action to be this: method=quickInventorySearch&invntrySearch=inventorySearch | |
browser.submit() | |
return browser_to_xml(browser) | |
def search_example(term): | |
base_url='http://www.example.com/' | |
browser = mechanize.Browser() | |
root = search(term, base_url=base_url, | |
search_page='', | |
search_form_number=0, | |
search_input_name = 'search_query', | |
browser=browser | |
) | |
more_info_link_nodes = root.xpath('//*[@id="product_list"]/li/div[2]/a[2]') | |
results = [] | |
for node in more_info_link_nodes: | |
print node.get('href') | |
prod_node = browser_to_xml(browser.open(node.get('href'))) | |
prod_url = node.get('href') | |
results.append({ | |
'description': prod_node.xpath('//*[@id="primary_block"]/h1/text()').pop().strip(), | |
'oem_part_no': prod_node.xpath('//*[@id="short_description_content"]/h1/text()').pop().strip(), | |
# These guys already have the base url in their href | |
'link': prod_url, | |
'location': None, # example don't provide a location (although they do providing a lead-time) | |
'condition': prod_node.xpath('//ul/li/span/../text()').pop().replace('STATUS', '').strip(), | |
'price': prod_node.xpath('//*[@id="our_price_display"]/text()').pop().strip(), | |
'currency_code': None # no currency code either | |
}) | |
return results | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment