Skip to content

Instantly share code, notes, and snippets.

@EdwardIII
Created December 16, 2013 20:54
Show Gist options
  • Save EdwardIII/7994229 to your computer and use it in GitHub Desktop.
Save EdwardIII/7994229 to your computer and use it in GitHub Desktop.
import mechanize
from lxml import etree
from StringIO import StringIO
def get_text_recursive(node):
return (node.text or '') + ''.join(map(get_text_recursive, node)) + (node.tail or '')
def browser_to_xml(browser):
htmlparser = etree.HTMLParser()
try:
html_string = browser.response().read()
except AttributeError: # it's already a response object
html_string = browser.read()
tree = etree.parse(StringIO(html_string), htmlparser)
return tree
def search(term, **kwargs):
""" Search a site for parts
Returns: etree. XML root of the search results page
"""
if kwargs.get('browser'):
browser = kwargs['browser']
else:
browser = mechanize.Browser()
search_url = '%s/%s' % (kwargs['base_url'], kwargs['search_page'])
browser.open(search_url)
if 'search_form_name' in kwargs:
browser.select_form(name=kwargs['search_form_name'])
elif 'search_form_number' in kwargs:
browser.select_form(nr=kwargs['search_form_number'])
else:
raise RuntimeError('Must specificy either search_form_name or search_form_number in kwargs')
browser[kwargs['search_input_name']] = term
# we need the action to be this: method=quickInventorySearch&invntrySearch=inventorySearch
browser.submit()
return browser_to_xml(browser)
def search_example(term):
base_url='http://www.example.com/'
browser = mechanize.Browser()
root = search(term, base_url=base_url,
search_page='',
search_form_number=0,
search_input_name = 'search_query',
browser=browser
)
more_info_link_nodes = root.xpath('//*[@id="product_list"]/li/div[2]/a[2]')
results = []
for node in more_info_link_nodes:
print node.get('href')
prod_node = browser_to_xml(browser.open(node.get('href')))
prod_url = node.get('href')
results.append({
'description': prod_node.xpath('//*[@id="primary_block"]/h1/text()').pop().strip(),
'oem_part_no': prod_node.xpath('//*[@id="short_description_content"]/h1/text()').pop().strip(),
# These guys already have the base url in their href
'link': prod_url,
'location': None, # example don't provide a location (although they do providing a lead-time)
'condition': prod_node.xpath('//ul/li/span/../text()').pop().replace('STATUS', '').strip(),
'price': prod_node.xpath('//*[@id="our_price_display"]/text()').pop().strip(),
'currency_code': None # no currency code either
})
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment