Skip to content

Instantly share code, notes, and snippets.

@bmease
Created March 15, 2013 12:34
Show Gist options
  • Save bmease/5169588 to your computer and use it in GitHub Desktop.
Save bmease/5169588 to your computer and use it in GitHub Desktop.
Example scraping of alice in vapeland.
import requests
from lxml import html
def get_list():
"""Finds all the liquids from the index page"""
# Make the request to the website
request = requests.get('https://aliceinvapeland.com/e-juices')
# Parse the request into some fancy structure
page = html.fromstring(request.text)
# Make a list of all the links
liquid_urls = [x.get('href') for x in page.cssselect('#yak-gallery-ejuice a')]
# Remove duplicates
liquid_urls_unique = list(set(liquid_urls))
# Remove urls that were None
liquid_urls_clean = [x for x in liquid_urls_unique if x is not None]
return liquid_urls_clean
def get_page(url):
"""Fetchs a page of a liquid and returns the title, description and price"""
request = requests.get(url)
page = html.fromstring(request.text)
# cssselect returns a list, but we know from looking at the html that
# there is only one element even thought it's a class
name = page.cssselect('.entry-title')[0].text_content()
# List of words that are actually samplers and not individual liquids
SAMPLERS = ['sample', 'everything']
for word in SAMPLERS:
if word in name.lower():
return None # return None since this isn't an individual liquid
description = page.cssselect('.entry-content strong')[0].text_content()
price = page.cssselect('.yak_price')[0].text_content()
return (name, description, price)
def main():
liquid_pages = get_list()
print liquid_pages
print get_page(liquid_pages[5])
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment