Created
March 15, 2013 12:34
-
-
Save bmease/5169588 to your computer and use it in GitHub Desktop.
Example scraping of alice in vapeland.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from lxml import html | |
def get_list(): | |
"""Finds all the liquids from the index page""" | |
# Make the request to the website | |
request = requests.get('https://aliceinvapeland.com/e-juices') | |
# Parse the request into some fancy structure | |
page = html.fromstring(request.text) | |
# Make a list of all the links | |
liquid_urls = [x.get('href') for x in page.cssselect('#yak-gallery-ejuice a')] | |
# Remove duplicates | |
liquid_urls_unique = list(set(liquid_urls)) | |
# Remove urls that were None | |
liquid_urls_clean = [x for x in liquid_urls_unique if x is not None] | |
return liquid_urls_clean | |
def get_page(url): | |
"""Fetchs a page of a liquid and returns the title, description and price""" | |
request = requests.get(url) | |
page = html.fromstring(request.text) | |
# cssselect returns a list, but we know from looking at the html that | |
# there is only one element even thought it's a class | |
name = page.cssselect('.entry-title')[0].text_content() | |
# List of words that are actually samplers and not individual liquids | |
SAMPLERS = ['sample', 'everything'] | |
for word in SAMPLERS: | |
if word in name.lower(): | |
return None # return None since this isn't an individual liquid | |
description = page.cssselect('.entry-content strong')[0].text_content() | |
price = page.cssselect('.yak_price')[0].text_content() | |
return (name, description, price) | |
def main(): | |
liquid_pages = get_list() | |
print liquid_pages | |
print get_page(liquid_pages[5]) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment