bmease · March 15, 2013 12:34
diff --git a/scrape.py b/scrape.py
 import requests                                                                
 from lxml import html                                                          
                                                                               
                                                                               
 def get_list():                                                                
    """Finds all the liquids from the index page"""                            
    # Make the request to the website                                          
    request = requests.get('https://aliceinvapeland.com/e-juices')             
                                                                               
    # Parse the request into some fancy structure                              
    page = html.fromstring(request.text)                                       
                                                                               
    # Make a list of all the links                                             
    liquid_urls = [x.get('href') for x in page.cssselect('#yak-gallery-ejuice a')]
                                                                               
    # Remove duplicates                                                        
    liquid_urls_unique = list(set(liquid_urls))                                
                                                                               
    # Remove urls that were None                                               
    liquid_urls_clean = [x for x in liquid_urls_unique if x is not None]       
                                                                               
    return liquid_urls_clean                                                   
                                                                               
                                                                               
 def get_page(url):                                                             
    """Fetchs a page of a liquid and returns the title, description and price"""
    request = requests.get(url)                                                
    page = html.fromstring(request.text)                                       
                                                                               
    # cssselect returns a list, but we know from looking at the html that      
    # there is only one element even thought it's a class                      
    name = page.cssselect('.entry-title')[0].text_content()                    
                                                                               
    # List of words that are actually samplers and not individual liquids      
    SAMPLERS = ['sample', 'everything']                                        
    for word in SAMPLERS:                                                      
        if word in name.lower():                                               
            return None  # return None since this isn't an individual liquid   
                                                                               
    description = page.cssselect('.entry-content strong')[0].text_content()    
    price = page.cssselect('.yak_price')[0].text_content()                     
                                                                               
    return (name, description, price)                                          
                                                                               
                                                                               
 def main():                                                                    
    liquid_pages = get_list()                                                  
    print liquid_pages                                                         
    print get_page(liquid_pages[5])                                            
                                                                               
 if __name__ == "__main__":                                                     
    main()
	import requests
	from lxml import html


	def get_list():
	"""Finds all the liquids from the index page"""
	# Make the request to the website
	request = requests.get('https://aliceinvapeland.com/e-juices')

	# Parse the request into some fancy structure
	page = html.fromstring(request.text)

	# Make a list of all the links
	liquid_urls = [x.get('href') for x in page.cssselect('#yak-gallery-ejuice a')]

	# Remove duplicates
	liquid_urls_unique = list(set(liquid_urls))

	# Remove urls that were None
	liquid_urls_clean = [x for x in liquid_urls_unique if x is not None]

	return liquid_urls_clean


	def get_page(url):
	"""Fetchs a page of a liquid and returns the title, description and price"""
	request = requests.get(url)
	page = html.fromstring(request.text)

	# cssselect returns a list, but we know from looking at the html that
	# there is only one element even thought it's a class
	name = page.cssselect('.entry-title')[0].text_content()

	# List of words that are actually samplers and not individual liquids
	SAMPLERS = ['sample', 'everything']
	for word in SAMPLERS:
	if word in name.lower():
	return None # return None since this isn't an individual liquid

	description = page.cssselect('.entry-content strong')[0].text_content()
	price = page.cssselect('.yak_price')[0].text_content()

	return (name, description, price)


	def main():
	liquid_pages = get_list()
	print liquid_pages
	print get_page(liquid_pages[5])

	if __name__ == "__main__":
	main()