herrmendez · December 11, 2015 21:58
diff --git a/chillscraper.py b/chillscraper.py
 # Author: Federico Mendez
 #              _       _                 _     _  _        _                  
 #    ___  _ _ <_> ___ | |__  ___ ._ _  _| |  _| |<_> _ _ _| |_ _ _            
 #  / . || | || |/ | '| / / <_> || ' |/ . | / . || || '_> | | | | |           
 # \_  |`___||_|\_|_.|_\_\ <___||_|_|\___| \___||_||_|   |_| `_. |           
 #  ___| _    _  _  _                                        <___'           
 #   |  _>| |_ <_>| || |    ___  ___ ._ _ _   ___ ___  _ _  ___  ___  ___  _ _ 
 #  | <__| . || || || | _ / | '/ . \| ' ' | <_-</ | '| '_><_> || . \/ ._>| '_>
 # `___/|_|_||_||_||_|<_>\_|_.\___/|_|_|_| /__/\_|_.|_|  <___||  _/\___.|_|  
 #                                                            |_|            
 #
 # Usage: python chillscraper.py <username> <path_to_save_txt>
 # Example: python chillscraper.py someUser ./my_videos.txt
 #
 # Notes: it can only retrieve the urls of videos from youtube and vimeo, 
 # it doesn't work with vevo because of the way it's uri's are formatted
 # (www.vevo.com/watch/<artist_name>/<song_name>/). Also note that this
 # script depends on selenium to retrieve the data, since chill.com generates
 # its content on the browser through Dojo JS framework (making urllib worhtless).
 #
 # Dependencies: selenium 2.29

 import sys
 import json
 import time
 from selenium import webdriver

 wd = webdriver.Firefox()
 wd.implicitly_wait(3)

 def get_page(url):
    wd.get(url)

 def get_collections(username):
    get_page('http://www.chill.com/%s' % username)
    collections = {}
    div_collection = wd.find_elements_by_class_name('collection-name-container')
    for e in div_collection:
        collections[e.text] = e.find_element_by_class_name('name').get_attribute('href')
    return collections

 def get_items(items):
    wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)
    while len(wd.find_elements_by_class_name('watch-video-item')) > len(items):
        items = wd.find_elements_by_class_name('watch-video-item')
        time.sleep(5)
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    return items

 def download_links(username):
    collections = get_collections(username)
    for name, url in collections.iteritems():
        get_page(url)
        links = {}
        div_items = get_items(wd.find_elements_by_class_name('watch-video-item'))
        for e in div_items:
            temp = e.get_attribute('data-embeddata')
            metadata = json.loads(temp)
            title = e.find_element_by_css_selector('div[style="overflow:hidden"]').text
            if metadata['source'] == 'embedly':
                links[title] = metadata['videoId']
            elif metadata['source'] == 'youtube':
                links[title] = 'http://www.youtu.be/%s' % metadata['videoId']
            elif metadata['source'] == 'vimeo':
                links[title] = 'http://www.vimeo.com/%s' % metadata['videoId']
        collections[name] = links
    return collections

 if __name__ == '__main__':
    collections = download_links(sys.argv[1])
    f = open(sys.argv[2], 'w')
    for k, ls in collections.iteritems():
        f.write("Collection: %s (%d items) \n~~~~~~~~~~\n" % (k, len(ls)))
        for name, url in ls.iteritems():
            f.write("\t%s - %s\n" % (name.encode('utf-8'), url.encode('utf-8')))
        f.write("\n\n")

 wd.quit()
diff --git a/requirements.txt b/requirements.txt
 selenium==2.29.0
 wsgiref==0.1.2
	# Author: Federico Mendez
	# _ _ _ _ _ _
	# ___ _ _ <_> ___ \| \|__ ___ ._ _ _\| \| _\| \|<_> _ _ _\| \|_ _ _
	# / . \|\| \| \|\| \|/ \| '\| / / <_> \|\| ' \|/ . \| / . \|\| \|\| '_> \| \| \| \| \|
	# \_ \|`___\|\|_\|\_\|_.\|_\_\ <___\|\|_\|_\|\___\| \___\|\|_\|\|_\| \|_\| `_. \|
	# ___\| _ _ _ _ <___'
	# \| _>\| \|_ <_>\| \|\| \| ___ ___ ._ _ _ ___ ___ _ _ ___ ___ ___ _ _
	# \| <__\| . \|\| \|\| \|\| \| _ / \| '/ . \\| ' ' \| <_-</ \| '\| '_><_> \|\| . \/ ._>\| '_>
	# `___/\|_\|_\|\|_\|\|_\|\|_\|<_>\_\|_.\___/\|_\|_\|_\| /__/\_\|_.\|_\| <___\|\| _/\___.\|_\|
	# \|_\|
	#
	# Usage: python chillscraper.py <username> <path_to_save_txt>
	# Example: python chillscraper.py someUser ./my_videos.txt
	#
	# Notes: it can only retrieve the urls of videos from youtube and vimeo,
	# it doesn't work with vevo because of the way it's uri's are formatted
	# (www.vevo.com/watch/<artist_name>/<song_name>/). Also note that this
	# script depends on selenium to retrieve the data, since chill.com generates
	# its content on the browser through Dojo JS framework (making urllib worhtless).
	#
	# Dependencies: selenium 2.29

	import sys
	import json
	import time
	from selenium import webdriver

	wd = webdriver.Firefox()
	wd.implicitly_wait(3)

	def get_page(url):
	wd.get(url)

	def get_collections(username):
	get_page('http://www.chill.com/%s' % username)
	collections = {}
	div_collection = wd.find_elements_by_class_name('collection-name-container')
	for e in div_collection:
	collections[e.text] = e.find_element_by_class_name('name').get_attribute('href')
	return collections

	def get_items(items):
	wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(5)
	while len(wd.find_elements_by_class_name('watch-video-item')) > len(items):
	items = wd.find_elements_by_class_name('watch-video-item')
	time.sleep(5)
	wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	return items

	def download_links(username):
	collections = get_collections(username)
	for name, url in collections.iteritems():
	get_page(url)
	links = {}
	div_items = get_items(wd.find_elements_by_class_name('watch-video-item'))
	for e in div_items:
	temp = e.get_attribute('data-embeddata')
	metadata = json.loads(temp)
	title = e.find_element_by_css_selector('div[style="overflow:hidden"]').text
	if metadata['source'] == 'embedly':
	links[title] = metadata['videoId']
	elif metadata['source'] == 'youtube':
	links[title] = 'http://www.youtu.be/%s' % metadata['videoId']
	elif metadata['source'] == 'vimeo':
	links[title] = 'http://www.vimeo.com/%s' % metadata['videoId']
	collections[name] = links
	return collections

	if __name__ == '__main__':
	collections = download_links(sys.argv[1])
	f = open(sys.argv[2], 'w')
	for k, ls in collections.iteritems():
	f.write("Collection: %s (%d items) \n~~~~~~~~~~\n" % (k, len(ls)))
	for name, url in ls.iteritems():
	f.write("\t%s - %s\n" % (name.encode('utf-8'), url.encode('utf-8')))
	f.write("\n\n")

	wd.quit()
No results found