Skip to content

Instantly share code, notes, and snippets.

@herrmendez
Last active December 11, 2015 21:58
Show Gist options
  • Save herrmendez/4666125 to your computer and use it in GitHub Desktop.
Save herrmendez/4666125 to your computer and use it in GitHub Desktop.
Quick and dirty chill.com scrapper, for anyone who wants to have a copy of his/her collections links
# Author: Federico Mendez
# _ _ _ _ _ _
# ___ _ _ <_> ___ | |__ ___ ._ _ _| | _| |<_> _ _ _| |_ _ _
# / . || | || |/ | '| / / <_> || ' |/ . | / . || || '_> | | | | |
# \_ |`___||_|\_|_.|_\_\ <___||_|_|\___| \___||_||_| |_| `_. |
# ___| _ _ _ _ <___'
# | _>| |_ <_>| || | ___ ___ ._ _ _ ___ ___ _ _ ___ ___ ___ _ _
# | <__| . || || || | _ / | '/ . \| ' ' | <_-</ | '| '_><_> || . \/ ._>| '_>
# `___/|_|_||_||_||_|<_>\_|_.\___/|_|_|_| /__/\_|_.|_| <___|| _/\___.|_|
# |_|
#
# Usage: python chillscraper.py <username> <path_to_save_txt>
# Example: python chillscraper.py someUser ./my_videos.txt
#
# Notes: it can only retrieve the urls of videos from youtube and vimeo,
# it doesn't work with vevo because of the way it's uri's are formatted
# (www.vevo.com/watch/<artist_name>/<song_name>/). Also note that this
# script depends on selenium to retrieve the data, since chill.com generates
# its content on the browser through Dojo JS framework (making urllib worhtless).
#
# Dependencies: selenium 2.29
import sys
import json
import time
from selenium import webdriver
wd = webdriver.Firefox()
wd.implicitly_wait(3)
def get_page(url):
wd.get(url)
def get_collections(username):
get_page('http://www.chill.com/%s' % username)
collections = {}
div_collection = wd.find_elements_by_class_name('collection-name-container')
for e in div_collection:
collections[e.text] = e.find_element_by_class_name('name').get_attribute('href')
return collections
def get_items(items):
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
while len(wd.find_elements_by_class_name('watch-video-item')) > len(items):
items = wd.find_elements_by_class_name('watch-video-item')
time.sleep(5)
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
return items
def download_links(username):
collections = get_collections(username)
for name, url in collections.iteritems():
get_page(url)
links = {}
div_items = get_items(wd.find_elements_by_class_name('watch-video-item'))
for e in div_items:
temp = e.get_attribute('data-embeddata')
metadata = json.loads(temp)
title = e.find_element_by_css_selector('div[style="overflow:hidden"]').text
if metadata['source'] == 'embedly':
links[title] = metadata['videoId']
elif metadata['source'] == 'youtube':
links[title] = 'http://www.youtu.be/%s' % metadata['videoId']
elif metadata['source'] == 'vimeo':
links[title] = 'http://www.vimeo.com/%s' % metadata['videoId']
collections[name] = links
return collections
if __name__ == '__main__':
collections = download_links(sys.argv[1])
f = open(sys.argv[2], 'w')
for k, ls in collections.iteritems():
f.write("Collection: %s (%d items) \n~~~~~~~~~~\n" % (k, len(ls)))
for name, url in ls.iteritems():
f.write("\t%s - %s\n" % (name.encode('utf-8'), url.encode('utf-8')))
f.write("\n\n")
wd.quit()
selenium==2.29.0
wsgiref==0.1.2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment