Last active
December 11, 2015 21:58
-
-
Save herrmendez/4666125 to your computer and use it in GitHub Desktop.
Quick and dirty chill.com scrapper, for anyone who wants to have a copy of his/her collections links
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Federico Mendez | |
# _ _ _ _ _ _ | |
# ___ _ _ <_> ___ | |__ ___ ._ _ _| | _| |<_> _ _ _| |_ _ _ | |
# / . || | || |/ | '| / / <_> || ' |/ . | / . || || '_> | | | | | | |
# \_ |`___||_|\_|_.|_\_\ <___||_|_|\___| \___||_||_| |_| `_. | | |
# ___| _ _ _ _ <___' | |
# | _>| |_ <_>| || | ___ ___ ._ _ _ ___ ___ _ _ ___ ___ ___ _ _ | |
# | <__| . || || || | _ / | '/ . \| ' ' | <_-</ | '| '_><_> || . \/ ._>| '_> | |
# `___/|_|_||_||_||_|<_>\_|_.\___/|_|_|_| /__/\_|_.|_| <___|| _/\___.|_| | |
# |_| | |
# | |
# Usage: python chillscraper.py <username> <path_to_save_txt> | |
# Example: python chillscraper.py someUser ./my_videos.txt | |
# | |
# Notes: it can only retrieve the urls of videos from youtube and vimeo, | |
# it doesn't work with vevo because of the way it's uri's are formatted | |
# (www.vevo.com/watch/<artist_name>/<song_name>/). Also note that this | |
# script depends on selenium to retrieve the data, since chill.com generates | |
# its content on the browser through Dojo JS framework (making urllib worhtless). | |
# | |
# Dependencies: selenium 2.29 | |
import sys | |
import json | |
import time | |
from selenium import webdriver | |
wd = webdriver.Firefox() | |
wd.implicitly_wait(3) | |
def get_page(url): | |
wd.get(url) | |
def get_collections(username): | |
get_page('http://www.chill.com/%s' % username) | |
collections = {} | |
div_collection = wd.find_elements_by_class_name('collection-name-container') | |
for e in div_collection: | |
collections[e.text] = e.find_element_by_class_name('name').get_attribute('href') | |
return collections | |
def get_items(items): | |
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
time.sleep(5) | |
while len(wd.find_elements_by_class_name('watch-video-item')) > len(items): | |
items = wd.find_elements_by_class_name('watch-video-item') | |
time.sleep(5) | |
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
return items | |
def download_links(username): | |
collections = get_collections(username) | |
for name, url in collections.iteritems(): | |
get_page(url) | |
links = {} | |
div_items = get_items(wd.find_elements_by_class_name('watch-video-item')) | |
for e in div_items: | |
temp = e.get_attribute('data-embeddata') | |
metadata = json.loads(temp) | |
title = e.find_element_by_css_selector('div[style="overflow:hidden"]').text | |
if metadata['source'] == 'embedly': | |
links[title] = metadata['videoId'] | |
elif metadata['source'] == 'youtube': | |
links[title] = 'http://www.youtu.be/%s' % metadata['videoId'] | |
elif metadata['source'] == 'vimeo': | |
links[title] = 'http://www.vimeo.com/%s' % metadata['videoId'] | |
collections[name] = links | |
return collections | |
if __name__ == '__main__': | |
collections = download_links(sys.argv[1]) | |
f = open(sys.argv[2], 'w') | |
for k, ls in collections.iteritems(): | |
f.write("Collection: %s (%d items) \n~~~~~~~~~~\n" % (k, len(ls))) | |
for name, url in ls.iteritems(): | |
f.write("\t%s - %s\n" % (name.encode('utf-8'), url.encode('utf-8'))) | |
f.write("\n\n") | |
wd.quit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
selenium==2.29.0 | |
wsgiref==0.1.2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment