Skip to content

Instantly share code, notes, and snippets.

@tranch
Last active June 15, 2016 03:05
Show Gist options
  • Select an option

  • Save tranch/1e69afa2dc5b84b831f85e76a2546ba8 to your computer and use it in GitHub Desktop.

Select an option

Save tranch/1e69afa2dc5b84b831f85e76a2546ba8 to your computer and use it in GitHub Desktop.
CSS spider
import os
import urlparse
import tinycss
import requests
from bs4 import BeautifulSoup
def crawl(site_url):
html = get_source(site_url)
soup = BeautifulSoup(html, 'html.parser')
css_tags = soup.findAll('link', rel='stylesheet')
for css_tag in css_tags:
css_url = urlparse.urljoin(site_url, css_tag['href'])
if css_url.startswith('http') and site_url not in css_url:
continue
download_file(css_url)
css_content = get_source(css_url)
for img_url in urls_from_css(css_content):
full_img_url = urlparse.urljoin(css_url, img_url)
download_file(full_img_url)
def get_source(url):
r = requests.get(url)
if r.status_code == 200:
return r.text
return ''
def urls_from_css(css):
parser = tinycss.make_parser()
for r in parser.parse_stylesheet(css).rules:
for d in r.declarations:
for tok in d.value:
if tok.type == 'URI':
yield tok.value
def download_file(url):
response = requests.get(url, timeout=3)
parse_result = urlparse.urlparse(url)
assets_dir = os.path.join(os.getcwd(), '/'.join(parse_result.path.split('/')[1:-1]))
filename = os.path.join(assets_dir, parse_result.path.split('/')[-1])
if not os.path.exists(assets_dir):
os.makedirs(assets_dir)
with open(filename, 'wb') as fp:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
fp.write(chunk)
fp.close()
tinycss==0.3
bs4==0.0.1
requests==2.10.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment