Last active
August 29, 2015 14:27
-
-
Save TheGU/b61dfaeb63fa3cbacef2 to your computer and use it in GitHub Desktop.
a script to crawl a page then download all file to local disk (http://pattapongj.com/2015/08/11/python-crawler-and-download/)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import requests | |
import re | |
import urlparse | |
import urllib | |
import os | |
from bs4 import BeautifulSoup | |
base_url = "http://downloads.khinsider.com/game-soundtracks/album/patapon-2" | |
crawl_link_string = "Download" | |
download_link_string = "Click here to download" | |
download_local_path = "Z:\Patapon" | |
# @url : url of target page | |
# @a_string : a string in [<a>a_string</a>] to filter link | |
# return : list of link url | |
def getdownload(url,a_string): | |
# try to open url and retry if connection error | |
try: | |
req = requests.get(url) | |
except requests.exceptions.ConnectionError as e: # This is the correct syntax | |
print e, "... Retry" | |
return getdownload(url,a_string) | |
# check response status | |
if(req.status_code != 200): | |
return | |
# get page content | |
soup = BeautifulSoup(req.text, 'html.parser') | |
# capture all download link | |
linklist = [l.get('href') for l in soup.find_all("a", string=a_string)] | |
print "Get {} download links".format(len(linklist)) | |
# return all download link | |
return linklist | |
# get a page then save download link to specific path | |
# @url : url of target page | |
# @a_crawl_string : a string in [<a>a_crawl_string</a>] to filter link for crawl to next page | |
# @a_download_string : a string in [<a>a_download_string</a>] to filter link for download | |
# @path : path on local disk to save to | |
# return : None | |
def crawl(url, a_crawl_string, a_download_string, path): | |
req = requests.get(url) | |
if(req.status_code != 200): | |
return | |
# fetch list of page to crawl | |
soup = BeautifulSoup(req.text, 'html.parser') | |
link_list = [l.get('href') for l in soup.find_all("a", string=a_crawl_string)] | |
# loop on list of page | |
for link in link_list: | |
print "#### Link {}".format(link) | |
# get download link on target download page | |
download_list = getdownload(link,a_download_string) | |
# download all the link found on download page | |
for d in download_list: | |
file_name = os.path.join(path,d.split('/')[-1]) | |
print "Download {} to {}".format(d,file_name), | |
urllib.urlretrieve(d, file_name) | |
print "... Done" | |
crawl(base_url, crawl_link_string, download_link_string, download_local_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment