Created
August 31, 2014 14:45
-
-
Save NamPNQ/4b7866b970676bb4f508 to your computer and use it in GitHub Desktop.
Python Scrap Tool
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
import os | |
from urlparse import urljoin | |
def save(resource_url, resource_dir): | |
directories , _ = os.path.split(resource_dir) | |
if directories != '': | |
if not os.path.exists(directories): | |
os.makedirs(directories) | |
r = requests.get(resource_url, stream=True) | |
if r.status_code == 200: | |
with open(resource_dir, 'wb') as f: | |
for chunk in r.iter_content(1024): | |
f.write(chunk) | |
del r | |
def main(): | |
res = requests.get(url) | |
if res.status_code == 200: | |
list_files = [] | |
list_files.extend([res_file for res_file in re.findall(r'href=[\'\"](.*?)[\'\"]',res.text) if not res_file.startswith(('http','#')) and ':' not in res_file]) | |
list_files.extend([res_file for res_file in re.findall(r'src=[\'\"](.*?)[\'\"]',res.text) if not res_file.startswith(('http','#'))]) | |
for file in list_files: | |
save(urljoin(url,file),file) | |
save(url,"index.html") | |
if __name__ == '__main__': | |
global url | |
import argparse | |
parser = argparse.ArgumentParser(description='Scrap web.') | |
parser.add_argument('-u', help='put url in here') | |
args = parser.parse_args() | |
url = args.u | |
if url: | |
main() | |
else: | |
print 'Please input url' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment