Created
January 9, 2025 08:59
-
-
Save nazarovsky/787c8abca621394e26a55dab71c62fcc to your computer and use it in GitHub Desktop.
get files from http directory crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import os,sys,re | |
from bs4 import BeautifulSoup | |
def make_folder(out_path): | |
# make path if not exists | |
if not os.path.exists(out_path): | |
os.makedirs(out_path) | |
def get_path_to_filename(filename): | |
return os.path.dirname(filename) | |
def save_file(fname, content): | |
make_folder(get_path_to_filename(fname)) | |
with open(fname, 'wb') as f: | |
f.write(content) | |
print(len(content)) | |
def crawl_html(url, local_dir): | |
r = requests.get(url, allow_redirects=True) | |
r_html = r.text | |
soup = BeautifulSoup(r_html, "html.parser") | |
for link in soup.findAll('a'): | |
link2 = link.get('href') | |
if link2 != '../': | |
if link2.endswith('/'): | |
print('catalog ',link2) | |
crawl_html(url+link2, os.path.join(local_dir,link2)) | |
else: | |
link3 = url+ link2 | |
local_file = os.path.join(local_dir,link2) | |
print('file ',link3, 'saving to', local_file) | |
# input() | |
r2 = requests.get(link3, allow_redirects=True) | |
save_file(local_file, r2.content) | |
urll = \ | |
'https://a320.the-cake-is-a-lie.com/midi/roland/' | |
#r = requests.get(urll, allow_redirects=True) | |
#open('index.htm', 'wb').write(r.content) | |
#r_html = r.text | |
crawl_html(urll, 'roland') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment