Skip to content

Instantly share code, notes, and snippets.

@nazarovsky
Created January 9, 2025 08:59
Show Gist options
  • Save nazarovsky/787c8abca621394e26a55dab71c62fcc to your computer and use it in GitHub Desktop.
Save nazarovsky/787c8abca621394e26a55dab71c62fcc to your computer and use it in GitHub Desktop.
get files from http directory crawler
import requests
import os,sys,re
from bs4 import BeautifulSoup
def make_folder(out_path):
# make path if not exists
if not os.path.exists(out_path):
os.makedirs(out_path)
def get_path_to_filename(filename):
return os.path.dirname(filename)
def save_file(fname, content):
make_folder(get_path_to_filename(fname))
with open(fname, 'wb') as f:
f.write(content)
print(len(content))
def crawl_html(url, local_dir):
r = requests.get(url, allow_redirects=True)
r_html = r.text
soup = BeautifulSoup(r_html, "html.parser")
for link in soup.findAll('a'):
link2 = link.get('href')
if link2 != '../':
if link2.endswith('/'):
print('catalog ',link2)
crawl_html(url+link2, os.path.join(local_dir,link2))
else:
link3 = url+ link2
local_file = os.path.join(local_dir,link2)
print('file ',link3, 'saving to', local_file)
# input()
r2 = requests.get(link3, allow_redirects=True)
save_file(local_file, r2.content)
urll = \
'https://a320.the-cake-is-a-lie.com/midi/roland/'
#r = requests.get(urll, allow_redirects=True)
#open('index.htm', 'wb').write(r.content)
#r_html = r.text
crawl_html(urll, 'roland')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment