nazarovsky · January 9, 2025 08:59
diff --git a/getg_crawl.py b/getg_crawl.py
 import requests
 import os,sys,re
 from bs4 import BeautifulSoup

 def make_folder(out_path):
    # make path if not exists
    if not os.path.exists(out_path):
        os.makedirs(out_path)      

 def get_path_to_filename(filename):
    return os.path.dirname(filename)

 def save_file(fname, content):
    make_folder(get_path_to_filename(fname))
    with open(fname, 'wb') as f:
        f.write(content)
        print(len(content))

 def crawl_html(url, local_dir):
    r = requests.get(url, allow_redirects=True)
    r_html = r.text
    soup = BeautifulSoup(r_html, "html.parser")
    for link in soup.findAll('a'):
        link2 = link.get('href')
        if link2 != '../':
            if link2.endswith('/'):
                print('catalog ',link2)
                crawl_html(url+link2, os.path.join(local_dir,link2))
            else:
                link3 = url+ link2
                local_file = os.path.join(local_dir,link2)
                print('file ',link3, 'saving to', local_file)
 #                input()
                r2 = requests.get(link3, allow_redirects=True)
                save_file(local_file, r2.content)

 urll = \
    'https://a320.the-cake-is-a-lie.com/midi/roland/'

 #r = requests.get(urll, allow_redirects=True)
 #open('index.htm', 'wb').write(r.content)
 #r_html = r.text
 crawl_html(urll, 'roland')
	import requests
	import os,sys,re
	from bs4 import BeautifulSoup

	def make_folder(out_path):
	# make path if not exists
	if not os.path.exists(out_path):
	os.makedirs(out_path)

	def get_path_to_filename(filename):
	return os.path.dirname(filename)

	def save_file(fname, content):
	make_folder(get_path_to_filename(fname))
	with open(fname, 'wb') as f:
	f.write(content)
	print(len(content))

	def crawl_html(url, local_dir):
	r = requests.get(url, allow_redirects=True)
	r_html = r.text
	soup = BeautifulSoup(r_html, "html.parser")
	for link in soup.findAll('a'):
	link2 = link.get('href')
	if link2 != '../':
	if link2.endswith('/'):
	print('catalog ',link2)
	crawl_html(url+link2, os.path.join(local_dir,link2))
	else:
	link3 = url+ link2
	local_file = os.path.join(local_dir,link2)
	print('file ',link3, 'saving to', local_file)
	# input()
	r2 = requests.get(link3, allow_redirects=True)
	save_file(local_file, r2.content)

	urll = \
	'https://a320.the-cake-is-a-lie.com/midi/roland/'

	#r = requests.get(urll, allow_redirects=True)
	#open('index.htm', 'wb').write(r.content)
	#r_html = r.text
	crawl_html(urll, 'roland')