Yuffster · February 18, 2017 23:26
diff --git a/scrape_morse.py b/scrape_morse.py
 from bs4 import BeautifulSoup
 import requests
 import os

 def fetch(url):
 	if (url[0:4] != "http"):
 		url = 'http://www.arrl.org'+url
 	fname = url.split("/")[-1]
 	print("Fetching [{}] from {}".format(fname, url))
 	if os.path.isfile(fname):
 		print("\tCached.")
 		return
 	res = requests.get(url)
 	if not res.ok:
 		print("\tERROR")
 		return
 	with open(fname, 'wb') as f:
 		for block in res.iter_content(1024):
 			f.write(block)

 def getSamples(speed):
 	r = requests.get('http://www.arrl.org/{}-wpm-code-archive'.format(speed))
 	soup = BeautifulSoup(r.text, 'html.parser')
 	mp3s = []
 	transcripts = []
 	for a in soup.find_all('a'):
 		if (a.get('href') and a.get('href')[-3:]=='mp3'):
 			fetch(a.get('href'))
 			text = a.find_parent('td').find_next_sibling().find('a').get('href')
 			fetch(text)


 for n in ['5', '7-5', '10', '13', '15', '20', '25', '30', '35', '40']:
 	getSamples(n)
	from bs4 import BeautifulSoup
	import requests
	import os

	def fetch(url):
	if (url[0:4] != "http"):
	url = 'http://www.arrl.org'+url
	fname = url.split("/")[-1]
	print("Fetching [{}] from {}".format(fname, url))
	if os.path.isfile(fname):
	print("\tCached.")
	return
	res = requests.get(url)
	if not res.ok:
	print("\tERROR")
	return
	with open(fname, 'wb') as f:
	for block in res.iter_content(1024):
	f.write(block)

	def getSamples(speed):
	r = requests.get('http://www.arrl.org/{}-wpm-code-archive'.format(speed))
	soup = BeautifulSoup(r.text, 'html.parser')
	mp3s = []
	transcripts = []
	for a in soup.find_all('a'):
	if (a.get('href') and a.get('href')[-3:]=='mp3'):
	fetch(a.get('href'))
	text = a.find_parent('td').find_next_sibling().find('a').get('href')
	fetch(text)


	for n in ['5', '7-5', '10', '13', '15', '20', '25', '30', '35', '40']:
	getSamples(n)