tovask · April 21, 2020 03:26
diff --git a/web_page_youtube_links_to_mp3.py b/web_page_youtube_links_to_mp3.py
 #!/usr/bin/python3
 """
 Find youtube links in a webpage, and save them as mp3.
 Conclusion: finding a way to downloan mp3 from youtube without installing any additional library was the hardest thing, due to youtube mutch care about copyright...

 Useful readings (but you will find these anyway):
 https://docs.python.org/3/howto/urllib2.html
 https://docs.python.org/3/howto/regex.html
 https://docs.python.org/3/library/functions.html
 https://docs.python.org/3/tutorial/datastructures.html
 https://docs.python.org/3/library/json.html
 https://docs.python.org/3/library/urllib.request.html

 Useful tools:
 https://repl.it/languages/python3
 https://jupyter.org/try
 https://youtubemp3.biz/
 """
 import urllib.request
 import re, json, unicodedata, shutil, os.path


 # need to set some headers for not get blocked
 headers={
    "Accept": "*/*",
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
 }

 url = "https://www.google.com/search?q=Cascada+-+Everytime+We+Touch"
 url = "https://sites.google.com/a/palferi.hu/www/hanganyagok/5perc-palferi"
 print(url)

 req = urllib.request.Request(url, data=None, headers=headers)
 html = urllib.request.urlopen(req).read().decode('utf-8')

 findlinks = re.compile(r'<a.*?href=\"([^\"]*)\".*?>(.*?)</a>')
 youtube_url = re.compile(r'youtu(?:.*\/v\/|.*v\=|\.be\/)([A-Za-z0-9_\-]{11})')
 find_tags = re.compile(r'<.*?>')

 # find links, check if it's pointing to a youtube video, extract the id and remove html tags from it's title
 links = list(
    map(
        lambda link: {'url': link[0], 'id': youtube_url.search(link[0]).group(1), 'title': find_tags.sub('', link[1])},
        [link.group(1,2) for link in findlinks.finditer(html) if youtube_url.search(link.group(1))]
    )
  )

 print("found:",len(links),"links\n")

 def slugify(value, allow_unicode=False):
    """
    From https://github.com/django/django/blob/b2bd08bb7a912a1504f5fb5018f5317e6b5423cd/django/utils/text.py#L394
    Convert to ASCII if 'allow_unicode' is False. Convert spaces to underscores.
    Remove characters that aren't alphanumerics, underscores, hyphens or dot.
    Strip leading and trailing whitespace.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s\-.]', '', value).strip()
    return re.sub(r'[_\s]+', '_', value)

 for link in links:
    id = link["id"]
    url = 'https://youtubemp3.biz/a/itemInfo/?video_id=%s&nur=nazim&bf=false&t=akhon&r=1581412022235&adres=youtubemp3.biz' % id
    
    req = urllib.request.Request(url, data=None, headers=headers)
    resp = urllib.request.urlopen(req).read().decode('utf-8')
    try:
        info = json.loads(resp)
    except ValueError:
        print("error decoding response")
        print(resp)
        print("skipping", id, ": json decode error")
        continue
    
    if info["status"] != "serving":
        print("(status not serving) possible problem with", id, info["title"])
    
    file_name = slugify(info["title"])+'.mp3'
    
    if os.path.exists(file_name):
        print("skipping", id, ": file already exists")
        continue
    
    print("saving", id, "to:", file_name)
    req = urllib.request.Request(info['dl_link'], data=None, headers=headers)
    with urllib.request.urlopen(req) as response, open(file_name, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
	#!/usr/bin/python3
	"""
	Find youtube links in a webpage, and save them as mp3.
	Conclusion: finding a way to downloan mp3 from youtube without installing any additional library was the hardest thing, due to youtube mutch care about copyright...

	Useful readings (but you will find these anyway):
	https://docs.python.org/3/howto/urllib2.html
	https://docs.python.org/3/howto/regex.html
	https://docs.python.org/3/library/functions.html
	https://docs.python.org/3/tutorial/datastructures.html
	https://docs.python.org/3/library/json.html
	https://docs.python.org/3/library/urllib.request.html

	Useful tools:
	https://repl.it/languages/python3
	https://jupyter.org/try
	https://youtubemp3.biz/
	"""
	import urllib.request
	import re, json, unicodedata, shutil, os.path


	# need to set some headers for not get blocked
	headers={
	"Accept": "/",
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
	}

	url = "https://www.google.com/search?q=Cascada+-+Everytime+We+Touch"
	url = "https://sites.google.com/a/palferi.hu/www/hanganyagok/5perc-palferi"
	print(url)

	req = urllib.request.Request(url, data=None, headers=headers)
	html = urllib.request.urlopen(req).read().decode('utf-8')

	findlinks = re.compile(r'<a.?href=\"([^\"])\".?>(.?)</a>')
	youtube_url = re.compile(r'youtu(?:.\/v\/\|.v\=\|\.be\/)([A-Za-z0-9_\-]{11})')
	find_tags = re.compile(r'<.*?>')

	# find links, check if it's pointing to a youtube video, extract the id and remove html tags from it's title
	links = list(
	map(
	lambda link: {'url': link[0], 'id': youtube_url.search(link[0]).group(1), 'title': find_tags.sub('', link[1])},
	[link.group(1,2) for link in findlinks.finditer(html) if youtube_url.search(link.group(1))]
	)
	)

	print("found:",len(links),"links\n")

	def slugify(value, allow_unicode=False):
	"""
	From https://github.com/django/django/blob/b2bd08bb7a912a1504f5fb5018f5317e6b5423cd/django/utils/text.py#L394
	Convert to ASCII if 'allow_unicode' is False. Convert spaces to underscores.
	Remove characters that aren't alphanumerics, underscores, hyphens or dot.
	Strip leading and trailing whitespace.
	"""
	value = str(value)
	if allow_unicode:
	value = unicodedata.normalize('NFKC', value)
	else:
	value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
	value = re.sub(r'[^\w\s\-.]', '', value).strip()
	return re.sub(r'[_\s]+', '_', value)

	for link in links:
	id = link["id"]
	url = 'https://youtubemp3.biz/a/itemInfo/?video_id=%s&nur=nazim&bf=false&t=akhon&r=1581412022235&adres=youtubemp3.biz' % id

	req = urllib.request.Request(url, data=None, headers=headers)
	resp = urllib.request.urlopen(req).read().decode('utf-8')
	try:
	info = json.loads(resp)
	except ValueError:
	print("error decoding response")
	print(resp)
	print("skipping", id, ": json decode error")
	continue

	if info["status"] != "serving":
	print("(status not serving) possible problem with", id, info["title"])

	file_name = slugify(info["title"])+'.mp3'

	if os.path.exists(file_name):
	print("skipping", id, ": file already exists")
	continue

	print("saving", id, "to:", file_name)
	req = urllib.request.Request(info['dl_link'], data=None, headers=headers)
	with urllib.request.urlopen(req) as response, open(file_name, 'wb') as out_file:
	shutil.copyfileobj(response, out_file)