Last active
April 21, 2020 03:26
-
-
Save tovask/cd4e09ba66b9ee597381f3fa522a9b38 to your computer and use it in GitHub Desktop.
Find youtube links in a webpage, and save them as mp3.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
""" | |
Find youtube links in a webpage, and save them as mp3. | |
Conclusion: finding a way to downloan mp3 from youtube without installing any additional library was the hardest thing, due to youtube mutch care about copyright... | |
Useful readings (but you will find these anyway): | |
https://docs.python.org/3/howto/urllib2.html | |
https://docs.python.org/3/howto/regex.html | |
https://docs.python.org/3/library/functions.html | |
https://docs.python.org/3/tutorial/datastructures.html | |
https://docs.python.org/3/library/json.html | |
https://docs.python.org/3/library/urllib.request.html | |
Useful tools: | |
https://repl.it/languages/python3 | |
https://jupyter.org/try | |
https://youtubemp3.biz/ | |
""" | |
import urllib.request | |
import re, json, unicodedata, shutil, os.path | |
# need to set some headers for not get blocked | |
headers={ | |
"Accept": "*/*", | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' | |
} | |
url = "https://www.google.com/search?q=Cascada+-+Everytime+We+Touch" | |
url = "https://sites.google.com/a/palferi.hu/www/hanganyagok/5perc-palferi" | |
print(url) | |
req = urllib.request.Request(url, data=None, headers=headers) | |
html = urllib.request.urlopen(req).read().decode('utf-8') | |
findlinks = re.compile(r'<a.*?href=\"([^\"]*)\".*?>(.*?)</a>') | |
youtube_url = re.compile(r'youtu(?:.*\/v\/|.*v\=|\.be\/)([A-Za-z0-9_\-]{11})') | |
find_tags = re.compile(r'<.*?>') | |
# find links, check if it's pointing to a youtube video, extract the id and remove html tags from it's title | |
links = list( | |
map( | |
lambda link: {'url': link[0], 'id': youtube_url.search(link[0]).group(1), 'title': find_tags.sub('', link[1])}, | |
[link.group(1,2) for link in findlinks.finditer(html) if youtube_url.search(link.group(1))] | |
) | |
) | |
print("found:",len(links),"links\n") | |
def slugify(value, allow_unicode=False): | |
""" | |
From https://github.com/django/django/blob/b2bd08bb7a912a1504f5fb5018f5317e6b5423cd/django/utils/text.py#L394 | |
Convert to ASCII if 'allow_unicode' is False. Convert spaces to underscores. | |
Remove characters that aren't alphanumerics, underscores, hyphens or dot. | |
Strip leading and trailing whitespace. | |
""" | |
value = str(value) | |
if allow_unicode: | |
value = unicodedata.normalize('NFKC', value) | |
else: | |
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') | |
value = re.sub(r'[^\w\s\-.]', '', value).strip() | |
return re.sub(r'[_\s]+', '_', value) | |
for link in links: | |
id = link["id"] | |
url = 'https://youtubemp3.biz/a/itemInfo/?video_id=%s&nur=nazim&bf=false&t=akhon&r=1581412022235&adres=youtubemp3.biz' % id | |
req = urllib.request.Request(url, data=None, headers=headers) | |
resp = urllib.request.urlopen(req).read().decode('utf-8') | |
try: | |
info = json.loads(resp) | |
except ValueError: | |
print("error decoding response") | |
print(resp) | |
print("skipping", id, ": json decode error") | |
continue | |
if info["status"] != "serving": | |
print("(status not serving) possible problem with", id, info["title"]) | |
file_name = slugify(info["title"])+'.mp3' | |
if os.path.exists(file_name): | |
print("skipping", id, ": file already exists") | |
continue | |
print("saving", id, "to:", file_name) | |
req = urllib.request.Request(info['dl_link'], data=None, headers=headers) | |
with urllib.request.urlopen(req) as response, open(file_name, 'wb') as out_file: | |
shutil.copyfileobj(response, out_file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment