Created
July 7, 2021 08:49
-
-
Save cyphar/42a3df666f4e934253e3a4fb09af4fae to your computer and use it in GitHub Desktop.
Forvo-based Audio Server for Yomichan
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# forvo-yomichan: Forvo-based Yomichan audio source | |
# Copyright (C) 2021 Aleksa Sarai <[email protected]> | |
# | |
# This program is free software: you can redistribute it and/or modify it under | |
# the terms of the GNU Lesser General Public License as published by the Free | |
# Software Foundation, either version 3 of the License, or (at your option) any | |
# later version. | |
# | |
# This program is distributed in the hope that it will be useful, but WITHOUT ANY | |
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A | |
# PARTICULAR PURPOSE. See the GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU Lesser General Public License along | |
# with this program. If not, see <https://www.gnu.org/licenses/>. | |
import os | |
import re | |
import base64 | |
import flask | |
import requests | |
app = flask.Flask(__name__) | |
PLAY_PATTERN = re.compile(r"Play\([^\)]+\)") | |
SLUG_PATTERN = re.compile(r"'([^']+)'") | |
MEDIA_URL = "https://audio00.forvo.com" | |
URLS = [ | |
MEDIA_URL + "/%(extension)s/%(unbase64_slug)s", # mp3 in /mp3 | |
MEDIA_URL + "/%(extension)s/%(unbase64_slug)s", # ogg in /ogg | |
MEDIA_URL + "/audios/%(extension)s/%(unbase64_slug)s", # mp3 in /audios/mp3 | |
MEDIA_URL + "/audios/%(extension)s/%(unbase64_slug)s", # ogg in /audios/ogg | |
] | |
HEADERS = { | |
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0", | |
} | |
def forvo_fetch(search_term): | |
with requests.get("https://forvo.com/search/%s/ja/" % (search_term,), headers=HEADERS) as r: | |
if not r.ok: | |
print("[!] Got error from request: %s" % (r,)) | |
return None | |
for sample in PLAY_PATTERN.findall(r.text): | |
# Currently the order of these is (mp3, ogg, mp3, ogg), with the | |
# last two audio samples appearing to be better quality. | |
slugs = SLUG_PATTERN.findall(sample)[:len(URLS)] | |
if len(slugs) < 1: | |
print("[!] Got no slugs from '%s' search: %s" % (search_term, sample)) | |
continue | |
urls = [] | |
for i, slug in enumerate(slugs): | |
unbase64_slug = base64.b64decode(slug).decode("utf-8") | |
_, extension = os.path.splitext(unbase64_slug) | |
urls.append(URLS[i] % { | |
"unbase64_slug": unbase64_slug, | |
"extension": extension[1:], | |
}) | |
yield urls[-1] # Last one is usually better quality. | |
def forvo_find_best(term, reading): | |
# Forvo doesn't have reading-based search, so we first try the term | |
# version. If there is only one result, return that. Otherwise, try the | |
# reading-based approach. If there is only one result, return that. | |
# Otherwise return all the readings and let Yomichan user pick (with a | |
# preference for term URLs). | |
term_urls = list(forvo_fetch(term) or []) | |
reading_urls = list(forvo_fetch(reading) or []) | |
if not (term_urls or reading_urls): | |
# We didn't get anything. | |
print("[!] Nothing from Forvo for %s(%s)." % (term, reading)) | |
return [] | |
if len(term_urls) == 1: | |
return term_urls | |
if len(reading_urls) == 1: | |
return reading_urls | |
return term_urls + reading_urls | |
@app.route("/<term>/<reading>") | |
def forvo(term, reading): | |
return flask.jsonify({ | |
"type": "audioSourceList", | |
"audioSources": [ | |
{"name": "Forvo", "url": url} for url in forvo_find_best(term, reading) | |
], | |
}) | |
if __name__ == "__main__": | |
app.run(host="127.0.0.1", port="50505") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment