Last active
October 31, 2019 22:24
-
-
Save xavvvier/b05db94c6bc3a6a1fee9d320a1ffcb91 to your computer and use it in GitHub Desktop.
Generates a text file ready to import in anki software using a list of minimal pairs as base input. Also downloads audio files from wordreference.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#execute as: python3 anki_importer_generator.py < input.txt | |
#where input.txt is a list of minimal pairs words separated by space: | |
#it eat | |
#bit beat | |
import fileinput | |
import urllib.request | |
import re | |
def makeline(word1, word2, ipaword1, ipaword2, file1, file2): | |
return word1 + '\t' + '[sound:' + file1 + ']' + '\t' + ipaword1 + '\t' + word2 + '\t' + '[sound:' + file2 + ']' + '\t' + ipaword2 + '\n' | |
def getContent(word): | |
url = 'https://www.wordreference.com/definition/' + word | |
req = urllib.request.Request(url, | |
data=None, | |
headers={ | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' | |
} | |
) | |
#execute the request | |
f = urllib.request.urlopen(req) | |
#read the response definition | |
return f.read().decode('utf-8') | |
def find_ipa(word): | |
content = getContent(word) | |
#get the ipa | |
ipa = getIPAInContent(content) | |
return (ipa, content) | |
def getFileName(word, path): | |
chunks = path.split('/') | |
language = chunks[2] | |
accent = chunks[3] | |
extension = chunks[-1] | |
extension = extension[extension.rfind('.'):] | |
return word + '-' + language + '-' + accent + extension | |
def downloadFile(path, fileName): | |
url = "https://www.wordreference.com" + path; | |
with urllib.request.urlopen(url) as response, open(fileName, 'wb') as outFile: | |
data = response.read() # a `bytes` object | |
outFile.write(data) | |
def getFirstMatch(content, regex): | |
matches = re.findall(regex, content) | |
if len(matches) > 0: | |
return matches[0] | |
return None | |
def getIPAInContent(content): | |
regex = r"<span id=['\"]pronWR['\"].+?>\[(.*)\]</span>" | |
return getFirstMatch(content, regex) | |
def getAudio1InContent(content): | |
regex = r"<audio id=['\"]aud0['\"].+?><source src=['\"](.+?)['\"] type=['\"]audio/mpeg['\"]></audio>" | |
return getFirstMatch(content, regex) | |
def process_input(file): | |
content = '' | |
for line in file: | |
[word1, word2] = line.strip().split() | |
(ipa1, content1) = find_ipa(word1) | |
(ipa2, content2) = find_ipa(word2) | |
if ipa1 != None and ipa2 != None: | |
#Download the first to accents for each word | |
word1Audio = downloadAudio(content1, word1) | |
word2Audio = downloadAudio(content2, word2) | |
if word1Audio != None and word2Audio != None: | |
#Create the line for first accent | |
baseline = makeline(word1, word2, ipa1, ipa2, word1Audio, word2Audio) | |
content += baseline | |
else: | |
print('IPA not found for:', word1, 'or', word2) | |
with open('result.txt', 'w') as outFile: | |
outFile.write(content) | |
def downloadAudio(content, word): | |
#get the file path | |
path = getAudio1InContent(content) | |
if path != None: | |
fileName = getFileName(word, path) | |
downloadFile(path, fileName) | |
return fileName | |
return None | |
with fileinput.input() as input: | |
process_input(input); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment