Skip to content

Instantly share code, notes, and snippets.

@xavvvier
Last active October 31, 2019 22:24
Show Gist options
  • Save xavvvier/b05db94c6bc3a6a1fee9d320a1ffcb91 to your computer and use it in GitHub Desktop.
Save xavvvier/b05db94c6bc3a6a1fee9d320a1ffcb91 to your computer and use it in GitHub Desktop.
Generates a text file ready to import in anki software using a list of minimal pairs as base input. Also downloads audio files from wordreference.com
#execute as: python3 anki_importer_generator.py < input.txt
#where input.txt is a list of minimal pairs words separated by space:
#it eat
#bit beat
import fileinput
import urllib.request
import re
def makeline(word1, word2, ipaword1, ipaword2, file1, file2):
return word1 + '\t' + '[sound:' + file1 + ']' + '\t' + ipaword1 + '\t' + word2 + '\t' + '[sound:' + file2 + ']' + '\t' + ipaword2 + '\n'
def getContent(word):
url = 'https://www.wordreference.com/definition/' + word
req = urllib.request.Request(url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
#execute the request
f = urllib.request.urlopen(req)
#read the response definition
return f.read().decode('utf-8')
def find_ipa(word):
content = getContent(word)
#get the ipa
ipa = getIPAInContent(content)
return (ipa, content)
def getFileName(word, path):
chunks = path.split('/')
language = chunks[2]
accent = chunks[3]
extension = chunks[-1]
extension = extension[extension.rfind('.'):]
return word + '-' + language + '-' + accent + extension
def downloadFile(path, fileName):
url = "https://www.wordreference.com" + path;
with urllib.request.urlopen(url) as response, open(fileName, 'wb') as outFile:
data = response.read() # a `bytes` object
outFile.write(data)
def getFirstMatch(content, regex):
matches = re.findall(regex, content)
if len(matches) > 0:
return matches[0]
return None
def getIPAInContent(content):
regex = r"<span id=['\"]pronWR['\"].+?>\[(.*)\]</span>"
return getFirstMatch(content, regex)
def getAudio1InContent(content):
regex = r"<audio id=['\"]aud0['\"].+?><source src=['\"](.+?)['\"] type=['\"]audio/mpeg['\"]></audio>"
return getFirstMatch(content, regex)
def process_input(file):
content = ''
for line in file:
[word1, word2] = line.strip().split()
(ipa1, content1) = find_ipa(word1)
(ipa2, content2) = find_ipa(word2)
if ipa1 != None and ipa2 != None:
#Download the first to accents for each word
word1Audio = downloadAudio(content1, word1)
word2Audio = downloadAudio(content2, word2)
if word1Audio != None and word2Audio != None:
#Create the line for first accent
baseline = makeline(word1, word2, ipa1, ipa2, word1Audio, word2Audio)
content += baseline
else:
print('IPA not found for:', word1, 'or', word2)
with open('result.txt', 'w') as outFile:
outFile.write(content)
def downloadAudio(content, word):
#get the file path
path = getAudio1InContent(content)
if path != None:
fileName = getFileName(word, path)
downloadFile(path, fileName)
return fileName
return None
with fileinput.input() as input:
process_input(input);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment