xavvvier · October 31, 2019 22:24
diff --git a/anki_importer_generator.py b/anki_importer_generator.py
 #execute as: python3 anki_importer_generator.py < input.txt
 #where input.txt is a list of minimal pairs words separated by space:
 #it eat
 #bit beat

 import fileinput
 import urllib.request
 import re

 def makeline(word1, word2, ipaword1, ipaword2, file1, file2):
    return word1 + '\t' + '[sound:' + file1 + ']' + '\t' + ipaword1 + '\t' + word2 + '\t' + '[sound:' + file2 + ']' + '\t' + ipaword2 + '\n'

 def getContent(word):
    url = 'https://www.wordreference.com/definition/' + word
    req = urllib.request.Request(url,
        data=None,
        headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
        }
    )
    #execute the request
    f = urllib.request.urlopen(req)
    #read the response definition
    return f.read().decode('utf-8')


 def find_ipa(word):
    content = getContent(word)
    #get the ipa 
    ipa = getIPAInContent(content)
    return (ipa, content)

 def getFileName(word, path):
    chunks = path.split('/')
    language = chunks[2]
    accent = chunks[3]
    extension = chunks[-1]
    extension = extension[extension.rfind('.'):]
    return word + '-' + language + '-' + accent + extension

 def downloadFile(path, fileName):
    url = "https://www.wordreference.com" + path;
    with urllib.request.urlopen(url) as response, open(fileName, 'wb') as outFile:
        data = response.read() # a `bytes` object
        outFile.write(data)

 def getFirstMatch(content, regex):
    matches = re.findall(regex, content)
    if len(matches) > 0:
        return matches[0]
    return None
    
 def getIPAInContent(content):
    regex = r"<span id=['\"]pronWR['\"].+?>\[(.*)\]</span>"
    return getFirstMatch(content, regex)

 def getAudio1InContent(content):
    regex = r"<audio id=['\"]aud0['\"].+?><source src=['\"](.+?)['\"] type=['\"]audio/mpeg['\"]></audio>"
    return getFirstMatch(content, regex)

 def process_input(file):
    content = ''
    for line in file:
        [word1, word2] = line.strip().split()
        (ipa1, content1) = find_ipa(word1)
        (ipa2, content2) = find_ipa(word2)
        if ipa1 != None and ipa2 != None:
            #Download the first to accents for each word
            word1Audio = downloadAudio(content1, word1)
            word2Audio = downloadAudio(content2, word2)
            if word1Audio != None and word2Audio != None:
                #Create the line for first accent
                baseline = makeline(word1, word2, ipa1, ipa2, word1Audio, word2Audio)
            content += baseline
        else:
            print('IPA not found for:', word1, 'or', word2)
    with open('result.txt', 'w') as outFile:
        outFile.write(content)

 def downloadAudio(content, word):
    #get the file path
    path = getAudio1InContent(content)
    if path != None:
        fileName = getFileName(word, path)
        downloadFile(path, fileName)
        return fileName
    return None


 with fileinput.input() as input:
    process_input(input);
	#execute as: python3 anki_importer_generator.py < input.txt
	#where input.txt is a list of minimal pairs words separated by space:
	#it eat
	#bit beat

	import fileinput
	import urllib.request
	import re

	def makeline(word1, word2, ipaword1, ipaword2, file1, file2):
	return word1 + '\t' + '[sound:' + file1 + ']' + '\t' + ipaword1 + '\t' + word2 + '\t' + '[sound:' + file2 + ']' + '\t' + ipaword2 + '\n'

	def getContent(word):
	url = 'https://www.wordreference.com/definition/' + word
	req = urllib.request.Request(url,
	data=None,
	headers={
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
	}
	)
	#execute the request
	f = urllib.request.urlopen(req)
	#read the response definition
	return f.read().decode('utf-8')


	def find_ipa(word):
	content = getContent(word)
	#get the ipa
	ipa = getIPAInContent(content)
	return (ipa, content)

	def getFileName(word, path):
	chunks = path.split('/')
	language = chunks[2]
	accent = chunks[3]
	extension = chunks[-1]
	extension = extension[extension.rfind('.'):]
	return word + '-' + language + '-' + accent + extension

	def downloadFile(path, fileName):
	url = "https://www.wordreference.com" + path;
	with urllib.request.urlopen(url) as response, open(fileName, 'wb') as outFile:
	data = response.read() # a `bytes` object
	outFile.write(data)

	def getFirstMatch(content, regex):
	matches = re.findall(regex, content)
	if len(matches) > 0:
	return matches[0]
	return None

	def getIPAInContent(content):
	regex = r"<span id=['\"]pronWR['\"].+?>\[(.*)\]</span>"
	return getFirstMatch(content, regex)

	def getAudio1InContent(content):
	regex = r"<audio id=['\"]aud0['\"].+?><source src=['\"](.+?)['\"] type=['\"]audio/mpeg['\"]></audio>"
	return getFirstMatch(content, regex)

	def process_input(file):
	content = ''
	for line in file:
	[word1, word2] = line.strip().split()
	(ipa1, content1) = find_ipa(word1)
	(ipa2, content2) = find_ipa(word2)
	if ipa1 != None and ipa2 != None:
	#Download the first to accents for each word
	word1Audio = downloadAudio(content1, word1)
	word2Audio = downloadAudio(content2, word2)
	if word1Audio != None and word2Audio != None:
	#Create the line for first accent
	baseline = makeline(word1, word2, ipa1, ipa2, word1Audio, word2Audio)
	content += baseline
	else:
	print('IPA not found for:', word1, 'or', word2)
	with open('result.txt', 'w') as outFile:
	outFile.write(content)

	def downloadAudio(content, word):
	#get the file path
	path = getAudio1InContent(content)
	if path != None:
	fileName = getFileName(word, path)
	downloadFile(path, fileName)
	return fileName
	return None


	with fileinput.input() as input:
	process_input(input);