Last active
December 27, 2020 23:29
-
-
Save rraallvv/909e0e63b3fedc053a63332ba8f28de0 to your computer and use it in GitHub Desktop.
Lingorado IPA phonetic transcription
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import requests | |
from lxml import html | |
import re | |
from random import randint | |
from time import sleep | |
import sys | |
import random | |
base_url='http://lingorado.com/ipa/' | |
proxy_list = ['89.187.217.114:80', '92.126.152.221:8080', '35.188.62.145:80', '171.255.199.3:3128', '178.62.91.24:8118', '203.74.4.4:80', '61.69.92.106:80', '94.177.180.226:80', '98.182.126.155:80', '203.74.4.0:80', '51.254.16.106:8080', '219.76.4.72:88', '203.74.4.6:80', '125.16.128.118:3128', '217.15.85.202:8080', '24.249.80.22:3128', '34.252.130.88:8080', '62.214.70.116:3128', '89.187.217.116:80', '219.76.4.12:88'] | |
current_proxy = 0 | |
#text_to_transcribe=impress&submit=Show+transcription&output_dialect=am&output_style=only_tr&weak_forms=on&preBracket=&postBracket=&speech_support=1: undefined | |
def get_transcription(word): | |
global current_proxy | |
payload = {'text_to_transcribe': word, 'output_dialect': 'am', 'output_style': 'only_tr', 'weak_forms': 'on', 'speech_support': 1, 'submit': 'Show+transcription'} | |
response = {} | |
while True: | |
proxy_dict = {'http' : 'http://' + proxy_list[current_proxy]} | |
current_proxy = (current_proxy + 1) % len(proxy_list) | |
try: | |
response = requests.post(base_url, data=payload, proxies=proxy_dict) | |
#response = requests.post(base_url, data=payload) | |
except KeyboardInterrupt: | |
return "" | |
except: | |
#print '\033[91m' + "Connection error with " + proxy_dict['http'] + '\033[0m' | |
continue | |
#sleep(randint(1,10)) | |
#print response.content | |
tree = html.fromstring(response.content) | |
transcribed_word = {} | |
#print tree.xpath("//span[@class='transcribed_word']") | |
try: | |
transcribed_word = tree.xpath("//span[@class='transcribed_word']")[0] | |
except: | |
try: | |
transcribed_word = tree.xpath("//span[@class='transcription_missing']")[0] | |
return "Transcription missing!" | |
except: | |
#print '\033[91m' + "Couldn't get the transcription from " + proxy_dict['http'] + '\033[0m' | |
continue | |
#print transcribed_word | |
trans = '' | |
id = transcribed_word.xpath(".//a/@id") | |
if len(id) == 0: | |
return '[' + transcribed_word.xpath(".//text()")[0] + ']' | |
else: | |
id = id[0] | |
if not id: | |
trans = '[' + transcribed_word.xpath(".//text()")[0] + ']' | |
else: | |
#print id | |
m = re.search('(.*)_[^_]+', id) | |
if m: | |
id = m.group(1) | |
#print id | |
m = re.search(id + '_notes = "([^"]+)"', response.content) | |
if m: | |
trans = re.sub('\d+\. ', '', m.group(1)) | |
trans = re.sub('(<i>|</i>|:)', '', trans) | |
trans = re.sub('<br ?/>', ', ', trans) | |
trans = re.sub(' ,', ',', trans) | |
else: | |
print '\033[91m' + "Error! Couldn't make transcription." + '\033[0m' | |
return trans | |
random.shuffle(proxy_list) | |
with open('words-list.txt') as lines: | |
payload = '' | |
for word in lines: | |
transcription = get_transcription(word) | |
if transcription: | |
print word.replace('\n', ' ') + '\t' + transcription | |
else: | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment