rraallvv · December 27, 2020 23:29
diff --git a/lingorado_ipa_transcription.py b/lingorado_ipa_transcription.py
 #!/usr/bin/python

 import requests
 from lxml import html
 import re
 from random import randint
 from time import sleep
 import sys
 import random

 base_url='http://lingorado.com/ipa/'
 proxy_list = ['89.187.217.114:80', '92.126.152.221:8080', '35.188.62.145:80', '171.255.199.3:3128', '178.62.91.24:8118', '203.74.4.4:80', '61.69.92.106:80', '94.177.180.226:80', '98.182.126.155:80', '203.74.4.0:80', '51.254.16.106:8080', '219.76.4.72:88', '203.74.4.6:80', '125.16.128.118:3128', '217.15.85.202:8080', '24.249.80.22:3128', '34.252.130.88:8080', '62.214.70.116:3128', '89.187.217.116:80', '219.76.4.12:88']
 current_proxy = 0

 #text_to_transcribe=impress&submit=Show+transcription&output_dialect=am&output_style=only_tr&weak_forms=on&preBracket=&postBracket=&speech_support=1: undefined

 def get_transcription(word):
 	global current_proxy
 	payload = {'text_to_transcribe': word, 'output_dialect': 'am', 'output_style': 'only_tr', 'weak_forms': 'on', 'speech_support': 1, 'submit': 'Show+transcription'}
 	response = {}
 	while True:
 		proxy_dict = {'http'  : 'http://' + proxy_list[current_proxy]}
 		current_proxy = (current_proxy + 1) % len(proxy_list)
 		try:
 			response = requests.post(base_url, data=payload, proxies=proxy_dict)
 			#response = requests.post(base_url, data=payload)
 		except KeyboardInterrupt:
 			return ""
 		except:
 			#print '\033[91m' + "Connection error with " + proxy_dict['http'] + '\033[0m'
 			continue

 		#sleep(randint(1,10))
 		#print response.content

 		tree = html.fromstring(response.content)

 		transcribed_word = {}

 		#print tree.xpath("//span[@class='transcribed_word']")
 		try:
 			transcribed_word = tree.xpath("//span[@class='transcribed_word']")[0]
 		except:
 			try:
 				transcribed_word = tree.xpath("//span[@class='transcription_missing']")[0]
 				return "Transcription missing!"
 			except:
 				#print '\033[91m' + "Couldn't get the transcription from " + proxy_dict['http'] + '\033[0m'
 				continue
 		#print transcribed_word

 		trans = ''
 		
 		id = transcribed_word.xpath(".//a/@id")
 		if len(id) == 0:
 			return '[' + transcribed_word.xpath(".//text()")[0] + ']'
 		else:
 			id = id[0]
 	
 		if not id:
 			trans = '[' + transcribed_word.xpath(".//text()")[0] + ']'
 		else:
 			#print id
 			m = re.search('(.*)_[^_]+', id)
 			if m:
 				id = m.group(1)
 			#print id
 			m = re.search(id + '_notes = "([^"]+)"', response.content)
 			if m:
 				trans = re.sub('\d+\. ', '', m.group(1))
 				trans = re.sub('(<i>|</i>|:)', '', trans)
 				trans = re.sub('<br ?/>', ', ', trans)
 				trans = re.sub(' ,', ',', trans)
 			else:
 				print '\033[91m' + "Error! Couldn't make transcription." + '\033[0m'
 		return trans

 random.shuffle(proxy_list)

 with open('words-list.txt') as lines:
 	payload = ''
 	for word in lines:
 		transcription = get_transcription(word)
 		if transcription:
 			print word.replace('\n', ' ') + '\t' + transcription
 		else:
 			break
	#!/usr/bin/python

	import requests
	from lxml import html
	import re
	from random import randint
	from time import sleep
	import sys
	import random

	base_url='http://lingorado.com/ipa/'
	proxy_list = ['89.187.217.114:80', '92.126.152.221:8080', '35.188.62.145:80', '171.255.199.3:3128', '178.62.91.24:8118', '203.74.4.4:80', '61.69.92.106:80', '94.177.180.226:80', '98.182.126.155:80', '203.74.4.0:80', '51.254.16.106:8080', '219.76.4.72:88', '203.74.4.6:80', '125.16.128.118:3128', '217.15.85.202:8080', '24.249.80.22:3128', '34.252.130.88:8080', '62.214.70.116:3128', '89.187.217.116:80', '219.76.4.12:88']
	current_proxy = 0

	#text_to_transcribe=impress&submit=Show+transcription&output_dialect=am&output_style=only_tr&weak_forms=on&preBracket=&postBracket=&speech_support=1: undefined

	def get_transcription(word):
	global current_proxy
	payload = {'text_to_transcribe': word, 'output_dialect': 'am', 'output_style': 'only_tr', 'weak_forms': 'on', 'speech_support': 1, 'submit': 'Show+transcription'}
	response = {}
	while True:
	proxy_dict = {'http' : 'http://' + proxy_list[current_proxy]}
	current_proxy = (current_proxy + 1) % len(proxy_list)
	try:
	response = requests.post(base_url, data=payload, proxies=proxy_dict)
	#response = requests.post(base_url, data=payload)
	except KeyboardInterrupt:
	return ""
	except:
	#print '\033[91m' + "Connection error with " + proxy_dict['http'] + '\033[0m'
	continue

	#sleep(randint(1,10))
	#print response.content

	tree = html.fromstring(response.content)

	transcribed_word = {}

	#print tree.xpath("//span[@class='transcribed_word']")
	try:
	transcribed_word = tree.xpath("//span[@class='transcribed_word']")[0]
	except:
	try:
	transcribed_word = tree.xpath("//span[@class='transcription_missing']")[0]
	return "Transcription missing!"
	except:
	#print '\033[91m' + "Couldn't get the transcription from " + proxy_dict['http'] + '\033[0m'
	continue
	#print transcribed_word

	trans = ''

	id = transcribed_word.xpath(".//a/@id")
	if len(id) == 0:
	return '[' + transcribed_word.xpath(".//text()")[0] + ']'
	else:
	id = id[0]

	if not id:
	trans = '[' + transcribed_word.xpath(".//text()")[0] + ']'
	else:
	#print id
	m = re.search('(.*)_[^_]+', id)
	if m:
	id = m.group(1)
	#print id
	m = re.search(id + '_notes = "([^"]+)"', response.content)
	if m:
	trans = re.sub('\d+\. ', '', m.group(1))
	trans = re.sub('(<i>\|</i>\|:)', '', trans)
	trans = re.sub('<br ?/>', ', ', trans)
	trans = re.sub(' ,', ',', trans)
	else:
	print '\033[91m' + "Error! Couldn't make transcription." + '\033[0m'
	return trans

	random.shuffle(proxy_list)

	with open('words-list.txt') as lines:
	payload = ''
	for word in lines:
	transcription = get_transcription(word)
	if transcription:
	print word.replace('\n', ' ') + '\t' + transcription
	else:
	break