Created
April 6, 2018 08:21
-
-
Save ishideo/3e77e32515dbe3b4bcaa331b3f39a66d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import re | |
import codecs | |
import time | |
import msmt | |
from BeautifulSoup import BeautifulStoneSoup | |
#sys.stdout = codecs.getwriter('utf-8')(sys.stdout) | |
def main(): | |
token = msmt.get_access_token('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx', 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx') | |
count = 1 | |
file = './out.txt' | |
for line in open(file, 'r'): | |
if line: | |
items = line.split('\t') | |
if count % 4000 == 0: time.sleep(80) | |
get_xml = msmt.translate(token, items[0], "fr", "en") | |
soup = BeautifulStoneSoup(get_xml, convertEntities=BeautifulStoneSoup.XML_ENTITIES) | |
text = re.sub(r'<[^>]*>', '', str(soup)) | |
print text | |
count += 1; | |
else: | |
break | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment