Skip to content

Instantly share code, notes, and snippets.

@Siedlerchr
Created October 14, 2017 15:42
Show Gist options
  • Save Siedlerchr/5017e15e53ab04d08141522d41b471b4 to your computer and use it in GitHub Desktop.
Save Siedlerchr/5017e15e53ab04d08141522d41b471b4 to your computer and use it in GitHub Desktop.
import codecs
import os
from nltk import word_tokenize
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus import stopwords
from googletrans import Translator
from PyDictionary import PyDictionary
from googletrans import Translator
translator = Translator()
print(translator.translate('안녕하세요.'))
corpusdir = 'test3/'
listaFicheros = PlaintextCorpusReader(corpusdir, '.*')
os.chdir(corpusdir)
for ficheros in listaFicheros.fileids():
if 'txt' in ficheros:
texto=codecs.open(ficheros,'r','utf-8')
tokens = word_tokenize(texto.read())
#tokens = [palabra.lower() for sent in sent_tokenize(texto) for palabra in word_tokenize(sent)]
tokens = [palabra for palabra in tokens if len(palabra) > 1] # Elimina tokens menores de 1
tokens = [palabra for palabra in tokens if not palabra.isnumeric()] # Elimina numeros
tokens = [palabra for palabra in tokens if palabra.isalpha()] # Elimina palabras no alfabeticas
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
tokens = [palabra for palabra in tokens if palabra not in stopwords_set] # Elimina stopwords
print(tokens)
traductor=Translator()
traduccion=traductor.translate(tokens,dest='en')
print(traduccion)
texto.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment