Created
October 14, 2017 15:42
-
-
Save Siedlerchr/5017e15e53ab04d08141522d41b471b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import os | |
from nltk import word_tokenize | |
from nltk.corpus.reader.plaintext import PlaintextCorpusReader | |
from nltk.corpus import stopwords | |
from googletrans import Translator | |
from PyDictionary import PyDictionary | |
from googletrans import Translator | |
translator = Translator() | |
print(translator.translate('안녕하세요.')) | |
corpusdir = 'test3/' | |
listaFicheros = PlaintextCorpusReader(corpusdir, '.*') | |
os.chdir(corpusdir) | |
for ficheros in listaFicheros.fileids(): | |
if 'txt' in ficheros: | |
texto=codecs.open(ficheros,'r','utf-8') | |
tokens = word_tokenize(texto.read()) | |
#tokens = [palabra.lower() for sent in sent_tokenize(texto) for palabra in word_tokenize(sent)] | |
tokens = [palabra for palabra in tokens if len(palabra) > 1] # Elimina tokens menores de 1 | |
tokens = [palabra for palabra in tokens if not palabra.isnumeric()] # Elimina numeros | |
tokens = [palabra for palabra in tokens if palabra.isalpha()] # Elimina palabras no alfabeticas | |
for language in stopwords.fileids(): | |
stopwords_set = set(stopwords.words(language)) | |
tokens = [palabra for palabra in tokens if palabra not in stopwords_set] # Elimina stopwords | |
print(tokens) | |
traductor=Translator() | |
traduccion=traductor.translate(tokens,dest='en') | |
print(traduccion) | |
texto.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment