Skip to content

Instantly share code, notes, and snippets.

View luismond's full-sized avatar
🎯
Focusing

Luis Mondragón luismond

🎯
Focusing
View GitHub Profile
@luismond
luismond / Sentiment Analysis - Inspecting important words from simple models.ipynb
Created March 22, 2019 16:21
Sentiment Analysis - Inspecting important words from simple models
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#Get bilingual data from the European Comission translation memories
#https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory#More%20details%20/%20Reference%20publication
#I needed to extract just EN-ES bilingual data from the tmx files for my machine translation experiment.
#Their Java TM exporter was not working on my side.
#I wrote this script to get the data
import xmltodict
import pandas as pd
import os
import stanfordnlp
MODELS_DIR = 'C:\\Users\\user\\stanfordnlp_resources\\'
nlp = stanfordnlp.Pipeline(processors='tokenize,pos,lemma', models_dir=MODELS_DIR, lang='es')
def get_lemmas(line):
line = nlp(line)
tagged = [[w.lemma for w in sent.words if w.pos == 'ADV' or w.pos == 'ADJ' or w.pos == 'VERB']
for sent in line.sentences]
return ' '.join([w for sent in tagged for w in sent])
@luismond
luismond / strip_punctuation.py
Created March 13, 2019 19:51
Strip punctuation function
def strip_punct(line):
line = str(line)
charset = set()
for ch in line:
charset.update(ch)
punct = [ch for ch in charset if not ch.isalpha()]
if ' ' in punct:
punct.remove(' ')
for ch in punct:
line = line.replace(ch, ' ').lower()
from faker import Faker
from translate import Translator
fake = Faker('es_MX')
translator= Translator(to_lang="es")
for n in range(10):
print(translator.translate(fake.job()))
'''
from faker import Faker
fake = Faker('es_MX')
for n in range(10):
print(fake.job())
'''
Geologist, wellsite
Sports development officer
Telecommunications researcher
@luismond
luismond / faker_es.py
Created March 10, 2019 01:56
Faker test for Spanish locales
from faker import Faker
fake = Faker('es_MX')
for n in range(10):
print(fake.name())
'''
Humberto Menchaca Berríos
Lic. Irma Menchaca
Elisa Barrera
@luismond
luismond / download_vtt.py
Created March 10, 2019 00:59
Downloads .vtt subtitle file from youtube video
import youtube_dl
def download_vtt(url,lang):
ydl_opts = {
'quiet': True,
'subtitleslangs': [lang],
'writeautomaticsub': 'yes',
'skip_download': 'yes'
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
@luismond
luismond / save_author_names.py
Created March 10, 2019 00:49
save reddit author names
def save_author_names():
authors = [cmnt.author.name for cmnt in reddit.subreddit(sub).comments(limit=None)]
with open('authors'+'_mexico_'+str(time.time())+'.txt', 'w', encoding='utf8') as f:
for a in authors:
f.write(a+'\n')