Skip to content

Instantly share code, notes, and snippets.

View luismond's full-sized avatar
🎯
Focusing

Luis Mondragón luismond

🎯
Focusing
View GitHub Profile
from apiclient.discovery import build
service = build('translate', 'v2', developerKey='')
@luismond
luismond / save_author_names.py
Created March 10, 2019 00:49
save reddit author names
def save_author_names():
authors = [cmnt.author.name for cmnt in reddit.subreddit(sub).comments(limit=None)]
with open('authors'+'_mexico_'+str(time.time())+'.txt', 'w', encoding='utf8') as f:
for a in authors:
f.write(a+'\n')
@luismond
luismond / download_vtt.py
Created March 10, 2019 00:59
Downloads .vtt subtitle file from youtube video
import youtube_dl
def download_vtt(url,lang):
ydl_opts = {
'quiet': True,
'subtitleslangs': [lang],
'writeautomaticsub': 'yes',
'skip_download': 'yes'
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
@luismond
luismond / faker_es.py
Created March 10, 2019 01:56
Faker test for Spanish locales
from faker import Faker
fake = Faker('es_MX')
for n in range(10):
print(fake.name())
'''
Humberto Menchaca Berríos
Lic. Irma Menchaca
Elisa Barrera
from faker import Faker
fake = Faker('es_MX')
for n in range(10):
print(fake.job())
'''
Geologist, wellsite
Sports development officer
Telecommunications researcher
from faker import Faker
from translate import Translator
fake = Faker('es_MX')
translator= Translator(to_lang="es")
for n in range(10):
print(translator.translate(fake.job()))
'''
@luismond
luismond / strip_punctuation.py
Created March 13, 2019 19:51
Strip punctuation function
def strip_punct(line):
line = str(line)
charset = set()
for ch in line:
charset.update(ch)
punct = [ch for ch in charset if not ch.isalpha()]
if ' ' in punct:
punct.remove(' ')
for ch in punct:
line = line.replace(ch, ' ').lower()
import stanfordnlp
MODELS_DIR = 'C:\\Users\\user\\stanfordnlp_resources\\'
nlp = stanfordnlp.Pipeline(processors='tokenize,pos,lemma', models_dir=MODELS_DIR, lang='es')
def get_lemmas(line):
line = nlp(line)
tagged = [[w.lemma for w in sent.words if w.pos == 'ADV' or w.pos == 'ADJ' or w.pos == 'VERB']
for sent in line.sentences]
return ' '.join([w for sent in tagged for w in sent])
#Get bilingual data from the European Comission translation memories
#https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory#More%20details%20/%20Reference%20publication
#I needed to extract just EN-ES bilingual data from the tmx files for my machine translation experiment.
#Their Java TM exporter was not working on my side.
#I wrote this script to get the data
import xmltodict
import pandas as pd
import os
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.