Denis gaphex

Good morning!

gaphex / idisplay.py

Last active October 17, 2016 16:28

	import random
	import matplotlib.pyplot as plt

	from IPython import display

	"""
	IPython Display rc0

	Try:
	dsp = IDisplay()

gaphex / wikianswers_iterator.py

Last active October 28, 2016 13:44

	"""
	Sample usage:
	cli = pymongo.MongoClient()
	col = cli['wiki_answers']['gold']
	itr = WikianswersIterator(col, cache_size=2048)

	for minibatch in itr:
	process(minibatch)
	"""
	import random

gaphex / wikianswers_iterator_py3.py

Last active November 1, 2016 12:52

	"""
	Sample usage:

	itr = WikianswersIterator(col='gold', db='wiki_answers', cache_size=2048)
	for minibatch in itr:
	process(minibatch)
	"""
	import random
	import pymongo
	import numpy as np

gaphex / taurus.py

Last active April 6, 2018 13:57

	import json
	import logging
	import requests
	import telegram
	import coinmarketcap

	from telegram.ext import Updater, CommandHandler, MessageHandler, Filters

	#from config import TOKEN, LOG_FILE
	TOKEN = ""

gaphex / bert_environment.py

Last active May 9, 2019 17:55

setting up BERT learning environment

gaphex / download_training_data.py

Created May 9, 2019 14:52

Downloading the OPUS dataset

	AVAILABLE = {'af','ar','bg','bn','br','bs','ca','cs',
	'da','de','el','en','eo','es','et','eu',
	'fa','fi','fr','gl','he','hi','hr','hu',
	'hy','id','is','it','ja','ka','kk','ko',
	'lt','lv','mk','ml','ms','nl','no','pl',
	'pt','pt_br','ro','ru','si','sk','sl','sq',
	'sr','sv','ta','te','th','tl','tr','uk',
	'ur','vi','ze_en','ze_zh','zh','zh_cn',
	'zh_en','zh_tw','zh_zh'}

gaphex / truncate_dataset.py

Created May 9, 2019 14:54

Truncating OPUS dataset

	DEMO_MODE = True #@param {type:"boolean"}

	if DEMO_MODE:
	CORPUS_SIZE = 1000000
	else:
	CORPUS_SIZE = 100000000 #@param {type: "integer"}

	!(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt
	!mv subdataset.txt dataset.txt

gaphex / truncate_dataset.py

Created May 9, 2019 14:55

	DEMO_MODE = True #@param {type:"boolean"}
	if DEMO_MODE:
	CORPUS_SIZE = 1000000
	else:
	CORPUS_SIZE = 100000000 #@param {type: "integer"}

	!(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt
	!mv subdataset.txt dataset.txt

gaphex / text_normalization.py

Created May 9, 2019 15:04

	regex_tokenizer = nltk.RegexpTokenizer("\w+")

	def normalize_text(text):
	# lowercase text
	text = str(text).lower()
	# remove non-UTF
	text = text.encode("utf-8", "ignore").decode()
	# remove punktuation symbols
	text = " ".join(regex_tokenizer.tokenize(text))
	return text

gaphex / apply_normalization.py

Created May 9, 2019 15:11

	RAW_DATA_FPATH = "dataset.txt" #@param {type: "string"}
	PRC_DATA_FPATH = "proc_dataset.txt" #@param {type: "string"}

	# apply normalization to the dataset
	# this will take a minute or two

	total_lines = count_lines(RAW_DATA_FPATH)
	bar = Progbar(total_lines)

	with open(RAW_DATA_FPATH,encoding="utf-8") as fi: