Skip to content

Instantly share code, notes, and snippets.

View gaphex's full-sized avatar

Denis gaphex

  • Moscow
View GitHub Profile
@gaphex
gaphex / idisplay.py
Last active October 17, 2016 16:28
import random
import matplotlib.pyplot as plt
from IPython import display
"""
IPython Display rc0
Try:
dsp = IDisplay()
"""
Sample usage:
cli = pymongo.MongoClient()
col = cli['wiki_answers']['gold']
itr = WikianswersIterator(col, cache_size=2048)
for minibatch in itr:
process(minibatch)
"""
import random
"""
Sample usage:
itr = WikianswersIterator(col='gold', db='wiki_answers', cache_size=2048)
for minibatch in itr:
process(minibatch)
"""
import random
import pymongo
import numpy as np
import json
import logging
import requests
import telegram
import coinmarketcap
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters
#from config import TOKEN, LOG_FILE
TOKEN = ""
@gaphex
gaphex / bert_environment.py
Last active May 9, 2019 17:55
setting up BERT learning environment
import os
import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import sentencepiece as spm
from glob import glob
@gaphex
gaphex / download_training_data.py
Created May 9, 2019 14:52
Downloading the OPUS dataset
AVAILABLE = {'af','ar','bg','bn','br','bs','ca','cs',
'da','de','el','en','eo','es','et','eu',
'fa','fi','fr','gl','he','hi','hr','hu',
'hy','id','is','it','ja','ka','kk','ko',
'lt','lv','mk','ml','ms','nl','no','pl',
'pt','pt_br','ro','ru','si','sk','sl','sq',
'sr','sv','ta','te','th','tl','tr','uk',
'ur','vi','ze_en','ze_zh','zh','zh_cn',
'zh_en','zh_tw','zh_zh'}
@gaphex
gaphex / truncate_dataset.py
Created May 9, 2019 14:54
Truncating OPUS dataset
DEMO_MODE = True #@param {type:"boolean"}
if DEMO_MODE:
CORPUS_SIZE = 1000000
else:
CORPUS_SIZE = 100000000 #@param {type: "integer"}
!(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt
!mv subdataset.txt dataset.txt
DEMO_MODE = True #@param {type:"boolean"}
if DEMO_MODE:
 CORPUS_SIZE = 1000000
else:
 CORPUS_SIZE = 100000000 #@param {type: "integer"}
 
!(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt
!mv subdataset.txt dataset.txt
regex_tokenizer = nltk.RegexpTokenizer("\w+")
def normalize_text(text):
# lowercase text
text = str(text).lower()
# remove non-UTF
text = text.encode("utf-8", "ignore").decode()
# remove punktuation symbols
text = " ".join(regex_tokenizer.tokenize(text))
return text
RAW_DATA_FPATH = "dataset.txt" #@param {type: "string"}
PRC_DATA_FPATH = "proc_dataset.txt" #@param {type: "string"}
# apply normalization to the dataset
# this will take a minute or two
total_lines = count_lines(RAW_DATA_FPATH)
bar = Progbar(total_lines)
with open(RAW_DATA_FPATH,encoding="utf-8") as fi: