This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import matplotlib.pyplot as plt | |
from IPython import display | |
""" | |
IPython Display rc0 | |
Try: | |
dsp = IDisplay() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Sample usage: | |
cli = pymongo.MongoClient() | |
col = cli['wiki_answers']['gold'] | |
itr = WikianswersIterator(col, cache_size=2048) | |
for minibatch in itr: | |
process(minibatch) | |
""" | |
import random |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Sample usage: | |
itr = WikianswersIterator(col='gold', db='wiki_answers', cache_size=2048) | |
for minibatch in itr: | |
process(minibatch) | |
""" | |
import random | |
import pymongo | |
import numpy as np |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import logging | |
import requests | |
import telegram | |
import coinmarketcap | |
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters | |
#from config import TOKEN, LOG_FILE | |
TOKEN = "" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import json | |
import nltk | |
import random | |
import logging | |
import tensorflow as tf | |
import sentencepiece as spm | |
from glob import glob |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
AVAILABLE = {'af','ar','bg','bn','br','bs','ca','cs', | |
'da','de','el','en','eo','es','et','eu', | |
'fa','fi','fr','gl','he','hi','hr','hu', | |
'hy','id','is','it','ja','ka','kk','ko', | |
'lt','lv','mk','ml','ms','nl','no','pl', | |
'pt','pt_br','ro','ru','si','sk','sl','sq', | |
'sr','sv','ta','te','th','tl','tr','uk', | |
'ur','vi','ze_en','ze_zh','zh','zh_cn', | |
'zh_en','zh_tw','zh_zh'} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DEMO_MODE = True #@param {type:"boolean"} | |
if DEMO_MODE: | |
CORPUS_SIZE = 1000000 | |
else: | |
CORPUS_SIZE = 100000000 #@param {type: "integer"} | |
!(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt | |
!mv subdataset.txt dataset.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DEMO_MODE = True #@param {type:"boolean"} | |
if DEMO_MODE: | |
CORPUS_SIZE = 1000000 | |
else: | |
CORPUS_SIZE = 100000000 #@param {type: "integer"} | |
!(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt | |
!mv subdataset.txt dataset.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
regex_tokenizer = nltk.RegexpTokenizer("\w+") | |
def normalize_text(text): | |
# lowercase text | |
text = str(text).lower() | |
# remove non-UTF | |
text = text.encode("utf-8", "ignore").decode() | |
# remove punktuation symbols | |
text = " ".join(regex_tokenizer.tokenize(text)) | |
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
RAW_DATA_FPATH = "dataset.txt" #@param {type: "string"} | |
PRC_DATA_FPATH = "proc_dataset.txt" #@param {type: "string"} | |
# apply normalization to the dataset | |
# this will take a minute or two | |
total_lines = count_lines(RAW_DATA_FPATH) | |
bar = Progbar(total_lines) | |
with open(RAW_DATA_FPATH,encoding="utf-8") as fi: |
OlderNewer