This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import random | |
| import matplotlib.pyplot as plt | |
| from IPython import display | |
| """ | |
| IPython Display rc0 | |
| Try: | |
| dsp = IDisplay() | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | """ | |
| Sample usage: | |
| cli = pymongo.MongoClient() | |
| col = cli['wiki_answers']['gold'] | |
| itr = WikianswersIterator(col, cache_size=2048) | |
| for minibatch in itr: | |
| process(minibatch) | |
| """ | |
| import random | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | """ | |
| Sample usage: | |
| itr = WikianswersIterator(col='gold', db='wiki_answers', cache_size=2048) | |
| for minibatch in itr: | |
| process(minibatch) | |
| """ | |
| import random | |
| import pymongo | |
| import numpy as np | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import json | |
| import logging | |
| import requests | |
| import telegram | |
| import coinmarketcap | |
| from telegram.ext import Updater, CommandHandler, MessageHandler, Filters | |
| #from config import TOKEN, LOG_FILE | |
| TOKEN = "" | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import os | |
| import sys | |
| import json | |
| import nltk | |
| import random | |
| import logging | |
| import tensorflow as tf | |
| import sentencepiece as spm | |
| from glob import glob | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | AVAILABLE = {'af','ar','bg','bn','br','bs','ca','cs', | |
| 'da','de','el','en','eo','es','et','eu', | |
| 'fa','fi','fr','gl','he','hi','hr','hu', | |
| 'hy','id','is','it','ja','ka','kk','ko', | |
| 'lt','lv','mk','ml','ms','nl','no','pl', | |
| 'pt','pt_br','ro','ru','si','sk','sl','sq', | |
| 'sr','sv','ta','te','th','tl','tr','uk', | |
| 'ur','vi','ze_en','ze_zh','zh','zh_cn', | |
| 'zh_en','zh_tw','zh_zh'} | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | DEMO_MODE = True #@param {type:"boolean"} | |
| if DEMO_MODE: | |
| CORPUS_SIZE = 1000000 | |
| else: | |
| CORPUS_SIZE = 100000000 #@param {type: "integer"} | |
| !(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt | |
| !mv subdataset.txt dataset.txt | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | DEMO_MODE = True #@param {type:"boolean"} | |
| if DEMO_MODE: | |
| CORPUS_SIZE = 1000000 | |
| else: | |
| CORPUS_SIZE = 100000000 #@param {type: "integer"} | |
| !(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt | |
| !mv subdataset.txt dataset.txt | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | regex_tokenizer = nltk.RegexpTokenizer("\w+") | |
| def normalize_text(text): | |
| # lowercase text | |
| text = str(text).lower() | |
| # remove non-UTF | |
| text = text.encode("utf-8", "ignore").decode() | |
| # remove punktuation symbols | |
| text = " ".join(regex_tokenizer.tokenize(text)) | |
| return text | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | RAW_DATA_FPATH = "dataset.txt" #@param {type: "string"} | |
| PRC_DATA_FPATH = "proc_dataset.txt" #@param {type: "string"} | |
| # apply normalization to the dataset | |
| # this will take a minute or two | |
| total_lines = count_lines(RAW_DATA_FPATH) | |
| bar = Progbar(total_lines) | |
| with open(RAW_DATA_FPATH,encoding="utf-8") as fi: | 
OlderNewer