This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pysrt | |
def download_youtube_srt(link): | |
video_id = link[link.find('?v=') + 3:] | |
import subprocess | |
subprocess.call(['youtube-dl', | |
# for sub | |
'--sub-lang', 'en', |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
from bs4 import BeautifulSoup | |
from tqdm import tqdm | |
import json | |
if __name__ == '__main__': | |
paths = glob.glob('./corpus-utf8/*.txt') | |
print('len(paths):', len(paths)) | |
pair_count = 0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def load_glove(glove_path): | |
word2vec = {} | |
with open(glove_path, 'r') as f: | |
for line in f: | |
split_line = line.split() | |
word = split_line[0] | |
embedding = np.array([float(val) for val in split_line[1:]]) | |
word2vec[word] = embedding | |
print("Done.",len(word2vec)," words loaded!") | |
return word2vec |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import sys | |
import datetime | |
import random | |
import json | |
import pickle | |
import numpy as np | |
from flask import Flask, session, g, request, render_template, redirect | |
from flask_mongoengine import MongoEngine |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"settings": { | |
"index": { | |
"analysis": { | |
"analyzer": { | |
"korean": { | |
"type": "custom", | |
"tokenizer": "seunjeon_default_tokenizer", | |
"min_token_length": 2, | |
"decompound_mode": "mixed", | |
"filter": ['cobak_synonym_filter', 'token_length_filter'], |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Convolutional neural network (two convolutional layers) | |
class ConvNet(nn.Module): | |
def __init__(self, hidden_size=200): | |
super(ConvNet, self).__init__() | |
self.hidden_size = hidden_size | |
self.layer1 = nn.Sequential( | |
nn.Conv1d(34, 68, kernel_size=5, stride=1, padding=2), | |
nn.BatchNorm1d(68), | |
nn.CELU(), |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class BertNet(nn.Module): | |
def __init__(self, finetuning=False, num_classes=3, hidden_size=50): | |
super().__init__() | |
self.bert = BertModel.from_pretrained('bert-base-uncased') | |
self.bert_output_size = 768 | |
self.hidden_size = hidden_size | |
self.rnn = nn.LSTM(input_size=self.bert_output_size, hidden_size=self.hidden_size, batch_first=True, bidirectional=True) | |
self.fc = nn.Linear(self.hidden_size * 2, num_classes) | |
self.finetuning = finetuning |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def draw(epoch, input_x, input_y, predicts, input_c_pos, id2label, id2word): | |
sents_visual_file = './visualization/{}.html'.format(epoch) | |
batch_size = len(input_y) | |
with open(sents_visual_file, "w") as html_file: | |
html_file.write('<!DOCTYPE html><html lang="ko"><head><meta charset="UTF-8"/></head>') | |
for i in range(batch_size): | |
if input_y[i] == predicts[i]: continue |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pymongo | |
_connection = None | |
def get_db(): | |
global _connection | |
if not _connection: | |
_connection = pymongo.MongoClient('mongodb://127.0.0.1') | |
return _connection['dbname'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import utils | |
import nltk, datetime | |
from pprint import pprint | |
import spacy | |
nlp = spacy.load('en_core_web_lg') | |
NewerOlder