This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) 2019-present, Thomas Wolf. | |
# All rights reserved. This source code is licensed under the MIT-style license. | |
""" A very small and self-contained gist to train a GPT-2 transformer model on wikitext-103 """ | |
import os | |
from collections import namedtuple | |
from tqdm import tqdm | |
import torch | |
import torch.nn as nn | |
from torch.utils.data import DataLoader | |
from ignite.engine import Engine, Events |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%% Saved with string encoding Unicode (UTF-8) | |
@inproceedings{Gkotsis2014Content, | |
title={It's all in the content: state of the art best answer prediction based on discretisation of shallow linguistic features}, | |
author={Gkotsis, George and Stepanyan, Karen and Pedrinaci, Carlos and Domingue, John and Liakata, Maria}, | |
booktitle={Proceedings of the 2014 ACM Conference on Web Science (WebSci)}, | |
pages={202--210}, | |
year={2014}, | |
organization={ACM} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Brian Abelson @brianabelson | |
# Harmony Institute | |
# December 5, 2012 | |
# lda is a wrapper for lda.collapsed.gibbs.sampler in the "lda" package | |
# it fits topic models using latent dirichlet allocation | |
# it provides arguments for cleaning the input text and tuning the parameters of the model | |
# it also returns alot of useful information about the topics/documents in a format that you can easily join back to your original data | |
# this allows you to easily model outcomes based on the distribution of topics within a collection of texts |