Shortlink: goo.gl/wSuuS9
The github repository will soon be available at github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wikisum
# train_grpo.py | |
# | |
# See https://github.com/willccbb/verifiers for ongoing developments | |
# | |
import re | |
import torch | |
from datasets import load_dataset, Dataset | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from peft import LoraConfig | |
from trl import GRPOConfig, GRPOTrainer |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
# helpers | |
def make_unit_length(x, epsilon=1e-6): | |
norm = x.norm(p=2, dim=-1, keepdim=True) | |
return x.div(norm + epsilon) |
Shortlink: goo.gl/wSuuS9
The github repository will soon be available at github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wikisum
import random | |
class TicTacToe: | |
def __init__(self, playerX, playerO): | |
self.board = [' ']*9 | |
self.playerX, self.playerO = playerX, playerO | |
self.playerX_turn = random.choice([True, False]) | |
def play_game(self): |
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """ | |
import numpy as np | |
import cPickle as pickle | |
import gym | |
# hyperparameters | |
H = 200 # number of hidden layer neurons | |
batch_size = 10 # every how many episodes to do a param update? | |
learning_rate = 1e-4 | |
gamma = 0.99 # discount factor for reward |
#!/usr/bin/env python | |
# coding: utf-8 | |
"""Sampling Sequence Data from model""" | |
import numpy as np | |
import tensorflow as tf | |
import json | |
import cPickle as pickle | |
import itertools as it | |
from rnnlib import PTBModel |
B = self.igor.batch_size | |
R = self.igor.rnn_size | |
S = self.igor.max_sequence_len | |
V = self.igor.vocab_size | |
E = self.igor.embedding_size | |
### loaded from glove | |
emb_W = self.igor.embeddings.astype(theano.config.floatX) | |
## dropout parameters | |
p_emb = self.igor.p_emb_dropout |