- Engine: text-davinci-003 (also works in text-davinci-002, but might require more instructions to get a correct JSON back)
- Temperature: 0.7
You must extract the following information from the phone conversation below:
# coding=utf-8 | |
# Copyright 2023 Mixtral AI and the HuggingFace Inc. team. All rights reserved. | |
# | |
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX | |
# and OPT implementations in this library. It has been modified from its | |
# original forms to accommodate minor architectural differences compared | |
# to GPT-NeoX and OPT used by the Meta AI team that trained the model. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. |
You must extract the following information from the phone conversation below:
def AttentionMask(encoder_len, state_len, decoder_len, offset=0, near_decay=0, far_decay=0, device='cpu'): | |
m = -offset*np.tri(decoder_len, encoder_len+decoder_len+state_len, encoder_len) | |
for i in range(encoder_len+decoder_len-1): | |
m += np.tri(decoder_len, encoder_len+decoder_len+state_len, encoder_len-i-1) | |
if state_len: | |
ms = np.zeros((state_len, encoder_len+decoder_len+state_len)) | |
m = np.concatenate([m, ms], axis=0) | |
m = torch.tensor(m, dtype=torch.float32, device=device) | |
mx = 1-np.tri(decoder_len, encoder_len+decoder_len, encoder_len) | |
mx = np.concatenate([mx, np.zeros((decoder_len, state_len))], axis=1) |
import torch | |
import torch.nn as nn | |
class FIR(nn.Module): | |
def __init__(self, in_dim, out_dim=None, hidden_dim=None, segment_sizes=[1,2,4,8], activation=nn.functional.gelu, device='cpu'): | |
super().__init__() | |
if not out_dim: out_dim = in_dim | |
if not hidden_dim: hidden_dim = in_dim | |
cursor = 1 | |
nodes = [cursor] |
def apply_reshard(pytree_params_in, pytree_params_out, shards_in, shards_out): | |
def override_dtype(x): | |
if x.dtype == np.dtype('V2'): | |
x.dtype = jnp.bfloat16 | |
return x | |
def is_leaf(x): | |
return type(x) == np.ndarray |
class CrossAttentionModConv2d(nn.Module): | |
def __init__(self, state, ch, d_context, ch_q=None, d_v=None, n_head=1): | |
super().__init__() | |
assert ch % n_head == 0 | |
self.state = state | |
self.n_head = n_head | |
self.ch = ch | |
self.d_context = d_context | |
self.ch_q = ch_q or self.ch | |
self.d_v = d_v or self.d_context |
# So now you want to finetune that GPT-J-6B on a 3090/TITAN GPU ... okay | |
# More exploratory coding. It uses the Huggingface model port, deepspeed and reads all text/md files from a target directory | |
# It is a fragment of a larger system with remote editing, but that's another story | |
# This is the raw, training tester. Items to look out for: | |
# - uses DeepSpeed and has a DS config | |
# - to save space uses SGD instead of ADAM | |
# - uses gradient checkpointing | |
# - freezes 25% of the layers to fit | |
# Assumes you can already run https://gist.github.com/kinoc/2d636a68876cd3de7b6e9c9452b61089 |
# So you want to run GPT-J-6B using HuggingFace+FastAPI on a local rig (3090 or TITAN) ... tricky. | |
# special help from the Kolob Colab server https://colab.research.google.com/drive/1VFh5DOkCJjWIrQ6eB82lxGKKPgXmsO5D?usp=sharing#scrollTo=iCHgJvfL4alW | |
# Conversion to HF format (12.6GB tar image) found at https://drive.google.com/u/0/uc?id=1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1&export=download | |
# Uses GDOWN to get the image | |
# You will need 26 GB of space, 12+GB for the tar and 12+GB expanded (you can nuke the tar after expansion) | |
# Near Simplest Language model API, with room to expand! | |
# runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI | |
# change "seq" (which is the context size) to adjust footprint |
"""Complex momentum SGD and Adam. See https://arxiv.org/abs/2102.08431.""" | |
import math | |
import torch | |
from torch import optim | |
class ComplexSGD(optim.Optimizer): | |
def __init__(self, params, lr=1e-2, momentum=0.9, angle=math.pi / 8, weight_decay=0.): |