This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <torch/extension.h> | |
#include <cutlass/gemm/gemm.h> | |
#include <cutlass/epilogue/thread/linear_combination.h> | |
torch::Tensor bmm_fp16_fp16_f32(torch::Tensor A, torch::Tensor B, float alpha) { | |
int batch_size = A.size(0); | |
int M = A.size(1); | |
int N = B.size(1); | |
int K = A.size(2); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include "cutlass/cutlass.h" | |
#include "cutlass/gemm/device/gemm.h" | |
#include "cutlass/util/host_tensor.h" | |
#include "cutlass/util/reference/host/tensor_compare.h" | |
#include "cutlass/util/reference/host/tensor_copy.h" | |
#include "cutlass/util/reference/host/tensor_fill.h" | |
#include "cutlass/util/tensor_view_io.h" | |
#include "helper.h" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from inference import CaiInferenceConfig, convert_to_ds_model, recover_from_ds_model | |
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | |
from argparse import ArgumentParser | |
import time | |
import torch | |
from torch.profiler import profile, record_function, ProfilerActivity | |
parser = ArgumentParser() | |
parser.add_argument("--name", default="bigscience/bloom-560m", type=str, help="model_name") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import time | |
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | |
from argparse import ArgumentParser | |
from transformers import LlamaForCausalLM, LlamaTokenizer | |
from inference import CaiInferenceConfig, convert_to_ds_model, recover_from_ds_model | |
from torch.profiler import profile, record_function, ProfilerActivity | |
from types import MethodType | |
from typing import Optional, Sequence, Tuple, Union | |
import torch |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pytest | |
import torch | |
import triton | |
import triton.language as tl | |
@triton.jit | |
def max_fn(x, y): | |
return tl.math.max(x, y) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import time | |
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | |
from argparse import ArgumentParser | |
from transformers import LlamaForCausalLM, LlamaTokenizer | |
from inference import CaiInferenceConfig, convert_to_ds_model, recover_from_ds_model | |
parser = ArgumentParser() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import torch | |
import numpy as np | |
from deepspeed.ops.transformer.inference.triton.attention import compute_attention as deepspeed_compute_attention | |
from inference.ops.self_attention import self_attention_compute_using_triton | |
def run_func(func, qkv): | |
func(qkv, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <cmath> | |
#include <thread> | |
#include <future> | |
#include <functional> | |
using namespace std; | |
int f(int x, int y){ | |
return std::pow(x, y); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <future> | |
#include <iostream> | |
#include <thread> | |
using namespace std; | |
int factorial(std::future<int>& f){ | |
int N = f.get(); | |
int res = 1; | |
for(int i=2;i<=N;i++){ |