Skip to content

Instantly share code, notes, and snippets.

View tiandiao123's full-sized avatar

Cuiqing Li (李崔卿) tiandiao123

  • Shanghai, China
View GitHub Profile
#include <torch/extension.h>
#include <cutlass/gemm/gemm.h>
#include <cutlass/epilogue/thread/linear_combination.h>
torch::Tensor bmm_fp16_fp16_f32(torch::Tensor A, torch::Tensor B, float alpha) {
int batch_size = A.size(0);
int M = A.size(1);
int N = B.size(1);
int K = A.size(2);
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/tensor_view_io.h"
#include "helper.h"
import torch
from inference import CaiInferenceConfig, convert_to_ds_model, recover_from_ds_model
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from argparse import ArgumentParser
import time
import torch
from torch.profiler import profile, record_function, ProfilerActivity
parser = ArgumentParser()
parser.add_argument("--name", default="bigscience/bloom-560m", type=str, help="model_name")
import torch
import time
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from argparse import ArgumentParser
from transformers import LlamaForCausalLM, LlamaTokenizer
from inference import CaiInferenceConfig, convert_to_ds_model, recover_from_ds_model
from torch.profiler import profile, record_function, ProfilerActivity
from types import MethodType
from typing import Optional, Sequence, Tuple, Union
import torch
import pytest
import torch
import triton
import triton.language as tl
@triton.jit
def max_fn(x, y):
return tl.math.max(x, y)
import torch
import time
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from argparse import ArgumentParser
from transformers import LlamaForCausalLM, LlamaTokenizer
from inference import CaiInferenceConfig, convert_to_ds_model, recover_from_ds_model
parser = ArgumentParser()
import os
import torch
import numpy as np
from deepspeed.ops.transformer.inference.triton.attention import compute_attention as deepspeed_compute_attention
from inference.ops.self_attention import self_attention_compute_using_triton
def run_func(func, qkv):
func(qkv,
#include <iostream>
#include <cmath>
#include <thread>
#include <future>
#include <functional>
using namespace std;
int f(int x, int y){
return std::pow(x, y);
#include <future>
#include <iostream>
#include <thread>
using namespace std;
int factorial(std::shared_future<int> f){
int N = f.get();
int res = 1;
for(int i=2;i<=N;i++){
#include <future>
#include <iostream>
#include <thread>
using namespace std;
int factorial(std::future<int>& f){
int N = f.get();
int res = 1;
for(int i=2;i<=N;i++){