Skip to content

Instantly share code, notes, and snippets.

View merrymercy's full-sized avatar
:octocat:

Lianmin Zheng merrymercy

:octocat:
View GitHub Profile
"""
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and

model: InternLM-7B

python3 benchmark_serving.py --backend openai --host 127.0.0.1 --port 30000 --dataset ShareGPT_V3_unfiltered_cleaned_split.json --model internlm/internlm2-chat-7b --tokenizer internlm/internlm2-chat-7b --num-prompts 3000 --trust-remote-code

H100 w/o streaming

SGLang

Successful requests: 3000
import cupy as cp
import cupy.cuda.nccl as nccl
from mpi4py import MPI
import time
import os
import jax.numpy as jnp
nbytes = 1024*1024*32
data_type = cp.float32
import cupy as cp
import cupy.cuda.nccl as nccl
from mpi4py import MPI
import time
import os
import jax.numpy as jnp
nbytes = 1024*1024*32
data_type = cp.float32
# mpirun -np 2 python p2p-nonblocking.py
import cupy as cp
import cupy.cuda.nccl as nccl
from mpi4py import MPI
import time
import os
import jax.numpy as jnp
# mpirun -np 2 python p2p-nonblocking.py
import cupy as cp
import cupy.cuda.nccl as nccl
from mpi4py import MPI
import time
import os
import jax.numpy as jnp
# mpirun -np 2 python p2p-nonblocking.py
import cupy as cp
import cupy.cuda.nccl as nccl
from mpi4py import MPI
import time
import os
nbytes = 1024*1024*32
data_type = cp.float32
# mpirun -np 2 python p2p-nonblocking.py
import cupy as cp
import cupy.cuda.nccl as nccl
from mpi4py import MPI
import time
import os
nbytes = 1024*1024*32
data_type = cp.float32
@merrymercy
merrymercy / redirect.py
Created March 21, 2023 00:57
Permanently redirect to another url.
from fastapi import FastAPI
from starlette.responses import RedirectResponse
app = FastAPI()
@app.get("/")
async def redirect():
response = RedirectResponse(url="https://alpa-projects.github.io/opt", status_code=301)
return response
@merrymercy
merrymercy / test_pad.py
Last active August 21, 2022 18:24
test padding in the middle
"""Use huggingface/transformers interface and Alpa backend for distributed inference."""
from transformers import AutoTokenizer
from opt_serving.model.wrapper import get_model
import numpy as np
import torch
# Load the tokenizer. We have to use the 30B version because
# other versions have some issues. The 30B version works for all OPT models.
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-30b", use_fast=False)
tokenizer.add_bos_token = False