Simon Mo simon-mo

Simple in memory job queue in Ray Serve

serve run queue_proxy:app

python client.py

(base) ➜  tmp python client.py

In this example, we will deploy stable diffusion model on Anyscale using [Anyscale Production Services]: https://docs.anyscale.com/user-guide/run-and-monitor/production-services

Once you have onboarded to Anyscale and set up your local environment. You can download the content of this gist to your laptop.

This snippet implements a simple strategy for model multiplexing in Ray Serve.

It utilitizes Serve's autoscaling capability to adjust replica counts given traffic.
It host a LRU cache per replica.

	" Sensible setup
	set nocompatible
	filetype plugin on

	" Install Vim Plug
	if empty(glob('~/.vim/autoload/plug.vim'))
	silent !curl -fLo ~/.vim/autoload/plug.vim --create-dirs
	\ https://raw.githubusercontent.com/junegunn/vim-plug/master/plug.vim
	endif

	from ray import serve

	@serve.deployment(route_prefix="/", num_replicas=20)
	def f():
	return "Hello"

	serve.start(detached=True, http_options={"host": "0.0.0.0"})
	f.deploy()

	from ray import serve
	from ray.serve.drivers import DAGDriver
	from ray.serve.dag import InputNode
	from ray.serve.http_adapters import json_request


	@serve.deployment
	class A:
	def predict(self, inp):
	return inp

	import asyncio
	import types
	from scanner import _PyObjScanner

	corotinue = types.CoroutineType

	async def main():
	scanner = _PyObjScanner()
	async def f():
	pass

	from io import BytesIO
	import random
	import time
	from pydantic import BaseModel
	from pprint import pprint
	import threading

	import requests
	import torch
	import torchvision.models as models