Skip to content

Instantly share code, notes, and snippets.

View chengscott's full-sized avatar

Scott Cheng chengscott

View GitHub Profile
@chengscott
chengscott / ipc.cc
Last active February 7, 2025 13:24
c++ multiprocessing Barrier/CondVar/SharedMemory using SysV IPC
#include "ipc.hpp"
#include <cassert>
#include <cstring>
#include <sys/ipc.h>
#include <sys/sem.h>
#include <sys/shm.h>
void *Shm::create(int key, size_t size, int shmflg) {
flag = shmflg;
shmid = shmget(key, size, shmflg);
@chengscott
chengscott / host_vector.cu
Created January 30, 2025 10:21
host_vector (a std::vector with a custom allocator based on cudaMallocHost)
#include "host_vector.hpp"
#define DEVICE_CHECK(call) \
if ((call) != cudaSuccess) { \
throw std::runtime_error(#call " API call failed: " + GetLastErrorString() + " at " + \
__FILE__ + ", line" + std::to_string(__LINE__)); \
}
static std::string GetLastErrorString() { return cudaGetErrorString(cudaGetLastError()); }
namespace details {
@chengscott
chengscott / client.cc
Last active January 22, 2025 13:08
cppzmq ROUTER DEALER & pyzmq DEALER
#include "client.hpp"
void Sock::bind(size_t port) {
sock = zmq::socket_t(ctx, zmq::socket_type::router);
sock.set(zmq::sockopt::linger, 0);
sock.bind("tcp://*:" + std::to_string(port));
}
void Sock::close() {
sock.close();
@chengscott
chengscott / flat_array.cc
Last active January 8, 2025 01:38
flat_array_t and flat_tuple_t are automatically hashable type wrapper for std::array<T, N> and std::tuple<Ts...> that can be used as keys in map/unordered_map
#include <array>
#include <functional>
#include <iostream>
#include <tuple>
#include <unordered_map>
template <class T, size_t N> union flat_array_t {
using hash_t = std::tuple_element_t<
sizeof(T) - 1,
std::tuple<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t>>;
@chengscott
chengscott / tmpl_range.cc
Last active January 7, 2025 16:42
Select template by range
#include <iostream>
struct default_tag {};
struct case1_tag {};
struct case2_tag {};
template <size_t N> using selected_t = typename std::conditional<
10 <= N && N <= 20,
case1_tag,
typename std::conditional<
@chengscott
chengscott / README.md
Last active June 23, 2024 13:28
Run different jobs at specific time points on weekdays using a systemd template unit
systemctl daemon-reload
systemctl enable --now job@{08:00,12:00,21:00}.timer
#include <iostream>
#include <memory>
#include <mutex>
#include <thread>
class MeanTracker {
int total_ = 0;
float mean_ = 0.f;
public:
#include <iostream>
__device__ int warpInclusiveScan(int val) {
int laneId = threadIdx.x % warpSize;
for (int offset = 1; offset < 32; offset <<= 1) {
int v = __shfl_up_sync(0xffffffff, val, offset);
if (laneId >= offset) val += v;
}
return val;
}
diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
index 8cf7fb2..ca13a72 100644
--- a/python/aitemplate/backend/cuda/conv2d/common.py
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -501,6 +501,7 @@ def emit_instance(op):
emiter = cutlass_lib.conv2d_operation.EmitConv2dWithBroadcastInstance()
else:
emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
+ op.tile_description.stages = 2
op_def = emiter.emit(op)
static size_t GLOBAL_WORKSPACE_SIZE_DeviceConvFwdInstance_0 = 0;
#include <cstdio>
#include <stdexcept>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"