Skip to content

Instantly share code, notes, and snippets.

View abcdabcd987's full-sized avatar
😄
( ´ ▽ ` )ノ

Lequn Chen abcdabcd987

😄
( ´ ▽ ` )ノ
View GitHub Profile
@abcdabcd987
abcdabcd987 / 2025-02-23-mla-flashinfer-vs-deepseek.py
Last active February 28, 2025 19:31
MLA Kernel Performance: FlashInfer vs DeepSeek FlashMLA
# Results: https://docs.google.com/spreadsheets/d/1t0Txa7Ph9u7Su9LyWpS24vqr9A5FB-FyL0EZNpYOqwg/edit?gid=0#gid=0
# FlashInfer: 28053ac54023fbf3fb552f7be015b0f90a37ed76
# FlashMLA : accc1695ee0ff996ec63eaf2ebcbf6874ed0e7df
import itertools
import torch
from flash_mla import flash_mla_with_kvcache, get_mla_metadata
from flashinfer import BatchMLAPagedAttentionWrapper
from triton.testing import do_bench # type: ignore[import]
# PyTorch 2.6 + Cuda 12.6 Segmentation Fault
# Good combinations:
# --no-early-bind --sync=no
# --no-early-bind --sync=sleep
# --no-early-bind --sync=barrier
# Bad combinations:
# --early-bind --sync=no
# Good on NCCL 2.21.5 (offical PyTorch wheel) but segfault on NCCL 2.25.1 (custom built):
# --early-bind --sync=sleep
# --early-bind --sync=barrier
@abcdabcd987
abcdabcd987 / delay-fs.cc
Last active November 28, 2022 22:57
A filesystem from which reading a file incurs 50ms delay for each byte.
#if 0
set -e
binname=$(mktemp --suffix -delay-fs.bin)
g++ -o "$binname" -Wall -g -O2 "$0" -lfuse3 -lpthread
"$binname" $@
rm "$binname"
exit
#endif
// delay-fs:
// A filesystem from which reading a file incurs 50ms delay for each byte.
diff -ruN orig/kernel/src/allocator/bin.rs new/kernel/src/allocator/bin.rs
--- orig/kernel/src/allocator/bin.rs 2018-08-02 19:23:14.000000000 +0800
+++ new/kernel/src/allocator/bin.rs 2018-08-04 11:18:29.000000000 +0800
@@ -1,5 +1,6 @@
use std::fmt;
-use alloc::heap::{AllocErr, Layout};
+use core::alloc::{AllocErr, Layout};
+use core::ptr::NonNull;
use allocator::util::*;
// ==UserScript==
// @name PuTao without 4K
// @version 1.0.1
// @namespace https://abcdabcd987.com
// @homepageURL https://gist.github.com/abcdabcd987/9839987cf0b7cc13e2d9e4b030a188b5
// @author Lequn Chen
// @description Remove links to 4K videos in the torrent list.
// @run-at document-idle
// @match *://pt.sjtu.edu.cn/torrents.php*
// ==/UserScript==
diff -ruN orig/bootloader/Xargo.toml new/bootloader/Xargo.toml
--- orig/bootloader/Xargo.toml 2018-07-18 23:59:41.000000000 +0800
+++ new/bootloader/Xargo.toml 2018-07-19 22:31:03.000000000 +0800
@@ -2,10 +2,6 @@
core = {}
std_unicode = {}
-[dependencies.compiler_builtins]
-features = ["mem"]
-stage = 1
// g++ cache-prefetching.cc -Wall -std=c++11 -g -pthread -O2
#include <string>
#include <vector>
#include <mutex>
#include <random>
#include <chrono>
#include <thread>
#include <iomanip>
#include <iostream>
// g++-7 sharding.cc -Wall -std=c++17 -g -pthread -O2 -ltbb
#include <cassert>
#include <cstdlib>
#include <cstring>
#include <string>
#include <mutex>
#include <shared_mutex>
#include <optional>
#include <random>
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#!/usr/local/bin/python2
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, division
import os
# os.environ['HTTP_PROXY'] = 'http://127.0.0.1:8081/'
# os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:8081/'
import datetime
import hashlib
import requests
from pprint import pprint