May 4, 2025 07:50 · February 23, 2025 22:50 · November 28, 2022 22:57 · August 16, 2018 01:53 · August 7, 2018 04:08 · August 16, 2018 01:53
 # Results: https://docs.google.com/spreadsheets/d/1t0Txa7Ph9u7Su9LyWpS24vqr9A5FB-FyL0EZNpYOqwg/edit?gid=0#gid=0
 # FlashInfer: 28053ac54023fbf3fb552f7be015b0f90a37ed76
 # FlashMLA  : accc1695ee0ff996ec63eaf2ebcbf6874ed0e7df
 import itertools

 import torch
 from flash_mla import flash_mla_with_kvcache, get_mla_metadata
 from flashinfer import BatchMLAPagedAttentionWrapper
 from triton.testing import do_bench  # type: ignore[import]
 # PyTorch 2.6 + Cuda 12.6 Segmentation Fault
 # Good combinations:
 #   --no-early-bind --sync=no
 #   --no-early-bind --sync=sleep
 #   --no-early-bind --sync=barrier
 # Bad combinations:
 #   --early-bind --sync=no
 # Good on NCCL 2.21.5 (offical PyTorch wheel) but segfault on NCCL 2.25.1 (custom built):
 #   --early-bind --sync=sleep
 #   --early-bind --sync=barrier
 #if 0
 set -e
 binname=$(mktemp --suffix -delay-fs.bin)
 g++ -o "$binname" -Wall -g -O2 "$0" -lfuse3 -lpthread
 "$binname" $@
 rm "$binname"
 exit
 #endif
 // delay-fs:
 //   A filesystem from which reading a file incurs 50ms delay for each byte.
 diff -ruN orig/kernel/src/allocator/bin.rs new/kernel/src/allocator/bin.rs
 --- orig/kernel/src/allocator/bin.rs	2018-08-02 19:23:14.000000000 +0800
 +++ new/kernel/src/allocator/bin.rs	2018-08-04 11:18:29.000000000 +0800
 @@ -1,5 +1,6 @@
 use std::fmt;
 -use alloc::heap::{AllocErr, Layout};
 +use core::alloc::{AllocErr, Layout};
 +use core::ptr::NonNull;
 
 use allocator::util::*;
 // ==UserScript==
 // @name        PuTao without 4K
 // @version     1.0.1
 // @namespace   https://abcdabcd987.com
 // @homepageURL https://gist.github.com/abcdabcd987/9839987cf0b7cc13e2d9e4b030a188b5
 // @author      Lequn Chen
 // @description Remove links to 4K videos in the torrent list.
 // @run-at      document-idle
 // @match       *://pt.sjtu.edu.cn/torrents.php*
 // ==/UserScript==
 diff -ruN orig/bootloader/Xargo.toml new/bootloader/Xargo.toml
 --- orig/bootloader/Xargo.toml	2018-07-18 23:59:41.000000000 +0800
 +++ new/bootloader/Xargo.toml	2018-07-19 22:31:03.000000000 +0800
 @@ -2,10 +2,6 @@
 core = {}
 std_unicode = {}
 
 -[dependencies.compiler_builtins]
 -features = ["mem"]
 -stage = 1
 // g++ cache-prefetching.cc -Wall -std=c++11 -g -pthread -O2

 #include <string>
 #include <vector>
 #include <mutex>
 #include <random>
 #include <chrono>
 #include <thread>
 #include <iomanip>
 #include <iostream>
 // g++-7 sharding.cc -Wall -std=c++17 -g -pthread -O2 -ltbb

 #include <cassert>
 #include <cstdlib>
 #include <cstring>
 #include <string>
 #include <mutex>
 #include <shared_mutex>
 #include <optional>
 #include <random>
 #!/usr/local/bin/python2
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals, print_function, division
 import os
 # os.environ['HTTP_PROXY'] = 'http://127.0.0.1:8081/'
 # os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:8081/'
 import datetime
 import hashlib
 import requests
 from pprint import pprint
	# Results: https://docs.google.com/spreadsheets/d/1t0Txa7Ph9u7Su9LyWpS24vqr9A5FB-FyL0EZNpYOqwg/edit?gid=0#gid=0
	# FlashInfer: 28053ac54023fbf3fb552f7be015b0f90a37ed76
	# FlashMLA : accc1695ee0ff996ec63eaf2ebcbf6874ed0e7df
	import itertools

	import torch
	from flash_mla import flash_mla_with_kvcache, get_mla_metadata
	from flashinfer import BatchMLAPagedAttentionWrapper
	from triton.testing import do_bench # type: ignore[import]
	# PyTorch 2.6 + Cuda 12.6 Segmentation Fault
	# Good combinations:
	# --no-early-bind --sync=no
	# --no-early-bind --sync=sleep
	# --no-early-bind --sync=barrier
	# Bad combinations:
	# --early-bind --sync=no
	# Good on NCCL 2.21.5 (offical PyTorch wheel) but segfault on NCCL 2.25.1 (custom built):
	# --early-bind --sync=sleep
	# --early-bind --sync=barrier
	#if 0
	set -e
	binname=$(mktemp --suffix -delay-fs.bin)
	g++ -o "$binname" -Wall -g -O2 "$0" -lfuse3 -lpthread
	"$binname" $@
	rm "$binname"
	exit
	#endif
	// delay-fs:
	// A filesystem from which reading a file incurs 50ms delay for each byte.
	diff -ruN orig/kernel/src/allocator/bin.rs new/kernel/src/allocator/bin.rs
	--- orig/kernel/src/allocator/bin.rs 2018-08-02 19:23:14.000000000 +0800
	+++ new/kernel/src/allocator/bin.rs 2018-08-04 11:18:29.000000000 +0800
	@@ -1,5 +1,6 @@
	use std::fmt;
	-use alloc::heap::{AllocErr, Layout};
	+use core::alloc::{AllocErr, Layout};
	+use core::ptr::NonNull;

	use allocator::util::*;
	// ==UserScript==
	// @name PuTao without 4K
	// @version 1.0.1
	// @namespace https://abcdabcd987.com
	// @homepageURL https://gist.github.com/abcdabcd987/9839987cf0b7cc13e2d9e4b030a188b5
	// @author Lequn Chen
	// @description Remove links to 4K videos in the torrent list.
	// @run-at document-idle
	// @match ://pt.sjtu.edu.cn/torrents.php
	// ==/UserScript==
	diff -ruN orig/bootloader/Xargo.toml new/bootloader/Xargo.toml
	--- orig/bootloader/Xargo.toml 2018-07-18 23:59:41.000000000 +0800
	+++ new/bootloader/Xargo.toml 2018-07-19 22:31:03.000000000 +0800
	@@ -2,10 +2,6 @@
	core = {}
	std_unicode = {}

	-[dependencies.compiler_builtins]
	-features = ["mem"]
	-stage = 1
	// g++ cache-prefetching.cc -Wall -std=c++11 -g -pthread -O2

	#include <string>
	#include <vector>
	#include <mutex>
	#include <random>
	#include <chrono>
	#include <thread>
	#include <iomanip>
	#include <iostream>
	// g++-7 sharding.cc -Wall -std=c++17 -g -pthread -O2 -ltbb

	#include <cassert>
	#include <cstdlib>
	#include <cstring>
	#include <string>
	#include <mutex>
	#include <shared_mutex>
	#include <optional>
	#include <random>
	#!/usr/local/bin/python2
	# -- coding: utf-8 --
	from __future__ import unicode_literals, print_function, division
	import os
	# os.environ['HTTP_PROXY'] = 'http://127.0.0.1:8081/'
	# os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:8081/'
	import datetime
	import hashlib
	import requests
	from pprint import pprint