Skip to content

Instantly share code, notes, and snippets.

#include <stdio.h>
#include <chrono>
__global__ void noop() { }
int main(int argc, const char *argv[]) {
cudaDeviceProp prop;
auto rc = cudaGetDeviceProperties(&prop, 0);
printf("Running on %s sm%d.%d multiProcessorCount = %d maxBlocksPerMultiProcessor = %d maxThreadsPerBlock = %d\n",
prop.name, prop.major, prop.minor, prop.multiProcessorCount, prop.maxBlocksPerMultiProcessor, prop.maxThreadsPerBlock);
a_cpp = """#include <iostream>
namespace foo::bar {
inline namespace baz {
int inc(int x) {
std::cout << "do inc from lib_a" << std::endl;
return x + 1;
}
} // inline namespace baz
void do_a(int x) {
import torch
import sys
import timeit
def add_repeat(x, y, repeat=10):
rc = x.clone()
for i in range(repeat):
rc += y
return rc
#include <stdio.h>
__global__ void print() {
printf("Hello World of CUDA threadIdx.x=%d\n", threadIdx.x);
}
__global__ void noop() { }
int main(int argc, const char *argv[]) {
cudaDeviceProp prop;
import torch
import os
os.environ["MTL_CAPTURE_ENABLED"]="1"
a = torch.ones(2, (1 << 31) + 5, dtype=torch.int8, device='mps')
index_0 = torch.tensor([0, -1, 0, 1], device=a.device)
index_1 = torch.tensor([-2, -1, 0, 1], device=a.device)
values = torch.tensor([12, 13, 10, 11], dtype=a.dtype, device=a.device)
with torch.mps.profiler.metal_capture("index_put"):
a.index_put_((index_0, index_1), values, accumulate=True)
b = a[1, -2].cpu()
#!/usr/bin/env python3
import sys
import subprocess
import urllib.request
import json
def get_latest_version(package_name: str) -> str:
"""Get latest version from PyPI"""
api_url = f"https://pypi.org/pypi/{package_name}/json"
This file has been truncated, but you can view the full file.
With cudnn-9.10.2.21
```
$ CUDNN_LOGINFO_DBG=3 RUN_SLOW=1 python3 -m pytest -v tests/models/vit/test_modeling_vit.py::ViTModelTest::test_batching_equivalence
========================================================================================== test session starts ===========================================================================================
platform linux -- Python 3.10.12, pytest-8.4.1, pluggy-1.6.0 -- /home/ubuntu/py3.10-nightly/bin/python3
cachedir: .pytest_cache
rootdir: /home/ubuntu/transformers
configfile: pyproject.toml
plugins: xdist-3.8.0, asyncio-1.1.0, rerunfailures-15.1, order-1.3.0, timeout-2.4.0, rich-0.2.0
asyncio: mode=strict, asyncio_default_fixture_loop_scope=function, asyncio_default_test_loop_scope=function
import torch
import torch.utils.cpp_extension as _ce
import tempfile
import os
import subprocess
src = """#include <c10/util/BFloat16.h>
#include <iostream>
int main() {
std::cout << c10::BFloat16(3.14) << std::endl;
import ctypes
import torch
import time
def nvrtc_compile(source: str) -> str:
from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer
libnvrtc = CDLL('libnvrtc.so')
def get_error_string() -> str:
err_p = c_char_p()
libnvrtc.nvrtcGetErrorString(result, byref(err_str))
"""
Example showing how to use the no_header mode with a TensorBase CUDA extension
This example creates a CUDA extension that directly includes ATen/core/TensorBase.h
instead of torch/extension.h, resulting in faster compilation with no_header=True
"""
from datetime import datetime
import torch
import torch.utils.cpp_extension