This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <chrono> | |
| __global__ void noop() { } | |
| int main(int argc, const char *argv[]) { | |
| cudaDeviceProp prop; | |
| auto rc = cudaGetDeviceProperties(&prop, 0); | |
| printf("Running on %s sm%d.%d multiProcessorCount = %d maxBlocksPerMultiProcessor = %d maxThreadsPerBlock = %d\n", | |
| prop.name, prop.major, prop.minor, prop.multiProcessorCount, prop.maxBlocksPerMultiProcessor, prop.maxThreadsPerBlock); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| a_cpp = """#include <iostream> | |
| namespace foo::bar { | |
| inline namespace baz { | |
| int inc(int x) { | |
| std::cout << "do inc from lib_a" << std::endl; | |
| return x + 1; | |
| } | |
| } // inline namespace baz | |
| void do_a(int x) { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import sys | |
| import timeit | |
| def add_repeat(x, y, repeat=10): | |
| rc = x.clone() | |
| for i in range(repeat): | |
| rc += y | |
| return rc |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| __global__ void print() { | |
| printf("Hello World of CUDA threadIdx.x=%d\n", threadIdx.x); | |
| } | |
| __global__ void noop() { } | |
| int main(int argc, const char *argv[]) { | |
| cudaDeviceProp prop; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import os | |
| os.environ["MTL_CAPTURE_ENABLED"]="1" | |
| a = torch.ones(2, (1 << 31) + 5, dtype=torch.int8, device='mps') | |
| index_0 = torch.tensor([0, -1, 0, 1], device=a.device) | |
| index_1 = torch.tensor([-2, -1, 0, 1], device=a.device) | |
| values = torch.tensor([12, 13, 10, 11], dtype=a.dtype, device=a.device) | |
| with torch.mps.profiler.metal_capture("index_put"): | |
| a.index_put_((index_0, index_1), values, accumulate=True) | |
| b = a[1, -2].cpu() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import sys | |
| import subprocess | |
| import urllib.request | |
| import json | |
| def get_latest_version(package_name: str) -> str: | |
| """Get latest version from PyPI""" | |
| api_url = f"https://pypi.org/pypi/{package_name}/json" |
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| With cudnn-9.10.2.21 | |
| ``` | |
| $ CUDNN_LOGINFO_DBG=3 RUN_SLOW=1 python3 -m pytest -v tests/models/vit/test_modeling_vit.py::ViTModelTest::test_batching_equivalence | |
| ========================================================================================== test session starts =========================================================================================== | |
| platform linux -- Python 3.10.12, pytest-8.4.1, pluggy-1.6.0 -- /home/ubuntu/py3.10-nightly/bin/python3 | |
| cachedir: .pytest_cache | |
| rootdir: /home/ubuntu/transformers | |
| configfile: pyproject.toml | |
| plugins: xdist-3.8.0, asyncio-1.1.0, rerunfailures-15.1, order-1.3.0, timeout-2.4.0, rich-0.2.0 | |
| asyncio: mode=strict, asyncio_default_fixture_loop_scope=function, asyncio_default_test_loop_scope=function |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import torch.utils.cpp_extension as _ce | |
| import tempfile | |
| import os | |
| import subprocess | |
| src = """#include <c10/util/BFloat16.h> | |
| #include <iostream> | |
| int main() { | |
| std::cout << c10::BFloat16(3.14) << std::endl; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import ctypes | |
| import torch | |
| import time | |
| def nvrtc_compile(source: str) -> str: | |
| from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer | |
| libnvrtc = CDLL('libnvrtc.so') | |
| def get_error_string() -> str: | |
| err_p = c_char_p() | |
| libnvrtc.nvrtcGetErrorString(result, byref(err_str)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Example showing how to use the no_header mode with a TensorBase CUDA extension | |
| This example creates a CUDA extension that directly includes ATen/core/TensorBase.h | |
| instead of torch/extension.h, resulting in faster compilation with no_header=True | |
| """ | |
| from datetime import datetime | |
| import torch | |
| import torch.utils.cpp_extension |
NewerOlder