Install
git clone https://github.com/openai/triton.git;
cd triton/python;
pip install cmake; # build time dependency
pip install -e .
pip uninstall pytorch-triton -yExpected result (-0.1250)
| // | |
| // Generated by LLVM NVPTX Back-End | |
| // | |
| .version 8.0 | |
| .target sm_80 | |
| .address_size 64 | |
| // .globl triton__0d1d2d3d4d56d7d89d1011d1213d1415d1617d1819d2021d2223d2425d2627d2829d3031d3233d3435d3637d3839d4041d42d | |
| .extern .shared .align 1 .b8 global_smem[]; |
Install
git clone https://github.com/openai/triton.git;
cd triton/python;
pip install cmake; # build time dependency
pip install -e .
pip uninstall pytorch-triton -yExpected result (-0.1250)
| // | |
| // Generated by LLVM NVPTX Back-End | |
| // | |
| .version 8.0 | |
| .target sm_80 | |
| .address_size 64 | |
| // .globl triton__0d1d2d3d | |
| .visible .entry triton__0d1d2d3d( | |
| .param .u64 triton__0d1d2d3d_param_0, | |
| .param .u64 triton__0d1d2d3d_param_1, |
| #include <dlfcn.h> | |
| #include "tool.h" | |
| int main() { | |
| //void *handle = dlopen("./tool.so", RTLD_NOW); | |
| print_t func = (print_t)dlsym(RTLD_NEXT, "print"); | |
| func(); | |
| return 0; | |
| } |
| waka |
How many operations are still pending?
How does a thread ensure its activities are transferred before it ends? Because metrics are stored in thread local maps, it needs to get all activities and attribute them before it dies.
opencl-api.c
device_finalizer_register vs thread finalizer
pending_operations
| week |
| #include <iostream> | |
| #include <chrono> | |
| void __attribute__ ((noinline)) init(int *arr, size_t length) { | |
| for (auto i = 0; i < length; ++i) { | |
| arr[i] = 1; | |
| } | |
| } | |
| class NumMatrix { | |
| private: | |
| std::vector<std::vector<int> > tree; | |
| void columnInit(std::vector<int> &tree_column, std::vector<int> &matrix_column) { | |
| size_t columns = matrix_column.size(); | |
| for (size_t i = columns; i < tree_column.size(); ++i) { | |
| tree_column[i] = matrix_column[i - columns]; | |
| } | |
| for (size_t i = columns - 1; i > 0; --i) { |