This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# IMPORTANT: | |
# Install FFMPEG Static build from https://johnvansickle.com/ffmpeg/ | |
import numpy as np | |
import matplotlib | |
#matplotlib.use("Agg") | |
import matplotlib.pyplot as plt | |
from math import sin,cos,pi | |
import matplotlib.animation as manimation |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Working Example to use report_tensor_allocations_upon_oom | |
# with recent Tensorflow and Keras | |
config = tf.ConfigProto() | |
config.gpu_options.allow_growth = True | |
sess = tf.Session(config = config) | |
K.set_session(sess) | |
run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True) | |
run_metadata = tf.RunMetadata() | |
# ... |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="utf-8"?> | |
<manifest xmlns:android="http://schemas.android.com/apk/res/android" package="com.myapp"> | |
<application android:allowBackup="true" android:icon="@mipmap/ic_launcher android:label="@string/app_name" | |
android:roundIcon="@mipmap/ic_launcher_round" android:supportsRtl="true" android:theme="@style/AppTheme"> | |
<service android:name=".ForegroundService" android:enabled="true" android:exported="true"></service> | |
<activity | |
android:configChanges="orientation|keyboardHidden|keyboard|screenSize|locale" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Out-of-range shared or local address | |
========= at 0xbd0 in /home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/arch/barrier.h:169:cutlass::arch::ClusterBarrier::init(const unsigned long *, unsigned int) | |
========= by thread (0,0,0) in block (0,1,0) | |
========= Device Frame:/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/arch/barrier.h:127:cutlass::arch::ClusterBarrier::init(unsigned int) const [0xb20] | |
========= Device Frame:/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp:1073:cutlass::OrderedSequenceBarrier<(int)1, (int)2>::OrderedSequenceBarrier(cutlass::OrderedSequenceBarrier<(int)1, (int)2>::SharedStorage &, const cutlass::OrderedSequenceBarrier<(int)1, (int)2>::Params &) [0xb20] | |
========= Device Frame:/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp:382:cutlass::gemm::kernel::GemmUniversal<cute::tup |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Environment: | |
* Linux x64, NVIDIA H100 GPU | |
* CUDA 12.1 | |
* Cutlass v3.3.0 ( tagged release ) | |
Command ( example ): | |
nvcc -t=0 -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -w -gencode=arch=compute_90a,code=[sm_90a,compute_90a] -O1 -std=c++17 --expt-relaxed-constexpr -lineinfo -g -DCUTLASS_DEBUG_TRACE_LEVEL=1 -Xcompiler=-fPIC -Xcompiler=-fno-strict-aliasing -Xcompiler -fvisibility=hidden -Xcompiler=-Wconversion -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/library/include -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/library/src -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/util/include -L/home/klondenberg/local/cuda121/lib64 -L/home/klondenberg/local/cuda121/lib64/stubs -lcuda -lcudart -DGENERATE_STANDALONE_RUNNER -o broken6 broken6.cu |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Environment: | |
* Linux x64, NVIDIA H100 GPU | |
* CUDA 12.1 | |
* Cutlass v3.3.0 ( tagged release ) and Cutlass v3.2.2 ( tagged release ) | |
Command ( example ): | |
nvcc -t=0 -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -w -gencode=arch=compute_90a,code=[sm_90a,compute_90a] -O1 -std=c++17 --expt-relaxed-constexpr -Xcompiler=-fPIC --use-fast-math -Xcompiler=-fno-strict-aliasing -Xcompiler -fvisibility=hidden -Xcompiler=-Wconversion -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/library/include -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/library/src -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/util/include -L/home/klondenberg/local/cuda121/lib64 -L/home/klondenberg/local/cuda121/lib64/stubs -lcuda -lcudart -DGENERATE_STANDALONE_RUNNER -o performance_repro performance_repro.cu |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Change the environment variables to point to Cutlass and CUDA Toolkit and run this, | |
# passing any of the standalone repro_N.cu files as argument. It will compile and run the | |
# example. | |
set -x | |
export REPRO_CUTLASS_PATH=/home/klondenberg/github/pytorch/pytorch/third_party/cutlass | |
export REPRO_CUDA_PATH=/home/klondenberg/local/cuda121 | |
$REPRO_CUDA_PATH/bin/nvcc -t=0 -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -w -gencode=arch=compute_90a,code=[sm_90a,compute_90a] -O1 -std=c++17 --expt-relaxed-constexpr -Xcompiler=-fPIC --use_fast_math -Xcompiler=-fno-strict-aliasing -Xcompiler -fvisibility=hidden -Xcompiler=-Wconversion -I${REPRO_CUTLASS_PATH}/include -I${REPRO_CUTLASS_PATH}/tools/library/include -I${REPRO_CUTLASS_PATH}/tools/library/src -I${REPRO_CUTLASS_PATH}/tools/util/include -L${REPRO_CUDA_PATH}/lib64 -L${REPRO_CUDA_PATH}/lib64/stubs -lcuda -lcudart -DGENERATE_STANDALONE_RUNNER -DNDEBUG -DCUTLASS_DEBUG_TRACE_LEVEL=1 -o "${@}.exe" "$@" | |
"./${@}.exe" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def test_wrapper_codegen_statically_known_int_or_none(self) -> typing.List[CachingAutotuner]: | |
from torch._dynamo.utils import detect_fake_mode | |
from torch._inductor.codegen.common import boolean_ops | |
from torch._inductor.codegen.wrapper import WrapperCodeGen | |
from torch._inductor.compile_fx import _shape_env_from_inputs | |
from torch._inductor.debug import DebugContext | |
from torch._inductor.graph import GraphLowering | |
from torch._inductor.virtualized import V | |
from torch.fx.passes.fake_tensor_prop import FakeTensorProp |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda.h> | |
#include <iostream> | |
#define ASSERT_EQ(a,b) if (a!=b) { std::cerr << "Error" << std::endl << " Last CUDA error: " << cudaGetErrorName(cudaPeekAtLastError()) << ": " << cudaGetErrorString(cudaPeekAtLastError()) << std::endl; exit(1); } | |
__global__ void set_array_value(float *data, size_t num_elements, float value) { | |
int idx = blockIdx.x*blockDim.x + threadIdx.x; | |
if (idx<num_elements) { |
OlderNewer