A tool to generate files for C extensions of CUDA-relatged libraries for CuPy. Currently covered are cuBLAS, cuSPARSE, and cuSOLVER, which have so many APIs to write their extensions by hands.
./gen.sh
| #include <cassert> | |
| #include <iostream> | |
| #include <thread> | |
| __global__ void vecAddOne(float *a, int n) { | |
| int id = blockIdx.x * blockDim.x + threadIdx.x; | |
| if (id < n) | |
| a[id] += 1.0f; | |
| } |
| diff --git a/cupyx/scipy/interpolate/_interpolate.py b/cupyx/scipy/interpolate/_interpolate.py | |
| index bab74671e..ec3c5bcac 100644 | |
| --- a/cupyx/scipy/interpolate/_interpolate.py | |
| +++ b/cupyx/scipy/interpolate/_interpolate.py | |
| @@ -22,7 +22,7 @@ INTERVAL_KERNEL = r''' | |
| extern "C" { | |
| __global__ void find_breakpoint_position( | |
| const double* breakpoints, const double* x, long long* out, | |
| - bool extrapolate, int total_x, int total_breakpoints, bool asc) { | |
| + bool extrapolate, int total_x, int total_breakpoints, const bool* pasc) { |
| { | |
| "_nodetype": "FileAST", | |
| "coord": null, | |
| "ext": [ | |
| { | |
| "_nodetype": "Pragma", | |
| "coord": "../utils/fake_libc_include/_fake_typedefs.h:56:9", | |
| "string": "GCC diagnostic ignored \"-Wunused-function\"" | |
| }, | |
| { |
| import multiprocessing | |
| import cupy | |
| from cupy import cuda | |
| from cupy.cuda import nccl | |
| from cupy import testing | |
| def f(n_devices, device, comm_id, rank): | |
| device.use() | |
| comm = nccl.NcclCommunicator(n_devices, comm_id, rank) |
| $ CHAINER_DTYPE=float16 python train_ptb.py -d 0 -e 10 | |
| #vocab = 10000 | |
| epoch iteration perplexity val_perplexity | |
| 0 500 326440 | |
| 0 1000 301342 | |
| 1 1500 298940 inf | |
| 1 2000 334369 | |
| 1 2500 334369 | |
| 2 3000 306202 inf | |
| 2 3500 339762 |
| $ CHAINER_DTYPE=float16 python postagging.py -d 0 | |
| [nltk_data] Downloading package brown to /home/ext- | |
| [nltk_data] mtakagi/nltk_data... | |
| [nltk_data] Package brown is already up-to-date! | |
| # of sentences: 57340 | |
| # of words: 56057 | |
| # of pos: 472 | |
| epoch main/loss validation/main/loss main/accuracy validation/main/accuracy elapsed_time | |
| 0 244.875 18.3736 | |
| 0 373.75 34.9924 |
| $ CHAINER_DTYPE=float16 python train_memnn.py tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_train.txt tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_test.txt -d 0 | |
| Training data: tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_train.txt: 2000 | |
| Test data: tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_test.txt: 200 | |
| epoch main/loss validation/main/loss main/accuracy validation/main/accuracy | |
| 1 nan nan 0.0017004 0 | |
| 2 nan nan 0 0 | |
| 3 nan nan 0 0 | |
| 4 nan nan 0 0 | |
| 5 nan nan 0 0 | |
| 6 nan nan 0 0 |
| $ CHAINER_DTYPE=float16 python train_mnist.py -d 0 | |
| Device: @cupy:0 | |
| # unit: 1000 | |
| # Minibatch-size: 100 | |
| # epoch: 20 | |
| epoch main/loss validation/main/loss main/accuracy validation/main/accuracy elapsed_time | |
| 1 nan nan 0.0994271 0.0980225 3.91818 | |
| 2 nan nan 0.0997917 0.0980225 6.22553 | |
| 3 nan nan 0.0995833 0.0980225 8.72424 |
| $ CHAINER_DTYPE=float16 python train_dcgan.py -d 0 | |
| Device: @cupy:0 | |
| # Minibatch-size: 50 | |
| # n_hidden: 100 | |
| # epoch: 1000 | |
| epoch iteration gen/loss dis/loss ................] 0.01% | |
| 0 100 nan nan | |
| 0 200 nan nan | |
| 0 300 nan nan |