Skip to content

Instantly share code, notes, and snippets.

@glyphx
Created January 2, 2017 15:10
Show Gist options
  • Save glyphx/64e0f1ca64bd053a423f2ace76661b23 to your computer and use it in GitHub Desktop.
Save glyphx/64e0f1ca64bd053a423f2ace76661b23 to your computer and use it in GitHub Desktop.
gpu flag set
C:\toolkits\anaconda2-4.2.0\python.exe C:/toolkits/cpu_gpu_test.py
1 #define _CUDA_NDARRAY_C
2
3 #include <Python.h>
4 #include <structmember.h>
5 #include "theano_mod_helper.h"
6
7 #include <numpy/arrayobject.h>
8 #include <iostream>
9
10 #include "cuda_ndarray.cuh"
11
12 #ifndef CNMEM_DLLEXPORT
13 #define CNMEM_DLLEXPORT
14 #endif
15
16 #include "cnmem.h"
17 #include "cnmem.cpp"
18
19 //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
20 #define COMPUTE_GPU_MEM_USED 0
21
22 //If true, we fill with NAN allocated device memory.
23 #define ALLOC_MEMSET 0
24
25 //If true, we print out when we free a device pointer, uninitialize a
26 //CudaNdarray, or allocate a device pointer
27 #define PRINT_FREE_MALLOC 0
28
29 //If true, we do error checking at the start of functions, to make sure there
30 //is not a pre-existing error when the function is called.
31 //You probably need to set the environment variable
32 //CUDA_LAUNCH_BLOCKING=1, and/or modify the CNDA_THREAD_SYNC
33 //preprocessor macro in cuda_ndarray.cuh
34 //if you want this to work.
35 #define PRECHECK_ERROR 0
36
37 cublasHandle_t handle = NULL;
38 int* err_var = NULL;
39
40 /////////////////////////
41 // Alloc and Free
42 /////////////////////////
43
44 static int g_gpu_context_active = 0;
45
46
47 PyObject *
48 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args);
49 static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure);
50
51
52 /**
53 *
54 * In the test program I'm using, the _outstanding_mallocs decreases with every call.
55 * This suggests there are more free() calls being made than alloc(), but I can't figure out why.
56 *
57 */
58 int _outstanding_mallocs[] = {0,0};
59
60 #if COMPUTE_GPU_MEM_USED
61 size_t _allocated_size = 0;
62 size_t _max_allocated_size = 0;
63
64 const int TABLE_SIZE = 10000;
65 struct table_struct{
66 void* ptr;
67 size_t size;
68 };
69 table_struct _alloc_size_table[TABLE_SIZE];
70 #endif
71
72 void * device_malloc(size_t size)
73 {
74 return device_malloc(size, VERBOSE_DEVICE_MALLOC);
75 }
76
77 ///@TODO: thejaswi: link this option to a theano config variable?
78 static bool g_use_cnmem = false;
79 static const int g_max_devices = 8;
80 int initCnmem(int card_number_provided, int card_nb, size_t mem) {
81 static bool cnmemInitialized = false;
82 if(cnmemInitialized) {
83 return 0;
84 }
85 // On stderr to be at the same place as "Using gpu device..."
86 int numDevices = 0;
87 cnmemDevice_t devices[g_max_devices];
88 if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
89 PyErr_Format(PyExc_RuntimeError,
90 "initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n",
91 cudaGetErrorString(cudaGetLastError()));
92 return -1;
93 }
94 if(card_number_provided){
95 numDevices = 1;
96 int i = 0;
97 devices[i].device = card_nb;
98 devices[i].size = mem;
99 ///@TODO: thejaswi: add support for multiple streams
100 devices[i].numStreams = 0;
101 devices[i].streams = NULL;
102 devices[i].streamSizes = NULL;
103 }else{
104 for(int i=0;i<numDevices;++i) {
105 devices[i].device = i;
106 devices[i].size = mem;
107 ///@TODO: thejaswi: add support for multiple streams
108 devices[i].numStreams = 0;
109 devices[i].streams = NULL;
110 }
111 }
112
113 ///@TODO: thejaswi: passing custom cnmem flags?
114 cnmemStatus_t status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
115 if(status != CNMEM_STATUS_SUCCESS) {
116 PyErr_Format(PyExc_RuntimeError,
117 "initCnmem: cnmemInit call failed! Reason=%s. numdev=%d\n",
118 cnmemGetErrorString(status), numDevices);
119 return -1;
120 }
121 cnmemInitialized = true;
122 return 0;
123 }
124
125 void * device_malloc(size_t size, int verbose)
126 {
127 #if PRECHECK_ERROR
128 cudaThreadSynchronize();
129 cudaError_t prevError = cudaGetLastError();
130 if (cudaSuccess != prevError)
131 {
132 fprintf(stderr,
133 "Error existed before calling device_malloc. %s\n",
134 cudaGetErrorString(prevError)
135 );
136 }
137 #endif
138 void * rval=NULL;
139 ///@TODO: thejaswi: support for multiple-streams?
140 if(g_use_cnmem) {
141 cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
142 status = cnmemMalloc(&rval, size, NULL);
143 if(status != CNMEM_STATUS_SUCCESS) {
144 PyErr_Format(PyExc_MemoryError,
145 "Error allocating %llu bytes of device memory (%s).",
146 (unsigned long long)size, cnmemGetErrorString(status));
147 return NULL;
148 }
149 }
150 else {
151 cudaError_t err = cudaMalloc(&rval, size);
152 if (cudaSuccess != err)
153 {
154 // Clear the error flag, cudaMalloc doesn't do it.
155 // Currently this returns the same thing as err, but if in future
156 // it returns something else I still don't see why we should ignore
157 // it. All we want to do here is reset the flag.
158 cudaGetLastError();
159 if (verbose)
160 {
161 size_t free = 0, total = 0;
162 cudaError_t err2 = cudaMemGetInfo(&free, &total);
163 if (err2 != cudaSuccess){
164 cudaGetLastError();
165 fprintf(stderr,
166 "Error when trying to find the memory information"
167 " on the GPU: %s\n", cudaGetErrorString(err2));
168 }
169 #if COMPUTE_GPU_MEM_USED
170 fprintf(stderr,
171 "Error allocating %llu bytes of device memory (%s)."
172 " new total bytes allocated: %llu."
173 " Driver report %llu bytes free and %llu bytes total \n",
174 (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)_allocated_size,
175 (unsigned long long)free, (unsigned long long)total);
176 #else
177 fprintf(stderr,
178 "Error allocating %llu bytes of device memory (%s)."
179 " Driver report %llu bytes free and %llu bytes total \n",
180 (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
181 #endif
182 }
183 PyErr_Format(PyExc_MemoryError,
184 "Error allocating %llu bytes of device memory (%s).",
185 (unsigned long long)size, cudaGetErrorString(err));
186 return NULL;
187 }
188 }
189 if (rval != NULL){
190 // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
191 // Could this be what happen if size is 0?
192 _outstanding_mallocs[0] += 1;
193
194 #if COMPUTE_GPU_MEM_USED
195 _allocated_size += size;
196 _max_allocated_size = std::max(_max_allocated_size, _allocated_size);
197 int i = 0;
198 for(;i<TABLE_SIZE;i++){
199 if(NULL==_alloc_size_table[i].ptr){
200 _alloc_size_table[i].ptr=rval;
201 _alloc_size_table[i].size=size;
202 break;
203 }
204 }
205 if (i == TABLE_SIZE){
206 fprintf(stderr,
207 "When tracking GPU malloc, our table size wasn't big enough."
208 " So we loose some tracking. Raise the value of TABLE_SIZE in the file cuda_ndarra.cu");
209 }
210 #endif
211 }
212 //fprintf(stderr,
213 //"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
214 //(long)size, cudaGetErrorString(err),_allocated_size,rval);
215
216 if(ALLOC_MEMSET){
217 //We init them to nan to make sure we catch more debug case.
218 cudaMemset(rval, 0xFF, size);
219 //printf("MEMSET\n");
220 }
221 #if PRINT_FREE_MALLOC
222 fprintf(stderr, "device malloc %p of size %d\n", rval, size);
223 #endif
224 return rval;
225 }
226
227 int device_free(void *ptr)
228 {
229 #if PRECHECK_ERROR
230 cudaThreadSynchronize();
231 cudaError_t prevError = cudaGetLastError();
232 if (cudaSuccess != prevError)
233 {
234 fprintf(stderr,
235 "Error existed before calling device_free. %s\n",
236 cudaGetErrorString(prevError)
237 );
238 }
239 #endif
240 #if PRINT_FREE_MALLOC
241 size_t free = 0, total = 0;
242 cudaError_t err2 = cudaMemGetInfo(&free, &total);
243 if (err2 != cudaSuccess){
244 cudaGetLastError();
245 fprintf(stderr,
246 "Error when tring to find the memory information"
247 " on the GPU: %s\n", cudaGetErrorString(err2));
248 }
249 #if COMPUTE_GPU_MEM_USED
250 {
251 int i = 0;
252 for(;i<TABLE_SIZE;i++)
253 if(_alloc_size_table[i].ptr==ptr){
254 break;
255 }
256 assert(i<TABLE_SIZE);
257 fprintf(stderr, "device_free %p of size %d."
258 " Driver report %d bytes free and %d bytes total \n",
259 ptr, _alloc_size_table[i].size, free, total);
260 }
261 #else
262 fprintf(stderr, "device_free %p."
263 " Driver report %d bytes free and %d bytes total \n",
264 ptr, free, total);
265 #endif
266 #endif
267
268 // if there is no gpu context, the call to cudaFree will fail; skip it entirely
269 if(!g_gpu_context_active) {
270 return 0;
271 }
272
273 ///@TODO: thejaswi: multi-stream support
274 if(g_use_cnmem) {
275 cnmemStatus_t status = cnmemFree(ptr, NULL);
276 if(status != CNMEM_STATUS_SUCCESS) {
277 fprintf(stderr, "device_free: cnmemFree call failed! Reason=%s\n",
278 cnmemGetErrorString(status));
279 }
280 }
281 else {
282 // We need sync as the Theano's GC could remove intermediate variable that
283 // are still needed as the gpu kernel are running or in the queue.
284 CNDA_BEGIN_ALLOW_THREADS
285 cudaThreadSynchronize();
286 CNDA_END_ALLOW_THREADS
287
288 cudaError_t err = cudaFree(ptr);
289 if (cudaSuccess != err)
290 {
291 // Clear the error flag, cudaFree doesn't do it.
292 // Currently this returns the same thing as err, but if in future
293 // it returns something else I still don't see why we should ignore
294 // it. All we want to do here is reset the flag.
295 cudaGetLastError();
296 size_t free = 0, total = 0;
297 cudaError_t err2 = cudaMemGetInfo(&free, &total);
298 if (err2 != cudaSuccess){
299 cudaGetLastError();
300 fprintf(stderr,
301 "Error when tring to find the memory information"
302 " on the GPU: %s\n", cudaGetErrorString(err2));
303 }
304 #if COMPUTE_GPU_MEM_USED
305 {
306 int i = 0;
307 for(;i<TABLE_SIZE;i++)
308 if(_alloc_size_table[i].ptr==ptr){
309 break;
310 }
311 assert(i<TABLE_SIZE);
312 fprintf(stderr,
313 "Error freeing device pointer %p (%s) of size %llu. %llu byte already allocated."
314 " Driver report %llu bytes free and %llu bytes total \n",
315 ptr, cudaGetErrorString(err),
316 (unsigned long long)_alloc_size_table[i].size, (unsigned long long)_allocated_size, (unsigned long long)free, (unsigned long long)total);
317 }
318 #else
319 fprintf(stderr,
320 "Error freeing device pointer %p (%s)."
321 " Driver report %llu bytes free and %llu bytes total \n",
322 ptr,
323 cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
324 #endif
325 if (NULL != PyErr_Occurred()){
326 fprintf(stderr,
327 "device_free: cudaFree() returned an error, but there is already an"
328 " Python error set. This happen during the clean up when there is a"
329 " first error and the CUDA driver is in a so bad state that it don't"
330 " work anymore. We keep the previous error set to help debugging it.");
331 return -1;
332 }
333 PyErr_Format(PyExc_MemoryError,
334 "error freeing device pointer %p (%s)",
335 ptr,
336 cudaGetErrorString(err));
337 return -1;
338 }
339 }
340 _outstanding_mallocs[0] -= (ptr != NULL);
341 #if COMPUTE_GPU_MEM_USED
342 int i=0;
343 size_t total_freed = 0;
344 for(;i<TABLE_SIZE;i++)
345 if(_alloc_size_table[i].ptr==ptr){
346 _allocated_size -= _alloc_size_table[i].size;
347 total_freed += _alloc_size_table[i].size;
348 _alloc_size_table[i].ptr=0;
349 _alloc_size_table[i].size=0;
350
351 break;
352 }
353 //if(i==TABLE_SIZE)
354 // printf("Unallocated unknow size!\n");
355 //fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
356 #endif
357 return 0;
358 }
359
360 static PyObject *
361 outstanding_mallocs(PyObject* self, PyObject * args)
362 {
363 return PyInt_FromLong(_outstanding_mallocs[0]);
364 }
365
366
367 static void *work_mem = NULL;
368 static size_t work_size = 0;
369
370 /*
371 * Returns a chunk of memory for temporary work inside of an op. You can only
372 * request a single chunk of memory at a time since it is reused.
373 */
374 void *get_work_mem(size_t sz) {
375 if (sz <= work_size)
376 return work_mem;
377 device_free(work_mem);
378 work_mem = device_malloc(sz);
379 work_size = sz;
380 if (work_mem == NULL)
381 work_size = 0;
382 return work_mem;
383 }
384
385 /////////////////////////
386 // Static helper methods
387 /////////////////////////
388
389 static void
390 CudaNdarray_null_init(CudaNdarray*self)
391 {
392 self->base = NULL;
393 self->nd = -1;
394 self->host_structure = NULL;
395 self->data_allocated = 0;
396 self->dev_structure_fresh = 1;
397 self->dev_structure = NULL;
398 self->devdata = NULL;
399 }
400
401 static int
402 CudaNdarray_uninit(CudaNdarray*self)
403 {
404 #if PRINT_FREE_MALLOC
405 fprintf(stderr, "CudaNdarray_uninit %p\n", self);
406 #endif
407 int rval = 0;
408 if (self->data_allocated) {
409 assert(self->devdata);
410 if (device_free(self->devdata))
411 {
412 fprintf(stderr,
413 "CudaNdarray_uninit: error freeing self->devdata. (self=%p, self->devata=%p)\n",
414 self, self->devdata);
415 rval = -1;
416 }
417 self->devdata = NULL;
418 self->data_allocated = 0;
419 }
420 if (self->dev_structure)
421 {
422 if (device_free(self->dev_structure))
423 {
424 fprintf(stderr,
425 "CudaNdarray_uninit: error freeing dev_structure memory %p (self=%p)\n",
426 self->dev_structure, self);
427 rval = -1;
428 }
429 self->dev_structure = NULL;
430 }
431 if (self->host_structure)
432 {
433 free(self->host_structure);
434 self->host_structure = NULL;
435 }
436 self->nd = -1;
437 Py_XDECREF(self->base);
438 self->base = NULL;
439 return rval;
440 }
441
442
443 //make the rightmost coords change fastest
444 //TODO: why does a downward for-loop not work????
445 //TODO: use the log2_dims and driver code to remove / and %
446 //TODO: skip the last division (when d == 0)
447 #define decl_k_elemwise_unary_rowmajor(name, F) \
448 __global__ void name (unsigned int numEls, \
449 unsigned int nd, \
450 const int * dim, \
451 const float * a_data, const int * a_str, \
452 float * z_data, const int * z_str) \
453 { \
454 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; \
455 const unsigned int numThreads = blockDim.x * gridDim.x; \
456 \
457 for (unsigned int i = idx; i < numEls; i += numThreads) \
458 { \
459 unsigned int ii = i; \
460 const float * a_i = a_data; \
461 float * z_i = z_data; \
462 for (unsigned int _d = 0; _d < nd; ++_d) \
463 { \
464 unsigned int d = nd - _d-1; \
465 int i_d = ii % dim[d]; /* i_d is our position in the d'th dimension */ \
466 ii = ii / dim[d]; \
467 a_i += i_d * a_str[d]; /* increment our a and z pointers by i_d elements */ \
468 z_i += i_d * z_str[d]; \
469 } \
470 z_i[0] = F(a_i[0]); \
471 } \
472 }
473
474 template<typename T> __device__ T unary_copy(T a) { return a; }
475 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy, unary_copy<float>)
476
477 template<typename T> __device__ T unary_exp(T a) { return exp(a); }
478 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_exp, unary_exp<float>)
479
480 /////////////////////////////
481 // Satisfying reqs to be Type
482 /////////////////////////////
483
484 //DON'T use directly(if their is other CudaNdarray that point to it, it will cause problem)! use Py_DECREF() instead
485 static void
486 CudaNdarray_dealloc(CudaNdarray* self)
487 {
488 if (0) std::cerr << "CudaNdarray dealloc " << self << " " << self->devdata << '\n';
489 if(Py_REFCNT(self) > 1)
490 printf("WARNING:CudaNdarray_dealloc called when there is still active reference to it.\n");
491 CudaNdarray_uninit(self);
492 Py_TYPE(self)->tp_free((PyObject*)self);
493 --_outstanding_mallocs[1];
494 if (0)
495 {
496 fprintf(stderr, "device_malloc_counts: (device) %i (obj) %i\n",
497 _outstanding_mallocs[0],
498 _outstanding_mallocs[1]);
499 }
500 }
501
502 static PyObject *
503 CudaNdarray_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
504 {
505 CudaNdarray *self;
506
507 self = (CudaNdarray *)type->tp_alloc(type, 0);
508 if (self != NULL)
509 {
510 CudaNdarray_null_init(self);
511 ++_outstanding_mallocs[1];
512 }
513 return (PyObject *)self;
514 }
515 static int
516 CudaNdarray_init(CudaNdarray *self, PyObject *args, PyObject *kwds)
517 {
518 PyObject *arr=NULL;
519
520 if (! PyArg_ParseTuple(args, "O", &arr))
521 return -1;
522 if (! PyArray_Check(arr))
523 {
524 PyErr_SetString(PyExc_TypeError, "PyArray arg required");
525 return -1;
526 }
527 int rval = CudaNdarray_CopyFromArray(self, (PyArrayObject*)arr);
528 return rval;
529 }
530 static PyMemberDef CudaNdarray_members[] =
531 {
532 /*
533 {"first", T_OBJECT_EX, offsetof(CudaNdarray, first), 0,
534 "first name"},
535 {"last", T_OBJECT_EX, offsetof(CudaNdarray, last), 0,
536 "last name"},
537 {"number", T_INT, offsetof(CudaNdarray, number), 0,
538 "noddy number"},
539 */
540 {NULL} /* Sentinel */
541 };
542
543 PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
544 {
545 PyObject * dtype = NULL;
546 if (args && !PyArg_ParseTuple(args, "|O", &dtype))
547 return NULL;
548 if (dtype) {
549 PyArray_Descr* dtype2;
550 // PyArray_DescrConverter try to convert anything to a PyArray_Descr.
551 if(!PyArray_DescrConverter(dtype, &dtype2))
552 {
553 PyObject * str = PyObject_Repr(dtype);
554 PyErr_Format(PyExc_TypeError,
555 "CudaNdarray dtype parameter not understood: %s",
556 PyString_AsString(str)
557 );
558 Py_CLEAR(str);
559 return NULL;
560 }
561 int typeNum = dtype2->type_num;
562 Py_DECREF(dtype2);
563 if (typeNum != NPY_FLOAT32)
564 {
565 PyObject * str = PyObject_Repr(dtype);
566 PyErr_Format(PyExc_TypeError,
567 "CudaNdarray support only support float32 dtype, provided: %d",
568 typeNum
569 );
570 Py_CLEAR(str);
571 return NULL;
572 }
573 }
574
575 int verbose = 0;
576 if(self->nd>=0 && CudaNdarray_SIZE(self)==0){
577 npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
578 assert (npydims);
579 for (int i = 0; i < self->nd; ++i) npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
580 PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM);
581 free(npydims);
582 if (!rval){
583 return NULL;
584 }
585 assert (PyArray_ITEMSIZE((PyArrayObject *)rval) == sizeof(real));
586 return rval;
587 }
588 if ((self->nd < 0) || (self->devdata == 0))
589 {
590 PyErr_SetString(PyExc_ValueError, "can't copy from un-initialized CudaNdarray");
591 return NULL;
592 }
593 CudaNdarray * contiguous_self = NULL;
594 if (CudaNdarray_is_c_contiguous(self))
595 {
596 contiguous_self = self;
597 Py_INCREF(contiguous_self);
598 if (verbose) std::cerr << "CreateArrayObj already contiguous" << contiguous_self << '\n';
599 }
600 else
601 {
602 contiguous_self = (CudaNdarray*)CudaNdarray_Copy(self);
603 if (verbose) std::cerr << "CreateArrayObj created contiguous" << contiguous_self << '\n';
604 }
605 if (!contiguous_self)
606 {
607 return NULL;
608 }
609
610 npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
611 assert (npydims);
612 for (int i = 0; i < self->nd; ++i)
613 npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
614 PyArrayObject * rval = (PyArrayObject *) PyArray_SimpleNew(self->nd,
615 npydims,
616 REAL_TYPENUM);
617 free(npydims);
618 if (!rval)
619 {
620 Py_DECREF(contiguous_self);
621 return NULL;
622 }
623
624 assert (PyArray_ITEMSIZE(rval) == sizeof(real));
625
626 npy_intp rval_size = PyArray_SIZE(rval);
627 void *rval_data = PyArray_DATA(rval);
628 cudaError_t err;
629 CNDA_BEGIN_ALLOW_THREADS;
630
631 err = cudaMemcpy(rval_data, contiguous_self->devdata,
632 rval_size * sizeof(real),
633 cudaMemcpyDeviceToHost
634 );
635 //CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
636 CNDA_END_ALLOW_THREADS;
637
638 if (cudaSuccess != err)
639 {
640 PyErr_Format(PyExc_RuntimeError, "error (%s)copying data to host",
641 cudaGetErrorString(err));
642 Py_DECREF(rval);
643 rval = NULL;
644 }
645
646 Py_DECREF(contiguous_self);
647 return (PyObject *)rval;
648 }
649
650 // TODO-- we have two functions here, ZEROS and Zeros.
651 // ZEROS is meant to be called just from C code (you don't need to pass it PyObject * s)
652 // but this naming is very weird, makes it look like a macro
653 // we should figure out the correct convention and change to that
654 PyObject* CudaNdarray_ZEROS(int n, int * dims)
655 {
656
657 size_t total_elements = 1;
658
659 for(size_t i=0;i<n;i++){
660 // Detect overflow on unsigned integer
661 if (dims[i] != 0 && total_elements > (SIZE_MAX / dims[i])) {
662 PyErr_Format(PyExc_RuntimeError,
663 "Can't store in size_t for the bytes requested %llu * %llu",
664 (unsigned long long)total_elements,
665 (unsigned long long)dims[i]);
666 return NULL;
667 }
668 total_elements*=dims[i];
669 }
670
671 // total_elements now contains the size of the array, in reals
672 if (total_elements > (SIZE_MAX / sizeof(real))){
673 PyErr_Format(PyExc_RuntimeError,
674 "Can't store in size_t for the bytes requested %llu * 4",
675 (unsigned long long)total_elements);
676 return NULL;
677 }
678 size_t total_size = total_elements * sizeof(real);
679
680 CudaNdarray* rval = (CudaNdarray*)CudaNdarray_New();
681 if (!rval)
682 {
683 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: call to New failed");
684 return NULL;
685 }
686
687 if (CudaNdarray_alloc_contiguous(rval, n, dims))
688 {
689 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: allocation failed.");
690 Py_DECREF(rval);
691 return NULL;
692 }
693
694 // Fill with zeros
695 //fprintf(stdout, "Sizeof: %d\n", total_size);
696 if (cudaSuccess != cudaMemset(rval->devdata, 0, total_size))
697 {
698 PyErr_Format(PyExc_MemoryError,
699 "CudaNdarray_ZEROS: Error memsetting %llu bytes of device memory.",
700 (unsigned long long)total_size);
701 Py_DECREF(rval);
702 return NULL;
703 }
704
705 if (cnda_copy_structure_to_device(rval))
706 {
707 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: syncing structure to device failed");
708 Py_DECREF(rval);
709 return NULL;
710 }
711 return (PyObject*) rval;
712 }
713
714 // declared as a static method (hence 1st parameter is not used)
715 // Based on _Copy and _dimshuffle
716 PyObject* CudaNdarray_Zeros(PyObject* _unused, PyObject* shape)
717 {
718 if(!shape)
719 {
720 PyErr_SetString(PyExc_TypeError, "CudaNdarray_Zeros: function takes at least 1 argument (0 given)");
721 return NULL;
722 }
723 if(!PySequence_Check(shape))
724 {
725 PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence");
726 return NULL;
727 }
728
729 int shplen = PySequence_Length(shape);
730
731 if (shplen == 0)
732 {
733 return CudaNdarray_ZEROS(0, NULL);
734 }
735
736 int* newdims = (int *)malloc(sizeof(int) * shplen);
737
738 if (!newdims)
739 {
740 PyErr_SetString(PyExc_MemoryError,
741 "CudaNdarray_Zeros: Failed to allocate temporary space");
742 return NULL;
743 }
744
745 // start from the end to compute strides
746 for (int i = shplen-1; i >= 0; --i)
747 {
748 PyObject* shp_el_obj = PySequence_GetItem(shape, i);
749 if(shp_el_obj == NULL)
750 {
751 // shouldn't happen since we checked length before...
752 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_Zeros: Index out of bound in sequence");
753 free(newdims);
754 return NULL;
755 }
756
757 int shp_el = PyInt_AsLong(shp_el_obj);
758 Py_DECREF(shp_el_obj);
759
760 if (shp_el < 0)
761 {
762 PyErr_SetString(PyExc_ValueError, "CudaNdarray_Zeros: shape must contain only non-negative values for size of a dimension");
763 free(newdims);
764 return NULL;
765 }
766
767 newdims[i] = shp_el;
768 }
769
770 PyObject* rval = CudaNdarray_ZEROS(shplen,newdims);
771
772 free(newdims);
773
774 return (PyObject*)rval;
775 }
776
777
778
779
780
781 PyObject * CudaNdarray_Copy(const CudaNdarray * self)
782 {
783 PyObject * rval = CudaNdarray_New();
784 if ((!rval) || (-1 == self->nd))
785 {
786 return rval;
787 }
788 if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, self->nd, CudaNdarray_HOST_DIMS(self)))
789 {
790 Py_DECREF(rval);
791 return NULL;
792 }
793 if (CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)rval, self))
794 {
795 Py_DECREF(rval);
796 return NULL;
797 }
798 return rval;
799 }
800 PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo)
801 {
802 assert(PyDict_Check(memo));
803 PyObject * selfkey = PyInt_FromLong((long)self);
804 assert(selfkey);
805 if (PyDict_Contains(memo, selfkey))
806 {
807 PyObject * rval = PyDict_GetItem(memo, selfkey);
808 Py_DECREF(selfkey);
809 Py_XINCREF(rval);
810 return rval;
811 }
812 else
813 {
814 PyObject * rval = CudaNdarray_Copy(self);
815 if (0) std::cerr << "DeepCopy created " << rval << " devdata " << ((CudaNdarray*)rval)->devdata << "\n";
816 if (NULL == rval)
817 {
818 Py_DECREF(selfkey);
819 return NULL;
820 }
821 if (PyDict_SetItem(memo, selfkey, rval))
822 {
823 Py_DECREF(rval);
824 Py_DECREF(selfkey);
825 return NULL;
826 }
827 Py_DECREF(selfkey);
828 return rval;
829 }
830 }
831 PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask)
832 {
833 if (!PySequence_Check(py_reduce_mask))
834 {
835 PyErr_SetString(PyExc_TypeError, "reduce_mask must be sequence of ints");
836 return NULL;
837 }
838 int len = PySequence_Length(py_reduce_mask);
839 if (len != self->nd)
840 {
841 PyErr_SetString(PyExc_TypeError, "length of reduce_mask must match self->nd");
842 return NULL;
843 }
844 CudaNdarray * self_sum = (CudaNdarray*)CudaNdarray_New();
845 if (!self_sum)
846 {
847 return NULL;
848 }
849 //TODO: allocate a fixed size dimshuffle_pattern_cache on the stack,
850 // and use it if it is big enough.
851 int * dimshuffle_pattern = (int*)malloc(len * 2 * sizeof(int));
852 int * sum_dims = dimshuffle_pattern + len;
853 int n_remaining_dims = 0;
854 if (!dimshuffle_pattern)
855 {
856 Py_DECREF(self_sum);
857 PyErr_SetString(PyExc_MemoryError, "failed to alloc internal storage");
858 return NULL;
859 }
860 for (int i = 0; i < len; ++i)
861 {
862 PyObject *o_i = PySequence_GetItem(py_reduce_mask, i);
863 int o_i_int = PyInt_AsLong(o_i);
864 Py_XDECREF(o_i);
865 if (PyErr_Occurred())
866 {
867 Py_DECREF(self_sum);
868 free(dimshuffle_pattern);
869 return NULL;
870 }
871 if (o_i_int) // this is a dimension over which we are reducing
872 {
873 sum_dims[i] = 1;
874 }
875 else
876 {
877 sum_dims[i] = CudaNdarray_HOST_DIMS(self)[i];
878 dimshuffle_pattern[n_remaining_dims++] = i;
879 }
880 }
881 if (0 || CudaNdarray_alloc_contiguous(self_sum, len, sum_dims)
882 || CudaNdarray_reduce_sum(self_sum, self)
883 || CudaNdarray_dimshuffle(self_sum, n_remaining_dims, dimshuffle_pattern))
884 {
885 Py_DECREF(self_sum);
886 free(dimshuffle_pattern);
887 return NULL;
888 }
889 free(dimshuffle_pattern);
890 return (PyObject*)self_sum;
891 }
892
893 // Reshape self to the new shape gived by the tuple shape.
894 //
895 // If self is c contiguous, it return a view. Otherwise it always do a copy.
896 // TODO: make it return a view when the strides allow it even if it is not
897 // c contiguous
898 PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
899 {
900 if(!CudaNdarray_is_c_contiguous(self))
901 {
902 // allocate new space
903 //TODO: test to see if we can re-use old one and take a new param to
904 // use this
905 CudaNdarray* rval = (CudaNdarray*) CudaNdarray_Copy(self);
906 if (!rval)
907 {
908 return NULL;
909 }
910
911 CudaNdarray* ret = (CudaNdarray*) CudaNdarray_Reshape(rval, shape);
912 Py_XDECREF(rval);
913 return (PyObject*)ret;
914 }
915
916 // check shape tuple
917 unsigned int rval_nd;
918 unsigned int * rval_dims;
919 size_t rval_size = 1;
920
921 if (PyTuple_Check(shape)){
922 // copy shape to integer array
923 rval_nd = PyTuple_Size(shape);
924 }else if (PyInt_Check(shape)){
925 rval_nd = 1;
926 }else{
927 PyErr_SetString(PyExc_TypeError, "shape must be tuple of integers or an integer");
928 return NULL;
929 }
930 rval_dims = (unsigned int*)malloc(rval_nd * sizeof(int));
931
932 if(PyTuple_Check(shape)){
933 for (int i = 0; i < rval_nd; ++i)
934 {
935 rval_dims[i] = PyInt_AsLong(PyTuple_GetItem(shape, i)); //GetItem returns borrowed reference
936 if (PyErr_Occurred()) //error in AsLong
937 {
938 free(rval_dims);
939 return NULL;
940 }
941 if(rval_dims[i]<0){
942 PyErr_Format(PyExc_ValueError, "Reshape has invalid dimension %i (must be >=0)",rval_dims[i]);
943 free(rval_dims);
944 return NULL;
945 }
946 rval_size = rval_size * rval_dims[i];
947 }
948 }else{
949 rval_size = PyInt_AsLong(shape);
950 rval_dims[0] = rval_size;
951 }
952 // calculate new size, assert same as old size
953 if (rval_size != CudaNdarray_SIZE(self))
954 {
955 PyErr_Format(PyExc_ValueError, "size must remain unchanged, changed from %lld to %lld", CudaNdarray_SIZE(self), rval_size);
956 free(rval_dims);
957 return NULL;
958 }
959 if (rval_size==0)
960 {
961 PyObject * rval = CudaNdarray_NewDims(rval_nd, rval_dims);
962 free(rval_dims);
963 return rval;
964 }
965
966 //return a view, not a copy
967 //we can do this as we checked self is c_contiguous
968 CudaNdarray * rval = (CudaNdarray * )CudaNdarray_New(rval_nd);
969
970 if (!rval || 0 != rval->data_allocated
971 ||CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
972 {
973 Py_XDECREF(rval);
974 free(rval_dims);
975 return NULL;
976 }
977 //set dim and stride
978 int size = 1;
979 for (int i = rval_nd-1; i >= 0; --i)
980 {
981 CudaNdarray_set_stride(rval, i, (rval_dims[i] == 1) ? 0 : size);
982 CudaNdarray_set_dim(rval, i, rval_dims[i]);
983 size = size * rval_dims[i];
984 }
985 free(rval_dims);
986 return (PyObject*)rval;
987 }
988
989 PyObject * CudaNdarray_View(const CudaNdarray * self)
990 {
991 CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(self->nd);
992 if (!rval || CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
993 {
994 Py_XDECREF(rval);
995 rval = NULL;
996 }
997 else
998 {
999 for (int i = 0; i < self->nd; ++i)
1000 {
1001 CudaNdarray_set_dim(rval, i, CudaNdarray_HOST_DIMS(self)[i]);
1002 CudaNdarray_set_stride(rval, i, CudaNdarray_HOST_STRIDES(self)[i]);
1003 }
1004 }
1005 return (PyObject*)rval;
1006 }
1007
1008 /*
1009 * d0,... are the output dims
1010 * indices are a list of index to operate on
1011 * They are int32 viewed as float32.
1012 * a is the output
1013 * b is the input
1014 * dB0, the source leading dimensions size
1015 */
1016 template <int operator_num>
1017 __global__ void k_take_3(const int d0, const int d1, const int d2,
1018 const npy_int64* indices,
1019 float* a,
1020 const int sA0, const int sA1, const int sA2,
1021 const float* b, const int dB0,
1022 const int sB0, const int sB1, const int sB2,
1023 int* err){
1024 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1025 npy_int64 idx = indices[i0];
1026 if (idx<0)
1027 idx += dB0; // To allow negative indexing.
1028 if ((idx < 0) || (idx >= dB0)){
1029 // Any value other the 0 probably work. But to be more safe, I want
1030 // to change all bits to prevent problem with concurrent write that
1031 // could cross cache line. But this should not happen with the
1032 // current code and driver.
1033 *err = 0xFFFF;
1034 continue;
1035 }
1036 for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){
1037 for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){
1038 int a_idx = i0*sA0 + i1*sA1 + i2*sA2;
1039 int b_idx = idx*sB0 + i1*sB1 + i2*sB2;
1040 a[a_idx] = b[b_idx];
1041 }
1042 }
1043 }
1044 }
1045
1046 // We try to be similar to the PyArray_TakeFrom function
1047 //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
1048 //TODO: support other clip mode then raise(clip, wrap)
1049 //self is the input that we copy data from.
1050 //The indices that we receive MUST be an CudaNdarray(float32)
1051 // that is in fact a view to int64 indices
1052 PyObject*
1053 CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
1054 int verbose = 0;
1055 PyObject * indices_obj = NULL;
1056 //int axis; Default None, that mean the flattened array.
1057 PyObject * axis_obj = Py_None;
1058 PyObject * out_obj = Py_None;
1059 PyObject * clipmode_obj = NULL;
1060 int max_threads = 1; // max threads per blocks
1061
1062 if (! PyArg_ParseTuple(args, "O|OOOi", &indices_obj, &axis_obj,
1063 &out_obj, &clipmode_obj, &max_threads))
1064 return NULL;
1065
1066 //Check argument indices
1067 //TODO: if not a numpy.ndarray, convert to numpy.ndarray
1068 //TODO: If a CudaNdarray, accept it and suppose the data is int32? is float32 number of int?
1069 //TODO: Support ndarray of other dtype then int32
1070 //TODO: support list of indices that are not c_contiguous
1071 CudaNdarray * indices = NULL;
1072 if (CudaNdarray_Check(indices_obj)) {
1073 if (verbose) printf("cudandarray indices\n");
1074 indices = (CudaNdarray*) indices_obj;
1075 Py_INCREF(indices);
1076 } else if (PyArray_Check(indices_obj)) {
1077 if (verbose) printf("ndarray indices\n");
1078 if (PyArray_TYPE((PyArrayObject *)indices_obj) != NPY_INT64) {
1079 PyErr_SetString(PyExc_TypeError,
1080 "CudaNdarray_TakeFrom: need a ndarray for indices"
1081 " with dtype int64");
1082 return NULL;
1083 }
1084 if (PyArray_NDIM(((PyArrayObject*)indices_obj)) != 1) {
1085 PyErr_SetString(PyExc_TypeError,
1086 "CudaNdarray_TakeFrom: need a CudaNdarray of"
1087 " indices with only 1 dimensions");
1088 return NULL;
1089 }
1090 // We need indices_obj to be contiguous, in order to take a view
1091 // with a different dtype.
1092 if (!PyArray_IS_C_CONTIGUOUS((PyArrayObject*) indices_obj)) {
1093 PyObject* indices_obj_contig = PyArray_NewCopy((PyArrayObject*) indices_obj, NPY_CORDER);
1094 if (!indices_obj_contig)
1095 return NULL;
1096 indices_obj = indices_obj_contig;
1097 } else {
1098 // Keep the refcount consistent
1099 Py_INCREF(indices_obj);
1100 }
1101 PyArray_Descr* float32_descr = PyArray_DescrFromType(NPY_FLOAT32);
1102 PyObject * indices_float32 = NULL;
1103 indices_float32 = PyArray_View((PyArrayObject*)indices_obj,
1104 float32_descr, NULL);
1105 if (verbose) printf("ndarray indices\n");
1106 if (!indices_float32) {
1107 Py_DECREF(indices_obj);
1108 return NULL;
1109 }
1110
1111 indices = (CudaNdarray*) CudaNdarray_New();
1112 if (verbose) printf("\nndarray after new\n");
1113 if (! indices){
1114 Py_DECREF(indices_obj);
1115 Py_DECREF(indices_float32);
1116 return NULL;
1117 }
1118 if (CudaNdarray_CopyFromArray(indices,
1119 (PyArrayObject *)indices_float32)){
1120 Py_DECREF(indices_obj);
1121 Py_DECREF(indices_float32);
1122 return NULL;
1123 }
1124 Py_DECREF(indices_obj);
1125 Py_DECREF(indices_float32);
1126 } else {
1127 PyObject* py_s = PyObject_Str(indices_obj);
1128 const char* s = PyString_AsString(py_s);
1129 Py_DECREF(py_s);
1130 PyErr_Format(PyExc_TypeError,
1131 "CudaNdarray_TakeFrom: need an ndarray of int64 or a"
1132 " CudaNdarray(float32) that is a view from int64 data"
1133 " for indices. Got %s", s);
1134 return NULL;
1135 }
1136
1137 if (verbose) {
1138 printf("indices used on the gpu\n");
1139 fprint_CudaNdarray(stdout, indices);
1140 PyObject * used_indices = CudaNdarray_CreateArrayObj(indices);
1141 PyObject_Print(used_indices, stdout, 0);
1142 Py_DECREF(used_indices);
1143 }
1144 if (verbose) printf("after print of object\n");
1145 if(!CudaNdarray_is_c_contiguous(indices) != 0) {
1146 PyErr_SetString(PyExc_NotImplementedError,
1147 "CudaNdarray_TakeFrom: The indices must be contiguous in memory.");
1148 Py_DECREF(indices);
1149 return NULL;
1150 }
1151 int nb_indices = CudaNdarray_SIZE((CudaNdarray *)indices) / 2;// int64 are 8 bytes, float32 are 4 bytes
1152
1153 //Check argument axis
1154 //TODO: implement the default and other axis
1155 long axis = PyInt_AsLong(axis_obj);
1156
1157 if (axis != 0) {
1158 PyErr_Format(PyExc_NotImplementedError,
1159 "CudaNdarray_TakeFrom: only axis=0 is currently supported."
1160 " Got %ld.", axis);
1161 Py_DECREF(indices);
1162 return NULL;
1163 }
1164
1165 //Check argument out_obj
1166 CudaNdarray * out = NULL;
1167 if (out_obj && CudaNdarray_Check(out_obj))
1168 out = (CudaNdarray*) out_obj;
1169 if (out && (out->nd != self->nd ||
1170 CudaNdarray_HOST_DIMS(out)[0] != nb_indices))
1171 out = NULL;
1172 int * dims = (int *)malloc(sizeof(int) * self->nd);
1173 dims[0] = nb_indices;
1174
1175 for (int i=1 ; i<self->nd ; i++) {
1176 dims[i] = CudaNdarray_HOST_DIMS(self)[i];
1177 if (out && CudaNdarray_HOST_DIMS(out)[i] != dims[i]) {
1178 out = NULL;
1179 }
1180 }
1181 if (!out) {
1182 out = (CudaNdarray*)CudaNdarray_New();
1183 if (!out){
1184 Py_DECREF(indices);
1185 free(dims);
1186 return NULL;
1187 }
1188 if (CudaNdarray_alloc_contiguous(out, self->nd, dims)) {
1189 Py_DECREF(out);
1190 Py_DECREF(indices);
1191 free(dims);
1192 return NULL;
1193 }
1194 }else {
1195 Py_INCREF(out);
1196 }
1197
1198 //Check argument clipmode
1199 if (clipmode_obj) {
1200 char * clipmode = PyString_AsString(clipmode_obj);
1201 if (! clipmode){
1202 Py_DECREF(indices);
1203 Py_DECREF(out);
1204 free(dims);
1205 return NULL;
1206 }
1207 if (strcmp(clipmode, "raise") != 0) {
1208 PyErr_Format(PyExc_NotImplementedError,
1209 "CudaNdarray_TakeFrom: only the raise mode is currently supported. Got '%s'",
1210 clipmode);
1211 Py_DECREF(indices);
1212 Py_DECREF(out);
1213 free(dims);
1214 return NULL;
1215 }
1216 }
1217 void (*k3)(const int, const int, const int,
1218 const npy_int64*,
1219 float*, const int, const int, const int,
1220 const float*, const int,
1221 const int, const int, const int,
1222 int*);
1223 k3 = k_take_3<CPY>;
1224
1225 // Create the memory place that will store the error information.
1226 if(init_err_var() != 0) return NULL;
1227
1228 dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
1229 if(CudaNdarray_HOST_DIMS(out)[0] == 0){
1230 // We take 0 elements, so no need for the rest of the code.
1231 // This speed up that case AND fix crash otherwise.
1232 free(dims);
1233 Py_DECREF(indices);
1234 return (PyObject *)out;
1235 }
1236
1237 switch (self->nd) {
1238 case 1:
1239 {
1240 dim3 n_threads(1, 1, 1);
1241 if (verbose)
1242 printf("cudaGetLastError=%d, nd=%d"
1243 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
1244 " n_threads.x=%i, n_threads.y=%i)\n",
1245 cudaGetLastError(), self->nd,
1246 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
1247 k3<<<n_blocks, n_threads>>>(
1248 dims[0],
1249 1,
1250 1,
1251 (npy_int64*) CudaNdarray_DEV_DATA(indices),
1252 CudaNdarray_DEV_DATA(out),
1253 CudaNdarray_HOST_STRIDES(out)[0], //strides
1254 1,
1255 1,
1256 CudaNdarray_DEV_DATA(self),
1257 CudaNdarray_HOST_DIMS(self)[0], //For indices check
1258 CudaNdarray_HOST_STRIDES(self)[0], //strides
1259 1,
1260 1,
1261 err_var);
1262 }
1263 break;
1264 case 2:
1265 {
1266 dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads), 1, 1);
1267
1268 if (verbose)
1269 printf("cudaGetLastError=%d, nd=%d"
1270 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
1271 " n_threads.x=%i, n_threads.y=%i)\n",
1272 cudaGetLastError(), self->nd,
1273 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
1274
1275 k3<<<n_blocks, n_threads>>>(
1276 dims[0], //dimensions
1277 dims[1],
1278 1,
1279 (npy_int64*) CudaNdarray_DEV_DATA(indices),
1280 CudaNdarray_DEV_DATA(out),
1281 CudaNdarray_HOST_STRIDES(out)[0], //strides
1282 CudaNdarray_HOST_STRIDES(out)[1],
1283 1,
1284 CudaNdarray_DEV_DATA(self),
1285 CudaNdarray_HOST_DIMS(self)[0], //For indices check
1286 CudaNdarray_HOST_STRIDES(self)[0], //strides
1287 CudaNdarray_HOST_STRIDES(self)[1],
1288 1,
1289 err_var);
1290 }
1291 break;
1292 case 3:
1293 {
1294 int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], max_threads);
1295 int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads / ty);
1296 dim3 n_threads(tx, ty, 1);
1297 if (verbose)
1298 printf("cudaGetLastError=%d, nd=%d"
1299 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
1300 " n_threads.x=%i, n_threads.y=%i)\n",
1301 cudaGetLastError(), self->nd,
1302 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
1303 k3<<<n_blocks, n_threads>>>(
1304 dims[0], //dimensions
1305 dims[1],
1306 dims[2],
1307 (npy_int64*) CudaNdarray_DEV_DATA(indices),
1308 CudaNdarray_DEV_DATA(out),
1309 CudaNdarray_HOST_STRIDES(out)[0], //strides
1310 CudaNdarray_HOST_STRIDES(out)[1],
1311 CudaNdarray_HOST_STRIDES(out)[2],
1312 CudaNdarray_DEV_DATA(self),
1313 CudaNdarray_HOST_DIMS(self)[0], //For indices check
1314 CudaNdarray_HOST_STRIDES(self)[0], //strides
1315 CudaNdarray_HOST_STRIDES(self)[1],
1316 CudaNdarray_HOST_STRIDES(self)[2],
1317 err_var);
1318 }
1319 break;
1320 default:
1321 PyErr_SetString(PyExc_NotImplementedError,
1322 "CudaNdarray_TakeFrom: only input with 1, 2 or 3"
1323 " dimensions are currently supported");
1324
1325 }
1326 free(dims);
1327 CNDA_THREAD_SYNC;
1328 cudaError_t err = cudaGetLastError();
1329 if (cudaSuccess != err) {
1330 PyErr_Format(PyExc_RuntimeError,
1331 "Cuda error: %s: %s.\n",
1332 "CudaNdarray_TakeFrom",
1333 cudaGetErrorString(err));
1334 Py_DECREF(indices);
1335 Py_DECREF(out);
1336 return NULL;
1337 }
1338
1339 int index_err = check_err_var();
1340 Py_DECREF(indices);
1341 if (index_err != 0) {
1342 Py_DECREF(out);
1343 return NULL;
1344 }
1345
1346 if (verbose) printf("TAKE SUCCEDED\n");
1347 return (PyObject *)out;
1348 }
1349
1350
1351 PyObject * CudaNdarray_SetStride(CudaNdarray * self, PyObject *args)
1352 {
1353 int pos, stride;
1354 if (! PyArg_ParseTuple(args, "ii", &pos, &stride))
1355 return NULL;
1356 if ((pos < 0) || (pos >= self->nd))
1357 {
1358 PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
1359 return NULL;
1360 }
1361 CudaNdarray_set_stride(self, pos, stride);
1362 if (cnda_copy_structure_to_device(self))
1363 {
1364 return NULL;
1365 }
1366 Py_INCREF(Py_None);
1367 return Py_None;
1368 }
1369 PyObject * CudaNdarray_SetShapeI(CudaNdarray * self, PyObject *args)
1370 {
1371 int pos, dim;
1372 if (! PyArg_ParseTuple(args, "ii", &pos, &dim))
1373 return NULL;
1374 if ((pos < 0) || (pos >= self->nd))
1375 {
1376 PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
1377 return NULL;
1378 }
1379 CudaNdarray_set_dim(self, pos, dim);
1380 if (cnda_copy_structure_to_device(self))
1381 {
1382 return NULL;
1383 }
1384 Py_INCREF(Py_None);
1385 return Py_None;
1386 }
1387
1388 static PyObject *
1389 CudaNdarray_exp(CudaNdarray* self)
1390 {
1391 CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New();
1392 if ((NULL == rval) || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self)))
1393 {
1394 Py_XDECREF(rval);
1395 return NULL;
1396 }
1397 unsigned int size = 1;
1398 for (int i = 0; i < self->nd; i++)
1399 {
1400 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
1401 }
1402 unsigned int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
1403 unsigned int n_blocks = std::min(ceil_intdiv(size,threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
1404 k_elemwise_unary_rowmajor_exp<<<n_blocks,threads_per_block>>>(size, self->nd, CudaNdarray_DEV_DIMS(self),
1405 CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_STRIDES(self),
1406 CudaNdarray_DEV_DATA(rval), CudaNdarray_DEV_STRIDES(rval));
1407
1408 //TODO: don't do this right away, do it when we need the result
1409 CNDA_THREAD_SYNC;
1410 cudaError_t err = cudaGetLastError();
1411 if( cudaSuccess != err)
1412 {
1413 Py_DECREF(rval);
1414 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
1415 return NULL;
1416 }
1417
1418 return (PyObject*)rval;
1419 }
1420
1421 static PyMethodDef CudaNdarray_methods[] =
1422 {
1423 {"__array__",
1424 (PyCFunction)CudaNdarray_CreateArrayObj, METH_VARARGS,
1425 "Copy from the device to a numpy ndarray"},
1426 {"__copy__",
1427 (PyCFunction)CudaNdarray_View, METH_NOARGS,
1428 "Create a shallow copy of this object. used by module copy"},
1429 {"__deepcopy__",
1430 (PyCFunction)CudaNdarray_DeepCopy, METH_O,
1431 "Create a copy of this object"},
1432 {"zeros",
1433 (PyCFunction)CudaNdarray_Zeros, METH_STATIC | METH_O,
1434 "Create a new CudaNdarray with specified shape, filled with zeros."},
1435 {"copy",
1436 (PyCFunction)CudaNdarray_Copy, METH_NOARGS,
1437 "Create a copy of this object"},
1438 {"is_c_contiguous",
1439 (PyCFunction)CudaNdarray_IS_C_Contiguous, METH_NOARGS,
1440 "Return True is the object is c contiguous. False otherwise."},
1441 {"reduce_sum",
1442 (PyCFunction)CudaNdarray_ReduceSum, METH_O,
1443 "Reduce over the given dimensions by summation"},
1444 {"exp",
1445 (PyCFunction)CudaNdarray_exp, METH_NOARGS,
1446 "Return the exponential of all elements"},
1447 {"reshape",
1448 (PyCFunction)CudaNdarray_Reshape, METH_O,
1449 "Return a reshaped view (or copy) of this ndarray\n\
1450 The required argument is a tuple of integers specifying the shape of the new ndarray."},
1451 {"view",
1452 (PyCFunction)CudaNdarray_View, METH_NOARGS,
1453 "Return an alias of this ndarray"},
1454 {"_set_stride",
1455 (PyCFunction)CudaNdarray_SetStride, METH_VARARGS,
1456 "For integer arguments (i, s), set the 'i'th stride to 's'"},
1457 {"take",
1458 (PyCFunction)CudaNdarray_TakeFrom, METH_VARARGS,
1459 "Equivalent of numpy.take"},
1460 {"_set_shape_i",
1461 (PyCFunction)CudaNdarray_SetShapeI, METH_VARARGS,
1462 "For integer arguments (i, s), set the 'i'th shape to 's'"},
1463 {NULL, NULL, NULL, NULL} /* Sentinel */
1464 };
1465
1466
1467 ////////////////////
1468 // Number protocol
1469 ////////////////////
1470
1471 __global__ void kAdd_contiguous(float* a, float* b, float* dest, unsigned int numEls) {
1472 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
1473 const unsigned int numThreads = blockDim.x * gridDim.x;
1474
1475 for (unsigned int i = idx; i < numEls; i += numThreads) {
1476 dest[i] = a[i] + b[i];
1477 }
1478 }
1479
1480 // Will be called by __add__ in Python
1481 static PyObject *
1482 CudaNdarray_add(PyObject* py_self, PyObject * py_other)
1483 {
1484 if (! CudaNdarray_Check(py_self)) {
1485 PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on left");
1486 return NULL;
1487 }
1488 if (! CudaNdarray_Check(py_other)) {
1489 PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on right");
1490 return NULL;
1491 }
1492 CudaNdarray * self = (CudaNdarray *)py_self;
1493 CudaNdarray * other = (CudaNdarray *)py_other;
1494 if(!CudaNdarray_is_c_contiguous(self) || !CudaNdarray_is_c_contiguous(other)){
1495 PyErr_SetString(PyExc_TypeError, "We have implementet only the c_contiguous version for now.");
1496 return NULL;
1497 }
1498
1499 //standard elemwise size checks
1500 if (self->nd != other->nd)
1501 {
1502 PyErr_SetString(PyExc_TypeError, "CudaNdarray_add: need same number of dims");
1503 return NULL;
1504 }
1505 //standard elemwise dim checks
1506 unsigned int size = 1;
1507 for (int i = 0; i< self->nd; ++i)
1508 {
1509 if (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
1510 {
1511 PyErr_SetString(PyExc_TypeError, "need same dimensions");
1512 return NULL;
1513 }
1514 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
1515 }
1516 CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New();
1517 if (!rval || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self)))
1518 {
1519 Py_XDECREF(rval);
1520 return NULL;
1521 }
1522
1523 if(CudaNdarray_SIZE((CudaNdarray *)py_self)==0 && CudaNdarray_SIZE((CudaNdarray *)py_other)==0){
1524 return (PyObject *) rval;
1525 }
1526
1527 int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
1528 int n_blocks = std::min(ceil_intdiv(size,(unsigned int)threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
1529 kAdd_contiguous<<<n_blocks,threads_per_block>>>(
1530 self->devdata, other->devdata, rval->devdata, size);
1531 CNDA_THREAD_SYNC;
1532 cudaError_t err = cudaGetLastError();
1533 if( cudaSuccess != err)
1534 {
1535 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kAdd", cudaGetErrorString(err));
1536 Py_DECREF(rval);
1537 return NULL;
1538 }
1539 return (PyObject *) rval;
1540 }
1541
1542 template <int operator_num>
1543 __global__ void k_ielem_3(const int d0, const int d1, const int d2,
1544 float* a, const int sA0, const int sA1, const int sA2,
1545 const float* b, const int sB0, const int sB1, const int sB2){
1546 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1547 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
1548 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){
1549 switch (operator_num)
1550 {
1551 case IADD:
1552 a[i0*sA0 + i1*sA1 + i2*sA2] += b[i0*sB0 + i1*sB1 + i2*sB2];
1553 break;
1554 case IDIV:
1555 a[i0*sA0 + i1*sA1 + i2*sA2] /= b[i0*sB0 + i1*sB1 + i2*sB2];
1556 break;
1557 case CPY:
1558 a[i0*sA0 + i1*sA1 + i2*sA2] = b[i0*sB0 + i1*sB1 + i2*sB2];
1559 break;
1560 }
1561 }
1562 }
1563 }
1564 }
1565
1566 template <int operator_num>
1567 __global__ void k_ielem_4(const int d0, const int d1, const int d2, const int d3,
1568 float* a, const int sA0, const int sA1,
1569 const int sA2, const int sA3,
1570 const float* b, const int sB0, const int sB1,
1571 const int sB2, const int sB3){
1572 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1573 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
1574 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){
1575 for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){
1576 switch (operator_num) {
1577 case IADD:
1578 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
1579 += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
1580 break;
1581 case IDIV:
1582 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
1583 /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
1584 break;
1585 case CPY:
1586 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
1587 = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
1588 break;
1589 }
1590 }
1591 }
1592 }
1593 }
1594 }
1595
1596 template <int operator_num>
1597 __global__ void k_ielem_6(const int d0, const int d1,
1598 const int d2, const int d3,
1599 const int d4, const int d5,
1600 float* a, const int sA0, const int sA1,
1601 const int sA2, const int sA3,
1602 const int sA4, const int sA5,
1603 const float* b, const int sB0, const int sB1,
1604 const int sB2, const int sB3,
1605 const int sB4, const int sB5
1606 ){
1607 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1608 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
1609 for (int i2 = blockIdx.z; i2 < d2; i2 += gridDim.z){
1610 for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x){
1611 for (int i4 = threadIdx.y; i4 < d4; i4 += blockDim.y){
1612 for (int i5 = threadIdx.z; i5 < d5; i5 += blockDim.z){
1613 switch (operator_num) {
1614 case IADD:
1615 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
1616 += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
1617 break;
1618 case IDIV:
1619 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
1620 /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
1621 break;
1622 case CPY:
1623 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
1624 = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
1625 break;
1626 }
1627 }
1628 }
1629 }
1630 }
1631 }
1632 }
1633 }
1634
1635 /*
1636 CudaNdarray_inplace_elemwise
1637 Compute elemwise, working inplace on A.
1638 Currently implemented A / B, A + B and A = B
1639 (the last is not tested and not used!)
1640
1641 py_self - the CudaNdarray that we'll modify (A)
1642 py_other - the other argument (B)
1643 fct_nb - which operation to perform (operator_t)
1644
1645 Returns 0 on success.
1646 Returns -1 on failure, and sets Python exception.
1647
1648 */
1649 int
1650 CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb)
1651 {
1652 int verbose = 0;
1653 void (*k3)(const int, const int, const int,
1654 float*, const int, const int, const int,
1655 const float*, const int, const int, const int);
1656 void (*k4)(const int, const int, const int, const int,
1657 float*, const int, const int,
1658 const int, const int,
1659 const float*, const int, const int,
1660 const int, const int);
1661 void (*k6)(const int, const int,
1662 const int, const int,
1663 const int, const int,
1664 float*, const int, const int,
1665 const int, const int,
1666 const int, const int,
1667 const float*, const int, const int,
1668 const int, const int,
1669 const int, const int);
1670 switch (fct_nb)
1671 {
1672 case IADD:
1673 k3 = k_ielem_3<IADD>;
1674 k4 = k_ielem_4<IADD>;
1675 k6 = k_ielem_6<IADD>;
1676 break;
1677 case IDIV:
1678 k3 = k_ielem_3<IDIV>;
1679 k4 = k_ielem_4<IDIV>;
1680 k6 = k_ielem_6<IDIV>;
1681 break;
1682 case CPY:
1683 k3 = k_ielem_3<CPY>;
1684 k4 = k_ielem_4<CPY>;
1685 k6 = k_ielem_6<CPY>;
1686 break;
1687 default:
1688 assert (0);
1689 PyErr_Format(
1690 PyExc_TypeError,
1691 "CudaNdarray_inplace_elemwise invalid fct_nb (%i).",
1692 (int)fct_nb);
1693 return -1;
1694 }
1695 if (!CudaNdarray_Check(py_self)) {
1696 PyErr_SetString(
1697 PyExc_TypeError,
1698 "CudaNdarray_inplace_elemwise need a CudaNdarray on left");
1699 return -1;
1700 }
1701 CudaNdarray * new_other = NULL;
1702 if (!CudaNdarray_Check(py_other)) {
1703 new_other = (CudaNdarray*) CudaNdarray_New();
1704 if(!new_other)
1705 {
1706 return -1;
1707 }
1708 if(CudaNdarray_CopyFromArray(new_other, (PyArrayObject *) py_other))
1709 {
1710 Py_XDECREF(new_other);
1711 return -1;
1712 }
1713 py_other = (PyObject *) new_other;
1714 }
1715
1716 CudaNdarray * self = (CudaNdarray *)py_self;
1717 CudaNdarray * other = (CudaNdarray *)py_other;
1718
1719 if (verbose)
1720 {
1721 fprintf(stderr,
1722 "INPLACE ADD/DIV for self->nd=%d other->nd=%d\n",
1723 self->nd, other->nd);
1724 }
1725
1726 //standard elemwise nb dim checks
1727 if (self->nd < other->nd)
1728 {
1729 PyErr_Format(
1730 PyExc_TypeError,
1731 "CudaNdarray_inplace_elemwise: The destination need more or the"
1732 " same number of dimensions then the source. Got %d and %d.",
1733 self->nd, other->nd);
1734 Py_XDECREF(new_other);
1735 return -1;
1736 }
1737
1738 //broadcast to the same number of dimensions.
1739 int* other_dims = (int*) alloca(self->nd * sizeof(int));
1740 int* other_strides = (int*) alloca(self->nd * sizeof(int));
1741 int added_dims = self->nd - other->nd;
1742 // Add the added broadcasted dimensions
1743 for (int i = 0; i< added_dims; ++i)
1744 {
1745 other_dims[i] = 1;
1746 other_strides[i] = 0;
1747 }
1748 // Copy the existing dimensions
1749 for (int i = 0; i< other->nd; ++i)
1750 {
1751 other_dims[i+added_dims] = CudaNdarray_HOST_DIMS(other)[i];
1752 other_strides[i+added_dims] = CudaNdarray_HOST_STRIDES(other)[i];
1753 }
1754
1755 //standard elemwise dim checks
1756 unsigned int size = 1;
1757 for (int i = 0; i< self->nd; ++i)
1758 {
1759 if ((CudaNdarray_HOST_DIMS(self)[i] != other_dims[i])
1760 && (other_dims[i] != 1))
1761 {
1762 PyErr_SetString(
1763 PyExc_ValueError,
1764 "CudaNdarray_inplace_elemwise need same dimensions (or broadcastable dimension)");
1765 Py_XDECREF(new_other);
1766 return -1;
1767 }
1768 // if we're broadcasting other, then make sure it has stride 0
1769 assert ((CudaNdarray_HOST_DIMS(self)[i] == other_dims[i])
1770 || (other_strides[i] == 0));
1771 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
1772 }
1773
1774 if (size==0)
1775 {
1776 int other_size = CudaNdarray_SIZE((CudaNdarray *)py_other);
1777 if (!(other_size == 0 || other_size == 1))
1778 {
1779 PyErr_SetString(
1780 PyExc_ValueError,
1781 "CudaNdarray_inplace_elemwise cannot work inplace on"
1782 " un-initialized array when the new value have more than"
1783 " 0 or 1 broadcastable dimensions");
1784 Py_XDECREF(new_other);
1785 return 0;
1786 }
1787 Py_XDECREF(new_other);
1788 return 0;
1789 }
1790
1791 switch(self->nd)
1792 {
1793 case 0:
1794 {
1795 dim3 n_blocks(1, 1, 1);
1796 dim3 n_threads(1);
1797 k3<<<n_blocks, n_threads>>>(
1798 1, //d0
1799 1, //d1
1800 1, //d2
1801 CudaNdarray_DEV_DATA(self),
1802 1, //strides
1803 1,
1804 1,
1805 CudaNdarray_DEV_DATA(other),
1806 1, //strides
1807 1,
1808 1);
1809 CNDA_THREAD_SYNC;
1810 cudaError_t err = cudaGetLastError();
1811 if (cudaSuccess != err)
1812 {
1813 PyErr_Format(
1814 PyExc_RuntimeError,
1815 "CudaNdarray_inplace_elemwise case0: Cuda error: %s: %s.\n",
1816 "k3",
1817 cudaGetErrorString(err));
1818 Py_XDECREF(new_other);
1819 return -1;
1820 }
1821 }
1822 break;
1823 case 1:
1824 {
1825 dim3 n_blocks(1, 1, 1);
1826 dim3 n_threads(
1827 std::min(
1828 CudaNdarray_HOST_DIMS(self)[0],
1829 NUM_VECTOR_OP_THREADS_PER_BLOCK));
1830 k3<<<n_blocks, n_threads>>>(
1831 1, //dimensions
1832 1,
1833 CudaNdarray_HOST_DIMS(self)[0],
1834 CudaNdarray_DEV_DATA(self),
1835 1, //strides
1836 1,
1837 CudaNdarray_HOST_STRIDES(self)[0],
1838 CudaNdarray_DEV_DATA(other),
1839 1, //strides
1840 1,
1841 other_strides[0]);
1842 CNDA_THREAD_SYNC;
1843 cudaError_t err = cudaGetLastError();
1844 if (cudaSuccess != err)
1845 {
1846 PyErr_Format(
1847 PyExc_RuntimeError,
1848 "CudaNdarray_inplace_elemwise case1: Cuda error: %s: %s.\n",
1849 "k3",
1850 cudaGetErrorString(err));
1851 Py_XDECREF(new_other);
1852 return -1;
1853 }
1854 }
1855 break;
1856 case 2:
1857 {
1858 //TODO: if both self and other are f-contiguous
1859 // Then flip the block and thread dimensions
1860 // to make contiguous reads & writes
1861 dim3 n_blocks(1,
1862 std::min(
1863 CudaNdarray_HOST_DIMS(self)[0],
1864 NUM_VECTOR_OP_BLOCKS));
1865 dim3 n_threads(
1866 std::min(
1867 CudaNdarray_HOST_DIMS(self)[1],
1868 NUM_VECTOR_OP_THREADS_PER_BLOCK));
1869 k3<<<n_blocks, n_threads>>>(1,
1870 CudaNdarray_HOST_DIMS(self)[0],
1871 CudaNdarray_HOST_DIMS(self)[1],
1872 CudaNdarray_DEV_DATA(self),
1873 1,
1874 CudaNdarray_HOST_STRIDES(self)[0],
1875 CudaNdarray_HOST_STRIDES(self)[1],
1876 CudaNdarray_DEV_DATA(other),
1877 1,
1878 other_strides[0],
1879 other_strides[1]);
1880 CNDA_THREAD_SYNC;
1881 cudaError_t err = cudaGetLastError();
1882 if (cudaSuccess != err)
1883 {
1884 PyErr_Format(
1885 PyExc_RuntimeError,
1886 "CudaNdarray_inplace_elemwise case2: Cuda error: %s: %s.\n",
1887 "k3",
1888 cudaGetErrorString(err));
1889 Py_XDECREF(new_other);
1890 return -1;
1891 }
1892 }
1893 break;
1894 case 3:
1895 {
1896 //TODO: Dimshuffle so that at least one of the arrays
1897 // has a contiguous dimension on the thread idx.
1898 dim3 n_blocks(
1899 std::min(
1900 CudaNdarray_HOST_DIMS(self)[0],
1901 NUM_VECTOR_OP_BLOCKS),
1902 CudaNdarray_HOST_DIMS(self)[1]);
1903 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
1904 n_blocks.y /= 2;
1905 dim3 n_threads(
1906 std::min(
1907 CudaNdarray_HOST_DIMS(self)[2],
1908 NUM_VECTOR_OP_THREADS_PER_BLOCK));
1909 k3<<<n_blocks, n_threads>>>(
1910 CudaNdarray_HOST_DIMS(self)[0],
1911 CudaNdarray_HOST_DIMS(self)[1],
1912 CudaNdarray_HOST_DIMS(self)[2],
1913 CudaNdarray_DEV_DATA(self),
1914 CudaNdarray_HOST_STRIDES(self)[0],
1915 CudaNdarray_HOST_STRIDES(self)[1],
1916 CudaNdarray_HOST_STRIDES(self)[2],
1917 CudaNdarray_DEV_DATA(other),
1918 other_strides[0],
1919 other_strides[1],
1920 other_strides[2]);
1921 CNDA_THREAD_SYNC;
1922 cudaError_t err = cudaGetLastError();
1923 if (cudaSuccess != err)
1924 {
1925 PyErr_Format(
1926 PyExc_RuntimeError,
1927 "CudaNdarray_inplace_elemwise case3: Cuda error: %s: %s.\n",
1928 "k3",
1929 cudaGetErrorString(err));
1930 Py_XDECREF(new_other);
1931 return -1;
1932 }
1933 }
1934 break;
1935 case 4:
1936 {
1937 dim3 n_blocks(
1938 std::min(
1939 CudaNdarray_HOST_DIMS(self)[0],
1940 NUM_VECTOR_OP_BLOCKS),
1941 CudaNdarray_HOST_DIMS(self)[1]
1942 );
1943 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
1944 n_blocks.y /= 2;
1945 dim3 n_threads(
1946 std::min(
1947 CudaNdarray_HOST_DIMS(self)[2],
1948 NUM_VECTOR_OP_THREADS_PER_BLOCK)
1949 //TODO: DON"T YOU NEED OT PUT DIMS[3] in here???
1950 );
1951 k4<<<n_blocks, n_threads>>>(
1952 CudaNdarray_HOST_DIMS(self)[0],
1953 CudaNdarray_HOST_DIMS(self)[1],
1954 CudaNdarray_HOST_DIMS(self)[2],
1955 CudaNdarray_HOST_DIMS(self)[3],
1956 CudaNdarray_DEV_DATA(self),
1957 CudaNdarray_HOST_STRIDES(self)[0],
1958 CudaNdarray_HOST_STRIDES(self)[1],
1959 CudaNdarray_HOST_STRIDES(self)[2],
1960 CudaNdarray_HOST_STRIDES(self)[3],
1961 CudaNdarray_DEV_DATA(other),
1962 other_strides[0],
1963 other_strides[1],
1964 other_strides[2],
1965 other_strides[3]);
1966 CNDA_THREAD_SYNC;
1967 cudaError_t err = cudaGetLastError();
1968 if (cudaSuccess != err)
1969 {
1970 PyErr_Format(
1971 PyExc_RuntimeError,
1972 "CudaNdarray_inplace_elemwise case4: Cuda error: %s: %s.\n",
1973 "k4",
1974 cudaGetErrorString(err));
1975 Py_XDECREF(new_other);
1976 return -1;
1977 }
1978 }
1979 break;
1980 case 5:
1981 {
1982 dim3 n_blocks(
1983 std::min(
1984 CudaNdarray_HOST_DIMS(self)[1],
1985 NUM_VECTOR_OP_BLOCKS),
1986 CudaNdarray_HOST_DIMS(self)[2]);
1987 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
1988 n_blocks.y /= 2;
1989 dim3 n_threads(
1990 std::min(
1991 CudaNdarray_HOST_DIMS(self)[3],
1992 NUM_VECTOR_OP_THREADS_PER_BLOCK)
1993 //TODO: DON"T YOU NEED OT PUT DIMS[3] in here???
1994 );
1995 for (int i = 0; i < CudaNdarray_HOST_DIMS(self)[0]; ++i)
1996 {
1997 k4<<<n_blocks, n_threads>>>(
1998 CudaNdarray_HOST_DIMS(self)[1],
1999 CudaNdarray_HOST_DIMS(self)[2],
2000 CudaNdarray_HOST_DIMS(self)[3],
2001 CudaNdarray_HOST_DIMS(self)[4],
2002 CudaNdarray_DEV_DATA(self) + i * CudaNdarray_HOST_STRIDES(self)[0],
2003 CudaNdarray_HOST_STRIDES(self)[1],
2004 CudaNdarray_HOST_STRIDES(self)[2],
2005 CudaNdarray_HOST_STRIDES(self)[3],
2006 CudaNdarray_HOST_STRIDES(self)[4],
2007 CudaNdarray_DEV_DATA(other) + i * other_strides[0],
2008 other_strides[1],
2009 other_strides[2],
2010 other_strides[3],
2011 other_strides[4]);
2012 CNDA_THREAD_SYNC;
2013 cudaError_t err = cudaGetLastError();
2014 if( cudaSuccess != err)
2015 {
2016 PyErr_Format(
2017 PyExc_RuntimeError,
2018 "CudaNdarray_inplace_elemwise case5: Cuda error: %s: %s. n_block=(%ld,%ld) n_threads=%ld\n",
2019 "k5 with loop over k4",
2020 cudaGetErrorString(err),
2021 (long) n_blocks.x, (long) n_blocks.y, (long) n_threads.x);
2022 Py_XDECREF(new_other);
2023 return -1;
2024 }
2025 }
2026 }
2027 break;
2028 case 6:
2029 {
2030 dim3 n_blocks(
2031 std::min(
2032 CudaNdarray_HOST_DIMS(self)[0],
2033 NUM_VECTOR_OP_BLOCKS),
2034 CudaNdarray_HOST_DIMS(self)[1],
2035 CudaNdarray_HOST_DIMS(self)[2]
2036 );
2037 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
2038 n_blocks.y /= 2;
2039 // GTX285(compute capabilities 1.3) don't support n_blocks.z > 1
2040 // (compute capabilities 2.0) support 65535 for n_blocks.z
2041 //while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
2042 // n_blocks.z /= 2;
2043 n_blocks.z = 1;
2044 dim3 n_threads(
2045 std::min(
2046 CudaNdarray_HOST_DIMS(self)[3],
2047 NUM_VECTOR_OP_THREADS_PER_BLOCK)
2048 //TODO: DON'T YOU NEED TO PUT DIMS[4] in here???
2049 //TODO: DON'T YOU NEED TO PUT DIMS[5] in here???
2050 );
2051 k6<<<n_blocks, n_threads>>>(
2052 CudaNdarray_HOST_DIMS(self)[0],
2053 CudaNdarray_HOST_DIMS(self)[1],
2054 CudaNdarray_HOST_DIMS(self)[2],
2055 CudaNdarray_HOST_DIMS(self)[3],
2056 CudaNdarray_HOST_DIMS(self)[4],
2057 CudaNdarray_HOST_DIMS(self)[5],
2058 CudaNdarray_DEV_DATA(self),
2059 CudaNdarray_HOST_STRIDES(self)[0],
2060 CudaNdarray_HOST_STRIDES(self)[1],
2061 CudaNdarray_HOST_STRIDES(self)[2],
2062 CudaNdarray_HOST_STRIDES(self)[3],
2063 CudaNdarray_HOST_STRIDES(self)[4],
2064 CudaNdarray_HOST_STRIDES(self)[5],
2065 CudaNdarray_DEV_DATA(other),
2066 other_strides[0],
2067 other_strides[1],
2068 other_strides[2],
2069 other_strides[3],
2070 other_strides[4],
2071 other_strides[5]);
2072 CNDA_THREAD_SYNC;
2073 cudaError_t err = cudaGetLastError();
2074 if (cudaSuccess != err)
2075 {
2076 PyErr_Format(
2077 PyExc_RuntimeError,
2078 "CudaNdarray_inplace_elemwise case6: Cuda error: %s: %s. n_blocks=(%ld, %ld, %ld) n_threads=(%ld)\n",
2079 "k6",
2080 cudaGetErrorString(err),
2081 (long) n_blocks.x, (long) n_blocks.y, (long) n_blocks.z,
2082 (long) n_threads.x);
2083 Py_XDECREF(new_other);
2084 return -1;
2085 }
2086 }
2087 break;
2088 default:
2089 {
2090 PyErr_Format(
2091 PyExc_NotImplementedError,
2092 "inplace_elemwise w nd=%i\n",
2093 self->nd);
2094 Py_XDECREF(new_other);
2095 return -1;
2096 }
2097 }
2098 if (verbose)
2099 fprintf(stderr, "INPLACE ADD/DIV end\n");
2100 Py_XDECREF(new_other);
2101 return 0;
2102 }
2103
2104 /*
2105 * We need this inplace Add to support IncSubTensor
2106 * It returns py_self on success with an additional reference. Else NULL.
2107 */
2108 // Will be called by __iadd__ in Python
2109 PyObject *
2110 CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
2111 {
2112 if (CudaNdarray_inplace_elemwise(py_self, py_other, IADD))
2113 {
2114 return NULL;
2115 }
2116 Py_INCREF(py_self);
2117 return py_self;
2118 }
2119
2120 /*
2121 * We need this inplace div for cuda/tests/test_basic_ops.py:test_shared_options
2122 * It returns py_self on success with an additional reference. Else NULL.
2123 */
2124 // Will be called by __idiv__ in Python
2125 static PyObject *
2126 CudaNdarray_inplace_div(PyObject* py_self, PyObject * py_other)
2127 {
2128 if (CudaNdarray_inplace_elemwise(py_self, py_other, IDIV))
2129 {
2130 return NULL;
2131 }
2132 Py_INCREF(py_self);
2133 return py_self;
2134 }
2135
2136 // The PyNumberMethods struct layout changed in a non-trivial way from 2 to 3.
2137 #if PY_MAJOR_VERSION == 3
2138 static PyNumberMethods CudaNdarrayNumberMethods =
2139 {
2140 (binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__
2141 0, //binaryfunc nb_subtract;
2142 0, //binaryfunc nb_multiply;
2143 0, //binaryfunc nb_remainder;
2144 0, //binaryfunc nb_divmod;
2145 0, //ternaryfunc nb_power;
2146 0, //unaryfunc nb_negative;
2147 0, //unaryfunc nb_positive;
2148 0, //unaryfunc nb_absolute;
2149 0, //inquiry nb_bool;
2150 0, //unaryfunc nb_invert;
2151 0, //binaryfunc nb_lshift;
2152 0, //binaryfunc nb_rshift;
2153 0, //binaryfunc nb_and;
2154 0, //binaryfunc nb_xor;
2155 0, //binaryfunc nb_or;
2156 0, //unaryfunc nb_int;
2157 0, //void *nb_reserved;
2158 0, //unaryfunc nb_float;
2159
2160 (binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__
2161 0, //binaryfunc nb_inplace_subtract;
2162 0, //binaryfunc nb_inplace_multiply;
2163 0, //binaryfunc nb_inplace_remainder;
2164 0, //ternaryfunc nb_inplace_power;
2165 0, //binaryfunc nb_inplace_lshift;
2166 0, //binaryfunc nb_inplace_rshift;
2167 0, //binaryfunc nb_inplace_and;
2168 0, //binaryfunc nb_inplace_xor;
2169 0, //binaryfunc nb_inplace_or;
2170
2171 0, //binaryfunc nb_floor_divide;
2172 0, //binaryfunc nb_true_divide;
2173 0, //binaryfunc nb_inplace_floor_divide;
2174 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_true_divide; __idiv__
2175
2176 0, //unaryfunc nb_index
2177 };
2178 #else
2179 static PyNumberMethods CudaNdarrayNumberMethods =
2180 {
2181 (binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__
2182 0, //binaryfunc nb_subtract; __sub__
2183 0, //binaryfunc nb_multiply; __mul__
2184 0, //binaryfunc nb_divide; __div__
2185 0, //binaryfunc nb_remainder; __mod__
2186 0, //binaryfunc nb_divmod; __divmod__
2187 0, //ternaryfunc nb_power; __pow__
2188 0, //unaryfunc nb_negative; __neg__
2189 0, //unaryfunc nb_positive; __pos__
2190 0, //unaryfunc nb_absolute; __abs__
2191 0, //inquiry nb_nonzero; __nonzero__ /* Used by PyObject_IsTrue */
2192 0, //unaryfunc nb_invert; __invert__
2193 0, //binaryfunc nb_lshift; __lshift__
2194 0, //binaryfunc nb_rshift; __rshift__
2195 0, //binaryfunc nb_and; __and__
2196 0, //binaryfunc nb_xor; __xor__
2197 0, //binaryfunc nb_or; __or__
2198 0, //coercion nb_coerce; __coerce__ /* Used by the coerce() function */
2199 0, //unaryfunc nb_int; __int__
2200 0, //unaryfunc nb_long; __long__
2201 0, //unaryfunc nb_float; __float__
2202 0, //unaryfunc nb_oct; __oct__
2203 0, //unaryfunc nb_hex; __hex__
2204
2205 /* Added in release 2.0 */
2206 (binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__
2207 0, //binaryfunc nb_inplace_subtract; __isub__
2208 0, //binaryfunc nb_inplace_multiply; __imul__
2209 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_divide; __idiv__
2210 0, //binaryfunc nb_inplace_remainder; __imod__
2211 0, //ternaryfunc nb_inplace_power; __ipow__
2212 0, //binaryfunc nb_inplace_lshift; __ilshift__
2213 0, //binaryfunc nb_inplace_rshift; __irshift__
2214 0, //binaryfunc nb_inplace_and; __iand__
2215 0, //binaryfunc nb_inplace_xor; __ixor__
2216 0, //binaryfunc nb_inplace_or; __ior__
2217
2218 /* Added in release 2.2 */
2219 0, //binaryfunc nb_floor_divide; __floordiv__
2220 0, //binaryfunc nb_true_divide; __truediv__
2221 0, //binaryfunc nb_inplace_floor_divide; __ifloordiv__
2222 0, //binaryfunc nb_inplace_true_divide; __itruediv__
2223
2224 #if PY_MINOR_VERSION > 4
2225 /* Added in release 2.5 */
2226 0 //unaryfunc nb_index; __index__
2227 #endif
2228 };
2229 #endif
2230
2231
2232 /////////////////////
2233 // Mapping protocol
2234 /////////////////////
2235
2236 // Will by called by __len__ in Python
2237 static Py_ssize_t
2238 CudaNdarray_len(PyObject * py_self)
2239 {
2240 CudaNdarray * self = (CudaNdarray*) py_self;
2241 if (self->nd <= 0)
2242 {
2243 return (Py_ssize_t) 0;
2244 }
2245 else
2246 {
2247 return (Py_ssize_t) CudaNdarray_HOST_DIMS(self)[0];
2248 }
2249 }
2250
2251 // Will by called by __getitem__ in Python
2252 PyObject *
2253 CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
2254 {
2255 int verbose = 0;
2256 if (verbose) fprintf(stderr, "Subscript .... \n");
2257 CudaNdarray * self = (CudaNdarray*) py_self;
2258 PyObject * py_rval = NULL;
2259 CudaNdarray * rval = NULL;
2260 PyObject * intobj = NULL;
2261
2262 //PyObject_Print(key, stderr, 0);
2263
2264 if (key == Py_Ellipsis)
2265 {
2266 Py_INCREF(py_self);
2267 return py_self;
2268 }
2269 if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER
2270 //else if (PyInt_Check(key)) //INDEXING BY INTEGER
2271 {
2272 int d_idx = PyInt_AsLong(intobj);
2273 Py_DECREF(intobj); intobj=NULL;
2274 //int d_idx = PyInt_AsLong(key);
2275 if (self->nd == 0)
2276 {
2277 PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed");
2278 return NULL;
2279 }
2280 int d_dim = CudaNdarray_HOST_DIMS(self)[0];
2281 int offset = 0;
2282
2283 if ((d_idx >= 0) && (d_idx < d_dim))
2284 {
2285 //normal indexing
2286 offset += d_idx * CudaNdarray_HOST_STRIDES(self)[0];
2287 }
2288 else if ((d_idx < 0) && (d_idx >= -d_dim))
2289 {
2290 //end-based indexing
2291 // d_idx is negative
2292 offset += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[0];
2293 }
2294 else
2295 {
2296 PyErr_Format(PyExc_IndexError,
2297 "index out of bounds. Asked %d, but size of %d",
2298 d_idx, d_dim);
2299 return NULL;
2300 }
2301
2302 //allocate our subtensor view
2303 py_rval = CudaNdarray_new_nd(self->nd - 1);
2304 rval = (CudaNdarray*) py_rval;
2305 if (!rval) return NULL;
2306 assert (0 == rval->data_allocated);
2307
2308 //initialize the view's data pointer to our own.
2309 if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self) + offset, self))
2310 {
2311 Py_DECREF(rval);
2312 return NULL;
2313 }
2314 for (int d = 1; d < self->nd; ++d)
2315 {
2316 CudaNdarray_set_stride(rval, d-1, CudaNdarray_HOST_STRIDES(self)[d]);
2317 CudaNdarray_set_dim(rval, d-1, CudaNdarray_HOST_DIMS(self)[d]);
2318 }
2319 }
2320 else
2321 {
2322 PyErr_Clear();
2323 }
2324 if (PySlice_Check(key)) //INDEXING BY SLICE
2325 {
2326 if (verbose) fprintf(stderr, "by slice\n");
2327 if (self->nd == 0)
2328 {
2329 PyErr_SetString(PyExc_ValueError, "cannot slice a 0-d array");
2330 return NULL;
2331 }
2332
2333 int d_dim = CudaNdarray_HOST_DIMS(self)[0];
2334 Py_ssize_t start, stop, step, slen;
2335 if (PySlice_GetIndicesEx(SLICE_CAST(key), d_dim, &start, &stop, &step, &slen))
2336 {
2337 if (verbose)
2338 fprintf(stderr, "PySlice_GetIndicesEx failed\n");
2339 return NULL;
2340 }
2341 if (verbose)
2342 {
2343 std::cerr << "start " << start << "\n";
2344 std::cerr << "stop " << stop << "\n";
2345 std::cerr << "step " << step << "\n";
2346 std::cerr << "slen " << slen << "\n";
2347 }
2348
2349 //allocate our subtensor view
2350 py_rval = CudaNdarray_new_nd(self->nd);
2351 rval = (CudaNdarray*) py_rval;
2352 if (!rval) return NULL;
2353 assert (0 == rval->data_allocated);
2354
2355
2356 //initialize the view's data pointer to our own.
2357 if (CudaNdarray_set_device_data(rval,
2358 CudaNdarray_DEV_DATA(self) + start * CudaNdarray_HOST_STRIDES(self)[0],
2359 self))
2360 {
2361 Py_DECREF(rval);
2362 return NULL;
2363 }
2364 //initialize dimension 0 of rval
2365 CudaNdarray_set_stride(rval, 0,
2366 (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[0]);
2367 CudaNdarray_set_dim(rval, 0, slen);
2368 if (verbose) std::cerr << "rval stride " << CudaNdarray_HOST_STRIDES(rval)[0] << "\n";
2369 // initialize dimensions > 0 of rval
2370 for (int d = 1; d < self->nd; ++d)
2371 {
2372 CudaNdarray_set_stride(rval, d, CudaNdarray_HOST_STRIDES(self)[d]);
2373 CudaNdarray_set_dim(rval, d, CudaNdarray_HOST_DIMS(self)[d]);
2374 }
2375 }
2376 if (PyTuple_Check(key)) //INDEXING BY TUPLE
2377 {
2378 if (verbose) fprintf(stderr, "by tuple\n");
2379 //elements of the tuple can be either integers or slices
2380 //the dimensionality of the view we will return is diminished for each slice in the tuple
2381
2382 if (PyTuple_Size(key) > self->nd)
2383 {
2384 PyErr_SetString(PyExc_IndexError, "index error");
2385 return NULL;
2386 }
2387
2388 //calculate the number of dimensions in the return value
2389 int rval_nd = self->nd;
2390 for (int d = 0; d < PyTuple_Size(key); ++d)
2391 {
2392 //On some paltform PyInt_Check(<type 'numpy.int64'>) return true, other it return false.
2393 //So we use PyArray_IsAnyScalar that should covert everything.
2394 rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, d));
2395 }
2396
2397 //allocate our subtensor view
2398 py_rval = CudaNdarray_new_nd(rval_nd);
2399 rval = (CudaNdarray*) py_rval;
2400 if (!rval) return NULL;
2401 assert (0 == rval->data_allocated);
2402
2403 //initialize the view's data pointer to our own.
2404 if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
2405 {
2406 Py_DECREF(rval);
2407 return NULL;
2408 }
2409
2410 // rval_d will refer to the current dimension in the rval.
2411 // It will not be incremented for integer keys, but will be incremented for slice
2412 // keys
2413 int rval_d = 0;
2414
2415 for (int d = 0; d < self->nd; ++d)
2416 {
2417 // keys can be shorter than self->nd.
2418 // when that happens, it means that the remaining dimensions are "full slices"
2419 if (d >=PyTuple_Size(key))
2420 {
2421 CudaNdarray_set_stride(rval, rval_d, CudaNdarray_HOST_STRIDES(self)[d]);
2422 CudaNdarray_set_dim(rval, rval_d, CudaNdarray_HOST_DIMS(self)[d]);
2423 ++rval_d;
2424 }
2425 else
2426 {
2427 PyObject * key_d = PyTuple_GetItem(key, d);
2428
2429 if (PySlice_Check(key_d))
2430 {
2431 Py_ssize_t start, stop, step, slen;
2432 if (PySlice_GetIndicesEx(SLICE_CAST(key_d), CudaNdarray_HOST_DIMS(self)[d], &start, &stop, &step, &slen))
2433 {
2434 Py_DECREF(rval);
2435 return NULL;
2436 }
2437 rval->devdata += start * CudaNdarray_HOST_STRIDES(self)[d];
2438 CudaNdarray_set_stride(rval, rval_d,
2439 (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[d]);
2440 CudaNdarray_set_dim(rval, rval_d, slen);
2441 if (0)
2442 {
2443 std::cerr << "start " << start << "\n";
2444 std::cerr << "stop " << stop << "\n";
2445 std::cerr << "step " << step << "\n";
2446 std::cerr << "slen " << slen << "\n";
2447 }
2448 ++rval_d;
2449 }
2450 else if ((intobj=PyNumber_Int(key_d)))
2451 {
2452 assert(PyArray_IsAnyScalar(key_d));
2453 int d_idx = PyInt_AsLong(intobj);
2454 Py_DECREF(intobj);
2455 intobj = NULL;
2456 int d_dim = CudaNdarray_HOST_DIMS(self)[d];
2457
2458 if ((d_idx >= 0) && (d_idx < d_dim))
2459 {
2460 //normal indexing
2461 rval->devdata += d_idx * CudaNdarray_HOST_STRIDES(self)[d];
2462 }
2463 else if ((d_idx < 0) && (d_idx >= -d_dim))
2464 {
2465 //end-based indexing
2466 rval->devdata += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[d];
2467 }
2468 else
2469 {
2470 PyErr_Format(PyExc_IndexError,
2471 "index out of bounds. Asked %d for dimensions %d, but size of %d",
2472 d_idx, d, d_dim);
2473 Py_DECREF(rval);
2474 return NULL;
2475 }
2476 }
2477 else
2478 {
2479 PyErr_Clear(); // clear the error set by PyNumber_Int
2480 PyErr_SetString(PyExc_IndexError, "index must be either int or slice");
2481 Py_DECREF(rval);
2482 return NULL;
2483 }
2484 }
2485 }
2486 }
2487 if (py_rval)
2488 {
2489 if (verbose) fprint_CudaNdarray(stderr, self);
2490 if (verbose) fprint_CudaNdarray(stderr, rval);
2491 }
2492 else
2493 {
2494 PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
2495 return NULL;
2496 }
2497 return py_rval;
2498 }
2499
2500 // Will by called by __setitem__ in Python
2501 // See http://docs.python.org/dev/py3k/c-api/object.html#PyObject_SetItem
2502 // Doesn't handle broadcasting, e.g. a[:] = 5
2503 // Can only be assigned from a CudaNdarray on the right side
2504 // Or a ndarray
2505 // Or a python scalar with value 0 when the left side part is c contiguous.
2506 static int
2507 CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
2508 {
2509 int verbose = 0;
2510 if (verbose) fprintf(stderr, "CudaNdarray_setitem start\n");
2511 // We try to copy directly into this CudaNdarray from the ndarray
2512 CudaNdarray* rval = (CudaNdarray*)CudaNdarray_Subscript(o, key);
2513 CudaNdarray* new_value = NULL;
2514
2515 if(!rval){
2516 // CudaNdarray_Subscript failed and set the error msg.
2517 Py_XDECREF(rval);
2518 return -1;
2519 }
2520
2521 if(rval != (CudaNdarray*)o &&
2522 (rval->data_allocated ||
2523 // The new array should have a base
2524 !(((CudaNdarray*)rval)->base) ||
2525 // If the original array has no base, the base of the new
2526 // array should be the original one
2527 (!((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != o) ||
2528 // Else, the two arrays should have the same base
2529 (((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != ((CudaNdarray*)o)->base)))
2530 {
2531 // This case shouldn't happen, based on what I see in Subscript
2532 // but just in case it happens sometime in the future
2533
2534 PyErr_Format(PyExc_RuntimeError,
2535 "__getitem__ must return a CudaNdarray that refers to"
2536 " the original CudaNdarray, not a copy. rval.base=%p"
2537 " o.base=%p o=%p",
2538 (((CudaNdarray*)rval)->base), ((CudaNdarray*)o)->base, o);
2539 Py_DECREF(rval);
2540 return -1;
2541 }
2542
2543 PyObject * intobj = NULL;
2544 if (CudaNdarray_Check(o) && PyArray_Check(value)){
2545 if (verbose)
2546 fprintf(stderr,
2547 "CudaNdarray_setitem dest is a CudaNdarray and"
2548 " value is a ndarray\n");
2549 new_value = (CudaNdarray*) CudaNdarray_New();
2550 if(!new_value)
2551 {
2552 return -1;
2553 }
2554 if (CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
2555 {
2556 Py_XDECREF(new_value);
2557 Py_XDECREF(rval);
2558 return -1;
2559 }
2560 value = (PyObject *) new_value;
2561 }
2562 else if ((intobj=PyNumber_Int(value)))
2563 {
2564 if (verbose)
2565 fprintf(stderr,
2566 "CudaNdarray_setitem dest and value is a python number\n");
2567 if(! CudaNdarray_is_c_contiguous(rval)){
2568 PyErr_SetString(PyExc_NotImplementedError,
2569 "CudaNdarray.__setitem__: When the new value is a scalar"
2570 " of value 0 the part where we copy to must be c contiguous.");
2571 Py_XDECREF(rval);
2572 return -1;
2573 }
2574
2575 long val = PyInt_AsLong(intobj);
2576 Py_DECREF(intobj); intobj=NULL;
2577 if (val == 0)
2578 {
2579 cudaError_t err = cudaMemset(rval->devdata, 0,
2580 CudaNdarray_SIZE(rval) * sizeof(real));
2581 Py_XDECREF(rval);
2582 if (err)
2583 {
2584 // Clear the error flag, cudaMemset doesn't do it.
2585 // Currently this returns the same thing as err, but if in future
2586 // it returns something else I still don't see why we should ignore
2587 // it. All we want to do here is reset the flag.
2588 cudaGetLastError();
2589 PyErr_SetString(PyExc_RuntimeError,
2590 "CudaNdarray.__setitem__: cudaMemset failed");
2591 return -1;
2592 }
2593 return 0;
2594 } else {
2595 Py_XDECREF(rval);
2596 PyErr_SetString(PyExc_NotImplementedError,
2597 "CudaNdarray.__setitem__: we support setting only python"
2598 " scalar of value 0, numpy nd array and CudaNdarray.");
2599 return -1;
2600 }
2601 }
2602
2603 PyErr_Clear(); // clear PyNumber_Int error.
2604
2605 if(!CudaNdarray_Check(o) || !CudaNdarray_Check(value))
2606 {
2607 PyErr_SetString(PyExc_TypeError,
2608 "CudaNdarray.__setitem__: left must be a CudaNdarrays and right"
2609 " must be a CudaNdarrays, an ndarray or a python scalar of value 0.");
2610 Py_XDECREF(new_value);
2611 return -1;
2612 }
2613
2614 if (verbose)
2615 fprintf(stderr, "CudaNdarray_setitem dest and value are CudaNdarray\n");
2616
2617 if (cnda_copy_structure_to_device(rval))
2618 {
2619 PyErr_SetString(PyExc_RuntimeError,
2620 "CudaNdarray.__setitem__: syncing structure to device failed");
2621 Py_DECREF(rval);
2622 Py_XDECREF(new_value);
2623
2624 if (verbose)
2625 fprintf(stderr, "CudaNdarray_setitem error end\n");
2626 return -1;
2627 }
2628
2629 PyObject *baseSavedForComparison = rval->base;
2630
2631 if (CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
2632 {
2633 Py_DECREF((PyObject*)rval);
2634 Py_XDECREF(new_value);
2635
2636 if (verbose)
2637 fprintf(stderr, "CudaNdarray_setitem error end\n");
2638 return -1;
2639 }
2640
2641 assert (rval->base == baseSavedForComparison);
2642 assert (rval->dev_structure_fresh);
2643
2644 // Clean up locally-created references
2645 Py_DECREF(rval);
2646 Py_XDECREF(new_value);
2647
2648 return 0;
2649 }
2650
2651
2652 PyMappingMethods CudaNdarrayMappingMethods = {
2653 CudaNdarray_len, //lenfunc mp_length; __len__
2654 CudaNdarray_Subscript, //binaryfunc mp_subscript; __getitem__
2655 CudaNdarray_setitem //objobjargproc mp_ass_subscript; __setitem__
2656 };
2657
2658 ////////////////////
2659 //
2660 ////////////////////
2661
2662 static PyObject *
2663 CudaNdarray_get_shape(CudaNdarray *self, void *closure)
2664 {
2665 if (self->nd < 0)
2666 {
2667 PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
2668 return NULL;
2669 }
2670 PyObject * rval = PyTuple_New(self->nd);
2671 for (int i = 0; i < self->nd; ++i)
2672 {
2673 if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_DIMS(self)[i])))
2674 {
2675 Py_XDECREF(rval);
2676 return NULL;
2677 }
2678
2679 }
2680 return rval;
2681 }
2682
2683 static int
2684 CudaNdarray_set_shape(CudaNdarray *self, PyObject *value, void *closure)
2685 {
2686 PyErr_SetString(PyExc_NotImplementedError, "TODO: call reshape");
2687 return -1;
2688 }
2689
2690 static PyObject *
2691 CudaNdarray_get_strides(CudaNdarray *self, void *closure)
2692 {
2693 if (self->nd < 0)
2694 {
2695 PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
2696 return NULL;
2697 }
2698 PyObject * rval = PyTuple_New(self->nd);
2699 for (int i = 0; i < self->nd; ++i)
2700 {
2701 if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_STRIDES(self)[i])))
2702 {
2703 Py_XDECREF(rval);
2704 return NULL;
2705 }
2706
2707 }
2708 return rval;
2709 }
2710
2711 static int
2712 CudaNdarray_set_strides(CudaNdarray *self, PyObject *value, void *closure)
2713 {
2714 //npy_intp newstrides_bytes[PyTuple_Size(value)];
2715 if (PyTuple_Check(value)){
2716 if (PyTuple_Size(value) != CudaNdarray_NDIM(self)){
2717 PyErr_SetString(PyExc_ValueError,
2718 "The new strides tuple must have the same length"
2719 " as the number of dimensions");
2720 return -1;
2721 }
2722 }else if (PyList_Check(value)){
2723 if (PyList_Size(value) != CudaNdarray_NDIM(self)){
2724 PyErr_SetString(PyExc_ValueError,
2725 "The new strides list must have the same length"
2726 " as the number of dimensions");
2727 return -1;
2728 }
2729 }else{
2730 PyErr_SetString(PyExc_ValueError,
2731 "The new strides need to be encoded in a tuple or list");
2732 return -1;
2733 }
2734 npy_intp* newstrides = (npy_intp*) alloca(CudaNdarray_NDIM(self) * sizeof(npy_intp));
2735 if (PyTuple_Check(value)){
2736 for(int i=0; i < CudaNdarray_NDIM(self); i++){
2737 newstrides[i] = PyInt_AsLong(PyTuple_GetItem(value, Py_ssize_t(i)));
2738 //newstrides_bytes[i] = newstrides[i] * 4;
2739 }
2740 }else if (PyList_Check(value)){
2741 for(int i=0; i < CudaNdarray_NDIM(self); i++){
2742 newstrides[i] = PyInt_AsLong(PyList_GetItem(value, Py_ssize_t(i)));
2743 //newstrides_bytes[i] = newstrides[i] * 4;
2744 }
2745 }
2746 /*
2747 // Do not do this check, as ExtractDiag needs that, and NumPy does not seem
2748 // to do it.
2749 npy_intp dims[PyTuple_Size(value)];
2750 for(int i=0; i < CudaNdarray_NDIM(self); i++){
2751 dims[i] = CudaNdarray_HOST_DIMS(self)[i];
2752 }
2753 if (!PyArray_CheckStrides(4,
2754 CudaNdarray_NDIM(self),
2755 0, 0,
2756 dims,
2757 newstrides_bytes)){
2758 PyErr_SetString(PyExc_ValueError, "bad new strides");
2759 return -1;
2760 }
2761 */
2762 for(int i=0; i < CudaNdarray_NDIM(self); i++){
2763 CudaNdarray_set_stride(self, i, newstrides[i]);
2764 }
2765 return 0;
2766 }
2767
2768 static PyObject *
2769 CudaNdarray_get_dev_data(CudaNdarray *self, void *closure)
2770 {
2771 float * p = CudaNdarray_DEV_DATA(self);
2772 //printf("get_dev_data %p %li \n", p, (long int)p );
2773 return PyInt_FromSize_t((size_t) CudaNdarray_DEV_DATA(self));
2774 }
2775
2776 static int
2777 CudaNdarray_set_dev_data(CudaNdarray *self, PyObject *value, void *closure)
2778 {
2779 Py_ssize_t newdevdata = PyInt_AsSsize_t(value);
2780 //printf("set_dev_data %p %li \n",(float*)newdevdata ,newdevdata);
2781 if (PyErr_Occurred())
2782 {
2783 return -1;
2784 }
2785 return CudaNdarray_set_device_data(self, (float*)newdevdata, (CudaNdarray*)self->base);
2786 }
2787
2788 static PyObject *
2789 CudaNdarray_get_dtype(CudaNdarray *self, void *closure)
2790 {
2791 return PyString_FromString("float32");
2792 }
2793
2794 static PyObject *
2795 CudaNdarray_get_ndim(CudaNdarray *self, void *closure)
2796 {
2797 return PyInt_FromLong(self->nd);
2798 }
2799
2800 static PyObject *
2801 CudaNdarray_get_base(CudaNdarray *self, void *closure)
2802 {
2803 PyObject * base = self->base;
2804 if (!base)
2805 {
2806 // We cannot return a NULL pointer, use None instead
2807 base = Py_None;
2808 }
2809 Py_INCREF(base);
2810 return base;
2811 }
2812
2813 void put_in_dict(PyObject * dict, const char * key, int val)
2814 {
2815 PyObject * k = PyString_FromString(key);
2816 PyObject * v = PyInt_FromLong(val);
2817 PyDict_SetItem(dict, k, v);
2818 Py_DECREF(k);
2819 Py_DECREF(v);
2820 }
2821
2822 PyObject *
2823 GetDeviceProperties(PyObject* _unused, PyObject* args)
2824 {
2825 int dev_id = -1;
2826 if (! PyArg_ParseTuple(args, "i", &dev_id))
2827 return NULL;
2828 cudaDeviceProp deviceProp;
2829 cudaGetDeviceProperties(&deviceProp, dev_id);
2830
2831 PyObject * dict = PyDict_New();
2832 PyObject * str= PyString_FromString("name");
2833 PyObject * i = PyString_FromString(deviceProp.name);
2834 PyDict_SetItem(dict, str, i);
2835 Py_DECREF(str);
2836 Py_DECREF(i);
2837
2838 put_in_dict(dict, "major", deviceProp.major);
2839 put_in_dict(dict, "minor", deviceProp.minor);
2840 #if CUDART_VERSION >= 2020
2841 int driverVersion = 0, runtimeVersion = 0;
2842 cudaDriverGetVersion(&driverVersion);
2843 cudaRuntimeGetVersion(&runtimeVersion);
2844 put_in_dict(dict, "driverVersion", driverVersion);
2845 put_in_dict(dict, "runtimeVersion", runtimeVersion);
2846 #endif
2847 #if CUDART_VERSION >= 2000
2848
2849 put_in_dict(dict, "multiProcessorCount", deviceProp.multiProcessorCount);
2850 //if ConvertSMVer2Cores is not defined in cuda_runtime_api.h, the run time is too old.
2851 int sm_cores = -1;
2852 if(deviceProp.major==1)
2853 sm_cores = 32;
2854 else if(deviceProp.major==2 && deviceProp.minor==0)
2855 sm_cores = 32;
2856 else if(deviceProp.major==2 && deviceProp.minor==1)
2857 sm_cores = 48;
2858 put_in_dict(dict, "coresCount", sm_cores * deviceProp.multiProcessorCount);
2859 #endif
2860 put_in_dict(dict, "totalConstMem", deviceProp.totalConstMem);
2861 put_in_dict(dict, "sharedMemPerBlock", deviceProp.sharedMemPerBlock);
2862 put_in_dict(dict, "regsPerBlock", deviceProp.regsPerBlock);
2863 put_in_dict(dict, "warpSize", deviceProp.warpSize);
2864 put_in_dict(dict, "maxThreadsPerBlock", deviceProp.maxThreadsPerBlock);
2865 put_in_dict(dict, "maxThreadsDim0", deviceProp.maxThreadsDim[0]);
2866 put_in_dict(dict, "maxThreadsDim1", deviceProp.maxThreadsDim[1]);
2867 put_in_dict(dict, "maxThreadsDim2", deviceProp.maxThreadsDim[2]);
2868 put_in_dict(dict, "maxGridSize0", deviceProp.maxGridSize[0]);
2869 put_in_dict(dict, "maxGridSize1", deviceProp.maxGridSize[1]);
2870 put_in_dict(dict, "maxGridSize2", deviceProp.maxGridSize[2]);
2871 put_in_dict(dict, "memPitch", deviceProp.memPitch);
2872 put_in_dict(dict, "textureAlignment", deviceProp.textureAlignment);
2873 put_in_dict(dict, "clockRate", deviceProp.clockRate);
2874 #if CUDART_VERSION >= 2000
2875 put_in_dict(dict, "deviceOverlap", deviceProp.deviceOverlap);
2876 #endif
2877 #if CUDART_VERSION >= 2020
2878 put_in_dict(dict, "kernelExecTimeoutEnabled", deviceProp.kernelExecTimeoutEnabled);
2879 put_in_dict(dict, "integrated", deviceProp.integrated);
2880 put_in_dict(dict, "canMapHostMemory", deviceProp.canMapHostMemory);
2881 put_in_dict(dict, "computeMode", deviceProp.computeMode);
2882 //in the doc of this fct tell that 0 - Normal mode, 1 - only 1 context, 2 - no context
2883 #endif
2884 #if CUDART_VERSION >= 3000
2885 put_in_dict(dict, "concurrentKernels", deviceProp.concurrentKernels);
2886 #endif
2887 #if CUDART_VERSION >= 3010
2888 put_in_dict(dict, "ECCEnabled", deviceProp.ECCEnabled);
2889 #endif
2890 #if CUDART_VERSION >= 3020
2891 put_in_dict(dict, "tccDriver", deviceProp.tccDriver);
2892 #endif
2893
2894 return dict;
2895 }
2896
2897 /*
2898 * Returns in *free and *total respectively, the free and total amount of memory available for allocation by the device in bytes.
2899 */
2900 PyObject *
2901 GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
2902 {
2903 size_t free = 0, total = 0;
2904 if(g_gpu_context_active == 0){
2905 PyErr_Format(PyExc_RuntimeError, "No gpu device selected yet. Please make sure the gpu device was initialized by Theano before.");
2906 return NULL;
2907 }
2908
2909 cudaError_t err = cudaMemGetInfo(&free, &total);
2910 if (err != cudaSuccess){
2911 // Clear the error flag, cudaMemGetInfo doesn't do it.
2912 // Currently this returns the same thing as err, but if in future
2913 // it returns something else I still don't see why we should ignore
2914 // it. All we want to do here is reset the flag.
2915 cudaGetLastError();
2916 PyErr_Format(PyExc_RuntimeError,
2917 "Error while getting memory info about the gpu: %s",
2918 cudaGetErrorString(err));
2919 return NULL;
2920 }
2921 return PyTuple_Pack(2, PyLong_FromLong(free), PyLong_FromLong(total));
2922 }
2923
2924 /*
2925 * Synchronize with all the gpu device stream.
2926 */
2927 PyObject *
2928 CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
2929 {
2930 CNDA_BEGIN_ALLOW_THREADS
2931 cudaThreadSynchronize();
2932 CNDA_END_ALLOW_THREADS
2933 Py_INCREF(Py_None);
2934 return Py_None;
2935 }
2936
2937 /*
2938 * Exist and return true if we link with cublas v2.
2939 */
2940 PyObject *
2941 CudaNdarray_cublasv2(PyObject* _unused, PyObject* dummy)
2942 {
2943 Py_INCREF(Py_True);
2944 return Py_True;
2945 }
2946
2947 PyObject *
2948 CudaNdarray_select_a_gpu(PyObject* _unused, PyObject* dummy)
2949 {
2950 void * rval = NULL;
2951 cudaError_t err;
2952 int num_gpus = 0;
2953
2954 err = cudaGetDeviceCount(&num_gpus);
2955 if (cudaSuccess != err){
2956 printf("ERR!\\n");
2957 PyErr_Format(PyExc_RuntimeError,
2958 "Not able to get number of GPUs (%s).",
2959 cudaGetErrorString(err));
2960 return NULL;
2961 }
2962
2963 for (int device = 0; device < num_gpus; device++) {
2964 cudaSetDevice(device);
2965 err = cudaDeviceSynchronize(); // << CUDA context gets created here.
2966 cudaGetLastError(); // reset the error state
2967 if (cudaSuccess == err)
2968 break;
2969 }
2970
2971 if (cudaSuccess != err){
2972 printf("ERR!\\n");
2973 PyErr_Format(PyExc_RuntimeError,
2974 "Not able to select available GPU from %d cards (%s).",
2975 num_gpus, cudaGetErrorString(err));
2976 return NULL;
2977 }
2978
2979 Py_INCREF(Py_None);
2980 return Py_None;
2981 }
2982
2983 #if COMPUTE_GPU_MEM_USED
2984 /*
2985 * Return the size in bytes that Theano currently have allocated on the gpu.
2986 */
2987 PyObject *
2988 GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
2989 {
2990 PyObject* a = PyLong_FromLong(_allocated_size);
2991 PyObject* b = PyLong_FromLong(_max_allocated_size);
2992
2993 PyObject* tuple = PyTuple_New(2);
2994 PyTuple_SetItem(tuple, 0, a);
2995 PyTuple_SetItem(tuple, 1, b);
2996 return tuple;
2997 }
2998 #endif
2999
3000 static PyGetSetDef CudaNdarray_getset[] = {
3001 {"shape",
3002 (getter)CudaNdarray_get_shape,
3003 (setter)CudaNdarray_set_shape,
3004 "shape of this ndarray (tuple)",
3005 NULL},
3006 {"_strides",
3007 (getter)CudaNdarray_get_strides,
3008 (setter)CudaNdarray_set_strides,
3009 "data pointer strides (in elements)",
3010 NULL},
3011 {"strides",
3012 (getter)CudaNdarray_get_strides,
3013 (setter)CudaNdarray_set_strides,
3014 "data pointer strides (in elements)",
3015 NULL},
3016 //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
3017 {"gpudata",
3018 (getter)CudaNdarray_get_dev_data,
3019 NULL,
3020 "device data pointer",
3021 NULL},
3022 {"_dev_data",
3023 (getter)CudaNdarray_get_dev_data,
3024 (setter)CudaNdarray_set_dev_data,
3025 "device data pointer",
3026 NULL},
3027 {"dtype",
3028 (getter)CudaNdarray_get_dtype,
3029 NULL,
3030 "The dtype of the element. Now always float32",
3031 NULL},
3032 {"size",
3033 (getter)CudaNdarray_SIZE_Object,
3034 NULL,
3035 "The number of elements in this object.",
3036 NULL},
3037 //mem_size is neede for pycuda.elementwise.ElementwiseKernel Why do they use size and mem_size of the same value?
3038 {"mem_size",
3039 (getter)CudaNdarray_SIZE_Object,
3040 NULL,
3041 "The number of elements in this object.",
3042 NULL},
3043 {"ndim",
3044 (getter)CudaNdarray_get_ndim,
3045 NULL,
3046 "The number of dimensions in this object.",
3047 NULL},
3048 {"base",
3049 (getter)CudaNdarray_get_base,
3050 NULL,
3051 "If this ndarray is a view, base is the original ndarray.",
3052 NULL},
3053
3054 {NULL, NULL, NULL, NULL} /* Sentinel */
3055 };
3056
3057 PyObject *CudaNdarray_repr(PyObject *self)
3058 {
3059 CudaNdarray *object = (CudaNdarray *)self;
3060 PyObject * np_object = CudaNdarray_CreateArrayObj(object);
3061 PyObject * str = PyObject_Str((PyObject *) np_object);
3062 char * cstr = PyString_AsString(str);
3063 PyObject * out = PyString_FromFormat("%s%s%s",
3064 "CudaNdarray(",
3065 cstr,
3066 ")");
3067 Py_DECREF(str);
3068 Py_DECREF(np_object);
3069 #if PY_MAJOR_VERSION >= 3
3070 // In Python 3 PyString_FromFormat return a Bytes object
3071 PyObject* out2 = PyObject_Str(out);
3072 Py_DECREF(out);
3073 return out2;
3074 #endif
3075 return out;
3076 }
3077
3078 static PyTypeObject CudaNdarrayType =
3079 {
3080 #if PY_MAJOR_VERSION >= 3
3081 PyVarObject_HEAD_INIT(NULL, 0)
3082 #else
3083 PyObject_HEAD_INIT(NULL)
3084 0, /*ob_size*/
3085 #endif
3086 "CudaNdarray", /*tp_name*/
3087 sizeof(CudaNdarray), /*tp_basicsize*/
3088 0, /*tp_itemsize*/
3089 (destructor)CudaNdarray_dealloc, /*tp_dealloc*/
3090 0, /*tp_print*/
3091 0, /*tp_getattr*/
3092 0, /*tp_setattr*/
3093 0, /*tp_compare*/
3094 CudaNdarray_repr, /*tp_repr*/
3095 &CudaNdarrayNumberMethods, /*tp_as_number*/
3096 0, /*tp_as_sequence*/
3097 &CudaNdarrayMappingMethods,/*tp_as_mapping*/
3098 0, /*tp_hash */
3099 0, /*tp_call*/
3100 0, /*tp_str*/
3101 0, /*tp_getattro*/
3102 0, /*tp_setattro*/
3103 0, /*tp_as_buffer*/
3104 #if PY_MAJOR_VERSION >= 3
3105 // Py_TPFLAGS_CHECKTYPES is always true and was removed in Python 3.
3106 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
3107 #else
3108 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES, /*tp_flags*/
3109 #endif
3110 "CudaNdarray objects", /* tp_doc */
3111 0, /* tp_traverse */
3112 0, /* tp_clear */
3113 0, /* tp_richcompare */
3114 0, /* tp_weaklistoffset */
3115 0, /* tp_iter */
3116 0, /* tp_iternext */
3117 CudaNdarray_methods, /* tp_methods */
3118 CudaNdarray_members, /* tp_members */
3119 CudaNdarray_getset, /* tp_getset */
3120 0, /* tp_base */
3121 0, /* tp_dict */
3122 0, /* tp_descr_get */
3123 0, /* tp_descr_set */
3124 0, /* tp_dictoffset */
3125 (initproc)CudaNdarray_init,/* tp_init */
3126 0, /* tp_alloc */
3127 CudaNdarray_new, /* tp_new */
3128 };
3129
3130 static __global__ void get_gpu_ptr_size(int* dst)
3131 {
3132 dst[0] = sizeof(float*);
3133 dst[1] = sizeof(int);
3134 }
3135
3136 PyObject *
3137 CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
3138 {
3139 int *gpu_data = (int*)device_malloc(sizeof(int)*2);
3140 if(gpu_data == NULL){
3141 return NULL;
3142 }
3143 get_gpu_ptr_size<<<1,1>>>(gpu_data);
3144
3145 cudaError_t cudaErr = cudaGetLastError();
3146 if (cudaSuccess != cudaErr){
3147
3148 device_free(gpu_data);
3149 return PyErr_Format(PyExc_RuntimeError,
3150 "CudaNdarray_ptr_int_size: error when calling the gpu code. (%s)",
3151 cudaGetErrorString(cudaErr));
3152 }
3153
3154 // Transfer the result to cpu
3155 int gpu_sizes[] = {-1,-1};
3156 cublasStatus_t err;
3157 err = cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
3158 device_free(gpu_data);
3159
3160 if (CUBLAS_STATUS_SUCCESS != err){
3161 PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
3162 return NULL;
3163 }
3164 return Py_BuildValue("iiii", (int) gpu_sizes[0], (int)sizeof(float*),
3165 (int)sizeof(int), (int) gpu_sizes[1]);
3166 }
3167
3168 static int cublas_init();
3169 static void cublas_shutdown();
3170 // Initialize the gpu.
3171 // Takes two optional parameters, the device number and if we should use cnmem.
3172 // If the device number is provided, it sets that device to be the active device.
3173 // If not provided (usually just to test whether the gpu is available at all),
3174 // it does not set an active device.
3175 // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
3176 // cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
3177 PyObject *
3178 CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
3179 {
3180 int card_nb = 0;
3181 int card_number_provided = 1;
3182 float cnmem = 0; // Theano flag lib.cnmem
3183 // if we're given something wildly invalid, this will throw a TypeError
3184 if(!PyArg_ParseTuple(args, "|if", &card_nb, &cnmem))
3185 return NULL;
3186 if(cnmem)
3187 g_use_cnmem = true;
3188
3189 if(PyTuple_Size(args) == 0) {
3190 card_number_provided = 0;
3191 card_nb = 0;
3192 }
3193
3194 int deviceCount;
3195 cudaError err = cudaGetDeviceCount(&deviceCount);
3196 if(cudaSuccess != err) {
3197 return PyErr_Format(PyExc_EnvironmentError,
3198 "Unable to get the number of gpus available: %s",
3199 cudaGetErrorString(cudaGetLastError()));
3200 }
3201
3202 // as soon as the first successful call to a cuda* function is made, a
3203 // gpu context has been created
3204 g_gpu_context_active = 1;
3205
3206 if(deviceCount <= 0) {
3207 return PyErr_Format(PyExc_EnvironmentError,
3208 "Can't use the GPU, no devices support CUDA");
3209 }
3210 if(card_number_provided && (card_nb < 0 || card_nb > (deviceCount - 1))) {
3211 return PyErr_Format(PyExc_ValueError,
3212 "Bad device number %d. Only %d devices available.",
3213 card_nb,
3214 deviceCount);
3215 }
3216
3217 cudaDeviceProp deviceProp;
3218 err = cudaGetDeviceProperties(&deviceProp, card_nb);
3219 if(cudaSuccess != err) {
3220 return PyErr_Format(PyExc_EnvironmentError,
3221 "Unable to get properties of gpu %i: %s",
3222 card_nb,
3223 cudaGetErrorString(cudaGetLastError()));
3224 }
3225
3226 if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
3227 return PyErr_Format(PyExc_EnvironmentError,
3228 "There is no device that supports CUDA");
3229 }
3230
3231 if(card_number_provided) {
3232 err = cudaSetDevice(card_nb);
3233 if(cudaSuccess != err) {
3234 return PyErr_Format(PyExc_EnvironmentError,
3235 "Unable to set device %i: %s",
3236 card_nb,
3237 cudaGetErrorString(cudaGetLastError()));
3238 }
3239 if (cublas_init() == -1)
3240 return NULL;
3241 }
3242 if(card_number_provided && g_use_cnmem) {
3243 size_t mem = 0;
3244 if (cnmem > 1)
3245 mem = cnmem * 1024 * 1024;
3246 else{
3247 // Clip to 95% to let memory for the driver.
3248 // 98% didn't worked in some cases.
3249 if (cnmem > .95){
3250 cnmem = .95;
3251 }
3252 size_t free = 0, total = 0;
3253 cudaError_t err = cudaMemGetInfo(&free, &total);
3254 if (err != cudaSuccess){
3255 // Clear the error flag, cudaMemGetInfo doesn't do it.
3256 // Currently this returns the same thing as err, but if in future
3257 // it returns something else I still don't see why we should ignore
3258 // it. All we want to do here is reset the flag.
3259 cudaGetLastError();
3260 PyErr_Format(PyExc_RuntimeError,
3261 "Error while getting memory info about the gpu: %s",
3262 cudaGetErrorString(err));
3263 return NULL;
3264 }
3265 mem = total * cnmem;
3266 }
3267 if(initCnmem(card_number_provided, card_nb, mem) == -1){
3268 return NULL;
3269 }
3270 }
3271
3272 Py_INCREF(Py_None);
3273 return Py_None;
3274 }
3275
3276 PyObject *
3277 CudaNdarray_active_device_number(PyObject* _unused, PyObject* _unused_args) {
3278 // NB: No cuda error checking here; keeps things simple, and it's not
3279 // really necessary.
3280 int currentDevice;
3281 cudaGetDevice(&currentDevice);
3282 return PyInt_FromLong(currentDevice);
3283 }
3284
3285 PyObject *
3286 CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
3287 // NB: No cuda error checking here; keeps things simple, and it's not
3288 // really necessary.
3289 int currentDevice;
3290 cudaGetDevice(&currentDevice);
3291
3292 cudaDeviceProp deviceProp;
3293 cudaGetDeviceProperties(&deviceProp, currentDevice);
3294 return PyString_FromString(deviceProp.name);
3295 }
3296
3297 PyObject *
3298 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
3299 // Don't handle errors here
3300 cublas_shutdown();
3301 g_gpu_context_active = 0; // context has now been closed down
3302 if(g_use_cnmem) {
3303 cnmemStatus_t status = cnmemFinalize();
3304 if(status != CNMEM_STATUS_SUCCESS) {
3305 fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
3306 cnmemGetErrorString(status));
3307 if(status == CNMEM_STATUS_CUDA_ERROR) {
3308 fprintf(stderr, " Cuda-Reason=%s\n",
3309 cudaGetErrorString(cudaGetLastError()));
3310 }
3311 }
3312 }
3313 cudaThreadExit();
3314
3315 Py_INCREF(Py_None);
3316 return Py_None;
3317 }
3318
3319 /*
3320 * This function is tested in theano/misc/test_pycuda_theano_simple.py
3321 */
3322 PyObject *
3323 CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
3324 {
3325 int verbose = 0;
3326 PyObject *gpu_ptr = NULL;
3327 PyObject *shapes = NULL;
3328 PyObject *strides = NULL;
3329 PyObject *base = NULL;
3330 PyObject *rval = NULL;
3331
3332 //args should consist of 3 python objects
3333 //The first is the gpu ptr
3334 //The second if the shape
3335 //The third if the strides
3336 if (! PyArg_ParseTuple(args, "OOOO", &gpu_ptr, &shapes, &strides, &base))
3337 return NULL;
3338
3339 if (verbose) printf("In CudaNdarray_from_gpu_pointer\n");
3340 if (!PyLong_Check(gpu_ptr))
3341 {
3342 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: The gpu pointor is not an long");
3343 return NULL;
3344 }
3345
3346 Py_ssize_t nd = PyObject_Length(shapes);
3347 if (nd < 0)
3348 {
3349 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of second argument");
3350 return NULL;
3351 }
3352 Py_ssize_t nd_stride = PyObject_Length(strides);
3353 if (nd_stride < 0)
3354 {
3355 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of third argument");
3356 return NULL;
3357 }
3358
3359 if (nd != nd_stride)
3360 {
3361 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: We need the same number of shapes and strides");
3362 return NULL;
3363 }
3364
3365 rval = CudaNdarray_New();
3366
3367 if (CudaNdarray_set_nd((CudaNdarray *)rval, nd))
3368 {
3369 //CudaNdarray_set_nd set the error msg
3370 return NULL;
3371 }
3372 // set gpu pointeur
3373 assert(((CudaNdarray *)rval)->data_allocated == 0);
3374 if (CudaNdarray_set_device_data((CudaNdarray *)rval, (float *)PyInt_AsLong(gpu_ptr), base))
3375 {
3376 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Error while setting the gpu pointor");
3377 return NULL;
3378
3379 }
3380
3381 // Set dims and strides
3382 for (int i = nd-1; i >= 0; --i)
3383 {
3384 PyObject * idx = PyLong_FromLong(i);
3385 if (idx == NULL)
3386 {
3387 PyErr_SetString(PyExc_Exception, "CudaNdarray_from_gpu_pointer: Couldn't make long object to loop over list/tuple");
3388 return NULL;
3389 }
3390 PyObject* dim_ = PyObject_GetItem(shapes, idx);
3391 PyObject* strd_ = PyObject_GetItem(strides, idx);
3392 if (!PyInt_Check(dim_))
3393 {
3394 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: shapes[%d] is not an int", i);
3395 return NULL;
3396 }
3397 if (!PyInt_Check(strd_))
3398 {
3399 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: strides[%d] is not an int", i);
3400 return NULL;
3401 }
3402 int dim = PyInt_AsLong(dim_);
3403 int strd = PyInt_AsLong(strd_);
3404 CudaNdarray_set_stride((CudaNdarray *)rval, i, strd);
3405 CudaNdarray_set_dim((CudaNdarray *)rval, i, dim);
3406 Py_DECREF(idx);
3407 Py_DECREF(dim_);
3408 Py_DECREF(strd_);
3409 }
3410 if (verbose) printf("CudaNdarray_from_gpu_pointer normal return\n");
3411 return rval;
3412 }
3413
3414 PyObject *
3415 CudaNdarray_Dot(PyObject* _unused, PyObject* args)
3416 {
3417 PyObject *l=NULL;
3418 PyObject *r=NULL;
3419 PyObject * rval = NULL;
3420
3421 //args should consist of two python objects ("OO")
3422 if (! PyArg_ParseTuple(args, "OO", &l, &r))
3423 return NULL;
3424
3425 if (!CudaNdarray_Check(l) || !CudaNdarray_Check(r))
3426 {
3427 PyErr_SetString(PyExc_TypeError, "CudaNdarray arguments required ");
3428 goto CudaNdarray_dot_fail;
3429 }
3430 if (((CudaNdarray*)l)->nd != 2)
3431 {
3432 PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
3433 goto CudaNdarray_dot_fail;
3434 }
3435 if (((CudaNdarray*)r)->nd != 2)
3436 {
3437 PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
3438 goto CudaNdarray_dot_fail;
3439 }
3440 rval = CudaNdarray_New();
3441 if (!rval)
3442 {
3443 goto CudaNdarray_dot_fail;
3444 }
3445 int dims[2];
3446 dims[0] = CudaNdarray_HOST_DIMS((CudaNdarray*)l)[0];
3447 dims[1] = CudaNdarray_HOST_DIMS((CudaNdarray*)r)[1];
3448 if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, 2, dims))
3449 {
3450 goto CudaNdarray_dot_fail;
3451 }
3452 if (CudaNdarray_gemm(1.0, (CudaNdarray*)l, (CudaNdarray*)r, 0.0, (CudaNdarray*)rval))
3453 {
3454 goto CudaNdarray_dot_fail;
3455 }
3456
3457 return rval;
3458
3459 CudaNdarray_dot_fail:
3460 Py_XDECREF(rval);
3461 return NULL;
3462 }
3463
3464 static PyObject *
3465 filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage)
3466 {
3467 /*
3468 * TODO: DOC what this function should do in the various cases of
3469 * What is 'strict' supposed to mean in the context of this function?
3470 * What do we do with input that could be interpreted as matching the broadcastable pattern in strict vs. non-strict cases?
3471 *
3472 */
3473 PyObject *py_data=NULL;
3474 PyArrayObject * data = NULL;
3475 int strict = 0;
3476 PyObject * broadcastable=NULL;
3477 PyObject * storage=NULL;
3478 CudaNdarray * rval=NULL;
3479
3480 //Python object references which are provided to the caller are borrowed references
3481 if (!PyArg_ParseTuple(args, "OOiO", &py_data, &broadcastable, &strict, &storage)) return NULL;
3482
3483 if (!PyTuple_Check(broadcastable)){
3484 PyErr_SetString(PyExc_TypeError, "broadcastable arg should be a tuple of int.");
3485 return NULL;
3486 }
3487 Py_INCREF(py_data);
3488 Py_INCREF(broadcastable);
3489
3490 CudaNdarray * cnda = (CudaNdarray*)py_data;
3491
3492 if (strict || CudaNdarray_Check(py_data))
3493 {
3494 //TODO: support non-strict "casting" from a vt to the broadcastable/type/size that we need.
3495 if (!CudaNdarray_Check(py_data))
3496 {
3497 Py_DECREF(py_data);
3498 Py_DECREF(broadcastable);
3499 PyErr_SetString(PyExc_TypeError, "strict mode requires CudaNdarray");
3500 return NULL;
3501 }
3502 if (cnda->nd != PyTuple_Size(broadcastable))
3503 {
3504 Py_DECREF(py_data);
3505 Py_DECREF(broadcastable);
3506 PyErr_Format(PyExc_TypeError, "Wrong rank: %i vs %li", cnda->nd, (long)PyTuple_Size(broadcastable));
3507 return NULL;
3508 }
3509 for (int i = 0; i < cnda->nd; ++i)
3510 {
3511 if ((CudaNdarray_HOST_DIMS(cnda)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
3512 {
3513 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable vt dimension %i", i);
3514 Py_DECREF(py_data);
3515 Py_DECREF(broadcastable);
3516 return NULL;
3517 }else if (CudaNdarray_HOST_DIMS(cnda)[i] == 1 && CudaNdarray_HOST_STRIDES(cnda)[i] != 0){
3518 PyErr_Format(PyExc_TypeError, "Non-zeros strides(%d) on dimension %d of size 1",
3519 CudaNdarray_HOST_STRIDES(cnda)[i], i);
3520 Py_DECREF(py_data);
3521 Py_DECREF(broadcastable);
3522 return NULL;
3523 }
3524 }
3525 Py_DECREF(broadcastable);
3526 return py_data;
3527 }
3528 else
3529 {
3530 data = (PyArrayObject*)PyArray_FromObject(py_data, REAL_TYPENUM, PyTuple_Size(broadcastable), PyTuple_Size(broadcastable));
3531 if (!data)
3532 {
3533 //err message already defined
3534 Py_DECREF(py_data);
3535 Py_DECREF(broadcastable);
3536 return NULL;
3537 }
3538 for (int i = 0; i < PyArray_NDIM(data); ++i)
3539 {
3540 if ((PyArray_DIMS(data)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
3541 {
3542 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable dimension %i", i);
3543 Py_DECREF(data);
3544 Py_DECREF(py_data);
3545 Py_DECREF(broadcastable);
3546 return NULL;
3547 }
3548 }
3549 if (storage && CudaNdarray_Check(storage))
3550 {
3551 rval = (CudaNdarray*) storage;
3552 Py_INCREF(rval);
3553 }
3554 else
3555 {
3556 rval = (CudaNdarray*) CudaNdarray_New();
3557 }
3558 if (rval)
3559 {
3560 if (CudaNdarray_CopyFromArray(rval, data))
3561 {
3562 Py_DECREF(rval);
3563 rval = NULL;
3564 }
3565 }
3566 Py_DECREF(data);
3567 Py_DECREF(py_data);
3568 Py_DECREF(broadcastable);
3569 return (PyObject*)rval;
3570 }
3571 }
3572
3573 //TODO-- CudaNdarray_Dot and CudaNdarray_active_device_name are following different capitalization conventions.
3574 // Pick one and standardize it, this file is already annoying enough to grep through
3575 static PyMethodDef module_methods[] = {
3576 {"dimshuffle", CudaNdarray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a CudaNdarray."},
3577 {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
3578 {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."},
3579 {"select_a_gpu", CudaNdarray_select_a_gpu, METH_NOARGS, "Call this method if you want to select a GPU before gpu_init call and let the driver choose the GPU."},
3580 {"active_device_name", CudaNdarray_active_device_name, METH_VARARGS, "Get the name of the active device."},
3581 {"active_device_number", CudaNdarray_active_device_number, METH_VARARGS, "Get the number of the active device."},
3582 {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},
3583 {"device_properties", GetDeviceProperties, METH_VARARGS, "Return a dictionary with the device properties."},
3584 {"mem_info", GetDeviceMemInfo, METH_NOARGS, "Return a tuple with the free and total memory on the gpu in bytes."},
3585 #if COMPUTE_GPU_MEM_USED
3586 {"theano_allocated", GetTheanoAllocInfo, METH_NOARGS, "Return the size in bytes of memory Theano currently have allocated on the gpu."},
3587 #endif
3588 {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
3589 {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
3590 {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
3591 {"from_gpu_pointer", CudaNdarray_from_gpu_pointer, METH_VARARGS, "Used to create a CudaNdarray from already allocated memory on the gpu.(example by pycuda)"},
3592 {"synchronize", CudaNdarray_synchronize, METH_NOARGS, "Used to synchronize the device"},
3593 {"cublas_v2", CudaNdarray_cublasv2, METH_NOARGS,
3594 "Used to know if this version of cuda_ndarray is linked with cublas v2."},
3595 {NULL, NULL, NULL, NULL} /* Sentinel */
3596 };
3597
3598 #define CNDA_MOD_NAME "cuda_ndarray"
3599 #define CNDA_DOCSTRING "CUDA implementation of a numpy ndarray-like object."
3600
3601 #if PY_MAJOR_VERSION == 3
3602 static struct PyModuleDef cuda_ndarray_moduledef =
3603 {
3604 PyModuleDef_HEAD_INIT,
3605 CNDA_MOD_NAME,
3606 CNDA_DOCSTRING,
3607 -1, /* size of per-interpreter state of the module,
3608 or -1 if the module keeps state in global variables. */
3609 module_methods
3610 };
3611
3612 PyMODINIT_FUNC
3613 PyInit_cuda_ndarray(void)
3614 #else
3615 PyMODINIT_FUNC
3616 initcuda_ndarray(void)
3617 #endif
3618 {
3619 import_array();
3620
3621 PyObject* m;
3622
3623 if (PyType_Ready(&CudaNdarrayType) < 0) {
3624 #if PY_MAJOR_VERSION == 3
3625 return NULL;
3626 #else
3627 return;
3628 #endif
3629 }
3630
3631 #if PY_MAJOR_VERSION == 3
3632 m = PyModule_Create(&cuda_ndarray_moduledef);
3633 #else
3634 m = Py_InitModule3(CNDA_MOD_NAME, module_methods, CNDA_DOCSTRING);
3635 #endif
3636
3637 if (m == NULL) {
3638 #if PY_MAJOR_VERSION == 3
3639 return NULL;
3640 #else
3641 return;
3642 #endif
3643 }
3644
3645 Py_INCREF(&CudaNdarrayType);
3646 PyModule_AddObject(m, "CudaNdarray", (PyObject *)&CudaNdarrayType);
3647 #if COMPUTE_GPU_MEM_USED
3648 for(int i=0;i<TABLE_SIZE;i++){
3649 _alloc_size_table[i].ptr=NULL;
3650 _alloc_size_table[i].size=0;
3651 }
3652 #endif
3653 // cublasInit();
3654 //if (0&&CUBLAS_STATUS_SUCCESS != cublasGetError())
3655 //{
3656 //std::cerr << "WARNING: initcuda_ndarray: error initializing device\n";
3657 //}
3658 if (0) //TODO: is this necessary?
3659 {
3660 int deviceId = 0; // TODO: what number goes here?
3661 cudaSetDevice(deviceId);
3662 cudaError_t err = cudaGetLastError();
3663 if( cudaSuccess != err)
3664 {
3665 std::cerr << "Error in SetDevice:" << cudaGetErrorString(err) << "\n";
3666 }
3667 }
3668
3669 #if PY_MAJOR_VERSION == 3
3670 return m;
3671 #endif
3672 }
3673
3674
3675 //////////////////////////////////////
3676 //
3677 // C API FOR CudaNdarray
3678 //
3679 //////////////////////////////////////
3680
3681 int
3682 CudaNdarray_Check(const PyObject * ob)
3683 {
3684 //TODO: doesn't work with inheritance
3685 return CudaNdarray_CheckExact(ob);
3686 }
3687 int
3688 CudaNdarray_CheckExact(const PyObject * ob)
3689 {
3690 return ((Py_TYPE(ob) == &CudaNdarrayType) ? 1 : 0);
3691 }
3692
3693 PyObject *
3694 CudaNdarray_New(int nd)
3695 {
3696 CudaNdarray *self = (CudaNdarray *)CudaNdarrayType.tp_alloc(&CudaNdarrayType, 0);
3697 if (self == NULL)
3698 {
3699 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_New failed to allocate self");
3700 return NULL;
3701 }
3702 CudaNdarray_null_init(self);
3703
3704 if (nd == 0)
3705 {
3706 self->nd = 0;
3707 }
3708 else if (nd > 0)
3709 {
3710 if (CudaNdarray_set_nd(self, nd))
3711 {
3712 Py_DECREF(self);
3713 return NULL;
3714 }
3715 }
3716 ++_outstanding_mallocs[1];
3717 return (PyObject *)self;
3718 }
3719
3720
3721
3722 //////////////////////////////
3723 //
3724 // Published helper functions
3725 //
3726 //////////////////////////////
3727
3728 static int
3729 cublas_init()
3730 {
3731 cublasStatus_t err;
3732 err = cublasCreate(&handle);
3733 if (CUBLAS_STATUS_SUCCESS != err)
3734 {
3735 if(CUBLAS_STATUS_NOT_INITIALIZED == err)
3736 PyErr_SetString(PyExc_RuntimeError,
3737 "cublasCreate() returned this error "
3738 "'the CUDA Runtime initialization failed'");
3739 else if(CUBLAS_STATUS_ALLOC_FAILED == err)
3740 PyErr_SetString(PyExc_RuntimeError,
3741 "cublasCreate() returned this error "
3742 "'the resources could not be allocated'");
3743 else
3744 PyErr_SetString(PyExc_RuntimeError,
3745 "unknow error during returned by cublasCreate()");
3746 return -1;
3747 }
3748 // Set the default stream as the one to execute on (default)
3749 cublasSetStream(handle, NULL);
3750 // Pointer to scalars are on the host (also default)
3751 cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
3752 #if CUDA_VERSION >= 5000
3753 // atomics can be used in kernels to speed up operations (not default)
3754 // This may lead to a slight variance from run to run in some operations
3755 cublasSetAtomicsMode(handle, CUBLAS_ATOMICS_ALLOWED);
3756 #endif
3757 return 0;
3758 }
3759
3760 static void
3761 cublas_shutdown()
3762 {
3763 if (handle != NULL)
3764 cublasDestroy(handle);
3765 // No point in handling any errors here
3766 handle = NULL;
3767 }
3768
3769 int
3770 CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
3771 {
3772 int err = CudaNdarray_alloc_contiguous(self, PyArray_NDIM(obj),
3773 PyArray_DIMS(obj));
3774 if (err) {
3775 return err;
3776 }
3777
3778 int typenum = PyArray_TYPE(obj);
3779 if (typenum != REAL_TYPENUM)
3780 {
3781 PyErr_SetString(PyExc_TypeError, "can only copy from float arrays");
3782 return -1;
3783 }
3784 assert( 4 == PyArray_ITEMSIZE(obj));
3785 PyArrayObject * py_src = (PyArrayObject *)PyArray_ContiguousFromAny(
3786 (PyObject*)obj, typenum, self->nd, self->nd);
3787 if (!py_src) {
3788 return -1;
3789 }
3790 npy_intp py_src_size = PyArray_SIZE(py_src);
3791 void *py_src_data = PyArray_DATA(py_src);
3792 cudaError_t cerr;
3793 CNDA_BEGIN_ALLOW_THREADS;
3794 cerr = cudaMemcpy(self->devdata, py_src_data,
3795 py_src_size * sizeof(real),
3796 cudaMemcpyHostToDevice);
3797 //CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
3798 CNDA_END_ALLOW_THREADS;
3799 if (cudaSuccess != cerr)
3800 {
3801 PyErr_Format(PyExc_RuntimeError,
3802 "Cuda error '%s' while copying %lli data element"
3803 " to device memory",
3804 cudaGetErrorString(cerr),
3805 (long long)py_src_size);
3806 Py_DECREF(py_src);
3807 return -1;
3808 }
3809 Py_DECREF(py_src);
3810 return 0;
3811 }
3812
3813 PyObject *
3814 CudaNdarray_new_nd(int nd)
3815 {
3816 CudaNdarray * rval = (CudaNdarray*) CudaNdarray_New();
3817 if (!rval || CudaNdarray_set_nd(rval, nd))
3818 {
3819 Py_XDECREF(rval);
3820 rval = NULL;
3821 }
3822 return (PyObject *) rval;
3823 }
3824
3825
3826 /**
3827 * Initialize 'self' as a view of 'base', with memory storage 'data'
3828 */
3829
3830 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
3831 {
3832 if (self->data_allocated)
3833 {
3834 assert(self->devdata);
3835 if (device_free(self->devdata))
3836 {
3837 self->devdata = NULL;
3838 self->data_allocated = 0;
3839 return -1;
3840 }
3841 }
3842 // Get the original base object (base.base.base...)
3843 PyObject * orig_base = base;
3844 // base is not always a CudaNdarray. It can be a GpuArray from pycuda, ...
3845 while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base)
3846 {
3847 // base_base is itself a view
3848 orig_base = ((CudaNdarray*) orig_base)->base;
3849 }
3850 //N.B. XDECREF and XINCREF are no-ops for NULL pointers
3851 if (self->base != orig_base)
3852 {
3853 Py_XDECREF(self->base);
3854 self->base = orig_base;
3855 Py_XINCREF(self->base);
3856 }
3857 self->data_allocated = 0;
3858 self->devdata = data;
3859 return 0;
3860 }
3861
3862 static __global__ void k_copy_1d(const int N, const float * x, const int sx, float * y, const int sy)
3863 {
3864 for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x)
3865 {
3866 y[i*sy] = x[i*sx];
3867 }
3868 }
3869
3870 // N1 through N4 are the size of y
3871 static __global__ void k_copy_4d(const int N1,
3872 const int N2, const int N3, const int N4,
3873 const float * x, const int sx1, const int sx2, const int sx3,
3874 const int sx4, float * y, const int sy1, const int sy2,
3875 const int sy3, const int sy4)
3876 {
3877 // These must be made int instead of unsigned int due to a bug in nvcc
3878 int bx = blockIdx.x;
3879 int by = blockIdx.y;
3880
3881 for (int i = bx; i < N1; i += gridDim.x)
3882 {
3883 for (int j = by; j < N2; j += gridDim.y)
3884 {
3885 for (int k = threadIdx.x; k < N3; k += (int) blockDim.x)
3886 {
3887 for (int l = threadIdx.y; l < N4; l += (int) blockDim.y)
3888 {
3889 y[i * sy1 + j * sy2 + k * sy3 + l * sy4] =
3890 x[i * sx1 + j * sx2 + k * sx3 + l * sx4];
3891 }
3892 }
3893 }
3894 }
3895 }
3896
3897 //copy from other into self
3898 int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
3899 const CudaNdarray * other,
3900 bool unbroadcast)
3901 {
3902 int verbose = 0;
3903 if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n");
3904
3905 //standard elemwise size checks
3906 if (self->nd == -1)
3907 {
3908 PyErr_SetString(PyExc_TypeError,
3909 "can't copy into un-initialized CudaNdarray");
3910 return -1;
3911 }
3912 CudaNdarray * new_other = NULL;
3913
3914 if (self->nd < other->nd)
3915 {
3916 PyErr_Format(PyExc_NotImplementedError,
3917 "CudaNdarray_CopyFromCudaNdarray: The number of dimensions of the "
3918 "destination needs to be >= the number of dimensions of the "
3919 "source. Got %d and %d.", self->nd, other->nd);
3920 return -1;
3921 }
3922 else if (self->nd != other->nd)
3923 {
3924 new_other = (CudaNdarray *) CudaNdarray_View(other);
3925 int added_dims = self->nd - other->nd;
3926 int* pattern = (int*) alloca(self->nd * sizeof(int));
3927 for(int i = 0; i < added_dims; i++)
3928 pattern[i] = -1;
3929 for(int i = 0; i < other->nd; i++)
3930 pattern[i + added_dims] = i;
3931 CudaNdarray_dimshuffle(new_other, self->nd, pattern);
3932 other = new_other;
3933 }
3934 assert(self->nd == other->nd);
3935 //standard elemwise dim checks (also compute total size)
3936 unsigned int size = 1;
3937 unsigned int size_source = 1;
3938 for (int i = 0; i< self->nd; ++i)
3939 {
3940 if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
3941 && (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) )
3942 {
3943 PyErr_Format(PyExc_ValueError,
3944 "CudaNdarray_CopyFromCudaNdarray:"
3945 " need same dimensions for dim %d,"
3946 " destination=%d, source=%d",
3947 i, CudaNdarray_HOST_DIMS(self)[i],
3948 CudaNdarray_HOST_DIMS(other)[i]);
3949 Py_XDECREF(new_other);
3950 return -1;
3951 }
3952 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
3953 size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i];
3954 }
3955 if (0 == size)
3956 {
3957 Py_XDECREF(new_other);
3958 return 0; //nothing to copy, we're done.
3959 }
3960 if (CudaNdarray_is_c_contiguous(self) &&
3961 CudaNdarray_is_c_contiguous(other) &&
3962 size == size_source)
3963 {
3964 if (verbose)
3965 fprintf(stderr, "Copying contiguous vector with cublasScopy\n");
3966
3967 cublasStatus_t err;
3968 err = cublasScopy(handle, size, CudaNdarray_DEV_DATA(other), 1,
3969 CudaNdarray_DEV_DATA(self), 1);
3970 CNDA_THREAD_SYNC;
3971 Py_XDECREF(new_other);
3972 if (CUBLAS_STATUS_SUCCESS != err)
3973 {
3974 PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
3975 return -1;
3976 }
3977 return 0;
3978 }
3979 //TODO: rewrite these copy operations to be more efficient
3980 // See, for example the transpose example in the cuda_sdk.
3981 switch (self->nd)
3982 {
3983 case 0: // scalar
3984 {
3985 // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS
3986 assert(0);
3987 }; break;
3988 case 1: // vector
3989 {
3990 if (verbose) fprintf(stderr, "Copying non-contiguous vector\n");
3991 if (verbose) fprint_CudaNdarray(stderr, other);
3992 unsigned int n_blocks = std::min(size,
3993 (unsigned int)NUM_VECTOR_OP_BLOCKS);
3994 unsigned int n_threads = std::min(ceil_intdiv(size, n_blocks),
3995 (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
3996 k_copy_1d<<<n_blocks, n_threads>>>(size,
3997 CudaNdarray_DEV_DATA(other),
3998 CudaNdarray_HOST_STRIDES(other)[0],
3999 CudaNdarray_DEV_DATA(self),
4000 CudaNdarray_HOST_STRIDES(self)[0]);
4001 CNDA_THREAD_SYNC;
4002 cudaError_t err = cudaGetLastError();
4003 if( cudaSuccess != err)
4004 {
4005 PyErr_Format(PyExc_RuntimeError,
4006 "Cuda error: %s: %s. (n_blocks=%i,"
4007 " n_threads_per_block=%i)\n", "k_copy_1d",
4008 cudaGetErrorString(err), n_blocks, n_threads);
4009 Py_XDECREF(new_other);
4010 return -1;
4011 }
4012 }; break;
4013 case 4: // 4-tensor
4014 {
4015 if (verbose)
4016 {
4017 if (0 != fprint_CudaNdarray(stderr, other))
4018 {
4019 Py_XDECREF(new_other);
4020 return -1;
4021 }
4022 }
4023
4024 // The blocks implement the looping over the first two axes so
4025 // this needs to be (N1, N2)
4026 dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(self)[0],
4027 NUM_VECTOR_OP_BLOCKS),
4028 std::min(CudaNdarray_HOST_DIMS(self)[1],
4029 NUM_VECTOR_OP_BLOCKS));
4030 // For the threads, just make as many as possible
4031 dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
4032 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),
4033 std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[3],
4034 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK));
4035
4036 n_threads.x = std::min( (unsigned int) 32, (unsigned int) n_threads.x);
4037 n_threads.y = std::min( n_threads.y, NUM_VECTOR_OP_THREADS_PER_BLOCK / n_threads.x);
4038
4039 k_copy_4d<<<n_blocks, n_threads>>>(
4040 // size of y
4041 (unsigned int) CudaNdarray_HOST_DIMS(self)[0], // N1
4042 (unsigned int) CudaNdarray_HOST_DIMS(self)[1], // N2
4043 (unsigned int) CudaNdarray_HOST_DIMS(self)[2], // N3
4044 (unsigned int) CudaNdarray_HOST_DIMS(self)[3], // N4
4045 CudaNdarray_DEV_DATA(other), // x
4046 // x strides
4047 CudaNdarray_HOST_STRIDES(other)[0],
4048 CudaNdarray_HOST_STRIDES(other)[1],
4049 CudaNdarray_HOST_STRIDES(other)[2],
4050 CudaNdarray_HOST_STRIDES(other)[3],
4051 CudaNdarray_DEV_DATA(self), // y
4052 // y strides
4053 CudaNdarray_HOST_STRIDES(self)[0],
4054 CudaNdarray_HOST_STRIDES(self)[1],
4055 CudaNdarray_HOST_STRIDES(self)[2],
4056 CudaNdarray_HOST_STRIDES(self)[3]
4057 );
4058 CNDA_THREAD_SYNC;
4059 cudaError_t err = cudaGetLastError();
4060 if( cudaSuccess != err)
4061 {
4062 PyErr_Format(PyExc_RuntimeError,
4063 "Cuda error: %s: %s.",
4064 "k_copy_4d",
4065 cudaGetErrorString(err));
4066 Py_XDECREF(new_other);
4067 return -1;
4068 }
4069 }; break;
4070 default:
4071 {
4072 cudaError_t err = cudaGetLastError();
4073 if(cudaSuccess != err){
4074 PyErr_Format(PyExc_RuntimeError,
4075 "Unexpected Cuda error: %s: %s\n",
4076 "CudaNdarray_CopyFromCudaNdarray",
4077 cudaGetErrorString(err));
4078 Py_XDECREF(new_other);
4079 return -1;
4080 }
4081
4082 if (verbose)
4083 fprintf(stderr,
4084 "Copying with default version unbroadcast=%d\n",
4085 unbroadcast);
4086 // call worker routine
4087 unsigned int threads_per_block = std::min(size,
4088 (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
4089 unsigned int n_blocks = std::min(ceil_intdiv(size, threads_per_block),
4090 (unsigned int)NUM_VECTOR_OP_BLOCKS);
4091 const CudaNdarray * cuda_dims = other;
4092 if(unbroadcast)
4093 cuda_dims = self;
4094 //copy from other into self
4095 k_elemwise_unary_rowmajor_copy<<<n_blocks, threads_per_block>>>(
4096 size,
4097 (unsigned int)other->nd,
4098 (const int *)CudaNdarray_DEV_DIMS(cuda_dims),
4099 (const float*)CudaNdarray_DEV_DATA(other),
4100 (const int *)CudaNdarray_DEV_STRIDES(other),
4101 CudaNdarray_DEV_DATA(self),
4102 (const int *)CudaNdarray_DEV_STRIDES(self));
4103 CNDA_THREAD_SYNC;
4104 err = cudaGetLastError();
4105 if(verbose>1)
4106 fprintf(stderr,
4107 "INFO k_elemwise_unary_rowmaj (n_blocks=%i,"
4108 " n_threads_per_block=%i)\n",
4109 n_blocks, threads_per_block);
4110 if( cudaSuccess != err)
4111 {
4112 //fprint_CudaNdarray(stderr, self);
4113 //fprint_CudaNdarray(stderr, other);
4114 PyErr_Format(PyExc_RuntimeError,
4115 "Cuda error: %s: %s. (n_blocks=%i,"
4116 " n_threads_per_block=%i)\n",
4117 "k_elemwise_unary_rowmajor_copy",
4118 cudaGetErrorString(err), n_blocks,
4119 threads_per_block);
4120 Py_XDECREF(new_other);
4121 return -1;
4122 }
4123 }
4124 };
4125 Py_XDECREF(new_other);
4126 return 0;
4127 }
4128
4129 int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
4130 {
4131 if (A->nd != 2)
4132 {
4133 PyErr_SetString(PyExc_ValueError, "non-matrix arg A to gemm");
4134 return -1;
4135 }
4136 if (B->nd != 2)
4137 {
4138 PyErr_SetString(PyExc_ValueError, "non-matrix arg B to gemm");
4139 return -1;
4140 }
4141 if (C->nd != 2)
4142 {
4143 PyErr_SetString(PyExc_ValueError, "non-matrix arg C to gemm");
4144 return -1;
4145 }
4146
4147 // We must allow dimensions to be zeros.
4148 if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
4149 || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0])
4150 || (CudaNdarray_HOST_DIMS(B)[1] != CudaNdarray_HOST_DIMS(C)[1]))
4151 {
4152 PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemm (%i,%i)x(%i,%i)->(%i,%i)",
4153 CudaNdarray_HOST_DIMS(A)[0],
4154 CudaNdarray_HOST_DIMS(A)[1],
4155 CudaNdarray_HOST_DIMS(B)[0],
4156 CudaNdarray_HOST_DIMS(B)[1],
4157 CudaNdarray_HOST_DIMS(C)[0],
4158 CudaNdarray_HOST_DIMS(C)[1]);
4159 return -1;
4160 }
4161
4162 // If matrix A or B has non-unit size and non-unit stride in both
4163 // dimensions, we can make a copy.
4164 CudaNdarray * A_new = NULL;
4165 CudaNdarray * B_new = NULL;
4166 if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
4167 && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
4168 && (CudaNdarray_HOST_DIMS(A)[1] > 1)
4169 && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
4170 || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
4171 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
4172 {
4173 A_new = (CudaNdarray*) CudaNdarray_Copy(A);
4174 if (!A_new)
4175 return -1;
4176 A = A_new;
4177 }
4178
4179 if (((CudaNdarray_HOST_DIMS(B)[0] > 1)
4180 && (CudaNdarray_HOST_STRIDES(B)[0] != 1)
4181 && (CudaNdarray_HOST_DIMS(B)[1] > 1)
4182 && (CudaNdarray_HOST_STRIDES(B)[1] != 1))
4183 || (CudaNdarray_HOST_STRIDES(B)[0] < 0)
4184 || (CudaNdarray_HOST_STRIDES(B)[1] < 0))
4185 {
4186 B_new = (CudaNdarray*) CudaNdarray_Copy(B);
4187 if (!B_new)
4188 {
4189 // If A_new is NULL, meaning A was not copied nothing happens
4190 Py_XDECREF(A_new);
4191 return -1;
4192 }
4193 B = B_new;
4194 }
4195
4196 // If matrix C has non-unit size and non-unit stride in both
4197 // dimensions, or negative strides, we can't operate. We cannot copy
4198 // C either, because the calling code will expect the result to be
4199 // in the original C container.
4200 if (((CudaNdarray_HOST_DIMS(C)[0] > 1)
4201 && (CudaNdarray_HOST_STRIDES(C)[0] != 1)
4202 && (CudaNdarray_HOST_DIMS(C)[1] > 1)
4203 && (CudaNdarray_HOST_STRIDES(C)[1] != 1))
4204 || (CudaNdarray_HOST_STRIDES(C)[0] < 0)
4205 || (CudaNdarray_HOST_STRIDES(C)[1] < 0))
4206 {
4207 PyErr_Format(PyExc_AssertionError,
4208 "non-unit or negative stride in gemm arg C (%i,%i) of shape (%i,%i)",
4209 CudaNdarray_HOST_STRIDES(C)[0],
4210 CudaNdarray_HOST_STRIDES(C)[1],
4211 CudaNdarray_HOST_DIMS(C)[0],
4212 CudaNdarray_HOST_DIMS(C)[1]);
4213 Py_XDECREF(A_new);
4214 Py_XDECREF(B_new);
4215 return -1;
4216 }
4217
4218 // the unit integer is divided logically into three fields of 4 bits
4219 // the lowermost 4 bits encode the stride pattern of the output
4220 // the next higher 4 bits encode the B variable (or y)
4221 // the next higher 4 bits encode the C variable (or x)
4222 //
4223 // the stride pattern for each input is encoded as 0 for unit stride from col to col (Row major)
4224 // 1 for unit stride from row to row (Col major)
4225
4226 // a stride of 0 implies a dimension of 1 - so we can actually define
4227 // a stride of 0 as a 'unit' stride because gemm will never use it.
4228 // If a dimension is 0, its stride will not be used either, so we can
4229 // consider it a 'unit' stride too.
4230 int unit = 0;
4231 if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_DIMS(A)[1] <= 1) {
4232 unit |= (0x0 << 8);
4233 } else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_DIMS(A)[0] <= 1) {
4234 unit |= (0x1 << 8);
4235 } else {
4236 unit |= (0x2 << 8);
4237 }
4238 if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_DIMS(B)[1] <= 1) {
4239 unit |= (0x0 << 4);
4240 } else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_DIMS(B)[0] <= 1) {
4241 unit |= (0x1 << 4);
4242 } else {
4243 unit |= (0x2 << 4);
4244 }
4245 if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_DIMS(C)[1] <= 1) {
4246 unit |= (0x0 << 0);
4247 } else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_DIMS(C)[0] <= 1) {
4248 unit |= (0x1 << 0);
4249 } else {
4250 unit |= (0x2 << 0);
4251 }
4252
4253 /* create appropriate strides for malformed matrices that are row or column
4254 * vectors
4255 */
4256 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
4257 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
4258 int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : CudaNdarray_HOST_DIMS(B)[1];
4259 int sb_1 = (CudaNdarray_HOST_DIMS(B)[1] > 1) ? CudaNdarray_HOST_STRIDES(B)[1] : CudaNdarray_HOST_DIMS(B)[0];
4260 int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : CudaNdarray_HOST_DIMS(C)[1];
4261 int sc_1 = (CudaNdarray_HOST_DIMS(C)[1] > 1) ? CudaNdarray_HOST_STRIDES(C)[1] : CudaNdarray_HOST_DIMS(C)[0];
4262
4263 float* a = CudaNdarray_DEV_DATA(A);
4264 float* b = CudaNdarray_DEV_DATA(B);
4265 float* c = CudaNdarray_DEV_DATA(C);
4266 cublasOperation_t N = CUBLAS_OP_N;
4267 cublasOperation_t T = CUBLAS_OP_T;
4268 //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
4269 // There should be no negative stride at that point
4270 #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
4271 if (sx == 0){sx = 1;}\
4272 if (sy == 0){sy = 1;}\
4273 if (sz == 0){sz = 1;}\
4274 if ((sx > 0) && (sy > 0) && (sz > 0)) { \
4275 err = cublasSgemm(handle, T0, T1, D0, D1, D2, &a, x, sx, y, sy, &b, z, sz); \
4276 } else { \
4277 PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
4278 Py_XDECREF(A_new);\
4279 Py_XDECREF(B_new);\
4280 return -1; \
4281 }
4282
4283 cublasStatus_t err;
4284 switch(unit)
4285 {
4286 case 0x000: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_0, beta, c, sc_0); break;
4287 case 0x100: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_1, beta, c, sc_0); break;
4288 case 0x010: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_0, beta, c, sc_0); break;
4289 case 0x110: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_1, beta, c, sc_0); break;
4290 case 0x001: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_0, beta, c, sc_1); break;
4291 case 0x101: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_0, beta, c, sc_1); break;
4292 case 0x011: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_1, beta, c, sc_1); break;
4293 case 0x111: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_1, beta, c, sc_1); break;
4294 default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%x)", unit);
4295 return -1;
4296 };
4297 CNDA_THREAD_SYNC;
4298 Py_XDECREF(A_new);
4299 Py_XDECREF(B_new);
4300
4301 if (CUBLAS_STATUS_SUCCESS != err)
4302 {
4303 PyErr_Format(PyExc_RuntimeError,
4304 "cublasSgemm failed (%i) %s\n"
4305 " unit=%x N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%p, b=%p, c=%p"
4306 " sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d",
4307 err, cublasGetErrorString(err),
4308 unit, N,
4309 CudaNdarray_HOST_DIMS(C)[0],
4310 CudaNdarray_HOST_DIMS(C)[1],
4311 CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
4312 alpha, beta, a, b, c, sa_0, sa_1, sb_0, sb_1, sc_0, sc_1);
4313
4314 return -1;
4315 }
4316 return 0;
4317 }
4318
4319 int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
4320 {
4321 /**
4322 * C <- alpha A B + beta C
4323 * A : matrix
4324 * B, C: vector
4325 * alpha, beta: scalars
4326 */
4327 if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemv"); return -1; }
4328 if (B->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
4329 if (C->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
4330
4331 // We must allow dimensions to be zeros.
4332 if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
4333 || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0]))
4334 {
4335 PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemv (%i,%i)x(%i)->(%i)",
4336 CudaNdarray_HOST_DIMS(A)[0],
4337 CudaNdarray_HOST_DIMS(A)[1],
4338 CudaNdarray_HOST_DIMS(B)[0],
4339 CudaNdarray_HOST_DIMS(C)[0]);
4340 return -1;
4341 }
4342
4343 // If matrix A has non-unit size and non-unit stride in both
4344 // dimensions, or negative strides, we cannot operate, but we can
4345 // make a copy.
4346 CudaNdarray * A_new = NULL;
4347 CudaNdarray * B_new = NULL;
4348 if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
4349 && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
4350 && (CudaNdarray_HOST_DIMS(A)[1] > 1)
4351 && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
4352 || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
4353 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
4354 {
4355 A_new = (CudaNdarray*) CudaNdarray_Copy(A);
4356 if (!A_new)
4357 return -1;
4358 A = A_new;
4359 }
4360
4361 // If vector B as a negative stride, we also have to make a copy.
4362 if (CudaNdarray_HOST_STRIDES(B)[0] < 0)
4363 {
4364 B_new = (CudaNdarray*) CudaNdarray_Copy(B);
4365 if (!B_new)
4366 {
4367 // If A was not copied, A_new is NULL, and Py_XDECREF does not
4368 // do anything
4369 Py_XDECREF(A_new);
4370 return -1;
4371 }
4372 B = B_new;
4373 }
4374
4375 // cudablas does not handle negative strides as expected
4376 if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0)
4377 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
4378 {
4379 PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)",
4380 CudaNdarray_HOST_STRIDES(A)[0],
4381 CudaNdarray_HOST_STRIDES(A)[1]);
4382 Py_XDECREF(A_new);
4383 Py_XDECREF(B_new);
4384 return -1;
4385 }
4386
4387 /* create appropriate strides for malformed matrices that are row or column
4388 * vectors
4389 */
4390 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
4391 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
4392 int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1;
4393 int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1;
4394
4395 if (sa_0 == 0)
4396 sa_0 = 1;
4397 if (sa_1 == 0)
4398 sa_1 = 1;
4399
4400 // This is important because we can end up not calling Sgemv at all
4401 cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
4402 if (CudaNdarray_SIZE(C)) {
4403 if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
4404 || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
4405 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
4406 {
4407 err = cublasSgemv(handle, CUBLAS_OP_N,
4408 CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
4409 &alpha,
4410 CudaNdarray_DEV_DATA(A), sa_1,
4411 CudaNdarray_DEV_DATA(B), sb_0,
4412 &beta,
4413 CudaNdarray_DEV_DATA(C), sc_0);
4414 }
4415 else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
4416 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
4417 && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
4418 {
4419 err = cublasSgemv(handle, CUBLAS_OP_T,
4420 CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
4421 &alpha,
4422 CudaNdarray_DEV_DATA(A), sa_0,
4423 CudaNdarray_DEV_DATA(B), sb_0,
4424 &beta,
4425 CudaNdarray_DEV_DATA(C), sc_0);
4426 }
4427 else
4428 {
4429 PyErr_Format(PyExc_AssertionError,
4430 "Unexpected stride pattern in gemv: (%i, %i) x %i -> %i.\n"
4431 "Shapes are: (%i, %i) x %i -> %i\n",
4432 CudaNdarray_HOST_STRIDES(A)[0],
4433 CudaNdarray_HOST_STRIDES(A)[1],
4434 CudaNdarray_HOST_STRIDES(B)[0],
4435 CudaNdarray_HOST_STRIDES(C)[0],
4436 CudaNdarray_HOST_DIMS(A)[0],
4437 CudaNdarray_HOST_DIMS(A)[1],
4438 CudaNdarray_HOST_DIMS(B)[0],
4439 CudaNdarray_HOST_DIMS(C)[0]);
4440 Py_XDECREF(A_new);
4441 Py_XDECREF(B_new);
4442 return -1;
4443 }
4444 }
4445
4446 CNDA_THREAD_SYNC;
4447 Py_XDECREF(A_new);
4448 Py_XDECREF(B_new);
4449
4450 if (CUBLAS_STATUS_SUCCESS != err)
4451 {
4452 PyErr_Format(PyExc_RuntimeError,
4453 "cublasSgemv failed (%i)",
4454 err);
4455 return -1;
4456 }
4457 return 0;
4458 }
4459
4460 int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray * A) {
4461 if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
4462 if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
4463 if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
4464
4465 if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0])
4466 || (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) {
4467 PyErr_Format(PyExc_ValueError,
4468 "dimension mismatch in args to sger (%i)x(%i)->(%i,%i)",
4469 CudaNdarray_HOST_DIMS(x)[0],
4470 CudaNdarray_HOST_DIMS(y)[0],
4471 CudaNdarray_HOST_DIMS(A)[0],
4472 CudaNdarray_HOST_DIMS(A)[1]);
4473 return -1;
4474 }
4475
4476 int x_strides = CudaNdarray_HOST_STRIDES(x)[0];
4477 CudaNdarray * x_new = NULL;
4478 if(x_strides == 0){
4479 if(CudaNdarray_HOST_DIMS(x)[0] != 1){
4480 PyErr_Format(PyExc_RuntimeError,
4481 "CudaNdarray_sger: Invalid input x (should not happen)."
4482 " We received a CudaNdarray vector with a stride of 0"
4483 " that has more than 1 element!");
4484 return -1;
4485 }
4486 x_strides = 1;
4487 } else if(x_strides < 0){
4488 x_new = (CudaNdarray*) CudaNdarray_Copy(x);
4489 x = x_new;
4490 x_strides = CudaNdarray_HOST_STRIDES(x)[0];
4491 }
4492
4493 int y_strides = CudaNdarray_HOST_STRIDES(y)[0];
4494 CudaNdarray * y_new = NULL;
4495 if(y_strides == 0){
4496 if(CudaNdarray_HOST_DIMS(y)[0] != 1){
4497 PyErr_Format(PyExc_RuntimeError,
4498 "CudaNdarray_sger: Invalid input y (should not happen)."
4499 " We received a CudaNdarray vector with a stride of 0"
4500 " that has more than 1 elements!");
4501 Py_XDECREF(x_new);
4502 return -1;
4503 }
4504 y_strides = 1;
4505 } else if(y_strides < 0){
4506 y_new = (CudaNdarray*) CudaNdarray_Copy(y);
4507 y = y_new;
4508 y_strides = CudaNdarray_HOST_STRIDES(y)[0];
4509 }
4510
4511 // Create appropriate strides if A is a row or column vector
4512 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0]
4513 : CudaNdarray_HOST_DIMS(A)[1];
4514 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
4515 : CudaNdarray_HOST_DIMS(A)[0];
4516
4517 // This is important because we can end up not calling Sger at all
4518 cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
4519 if(CudaNdarray_SIZE(A)){
4520 // If A is in col-major
4521 if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
4522 || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
4523 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
4524 {
4525 err = cublasSger(handle, CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], &alpha,
4526 CudaNdarray_DEV_DATA(x), x_strides,
4527 CudaNdarray_DEV_DATA(y), y_strides,
4528 CudaNdarray_DEV_DATA(A), sa_1);
4529 }
4530 // Since Sger expects A in col-major, we invert x and y to fake this.
4531 else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
4532 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
4533 && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
4534 {
4535 err = cublasSger(handle, CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], &alpha,
4536 CudaNdarray_DEV_DATA(y), y_strides,
4537 CudaNdarray_DEV_DATA(x), x_strides,
4538 CudaNdarray_DEV_DATA(A), sa_0);
4539 }
4540 // A has to be either c- or f-contiguous, with no negative strides
4541 else
4542 {
4543 PyErr_SetString(PyExc_NotImplementedError,
4544 "non-contiguous A, or negative strides, in sger");
4545 Py_XDECREF(x_new);
4546 Py_XDECREF(y_new);
4547 return -1;
4548 }
4549 }
4550 CNDA_THREAD_SYNC;
4551 Py_XDECREF(x_new);
4552 Py_XDECREF(y_new);
4553
4554 if (CUBLAS_STATUS_SUCCESS != err)
4555 {
4556 PyErr_Format(PyExc_RuntimeError,
4557 "cublasSger failed (%i)",
4558 err);
4559 return -1;
4560 }
4561
4562 return 0;
4563 }
4564
4565 /**
4566 *
4567 * Precondition:
4568 * a->dim[d] == (dims_a[d]==0) ? (1 << log2_dims_a[d]) : dims_a[d]
4569 * z->dim[d] == (z_str[d]==0) ? 1 : dims_a[d];
4570 *
4571 * TODO: templatize this function to support other reductions.
4572 * All that needs to change is the initial value for sum, and the reduction operator.
4573 */
4574
4575 static __global__ void kernel_reduce_sum(const unsigned int size_z,
4576 const unsigned int nd,
4577 const int * dims_a,
4578 const int * log2_dims_a,
4579 const int * a_str,
4580 const float * a_data,
4581 const int * z_str,
4582 float * z_data)
4583 {
4584 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
4585 const unsigned int numThreads = blockDim.x * gridDim.x;
4586
4587 //structure data contains the strides and dimensions of both a and z
4588 // a_dim[0], a_dim[1], ... a_dim[nd-1],
4589 // a_log2dim[0], a_log2dim[1], ... a_log2dim[nd-1],
4590 // a_str[0], ... a_str[nd-1],
4591 // z_str[0], ... z_str[nd-1]
4592 extern __shared__ int structure_data[];
4593 for (unsigned int i = threadIdx.x; i < nd; i += blockDim.x)
4594 {
4595 structure_data[i+0*nd] = dims_a[i];
4596 structure_data[i+1*nd] = log2_dims_a[i];
4597 structure_data[i+2*nd] = a_str[i];
4598 structure_data[i+3*nd] = z_str[i];
4599 }
4600 dims_a = structure_data;
4601 log2_dims_a = structure_data + nd;
4602 a_str = structure_data + 2*nd;
4603 z_str = structure_data + 3*nd;
4604
4605 __syncthreads(); //wait for all the shared structure to be loaded
4606
4607 for (unsigned int i = idx; i < size_z; i += numThreads)
4608 {
4609 unsigned int ii = i;
4610 const float * a_data_i = a_data;
4611 float * z_data_i = z_data;
4612 unsigned int n_reduce_elements = 1;
4613 unsigned int n_reduce_dims = 0;
4614 unsigned int reduce_dim0 = nd-1;
4615
4616
4617 //In this loop, we locate the initial element of the slice that we'd like to reduce with this thread
4618 // At the same time, we [re]calculate the size of that slice (n_reduce_elements)
4619 for (unsigned int d = 0; d < nd; ++d)
4620 {
4621 if (a_str[d] && (!z_str[d])) // this means 'd' is a dimension we are reducing over
4622 {
4623 n_reduce_elements *= dims_a[d];
4624 n_reduce_dims += 1;
4625 reduce_dim0 = (d < reduce_dim0) ? d : reduce_dim0;
4626 }
4627 else //'d' is not a dimension that we are reducing over
4628 {
4629 unsigned int pos_d;
4630 if (log2_dims_a[d]==-1) //TODO: when things are working, use this switch
4631 {
4632 // this branch is not preferred,
4633 // because the manual said that integer mod and div operations are slow on gpu
4634 pos_d = (ii % dims_a[d]);
4635 ii = (ii / dims_a[d]);
4636 }
4637 else
4638 {
4639 pos_d = (ii & ((1 << log2_dims_a[d])-1)); //take the lower log2_dims bits
4640 ii = (ii >> log2_dims_a[d]); //shift those lower log2_dims bits off of ii
4641 }
4642 a_data_i += pos_d * a_str[d];
4643 z_data_i += pos_d * z_str[d];
4644 }
4645 }
4646 // now we've got pointers a_data_i and z_data_i into element 0 of the slice over which we are reducing
4647 // do a similar loop
4648
4649 float sum = 0.0f;
4650 switch(n_reduce_dims)
4651 {
4652 case 0:
4653 {
4654 sum = a_data_i[0];
4655 }
4656 break;
4657 case 1:
4658 {
4659 const int stride = a_str[reduce_dim0];
4660 const float * a_data_i_max = a_data_i + dims_a[reduce_dim0] * stride;
4661 while (a_data_i != a_data_i_max)
4662 {
4663 sum += a_data_i[0];
4664 a_data_i += stride;
4665 }
4666 }
4667 break;
4668 case 2:
4669 {
4670 int rd = reduce_dim0+1;
4671 for (; rd < nd; ++rd)
4672 {
4673 if (a_str[rd] && (!z_str[rd])) // this means 'rd' is a dimension we are reducing over
4674 break;
4675 }
4676 const int stride0 = a_str[reduce_dim0];
4677 const int stride1 = a_str[rd];
4678 for (int ii = 0; ii < dims_a[rd]; ++ii)
4679 {
4680 const float * a_data_ri = a_data_i + ii * stride1;
4681 const float * a_data_ri_max = a_data_ri + dims_a[reduce_dim0] * stride0;
4682 while (a_data_ri != a_data_ri_max)
4683 {
4684 sum += a_data_ri[0];
4685 a_data_ri += stride0;
4686 }
4687 }
4688 };
4689 break;
4690 default:
4691 {
4692 for (unsigned int reduce_i = 0; reduce_i < n_reduce_elements; ++reduce_i)
4693 {
4694 //TODO: optimize this loop to work more like theano's Elemwise. It's serial code.
4695 unsigned int reduce_ii = reduce_i;
4696 const float * a_data_ri = a_data_i;
4697
4698 //This loop finds the element in the a slice to add.
4699 for (unsigned int rd = reduce_dim0; rd < nd; ++rd)
4700 {
4701 unsigned int pos_d;
4702 if (a_str[rd] && (!z_str[rd])) // this means 'd' is a dimension we are reducing over
4703 {
4704 if (log2_dims_a[rd]==-1)
4705 {
4706 // this branch is not preferred,
4707 // because the manual said that integer mod and div operations are slow on gpu
4708 pos_d = (reduce_ii % dims_a[rd]);
4709 reduce_ii = (reduce_ii / dims_a[rd]);
4710 }
4711 else
4712 {
4713 pos_d = (reduce_ii & ((1 << log2_dims_a[rd])-1)); //take the lower log2_dims bits
4714 reduce_ii = (reduce_ii >> log2_dims_a[rd]); //shift those lower log2_dims bits off of ii
4715 }
4716 a_data_ri += pos_d * a_str[rd];
4717 }
4718 }
4719 sum += a_data_ri[0];
4720 }
4721 }
4722 }
4723 z_data_i[0] = sum;
4724 }
4725 }
4726
4727 static __global__ void kernel_reduce_sum_1011(
4728 const unsigned int d0,
4729 const unsigned int d1,
4730 const unsigned int d2,
4731 const unsigned int d3,
4732 const float *A, const int sA0, const int sA1, const int sA2, const int sA3,
4733 float * Z, const int sZ0)
4734 {
4735 const int threadCount = blockDim.x * blockDim.y * blockDim.z;
4736 const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
4737 extern __shared__ float buf[];
4738 float mysum = 0.0f;
4739
4740 if (warpSize != 32)
4741 {
4742 return; //TODO: set error code
4743 }
4744
4745 for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
4746 {
4747 float Ai = A[i0 * sA0 + blockIdx.x * sA1 + threadIdx.y * sA2 + threadIdx.x * sA3];
4748 mysum += Ai;
4749 }
4750 buf[threadNum] = mysum;
4751 __syncthreads();
4752
4753 // rest of function is handled by one warp
4754 if (threadNum < warpSize)
4755 {
4756 for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
4757 {
4758 mysum += buf[i];
4759 }
4760 buf[threadNum] = mysum;
4761 if (threadNum < 16)
4762 {
4763 //reduce so that threadNum 0 has the sum of everything
4764 if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
4765 if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
4766 if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
4767 if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
4768 if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
4769 if (threadNum == 0)
4770 {
4771 Z[blockIdx.x*sZ0] = buf[0];
4772 }
4773 }
4774 }
4775 }
4776 /**
4777 * Dimensions in which the self has size 1 and A has size > 1 are considered summing dimensions
4778 * Dimensions in which self has size > 1 and A has size > 1 are considered non-summing dimensions, and in this case their sizes must be equal.
4779 */
4780 int
4781 CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A)
4782 {
4783 int verbose = 0;
4784 //check input rank
4785 if (self->nd != A->nd)
4786 {
4787 PyErr_Format(PyExc_TypeError, "Rank mismatch in CudaNdarray_sum: %i vs %i", self->nd, A->nd);
4788 return -1;
4789 }
4790 for (int i = 0; i < self->nd; ++i)
4791 {
4792 if ((CudaNdarray_HOST_DIMS(self)[i] > 1) && (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(A)[i]))
4793 {
4794 PyErr_Format(PyExc_TypeError, "Dimension mismatch in CudaNdarray_sum: self->dim[%i] == %i , A->dim[%i] = %i",
4795 i, CudaNdarray_HOST_DIMS(self)[i], i, CudaNdarray_HOST_DIMS(A)[i]);
4796 return -1;
4797 }
4798 }
4799
4800 int n_summations = (unsigned int)CudaNdarray_SIZE(self);
4801 if (verbose)
4802 {
4803 std::cerr << "reduce_sum n_summations " << n_summations << '\n';
4804 std::cerr << "reduce_sum nd " << self->nd << '\n';
4805 fprint_CudaNdarray(stderr, A);
4806 fprint_CudaNdarray(stderr, self);
4807 }
4808 if (0 && (A->nd == 4) //check to see if kernel_reduce_sum_1011 applies
4809 && (CudaNdarray_HOST_DIMS(self)[0] == 1)
4810 && (CudaNdarray_HOST_DIMS(self)[2] == 1)
4811 && (CudaNdarray_HOST_DIMS(self)[3] == 1)
4812 )
4813 {
4814 dim3 n_threads(CudaNdarray_HOST_DIMS(A)[3], CudaNdarray_HOST_DIMS(A)[2]);
4815 dim3 n_blocks(CudaNdarray_HOST_DIMS(A)[1]);
4816 while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
4817 n_threads.z -= 1;
4818 if (n_threads.z > 64) n_threads.z = 64;
4819 if (n_threads.z)
4820 {
4821 if (verbose) printf("trying kernel_reduce_sum_1011\n");
4822 int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
4823 kernel_reduce_sum_1011<<<n_blocks, n_threads, n_shared>>>(
4824 CudaNdarray_HOST_DIMS(A)[0],
4825 CudaNdarray_HOST_DIMS(A)[1],
4826 CudaNdarray_HOST_DIMS(A)[2],
4827 CudaNdarray_HOST_DIMS(A)[3],
4828 CudaNdarray_DEV_DATA(A),
4829 CudaNdarray_HOST_STRIDES(A)[0],
4830 CudaNdarray_HOST_STRIDES(A)[1],
4831 CudaNdarray_HOST_STRIDES(A)[2],
4832 CudaNdarray_HOST_STRIDES(A)[3],
4833 CudaNdarray_DEV_DATA(self),
4834 CudaNdarray_HOST_STRIDES(self)[1]);
4835 CNDA_THREAD_SYNC;
4836 if (cudaSuccess == cudaGetLastError()) return 0;
4837 if (verbose) printf("failed, falling back to kernel_reduce_sum\n");
4838 }
4839 }
4840
4841 int n_threads_per_block = std::min(n_summations,
4842 NUM_VECTOR_OP_THREADS_PER_BLOCK);
4843 int n_blocks = std::min(ceil_intdiv(n_summations,n_threads_per_block),
4844 NUM_VECTOR_OP_BLOCKS);
4845 int n_structure_cache = self->nd * 4 * sizeof(int);
4846
4847 if (verbose)
4848 {
4849 std::cerr << "n_blocks, n_threads_per_block " << n_blocks << ' ' << n_threads_per_block << '\n';
4850 }
4851 assert (self->nd > 0);
4852 assert (self->nd == A->nd);
4853 kernel_reduce_sum<<<n_blocks, n_threads_per_block, n_structure_cache>>>(
4854 n_summations,
4855 self->nd,
4856 CudaNdarray_DEV_DIMS(A),
4857 CudaNdarray_DEV_LOG2DIMS(A),
4858 CudaNdarray_DEV_STRIDES(A),
4859 CudaNdarray_DEV_DATA(A),
4860 CudaNdarray_DEV_STRIDES(self),
4861 CudaNdarray_DEV_DATA(self));
4862 CNDA_THREAD_SYNC;
4863 cudaError_t err = cudaGetLastError();
4864 if (cudaSuccess != err)
4865 {
4866 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kernel_reduce_sum", cudaGetErrorString(err));
4867 return -1;
4868 }
4869 return 0;
4870 }
4871 int
4872 CudaNdarray_reduce_prod(CudaNdarray * self, const CudaNdarray * A)
4873 {
4874 PyErr_SetString(PyExc_NotImplementedError, "");
4875 return -1;
4876 }
4877 int
4878 CudaNdarray_reduce_min(CudaNdarray * self, const CudaNdarray * A)
4879 {
4880 PyErr_SetString(PyExc_NotImplementedError, "");
4881 return -1;
4882 }
4883 int
4884 CudaNdarray_reduce_max(CudaNdarray * self, const CudaNdarray * A)
4885 {
4886 PyErr_SetString(PyExc_NotImplementedError, "");
4887 return -1;
4888 }
4889
4890
4891 /**
4892 *
4893 * pattern is a permutation of [0, 1, ... self->nd-1] with the following twists:
4894 * - an element 'd' of the permutation can be dropped if CudaNdarray_HOST_DIMS(self)[d] == 1
4895 * - any number of '-1' elements can be in the pattern, and they will cause new ranks (with dim==1) to be inserted.
4896 *
4897 * For example, if CudaNdarray_HOST_DIMS(self) == [4, 5, 1, 6], and pattern = [0,3,-1,-1, 1], then CudaNdarray_HOST_DIMS(self) would be modified to become:
4898 * [4, 6, 1, 1, 5] (we dropped the original dim[2]==1, and inserted two singleton dimensions with the -1s.
4899 */
4900 int
4901 CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern)
4902 {
4903 //TODO: pass a workspace pointer to avoid the internal malloc
4904 int * newdims = (int *)malloc(sizeof(int) * (len + len + self->nd)); //we tack on the taken buffer here for speed of not having to malloc twice.
4905 int * newstrides = newdims + len;
4906 int * dims_taken = newstrides + len;
4907 if (!newdims)
4908 {
4909 PyErr_SetString(PyExc_MemoryError, "CudaNdarray_dimshuffle: Failed to allocate temporary space");
4910 return -1;
4911 }
4912 for (int i = 0; i < self->nd; ++i)
4913 {
4914 dims_taken[i] = 0;
4915 }
4916 for (int i = 0; i < len; ++i)
4917 {
4918 if (pattern[i] < 0)
4919 {
4920 newdims[i] = 1;
4921 newstrides[i] = 0;
4922 }
4923 else if(dims_taken[pattern[i]])
4924 {
4925 PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You used the dimensions %d multiple time",
4926 pattern[i]);
4927 free(newdims);
4928 return -1;
4929 }
4930 else if (pattern[i]>= self->nd)
4931 {
4932 PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You asked for a dimensions that don't exist %d for a %d dims CudaNdarray",
4933 pattern[i], self->nd);
4934 free(newdims);
4935 return -1;
4936 }
4937 else
4938 {
4939 newdims[i] = CudaNdarray_HOST_DIMS(self)[pattern[i]];
4940 newstrides[i] = CudaNdarray_HOST_STRIDES(self)[pattern[i]];
4941 dims_taken[pattern[i]] = 1;
4942 }
4943 }
4944 //Check if we dropped not broadcastable dims
4945 for (int i = 0; i < self->nd; ++i)
4946 {
4947 if (dims_taken[i]==0 && CudaNdarray_HOST_DIMS(self)[i]!=1)
4948 {
4949 PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: You cannot drop a non-broadcastable dimension.");
4950 free(newdims);
4951 return -1;
4952 }
4953 }
4954 //swap this structure in for the one in self, and sync to the card
4955 if (CudaNdarray_set_nd(self, len))
4956 {
4957 free(newdims);
4958 return -1;
4959 }
4960 for (int i = 0; i < len; ++i)
4961 {
4962 CudaNdarray_set_dim(self, i, newdims[i]);
4963 CudaNdarray_set_stride(self, i, newstrides[i]);
4964 }
4965 if (cnda_copy_structure_to_device(self))
4966 {
4967 free(newdims);
4968 return -1;
4969 }
4970 free(newdims);
4971 return 0;
4972 }
4973
4974
4975
4976 /**
4977 *
4978 * This is the function that bind to python.
4979 * See CudaNdarray_dimshuffle to call from C.
4980 * We use -1 to mean 'x' as in Tensor Dimshuffle.
4981 */
4982 PyObject *
4983 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
4984 {
4985 PyObject * self = NULL;
4986 PyObject * pattern_object = NULL;
4987 int * pattern = NULL;
4988 PyObject * rval = NULL;
4989 int success = -1;
4990 //const int * dims = NULL;
4991
4992 //args should consist of two python objects ("OO")
4993 if (! PyArg_ParseTuple(args, "OO", &self, &pattern_object))
4994 return NULL;
4995
4996 if (!CudaNdarray_Check(self) )
4997 {
4998 PyErr_SetString(PyExc_TypeError, "First argument to cuda_ndarray.dimshuffle must be a CudaNdarray");
4999 return NULL;
5000 }
5001
5002 //parse pattern_object into int * pattern
5003
5004 Py_ssize_t pattern_dim = PyObject_Length(pattern_object);
5005
5006 if (pattern_dim < 0)
5007 {
5008 PyErr_SetString(PyExc_TypeError, "Couldn't get length of third argument to cuda_ndarray.dimshuffle");
5009 return NULL;
5010 }
5011
5012 pattern = (int *) malloc( pattern_dim * sizeof(int));
5013
5014 for (Py_ssize_t i = 0; i < pattern_dim; i++)
5015 {
5016 PyObject * idx = PyLong_FromLong(i);
5017
5018 if (idx == NULL)
5019 {
5020 PyErr_SetString(PyExc_Exception, "Couldn't make long object to loop over list/tuple");
5021 goto CudaNdarray_dimshuffle_fail;
5022 }
5023
5024 long elem_value = 0;
5025
5026 PyObject * elem = PyObject_GetItem(pattern_object, idx);
5027
5028 if (elem == NULL)
5029 {
5030 Py_XDECREF( elem);
5031 PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers");
5032 goto CudaNdarray_dimshuffle_fail;
5033 }
5034
5035 elem_value = PyInt_AsLong(elem);
5036
5037 if (elem_value == -1 && PyErr_Occurred() )
5038 {
5039 Py_XDECREF(elem);
5040 PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers");
5041 goto CudaNdarray_dimshuffle_fail;
5042 }
5043
5044 pattern[i] = elem_value;
5045
5046 Py_XDECREF( elem );
5047 Py_XDECREF( idx );
5048 }
5049
5050 //allocate rval
5051 rval = (PyObject *) CudaNdarray_View((CudaNdarray *) self);
5052
5053 if (rval == NULL)
5054 {
5055 //CudaNdarray_New should have set the exception string
5056 goto CudaNdarray_dimshuffle_fail;
5057 }
5058
5059
5060 //printf("pattern_dim: %d\n",pattern_dim);
5061 //printf("pattern: %d %d\n",pattern[0],pattern[1]);
5062 //dims = CudaNdarray_HOST_DIMS( (CudaNdarray *) self);
5063 //printf("dims before: %d %d\n",dims[0],dims[1]);
5064
5065 success = CudaNdarray_dimshuffle((CudaNdarray *) rval, pattern_dim, pattern);
5066
5067 if (success != 0)
5068 {
5069 //Exception string should already be set by CudaNdarray_dimshuffle
5070 goto CudaNdarray_dimshuffle_fail;
5071 }
5072
5073 free(pattern);
5074
5075 return rval;
5076
5077 CudaNdarray_dimshuffle_fail:
5078
5079 if (pattern != NULL)
5080 free(pattern);
5081
5082 Py_XDECREF(rval);
5083 return NULL;
5084 }
5085
5086
5087 int
5088 cnda_structure_size(int nd)
5089 {
5090 // dim0, dim1, ...
5091 // str0, str1, ...
5092 // log2(dim0), log2(dim1), ...
5093 return nd + nd + nd;
5094 }
5095
5096 const int *
5097 CudaNdarray_HOST_DIMS(const CudaNdarray * self)
5098 {
5099 return self->host_structure;
5100 }
5101
5102 const int *
5103 CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
5104 {
5105 return self->host_structure + self->nd;
5106 }
5107 const int *
5108 CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
5109 {
5110 return self->host_structure + 2*self->nd;
5111 }
5112
5113 int
5114 CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
5115 {
5116 int verbose = 0;
5117
5118 if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh)
5119 {
5120 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 1\n");
5121 return 0;
5122 }
5123
5124 if (cnda1->nd != cnda2->nd)
5125 {
5126 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 2\n");
5127 return 0;
5128 }
5129
5130 for (int i=0; i < 2*cnda1->nd; i++)
5131 {
5132 if (cnda1->host_structure[i] != cnda2->host_structure[i])
5133 {
5134 if(verbose)
5135 fprintf(stdout, "CUDANDARRAY_EQUAL : host_structure : %d, %d, %d\n", i, cnda1->host_structure[i], cnda2->host_structure[i]);
5136 return 0;
5137 }
5138 }
5139
5140 if (!ignoreBase && cnda1->base != cnda2->base)
5141 {
5142 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 4");
5143 return 0;
5144 }
5145 else if (cnda1->data_allocated != cnda2->data_allocated)
5146 {
5147 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 5");
5148 return 0;
5149 }
5150 else if (cnda1->data_allocated && cnda1->devdata != cnda2->devdata)
5151 {
5152 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 6");
5153 // no need to check devdata if data is not allocated
5154 return 0;
5155 }
5156
5157 return 1;
5158 }
5159
5160
5161 int
5162 CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
5163 {
5164 return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
5165 }
5166
5167 int
5168 cnda_copy_structure_to_device(const CudaNdarray * self)
5169 {
5170 //If the device structure do not exists, create it.
5171 //We allocate it here as we do not need it often.
5172 //In fact, we need it so infrequently that we expect
5173 //that most object won't need it. Not allocating it
5174 //save a significant when creating object.
5175 //This speed up a benchmark by 8% with the gc.
5176 if (!self->dev_structure)
5177 {
5178 int struct_size = cnda_structure_size(self->nd);
5179 if (struct_size)
5180 {
5181 self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
5182 if (NULL == self->dev_structure)
5183 {
5184 return -1;
5185 }
5186 }
5187 }
5188 if (cublasSetVector(cnda_structure_size(self->nd),
5189 sizeof(int),
5190 self->host_structure,
5191 1,
5192 self->dev_structure,
5193 1) != CUBLAS_STATUS_SUCCESS)
5194 {
5195 PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
5196 return -1;
5197 }
5198 self->dev_structure_fresh = 1;
5199 return 0;
5200 }
5201
5202 const int *
5203 CudaNdarray_DEV_DIMS(const CudaNdarray * self)
5204 {
5205 if (!self->dev_structure_fresh)
5206 {
5207 if (cnda_copy_structure_to_device(self))
5208 return NULL;
5209 }
5210 return self->dev_structure;
5211 }
5212 const int *
5213 CudaNdarray_DEV_STRIDES(const CudaNdarray * self)
5214 {
5215 if (!self->dev_structure_fresh)
5216 {
5217 if (cnda_copy_structure_to_device(self))
5218 return NULL;
5219 }
5220 return self->dev_structure + self->nd;
5221 }
5222 const int *
5223 CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self)
5224 {
5225 if (!self->dev_structure_fresh)
5226 {
5227 if (cnda_copy_structure_to_device(self))
5228 return NULL;
5229 }
5230 return self->dev_structure + 2*self->nd;
5231 }
5232 float *
5233 CudaNdarray_DEV_DATA(const CudaNdarray * self)
5234 {
5235 return self->devdata;
5236 }
5237
5238 /**
5239 * Return the number of elements in the ndarray (product of the dimensions)
5240 */
5241 size_t
5242 CudaNdarray_SIZE(const CudaNdarray *self)
5243 {
5244 if (self->nd == -1) return 0;
5245 size_t size = 1;
5246 for (int i = 0; i < self->nd; ++i)
5247 {
5248 size *= CudaNdarray_HOST_DIMS(self)[i];
5249 }
5250 return size;
5251 }
5252
5253 PyObject *
5254 CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
5255 {
5256 return PyInt_FromLong(CudaNdarray_SIZE(self));
5257 }
5258
5259 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base)
5260 {
5261 return CudaNdarray_set_device_data(self, data, (PyObject *) base);
5262 }
5263
5264 PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self)
5265 {
5266 return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
5267 }
5268
5269 int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
5270 {
5271 cudaError_t err = cudaGetLastError();
5272 if( cudaSuccess != err)
5273 {
5274 PyErr_Format(PyExc_RuntimeError,
5275 "Cuda error: %s: %s.",
5276 "fprint_CudaNdarray was called with an uncleared error",
5277 cudaGetErrorString(err));
5278 return -1;
5279 }
5280 fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
5281 self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
5282 fprintf(fd, "\tHOST_DIMS: ");
5283 for (int i = 0; i < self->nd; ++i)
5284 {
5285 fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]);
5286 }
5287 fprintf(fd, "\n\tHOST_STRIDES: ");
5288 for (int i = 0; i < self->nd; ++i)
5289 {
5290 fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
5291 }
5292
5293 if (self->dev_structure)
5294 {
5295 int data=0;
5296 fprintf(fd, "\n\tDEV_DIMS: ");
5nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
mod.cu
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(668): warning C4477: 'fprintf' : format string '%lu' requires an argument of type 'unsigned long', but variadic argument 2 has type 'std::size_t'
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(668): note: consider using '%zu' in the format string
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(670): warning C4477: 'fprintf' : format string '%016lx' requires an argument of type 'unsigned long', but variadic argument 1 has type 'std::size_t'
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(670): note: consider using '%zx' in the format string
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(670): warning C4477: 'fprintf' : format string '%016lx' requires an argument of type 'unsigned long', but variadic argument 2 has type 'std::size_t'
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(670): note: consider using '%zx' in the format string
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(670): warning C4477: 'fprintf' : format string '%lu' requires an argument of type 'unsigned long', but variadic argument 3 has type 'std::size_t'
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(670): note: consider using '%zu' in the format string
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(670): warning C4477: 'fprintf' : format string '%016lx' requires an argument of type 'unsigned long', but variadic argument 4 has type 'std::size_t'
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(670): note: consider using '%zx' in the format string
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(670): warning C4477: 'fprintf' : format string '%2lu' requires an argument of type 'unsigned long', but variadic argument 5 has type 'std::size_t'
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(670): note: consider using '%zu' in the format string
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(690): warning C4477: 'fprintf' : format string '%016lx' requires an argument of type 'unsigned long', but variadic argument 3 has type 'std::size_t'
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(690): note: consider using '%zx' in the format string
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(690): warning C4477: 'fprintf' : format string '%lu' requires an argument of type 'unsigned long', but variadic argument 4 has type 'std::size_t'
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(690): note: consider using '%zu' in the format string
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(690): warning C4477: 'fprintf' : format string '%lu' requires an argument of type 'unsigned long', but variadic argument 5 has type 'std::size_t'
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(690): note: consider using '%zu' in the format string
mod.cu(803): warning C4311: 'type cast': pointer truncation from 'CudaNdarray *' to 'long'
mod.cu(3374): warning C4312: 'type cast': conversion from 'long' to 'float *' of greater size
LINK : fatal error LNK1104: cannot open file 'uuid.lib'
['nvcc', '-shared', '-O3', '-Xlinker', '/DEBUG', '-D HAVE_ROUND', '-m64', '-297 for (int i = 0; i < self->nd; ++i)
5298 {
5299 cublasGetVector(1, sizeof(int),
5300 self->dev_structure+i, 1,
5301 &data, 1);
5302 fprintf(fd, "%i\t", data);
5303 }
5304 fprintf(fd, "\n\tDEV_STRIDES: ");
5305 for (int i = 0; i < self->nd; ++i)
5306 {
5307 cublasGetVector(1, sizeof(int),
5308 self->dev_structure + self->nd+i, 1,
5309 &data, 1);
5310 fprintf(fd, "%i \t", data);
5311 }
5312 fprintf(fd, "\n");
Xcompiler', '-DCUDA_NDARRAY_CUH=18715462c72ed6afcd7ca5d52813ce90,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,/Zi,/MD', '-IC:\\toolkits\\anaconda2-4.2.05313 }
5314 else
5315 {
5316 fprintf(fd, "\n\tdev_structure not allocated\n");
5317 }
5318
5319 err = cudaGetLastError();
5320 if( cudaSuccess != err)
5321 {
\\lib\\site-packages\\theano-05322 PyErr_Format(PyExc_RuntimeError,
5323 "Cuda error: %s: %s.",
5324 "fprint_CudaNdarray",
5325 cudaGetErrorString(err));
5326 return -1;
5327 }
5328 return 0;
5329 }
5330
5331
5332 int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
5333 const int * dims, int fortran)
5334 {
5335 bool allocated = false;
5336 if (*arr == NULL)
5337 {
5338 // This allocates the metadata but not the data
5339 *arr = (CudaNdarray *) CudaNdarray_new_nd(nd);
5340 if (*arr == NULL)
5341 return -1;
5342 allocated = true;
5343 }
5344
5345 if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
5346 {
5347 if (allocated)
5348 {
5349 Py_DECREF(*arr);
5350 *arr = NULL;
5351 }
5352 return -1;
5353 }
5354 return 0;
5355 }
5356
5357
5358 /*
5359 Local Variables:
5360 mode:c++
5361 c-basic-offset:4
5362 c-file-style:"stroustrup"
5363 indent-tabs-mode:nil
5364 fill-column:79
5365 End:
5366 */
5367 // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
5368
===============================
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(888): warning: variable "prev" was set but never used
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(912): warning: variable "result" was set but never used
mod.cu(803): warning: conversion from pointer to smaller integer
mod.cu(941): warning: pointless comparison of unsigned integer with zero
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(888): warning: variable "prev" was set but never used
C:\toolkits\anaconda2-4.2.0\lib\site-packages\theano-0.8.2-py2.7.egg\theano\sandbox\cuda\cnmem.cpp(912): warning: variable "result" was set but never used
mod.cu(803): warning: conversion from pointer to smaller integer
mod.cu(941): warning: pointless comparison of unsigned integer with zero
.8.2-py2.7.egg\\theano\\sandbox\\cuda', '-IC:\\toolkits\\anaconda2-4.2.0\\lib\\site-packages\\numpy\\core\\include', '-IC:\\toolkits\\anaconda2-4.2.0\\include', '-IC:\\toolkits\\anaconda2-4.2.0\\lib\\site-packages\\theano-0.8.2-py2.7.egg\\theano\\gof', '-o', 'C:\\Users\\nobody\\AppData\\Local\\Theano\\compiledir_Windows-10-10.0.14393-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.13-64\\cuda_ndarray\\cuda_ndarray.pyd', 'mod.cu', '-LC:\\toolkits\\anaconda2-4.2.0\\libs', '-LC:\\toolkits\\anaconda2-4.2.0', '-lcublas', '-lpython27', '-lcudart']
ERROR (theano.sandbox.cuda): Failed to compile cuda_ndarray.cu: ('nvcc return status', 2, 'for cmd', 'nvcc -shared -O3 -Xlinker /DEBUG -D HAVE_ROUND -m64 -Xcompiler -DCUDA_NDARRAY_CUH=18715462c72ed6afcd7ca5d52813ce90,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,/Zi,/MD -IC:\\toolkits\\anaconda2-4.2.0\\lib\\site-packages\\theano-0.8.2-py2.7.egg\\theano\\sandbox\\cuda -IC:\\toolkits\\anaconda2-4.2.0\\lib\\site-packages\\numpy\\core\\include -IC:\\toolkits\\anaconda2-4.2.0\\include -IC:\\toolkits\\anaconda2-4.2.0\\lib\\site-packages\\theano-0.8.2-py2.7.egg\\theano\\gof -o C:\\Users\\nobody\\AppData\\Local\\Theano\\compiledir_Windows-10-10.0.14393-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.13-64\\cuda_ndarray\\cuda_ndarray.pyd mod.cu -LC:\\toolkits\\anaconda2-4.2.0\\libs -LC:\\toolkits\\anaconda2-4.2.0 -lcublas -lpython27 -lcudart')
WARNING (theano.sandbox.cuda): CUDA is installed, but device gpu is not available (error: cuda unavailable)
[Elemwise{exp,no_inplace}(<TensorType(float32, vector)>)]
Looping 1000 times took 0.016000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.016000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.032000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.047000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.047000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.063000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.079000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.079000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.094000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.110000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.110000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.125000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.141000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.141000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.157000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.172000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.172000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.188000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.204000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.204000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.219000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.235000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.235000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.250000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.266000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.266000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.282000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.297000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.297000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.313000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.329000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.329000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.344000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.360000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.360000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.375000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.391000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.407000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.407000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.422000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.438000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.438000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.454000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.469000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.469000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.485000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.500000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.500000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.516000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.532000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.532000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.547000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.563000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.563000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.579000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.594000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.594000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.610000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.625000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.625000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.641000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.657000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.657000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.672000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.672000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.688000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.704000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.704000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.719000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.735000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.735000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.750000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.766000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.782000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.782000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.797000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.813000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.829000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.829000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.844000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.860000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.860000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.875000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.891000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.891000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.907000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.922000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.922000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.938000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.954000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.954000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.969000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.985000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 0.985000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.000000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.016000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.032000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.032000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.047000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.063000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.063000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.079000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.079000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.094000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.110000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.110000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.125000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.141000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.141000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.157000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.172000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.172000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.188000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.204000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.204000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.219000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.219000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.235000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.250000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.250000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.266000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.282000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.282000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.297000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.313000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.313000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.329000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.344000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.344000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.360000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.375000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.375000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.391000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.391000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.407000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.422000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.422000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.438000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.454000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.454000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.469000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.485000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.485000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.500000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.518000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.529000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.536000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.536000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.552000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.568000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.568000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.583000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.599000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.599000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.614000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.614000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.630000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.646000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.646000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.661000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.677000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.677000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.693000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.708000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.708000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.724000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.739000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.739000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.755000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.771000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.771000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.786000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.802000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.802000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.818000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.833000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.833000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.849000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.864000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.864000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.880000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.880000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.896000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.911000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.911000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.927000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.943000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.943000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.958000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.974000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.974000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 1.989000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.005000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.005000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.021000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.036000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.036000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.052000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.052000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.068000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.083000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.099000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.099000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.114000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.114000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.130000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.146000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.146000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.161000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.177000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.177000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.193000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.208000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.208000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.224000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.239000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.239000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.255000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.255000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.271000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.286000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.286000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.302000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.318000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.318000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.333000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.349000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.349000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.364000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.380000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.380000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.404000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.413000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.413000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.429000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.444000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.444000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.460000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.475000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.491000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.507000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.507000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.522000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.538000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.554000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.554000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.569000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.585000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.585000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.600000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.616000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.632000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.632000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.647000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.663000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.663000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.679000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.694000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.694000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.710000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.725000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.725000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.741000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.757000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.772000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.772000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.788000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.804000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.804000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.819000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.835000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.835000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.850000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.866000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.866000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.882000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.897000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.897000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.913000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.913000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.929000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.944000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.944000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.960000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.975000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.975000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 2.991000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.007000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.007000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.022000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.038000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.038000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.054000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.069000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.069000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.085000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.085000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.100000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.116000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.116000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.132000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.147000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.147000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.163000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.179000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.179000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.194000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.210000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.210000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.225000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.225000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.241000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.257000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.257000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.277000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.277000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.293000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.308000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.308000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.324000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.340000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.340000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.355000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.373000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.383000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.391000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.391000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.407000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.422000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.422000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.438000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.454000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.454000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.475000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.486000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.496000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.507000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.507000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.523000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.538000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.538000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.554000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.554000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.569000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.585000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.585000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.601000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.616000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.616000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.632000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.648000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.648000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.663000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.679000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.679000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.694000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.694000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.710000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.726000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.741000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.741000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.757000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.757000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.773000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.788000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.788000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.804000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.819000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.819000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.835000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.851000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.851000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.866000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.882000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.882000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.898000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.913000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.913000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.929000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.944000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.960000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.976000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.976000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.991000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 3.991000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.007000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.023000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.023000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.038000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.054000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.054000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.069000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.085000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.085000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.101000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.116000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.116000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.132000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.148000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.148000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.163000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.179000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.179000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.194000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.210000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.210000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.226000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.241000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.241000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.257000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.273000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.273000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.288000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.304000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.304000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.319000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.335000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.335000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.351000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.366000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.366000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.382000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.398000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.398000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.413000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.413000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.429000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.444000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.444000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.460000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.476000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.476000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.507000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.523000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.538000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.554000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.569000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.569000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.585000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.601000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.601000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.616000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.632000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.632000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.648000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.663000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.663000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.679000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.694000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.694000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.710000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.710000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.726000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.741000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.741000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.757000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.773000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.773000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.788000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.804000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.804000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.819000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.835000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.835000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.851000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.866000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.866000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.882000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.898000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.913000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.913000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.929000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.944000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.944000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.960000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 4.976000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.007000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.038000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.054000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.069000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.085000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.085000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.101000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.116000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.116000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.132000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.148000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.148000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.163000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.179000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.179000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.194000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.210000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.210000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.226000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.241000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.241000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.257000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.273000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.273000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.288000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.304000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.319000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.319000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.335000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.351000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.351000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.366000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.382000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.382000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.398000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.413000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.413000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.429000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.444000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.444000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.460000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.476000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.476000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.491000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.507000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.507000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.523000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.538000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.538000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.554000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.569000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.569000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.585000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.601000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.616000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.616000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.632000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.648000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.648000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.663000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.679000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.679000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.694000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.710000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.710000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.726000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.741000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.757000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.757000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.773000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.773000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.788000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.804000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.804000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.819000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.835000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.835000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.851000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.866000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.866000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.882000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.898000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.898000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.913000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.913000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.929000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.944000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.944000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.960000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.976000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.976000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 5.991000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.007000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.007000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.023000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.038000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.054000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.054000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.069000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.085000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.085000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.101000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.116000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.116000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.132000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.148000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.148000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.163000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.163000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.179000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.194000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.194000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.210000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.226000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.226000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.241000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.257000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.257000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.273000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.288000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.288000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.304000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.319000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.319000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.335000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.335000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.351000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.366000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.366000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.382000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.398000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.398000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.413000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.429000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.429000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.444000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.460000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.460000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.476000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.491000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.491000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.507000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.523000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.523000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.538000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.554000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.554000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.569000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.585000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.585000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.601000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.616000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.616000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.632000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.648000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.648000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.663000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.663000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.679000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.694000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.694000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.710000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.726000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.726000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.741000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.741000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.757000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.773000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.773000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.788000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.804000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.804000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.819000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.835000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.851000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.866000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.882000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.898000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.898000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.913000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.929000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.944000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.944000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.960000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.976000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.991000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 6.991000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.007000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.023000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.023000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.038000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.054000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.054000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.069000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.085000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.085000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.101000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.116000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.132000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.132000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.148000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.163000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.163000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.179000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.194000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.194000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.210000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.226000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.226000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.241000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.257000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.257000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.273000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.290000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.293000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.309000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.309000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.324000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.340000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.340000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.356000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.371000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.371000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.387000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.402000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.402000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.418000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.434000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.434000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.449000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.465000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.465000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.481000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.496000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.496000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.512000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.527000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.527000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.543000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.559000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.559000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.574000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.574000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.590000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.606000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.606000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.621000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.637000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.637000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.652000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.668000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.668000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.684000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.699000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.699000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.715000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.731000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.731000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.746000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.746000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.762000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.777000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.777000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.793000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.809000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.809000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.824000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.840000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.840000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.856000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.856000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.871000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.887000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.887000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.902000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.918000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.918000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.934000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.949000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.949000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.965000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.981000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.981000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.996000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 7.996000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.012000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.027000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.027000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.043000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.059000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.059000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.074000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.090000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.090000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.106000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.121000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.121000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.137000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.137000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.152000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.168000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.168000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.184000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.199000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.199000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.215000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.231000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.231000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.246000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.246000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.263000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.279000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.279000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.294000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.310000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.310000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.326000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.326000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.341000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.357000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.357000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.373000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.388000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.388000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.404000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.419000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.419000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.435000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.435000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.451000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.466000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.466000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.482000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.498000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.498000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.520000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.520000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.535000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.535000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.551000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.566000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.566000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.582000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.598000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.598000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.613000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.629000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.629000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.645000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.660000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.660000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.676000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.691000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.691000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.707000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.723000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.723000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.738000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.754000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.754000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.770000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.785000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.785000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.801000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.816000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.816000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.832000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.848000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.848000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.863000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.879000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.879000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.895000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.910000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.910000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.926000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.941000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.941000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.957000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.957000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.973000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 8.988000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.004000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.004000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.020000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.035000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.035000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.051000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.066000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.066000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.082000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.098000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.098000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.113000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.129000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.129000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.145000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.160000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.160000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.176000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.191000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.191000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.207000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.224000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.224000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.240000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.240000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.255000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.271000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.271000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.287000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.302000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.302000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.318000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.333000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.333000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.349000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.349000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.365000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.380000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.380000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.396000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.412000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.412000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.427000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.443000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.443000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.458000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.474000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.474000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.490000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.505000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.505000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.521000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.537000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.537000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.552000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.568000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.583000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.599000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.599000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.615000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.630000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.630000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.646000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.662000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.662000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.677000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.693000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.693000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.708000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.708000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.724000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.740000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.740000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.755000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.771000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.771000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.787000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.802000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.802000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.818000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.834000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.844000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.845000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.860000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.860000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.876000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.892000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.892000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.907000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.923000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.923000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.939000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.954000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.954000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.970000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.970000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 9.985000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.001000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.001000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.017000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.032000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.032000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.048000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.064000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.064000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.079000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.095000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.095000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.110000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.126000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.126000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.142000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.142000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.157000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.173000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.173000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.189000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.204000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.204000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.220000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.235000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.235000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.251000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.267000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.267000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.286000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.297000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.307000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.312000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.327000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.327000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.343000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.343000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.358000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.374000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.374000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.390000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.405000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.405000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.421000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.437000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Looping 1000 times took 10.437000 seconds
Result is [ 1.23178029 1.61879337 1.52278066 ..., 2.20771813 2.29967761
1.62323284]
Used the cpu
Process finished with exit code 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment