Skip to content

Instantly share code, notes, and snippets.

@malfet
malfet / gist:23e71fbb909707166bbb8cf24de8b17e
Created May 26, 2021 22:40
Triggering new pipeline on Circle and checking it's status
% curl --request POST --url https://circleci.com/api/v2/project/gh/pytorch/pytorch/pipeline --data '{"branch":"pull/59020/head", "parameters": {"run_slow_gradcheck_build": true}}' --header 'content-type: application/json' --header 'Circle-Token: XXXXXX'
{
"number" : 328134,
"state" : "pending",
"id" : "d23f0239-1cd0-4d22-8965-d40f8c7bbd73",
"created_at" : "2021-05-26T22:37:55.955Z"
}
% curl https://circleci.com/api/v2/project/gh/pytorch/pytorch/pipeline/328134
{
"id" : "d23f0239-1cd0-4d22-8965-d40f8c7bbd73",
#!/usr/bin/env python3
# Results of recent runs:
# Mac Apple M1 50.3 sec
# Mac Intel(R) Core(TM) i9-9980HK CPU @ 2.40GH in 61.1 sec
# Linux Intel(R) Xeon(R) W-2135 CPU @ 3.70GHz in 53.5 sec
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#!/usr/bin/env python
import torch
from torch.autograd.profiler import profile as _profile
def workload():
s1 = torch.cuda.Stream(device="cuda")
s2 = torch.cuda.Stream(device="cuda")
with torch.cuda.stream(s1):
#!/usr/bin/env python3
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
from urllib.request import urlopen, Request
import json
import enum
import os
#include <arm_neon.h>
#include <math.h>
#include <stdio.h>
void run_neon_reciproc(float data_in[4], float data_out[4]) {
float32x4_t input = vld1q_f32(data_in);
float32x4_t out = vrecpeq_f32(input);
//out = vmulq_f32(vrecpsq_f32(input, out), out);
//out = vmulq_f32(vrecpsq_f32(input, out), out);
[Inline Frame] torch_cuda.dll!std::_Default_allocator_traits<std::allocator<std::_Tree_node<unsigned int,void *>>>::deallocate(std::allocator<std::_Tree_node<unsigned int,void *>> &) Line 689 C++
[Inline Frame] torch_cuda.dll!std::_Tree_node<unsigned int,void *>::_Freenode0(std::allocator<std::_Tree_node<unsigned int,void *>> &) Line 373 C++
[Inline Frame] torch_cuda.dll!std::_Tree_val<std::_Tree_simple_types<unsigned int>>::_Erase_head(std::allocator<std::_Tree_node<unsigned int,void *>> &) Line 753 C++
[Inline Frame] torch_cuda.dll!std::_Tree<std::_Tset_traits<unsigned int,std::less<unsigned int>,std::allocator<unsigned int>,0>>::{dtor}() Line 1191 C++
> torch_cuda.dll!torch::jit::fuser::newForReduction(torch::jit::fuser::TensorView * tv, const std::vector<unsigned int,std::allocator<unsigned int>> & axes) Line 438 C++
torch_cuda.dll!torch::jit::fuser::reductionOp(torch::jit::fuser::BinaryOpType reduction_op_type, const std::vector<int,std::allocator<int>> & axes, torch::jit::fuser::Val * init, to
// nvcc -o hello hello.cu; ./hello
#include <stdio.h>
__global__ void kernel() {
printf("Hello World of CUDA\n");
}
int main() {
kernel<<<1,1>>>();
return cudaDeviceSynchronize();
@malfet
malfet / wrong-vmul-ps.c
Last active September 12, 2020 00:25
GCC masm=intel bug
// gcc -c -Os -mavx512f -masm=intel
#include <immintrin.h>
float foo(float* con) {
__mmask16 msk = 0x00ff;
__m512 a = _mm512_maskz_loadu_ps(msk, con);
__m512 b = _mm512_set1_ps(con[1]);
__m512 c = _mm512_mul_ps(a,b);
return ((float *)&c)[0];
}
@malfet
malfet / hello.S
Created August 30, 2020 14:15
HelloWorld in x86_64 assembly
# as -o hello.o hello.S ; cc -o hello hello.o -nostdlib
.text
.globl _start
.type _start, @function
_start:
movl $1, %eax # sys_write(
movl $1, %edi # fd = stdout,
movl $.LC0, %esi # buf = LC0,
movl $12, %edx # 12);
syscall