This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%define ARCH_X86_64 1 | |
%define private_prefix asm | |
%include "x86inc.asm" | |
section .rodata align=16 | |
zeros_vec: dd 0, 0, 0, 0 | |
ones_vec: dd 1, 1, 1, 1 | |
neg_ones_vec: dd -1, -1, -1, -1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void fg_blend_stripe_sse4(int16_t *dstSampleOffsetY, int16_t *srcSampleOffsetY, int32_t *grainStripe, uint32_t widthComp, uint32_t blockHeight, uint8_t bitDepth) | |
{ | |
uint32_t k, l; | |
// Prepare SIMD SSE4 ov_clip_uintp2 | |
__m128i mask = _mm_set1_epi32((1 << bitDepth)); | |
__m128i not_mask = _mm_xor_si128(mask, mask); | |
not_mask = _mm_sub_epi32(not_mask, mask); | |
mask = _mm_sub_epi32(mask, _mm_set1_epi32(1)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int16_t fg_compute_block_avg_sse4(int16_t *dstSampleBlk8, uint32_t widthComp, uint16_t *pNumSamples, | |
uint8_t ySize, uint8_t xSize, uint8_t bitDepth) | |
{ | |
uint16_t blockAvg = 0; | |
uint16_t numSamples = 0; | |
__m128i acc = _mm_setzero_si128(); | |
for (int i = 0; i < ySize; i+=1, numSamples+=8) | |
{ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int16_t fg_compute_block_avg(int16_t *dstSampleBlk8, uint32_t widthComp, uint16_t *pNumSamples, | |
uint8_t ySize, uint8_t xSize, uint8_t bitDepth) | |
{ | |
uint32_t blockAvg = 0; | |
uint16_t numSamples = 0; | |
uint8_t k, l; | |
for (k = 0; k < ySize; k++) | |
{ | |
for (l = 0; l < xSize; l++) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Loop of 16x16 blocks */ | |
for (y = 0; y < heightComp[compCtr]; y += 16) | |
{ | |
... | |
for (x = 0; x < widthComp[compCtr]; x += 16) | |
{ | |
... | |
for (blkId = 0; blkId < 4; blkId++) | |
{ | |
yOffset8x8 = (blkId >> 1) * 8; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To run | |
# mkdir build && cd build | |
# cmake .. | |
# make -j && ./bank conflict <offset> <is_debug> | |
cmake_minimum_required(VERSION 3.0) | |
set(CMAKE_CXX_FLAGS "-O3 -std=c++14") | |
set(CUDA_NVCC_FLAGS -arch=compute_52 -code=sm_75) |
- Problem: We have blocks that are scheduled later than others which imply that we won't get the "true max value" at the time we need it.
- Direction: We should find a way to wait for all threads of all blocks to finish
- Solution:
-
- Split into 2 kernels
-
-
- Use cooperative groups: https://numba.readthedocs.io/en/stable/cuda/cooperative_groups.html
- Matmul benchmark of Group-ordering vs Row-major ordering on A100 => No significant improvment over row-major ordering
matmul-performance:
M group_ordering row_major_ordering
0 256.0 3.640889 3.640889
1 384.0 11.059200 12.288000
2 512.0 23.831273 23.831273
3 640.0 39.384616 39.384616
4 768.0 58.982401 58.982401
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from tqdm import tqdm | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from datasets import load_dataset | |
# Model | |
device = "cpu" | |
device_map = { |