Skip to content

Instantly share code, notes, and snippets.

View vadimkantorov's full-sized avatar
💭
looking for an internship for summer/fall 2021

Vadim Kantorov vadimkantorov

💭
looking for an internship for summer/fall 2021
View GitHub Profile
@vadimkantorov
vadimkantorov / wpoptionimpex.sh
Last active November 6, 2024 11:54
Primer of batch-exporting and batch-importing options from WordPress using wp-cli
# https://github.com/wp-cli/entity-command/issues/512
# https://developer.wordpress.org/cli/commands/option/list/
# https://developer.wordpress.org/cli/commands/option/update/
# export options to a pretty-formatted JSON file
wp option list --format=json | jq '.' > wpoptionimpex.json
# [
# {
# "option_name": "name1",
# "option_value": "value1"
@vadimkantorov
vadimkantorov / basename.c
Created September 12, 2024 20:13
Demo of strrchr C function
// https://en.cppreference.com/w/c/string/byte/strrchr
#include <stdio.h>
#include <string.h>
int main(int argc, char* argv[])
{
if(argc < 2)
return -1;
@vadimkantorov
vadimkantorov / bpedetokenize.py
Last active September 15, 2024 20:40
Looks up variable-length UTF-8 byte tokens from a vocab and concats them together in pure PyTorch vectorized way without loops
# works only with a single, non-batched tensor of token ids
import torch
def bpedetokenize_loop(token_ids, token_utf8bytes, token_lens):
inds = torch.cat((torch.zeros_like(token_lens[:1]), token_lens.cumsum(-1)))
return torch.cat([token_utf8bytes[inds[i]:inds[i_p_1]] for i, i_p_1 in zip(token_ids, token_ids + 1)])
def bpedetokenize_vec(token_ids, token_utf8bytes, token_lens):
inds_begin = torch.cat((torch.zeros_like(token_lens[:1]), token_lens[:-1].cumsum(-1)))
@vadimkantorov
vadimkantorov / captcha.py
Last active September 8, 2024 09:59
Example of running the captcha breaker from https://huggingface.co/spaces/docparser/Text_Captcha_breaker without PyTorch as a dependency
# before running download captcha.onnx from https://huggingface.co/spaces/docparser/Text_Captcha_breaker
# python -m pip install numpy pillow onnxruntime --user --break-system-packages
import argparse
import PIL.Image
import numpy
import onnxruntime
parser = argparse.ArgumentParser()
parser.add_argument('--model-path', default = 'captcha.onnx')
@vadimkantorov
vadimkantorov / log_file_access_dynamic.c
Last active September 5, 2024 15:23
Trace certain libc and stdio file access function calls / syscalls via LD_PRELOAD-based interception
// gcc -shared -fPIC log_file_access_dynamic.c -o log_file_access_dynamic.so -ldl; LD_PRELOAD=$PWD/log_file_access_dynamic.so /usr/bin/cat log_file_access_dynamic.c
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <dlfcn.h>
#include <sys/stat.h>
@vadimkantorov
vadimkantorov / fmti4.sh
Created August 30, 2024 10:54
Format integer string as bytes
# https://stackoverflow.com/questions/78931597/binary-integer-representation-printing-in-linux-shell?noredirect=1#comment139167055_78931597
alias fmti4struct="python -c 'import sys,struct;sys.stdout.buffer.write(struct.pack(\"<I\",int(sys.argv[1])))'"
alias fmti4tobytes="python -c 'import sys,struct;sys.stdout.buffer.write(int(sys.argv[1]).to_bytes(4,\"little\"))'"
# fmt4istruct 2123 > foo.txt
# xxd foo.txt
## 00000000: 4b08 0000 K...
@vadimkantorov
vadimkantorov / texliveiso.py
Last active August 15, 2024 18:43
TexLive ISO file listing and extraction (Joliet)
# the texlive iso urls become unavailable as texlive replaces the releases
# this script also prints dd commands for extracting a file by offset
# docs at https://github.com/clalancette/pycdlib and https://clalancette.github.io/pycdlib/example-extracting-data-from-iso.html
# Usage:
# wget https://tug.ctan.org/systems/texlive/Images/texlive2024-20240312.iso
# python texliveiso.py texlive2024-20240312.iso > texliveiso.txt
import sys, io
import pycdlib
import re, unicodedata
def slugify(s, mode = '', space = '_', lower = True):
# https://jekyllrb.com/docs/liquid/filters/#slugify
# regex from https://github.com/Flet/github-slugger, see https://github.com/github/cmark-gfm/issues/361
regex_bad_chars = r'[\0-\x1F!-,\.\/:-@\[-\^`\{-\xA9\xAB-\xB4\xB6-\xB9\xBB-\xBF\xD7\xF7\u02C2-\u02C5\u02D2-\u02DF\u02E5-\u02EB\u02ED\u02EF-\u02FF\u0375\u0378\u0379\u037E\u0380-\u0385\u0387\u038B\u038D\u03A2\u03F6\u0482\u0530\u0557\u0558\u055A-\u055F\u0589-\u0590\u05BE\u05C0\u05C3\u05C6\u05C8-\u05CF\u05EB-\u05EE\u05F3-\u060F\u061B-\u061F\u066A-\u066D\u06D4\u06DD\u06DE\u06E9\u06FD\u06FE\u0700-\u070F\u074B\u074C\u07B2-\u07BF\u07F6-\u07F9\u07FB\u07FC\u07FE\u07FF\u082E-\u083F\u085C-\u085F\u086B-\u089F\u08B5\u08C8-\u08D2\u08E2\u0964\u0965\u0970\u0984\u098D\u098E\u0991\u0992\u09A9\u09B1\u09B3-\u09B5\u09BA\u09BB\u09C5\u09C6\u09C9\u09CA\u09CF-\u09D6\u09D8-\u09DB\u09DE\u09E4\u09E5\u09F2-\u09FB\u09FD\u09FF\u0A00\u0A04\u0A0B-\u0A0E\u0A11\u0A12\u0A29\u0A31\u0A34\u0A37\u
@vadimkantorov
vadimkantorov / cat.c
Last active July 1, 2024 16:42
Example cat program
// cc cat.c -o cat && ./cat cat.c
#include <stdio.h>
int main(int argc, char* argv[])
{
char buf[1024];
if(argc < 2) return 2;
FILE* f = fopen(argv[1], "r");
if(!f) return 1;