Skip to content

Instantly share code, notes, and snippets.

View vadimkantorov's full-sized avatar
💭
looking for an internship for summer/fall 2021

Vadim Kantorov vadimkantorov

💭
looking for an internship for summer/fall 2021
View GitHub Profile
@vadimkantorov
vadimkantorov / yaml_loads.js
Created March 13, 2025 11:24
JavaScript function for parsing simple YAML (supports only strings, lists, dicts)
// based on simplified version of Python snippet: https://gist.github.com/vadimkantorov/b26eda3645edb13feaa62b874a3e7f6f
function yaml_loads(frontamtter_str)
{
const procval = s => (s.length >= 2 && s[0] == '"' && s[s.length - 1] == '"') ? s.slice(1, s.length - 1) : (s.length >= 2 && s[0] == "'" && s[s.length - 1] == "'") ? s.slice(1, s.length - 1) : s;
for(const line of frontmatter_str.split('\n'))
{
const line_strip = line.trim();
const is_list_item = line_strip.startsWith('- ');
@vadimkantorov
vadimkantorov / svgdataurify.js
Created February 22, 2025 17:06
Conversion of SVG to data-uri format with prefix data:image/svg+xml - a primer in JavaScript
// based on https://github.com/tigt/mini-svg-data-uri/issues/24
// Usage: cat myicon.svg | node svgdataurify.js
let svg = "";
process.stdin.on("data", (chunk) => { svg += chunk; });
process.stdin.on("end", async () =>
{
const reWhitespace = /\s+/g, reUrlHexPairs = /%[\dA-F]{2}/g, hexDecode = {'%20': ' ', '%3D': '=', '%3A': ':', '%2F': '/'}, specialHexDecode = match => hexDecode[match] || match.toLowerCase();
if(svg.charCodeAt(0) === 0xfeff) svg = svg.slice(1);
svg = svg.trim().replace(reWhitespace, ' ').replaceAll('"', '\'');
@vadimkantorov
vadimkantorov / wslv1nodeinstall.sh
Last active February 13, 2025 20:00
Proper install command of node/npm on WSLv1 Ubuntu
# from https://github.com/microsoft/WSL/issues/8151#issuecomment-2276363014
curl -fsSL https://deb.nodesource.com/setup_22.x | sudo -E bash -
sudo apt-get install -y nodejs
@vadimkantorov
vadimkantorov / prependfrontmatter.sh
Created February 10, 2025 14:32
Sed script to prepend a Jekyll/Liquid front matter to a file
# prependfrontmatter ./index.html
alias prependfrontmatter="sed -i '1i---\n---'"
# https://unix.stackexchange.com/questions/99350/how-to-insert-text-before-the-first-line-of-a-file
@vadimkantorov
vadimkantorov / citygeocoder.py
Last active February 12, 2025 00:12
Queries WikiData / SPARQL endpoint for the GPS coordinates of world's 5000 most populated cities
# python citygeocoder.py > '~citygeocoder.json'
# https://www.wikidata.org/wiki/Wikidata:SPARQL_tutorial/en
# https://github.com/OSMNames/OSMNames, http://github.com/OSMNames/OSMNames/issues/208
# https://osmnames.org/download/
# https://stackoverflow.com/questions/74261733/how-to-fetch-gps-coordinates-of-worlds-largest-cities-from-wikidata-via-sparql
# FIXME: for some reason misses Helsinki
import sys
import json
import urllib.parse
@vadimkantorov
vadimkantorov / uneml.py
Created January 21, 2025 13:37
Extract all attachments from "*.eml" email files
# Usage: to extract all eml files in current directory into the current directory: python uneml.py *.eml
import os
import sys
import email
import email.policy
for input_path in sys.argv[1:]:
print('eml', repr(input_path))
eml = email.message_from_file(open(input_path), policy = email.policy.default)
for part in eml.walk():
@vadimkantorov
vadimkantorov / wpoptionimpex.sh
Last active January 21, 2025 12:28
Primer of batch-exporting and batch-importing options from WordPress using wp-cli
# https://github.com/wp-cli/entity-command/issues/512
# https://developer.wordpress.org/cli/commands/option/list/
# https://developer.wordpress.org/cli/commands/option/update/
# export options to a pretty-formatted JSON file
wp option list --format=json | jq '.' > wpoptionimpex.json
# [
# {
# "option_name": "name1",
# "option_value": "value1"
@vadimkantorov
vadimkantorov / basename.c
Created September 12, 2024 20:13
Demo of strrchr C function
// https://en.cppreference.com/w/c/string/byte/strrchr
#include <stdio.h>
#include <string.h>
int main(int argc, char* argv[])
{
if(argc < 2)
return -1;
@vadimkantorov
vadimkantorov / bpedetokenize.py
Last active September 15, 2024 20:40
Looks up variable-length UTF-8 byte tokens from a vocab and concats them together in pure PyTorch vectorized way without loops
# works only with a single, non-batched tensor of token ids
import torch
def bpedetokenize_loop(token_ids, token_utf8bytes, token_lens):
inds = torch.cat((torch.zeros_like(token_lens[:1]), token_lens.cumsum(-1)))
return torch.cat([token_utf8bytes[inds[i]:inds[i_p_1]] for i, i_p_1 in zip(token_ids, token_ids + 1)])
def bpedetokenize_vec(token_ids, token_utf8bytes, token_lens):
inds_begin = torch.cat((torch.zeros_like(token_lens[:1]), token_lens[:-1].cumsum(-1)))
@vadimkantorov
vadimkantorov / captcha.py
Last active September 8, 2024 09:59
Example of running the captcha breaker from https://huggingface.co/spaces/docparser/Text_Captcha_breaker without PyTorch as a dependency
# before running download captcha.onnx from https://huggingface.co/spaces/docparser/Text_Captcha_breaker
# python -m pip install numpy pillow onnxruntime --user --break-system-packages
import argparse
import PIL.Image
import numpy
import onnxruntime
parser = argparse.ArgumentParser()
parser.add_argument('--model-path', default = 'captcha.onnx')