alexeyev’s gists

alexeyev / simple_text_classification_distilbert.py

Created July 30, 2023 07:50

Binary classification with DistilBERT, minimal example

	import evaluate
	import numpy as np
	from datasets import load_dataset
	from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
	from transformers import AutoTokenizer
	from transformers import DataCollatorWithPadding

	id2label = {0: "NEGATIVE", 1: "POSITIVE"}
	label2id = {"NEGATIVE": 0, "POSITIVE": 1}

alexeyev / apertium_tokenizer.py

Last active July 26, 2023 17:41

Apertium-Kir-Based Tokenizer

	# coding: utf-8
	"""
	Tokenization as it is done in Apertium; may not be blazing fast,
	since a full-scale morphological analysis is carried out
	"""

	import apertium
	import re
	from typing import List, Tuple
	from streamparser import LexicalUnit, reading_to_string

alexeyev / LICENSE

Last active July 21, 2023 14:20

Converting Doccano NER task export (JSONL file) to a CONLL03-formatted file

	MIT License

	Copyright (c) 2023 Anton Alekseev

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:

alexeyev / sequence_to_sentences.py

Created October 16, 2022 08:43

	# coding: utf-8

	sentences = []

	for line in open("test.txt", "r", encoding="utf-8").readlines()[1:]:

	seq = line.strip().split(" ")

	if len(seq) == 1:
	sentences.append([])

alexeyev / onnx2pytorch.py

Created September 30, 2021 23:18 — forked from qinjian623/onnx2pytorch.py

ONNX file to Pytorch model

alexeyev / raspberry_pi_camera_telegram_bot.py

Last active August 31, 2021 18:13

Считаем ворон с помощью Raspberry Pi и Telegram API

	# coding: utf-8

	import configparser
	import logging
	import telebot
	from time import sleep
	from picamera import PiCamera

	logger = logging.getLogger("counting-crops")
	logger.setLevel(logging.DEBUG)

alexeyev / hogweed_photos_collector_bot.py

Created August 31, 2021 12:12

Telegram bot saving (hogweed) photos on disk

	# coding: utf-8

	import configparser
	import logging

	import telebot

	logger = logging.getLogger("hogweed-ground-level")
	logger.setLevel(logging.DEBUG)

alexeyev / gsdmm_example.py

Created April 17, 2021 16:30

	from functools import lru_cache

	from nltk import TweetTokenizer, WordNetLemmatizer
	from tqdm import tqdm
	from gsdmm import MovieGroupProcess
	from sklearn.datasets import fetch_20newsgroups
	from nltk.corpus import stopwords

	import pickle
	import nltk

alexeyev / texts_similarity_difflib.py

Created November 27, 2020 12:29

	# coding: utf-8

	from difflib import SequenceMatcher

	t0 = open("text0.txt", "r+").read().strip().replace("\n", " ").replace(" ", " ")
	t1 = open("text1.txt", "r+").read().strip().replace("\n", " ").replace(" ", " ")

	matcher = SequenceMatcher(a=t0, b=t1)
	ratio = matcher.ratio()
	mbs = matcher.get_matching_blocks()

alexeyev / gtranslate_selenium.py

Created September 20, 2020 17:28

	#!/usr/bin/env python3
	"""
	We do not recommend using this script for any purposes other than learning to use Selenium;
	for batched machine translation via Google Translate using 'document' translation feature
	is arguably the most suitable. For regular translations one should use the Cloud API.
	"""

	import time

	from selenium.common.exceptions import TimeoutException

Anton Alekseev alexeyev