alexeyev’s gists

alexeyev / gist:8073e9a8d2854af79800

Created May 29, 2015 22:08

Beautiful two-communities-graph layout

	import igraph

	clusters_const = 2

	graph = igraph.read("2_big_graph_clusters_ncol.txt", format="ncol", directed=False, names=True)

	clusters = graph.community_edge_betweenness(clusters=2, directed=False)
	splitter = clusters.as_clustering(clusters_const).membership
	vs = igraph.VertexSeq(graph)

alexeyev / SparkNaiveBayes20Newsgroup.java

Created November 13, 2015 18:28

Spark Naive Bayes 20_newsgroup classification MWE

	package ru.stachek66.mwe.ml.spark;

	import org.apache.spark.SparkConf;
	import org.apache.spark.SparkContext;
	import org.apache.spark.api.java.JavaPairRDD;
	import org.apache.spark.api.java.JavaRDD;
	import org.apache.spark.mllib.classification.NaiveBayes;
	import org.apache.spark.mllib.classification.NaiveBayesModel;
	import org.apache.spark.mllib.feature.HashingTF;
	import org.apache.spark.mllib.feature.IDF;

alexeyev / file_voting.py

Created November 14, 2015 12:33

Dumb voting w/o correlation estimation


	import os
	import collections
	import math

	map = {i: [] for i in xrange(50000)}

	for file in os.listdir("/home/alexeyev/projects/bee/"):
	# files filter
	if file.startswith("sol_0.7"):

alexeyev / museums_ner.py

Last active June 6, 2017 12:38

Извлечение именованных сущностей из текстов, уложенных в .docx

	# coding: utf-8
	""""
	Скрипт для извлечения имён людей из коллекции текстов о музеях
	"""

	import os

	from docx import Document
	from natasha import Combinator
	from natasha.grammars import Person

alexeyev / simple_plagiarism_check.py

Created July 2, 2018 11:55

Comparing texts as sequences

	import difflib

	text_one = open("1.txt", "r").read()
	text_two = open("2.txt", "r").read()

	sm = difflib.SequenceMatcher(isjunk=None, a=text_one, b=text_two)
	mbs = sm.get_matching_blocks()

	for mb in mbs:
	if mb.size > 10:

alexeyev / dependency_graph_draw.py

Created February 5, 2020 15:11

	# coding: utf-8
	"""
	The MIT License (MIT)

	Copyright (c) 2020 Anton Alekseev

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

alexeyev / autosklearn_stuff.py

Created April 22, 2020 12:36


	# import autosklearn
	# from autosklearn.classification import AutoSklearnClassifier
	# # print("Available CLASSIFICATION metrics autosklearn.metrics.*:")
	# # print("\t" + "\n\t".join(autosklearn.metrics.CLASSIFICATION_METRICS))
	# print("Y labels:", set(y))
	#
	# automl = autosklearn.classification.AutoSklearnClassifier(
	# n_jobs=3,
	# ensemble_size=1,

alexeyev / ruexport.sh

Created May 3, 2020 18:57

alexeyev / gtranslate_selenium.py

Created September 20, 2020 17:28

	#!/usr/bin/env python3
	"""
	We do not recommend using this script for any purposes other than learning to use Selenium;
	for batched machine translation via Google Translate using 'document' translation feature
	is arguably the most suitable. For regular translations one should use the Cloud API.
	"""

	import time

	from selenium.common.exceptions import TimeoutException

alexeyev / texts_similarity_difflib.py

Created November 27, 2020 12:29

	# coding: utf-8

	from difflib import SequenceMatcher

	t0 = open("text0.txt", "r+").read().strip().replace("\n", " ").replace(" ", " ")
	t1 = open("text1.txt", "r+").read().strip().replace("\n", " ").replace(" ", " ")

	matcher = SequenceMatcher(a=t0, b=t1)
	ratio = matcher.ratio()
	mbs = matcher.get_matching_blocks()

Anton Alekseev alexeyev