Skip to content

Instantly share code, notes, and snippets.

@alexeyev
alexeyev / gist:8073e9a8d2854af79800
Created May 29, 2015 22:08
Beautiful two-communities-graph layout
import igraph
clusters_const = 2
graph = igraph.read("2_big_graph_clusters_ncol.txt", format="ncol", directed=False, names=True)
clusters = graph.community_edge_betweenness(clusters=2, directed=False)
splitter = clusters.as_clustering(clusters_const).membership
vs = igraph.VertexSeq(graph)
@alexeyev
alexeyev / SparkNaiveBayes20Newsgroup.java
Created November 13, 2015 18:28
Spark Naive Bayes 20_newsgroup classification MWE
package ru.stachek66.mwe.ml.spark;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.classification.NaiveBayes;
import org.apache.spark.mllib.classification.NaiveBayesModel;
import org.apache.spark.mllib.feature.HashingTF;
import org.apache.spark.mllib.feature.IDF;
@alexeyev
alexeyev / file_voting.py
Created November 14, 2015 12:33
Dumb voting w/o correlation estimation
import os
import collections
import math
map = {i: [] for i in xrange(50000)}
for file in os.listdir("/home/alexeyev/projects/bee/"):
# files filter
if file.startswith("sol_0.7"):
@alexeyev
alexeyev / museums_ner.py
Last active June 6, 2017 12:38
Извлечение именованных сущностей из текстов, уложенных в .docx
# coding: utf-8
""""
Скрипт для извлечения имён людей из коллекции текстов о музеях
"""
import os
from docx import Document
from natasha import Combinator
from natasha.grammars import Person
@alexeyev
alexeyev / simple_plagiarism_check.py
Created July 2, 2018 11:55
Comparing texts as sequences
import difflib
text_one = open("1.txt", "r").read()
text_two = open("2.txt", "r").read()
sm = difflib.SequenceMatcher(isjunk=None, a=text_one, b=text_two)
mbs = sm.get_matching_blocks()
for mb in mbs:
if mb.size > 10:
# coding: utf-8
"""
The MIT License (MIT)
Copyright (c) 2020 Anton Alekseev
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# import autosklearn
# from autosklearn.classification import AutoSklearnClassifier
# # print("Available CLASSIFICATION metrics autosklearn.metrics.*:")
# # print("\t*" + "\n\t*".join(autosklearn.metrics.CLASSIFICATION_METRICS))
# print("Y labels:", set(y))
#
# automl = autosklearn.classification.AutoSklearnClassifier(
# n_jobs=3,
# ensemble_size=1,
#!/usr/bin/env bash
i=1
END=100000
mkdir export
while [ $i -le $END ]; do
echo "Dealing with company #"$i
mkdir export/$i
#!/usr/bin/env python3
"""
We do not recommend using this script for any purposes other than learning to use Selenium;
for batched machine translation via Google Translate using 'document' translation feature
is arguably the most suitable. For regular translations one should use the Cloud API.
"""
import time
from selenium.common.exceptions import TimeoutException
# coding: utf-8
from difflib import SequenceMatcher
t0 = open("text0.txt", "r+").read().strip().replace("\n", " ").replace(" ", " ")
t1 = open("text1.txt", "r+").read().strip().replace("\n", " ").replace(" ", " ")
matcher = SequenceMatcher(a=t0, b=t1)
ratio = matcher.ratio()
mbs = matcher.get_matching_blocks()