Skip to content

Instantly share code, notes, and snippets.

View semyont's full-sized avatar

Semyon semyont

View GitHub Profile
@semyont
semyont / gevent_concurrency_redis.py
Created April 12, 2017 20:53
gevent based concurrency for redis-py
import logging
logging.basicConfig(
format='%(asctime)s,%(msecs)05.1f (%(funcName)s) %(message)s',
datefmt='%H:%M:%S')
log = logging.getLogger()
log.setLevel(logging.INFO)
import threading
import os
import time
@semyont
semyont / csv_pandas_stream_elastic_upsert.py
Last active July 20, 2023 16:13
large timeseries csv streaming upsert bulk elasticseach #index #pandas #bigdata #csv #upsert #elasticsearch #progressbar #example #bulk #stream #dataops #dataengineer #timeseries
import logging
import hashlib
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from tqdm import tqdm
class Storage:
@semyont
semyont / regex.py
Created August 3, 2016 10:48
Regex for extracting log data
from pyspark.sql.functions import split, regexp_extract
split_df = base_df.select(regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'),
regexp_extract('value', r'^.*\[(\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]', 1).alias('timestamp'),
regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"', 1).alias('path'),
regexp_extract('value', r'^.*"\s+([^\s]+)', 1).cast('integer').alias('status'),
regexp_extract('value', r'^.*\s+(\d+)$', 1).cast('integer').alias('content_size'))
split_df.show(truncate=False)
@semyont
semyont / 0_reuse_code.js
Created December 30, 2015 13:40
Here are some things you can do with Gists in GistBox.
// Use Gists to store code you would like to remember later on
console.log(window); // log the "window" object to the console