Skip to content

Instantly share code, notes, and snippets.

View hakanilter's full-sized avatar

Hakan İlter hakanilter

View GitHub Profile
@hakanilter
hakanilter / wikipedia_sql_to_parquet.py
Created March 6, 2019 08:52
Convert Wikipedia Category SQL File to Parquet Files
from pyspark.sql import SparkSession
# init spark
spark = SparkSession.builder \
.master("local[*]") \
.appName("anaconda") \
.config("spark.sql.warehouse.dir", "file:///tmp/spark-warehouse") \
.enableHiveSupport() \
.getOrCreate()
@hakanilter
hakanilter / wikipedia_category_to_es.py
Last active March 7, 2019 17:23
Saving Wikipedia Categories in ElasticSearch using PySpark
# Download required library
#cd /opt/conda/lib/python3.6/site-packages/pyspark-2.4.0-py3.6.egg/pyspark/jars/
#wget http://central.maven.org/maven2/org/elasticsearch/elasticsearch-spark-20_2.11/6.6.1/elasticsearch-spark-20_2.11-6.6.1.jar
#ls -l *elastic*
# Initialize Spark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[*]") \
@hakanilter
hakanilter / poor_mans_text_clustering.py
Last active April 5, 2019 09:30
Poor Man's text clustering using cosine similarity
from scipy import spatial
distances = spatial.distance.squareform(spatial.distance.pdist(message_embeddings, 'cosine'))
def progress(i):
print('\r{} {}'.format('-\|/'[i % 4], i), end='')
def cluster(items, distances, similarity_threshold=0.11):
print('Clustering threshold:', similarity_threshold)
clusters = list()
@hakanilter
hakanilter / tensorflow_embeddings.py
Created April 5, 2019 09:32
Tensorflow Universal Sentence Encoder
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os
import pandas as pd
from scipy import spatial
from operator import itemgetter
#module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
@hakanilter
hakanilter / spark_weird_csv.scala
Last active May 24, 2019 12:18
Creating DataFrame from weird CSV files
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
def wcsv_to_df(
fileName: String,
tableName: String,
columns: Array[String],
fieldTerminator: String,
@hakanilter
hakanilter / import-from-s3-to-postgres.sql
Created June 24, 2019 23:32
Import CSV from S3 to Postgres
-- https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/PostgreSQL.Procedural.Importing.html#USER_PostgreSQL.S3Import.table_import_from_s3
-- https://github.com/chimpler/postgres-aws-s3
CREATE EXTENSION aws_s3 CASCADE;
DROP TABLE nyse;
CREATE TABLE nyse (
exchange VARCHAR(50),
stock_symbol VARCHAR(50),
stock_date DATE,
@hakanilter
hakanilter / s3_select.py
Created July 5, 2019 23:39
S3 Select Example
import boto3
import pandas as pd
s3 = boto3.client('s3', 'eu-west-1')
def execute_query(query):
response = s3.select_object_content(
Bucket='my-bucket',
Key='nyse/NYSE-2000-2001.tsv.gz',
ExpressionType='SQL',
@hakanilter
hakanilter / auto-refresh.js
Created October 11, 2019 13:13
AWS UI Autorefresh JS scripts
// Cloudwatch
setInterval(function(){
document.getElementsByClassName('cwdb-log-viewer-table-infinite-loader-bottom')[0].lastElementChild.click();
document.getElementsByClassName('GIYU-ANBFDF')[0].scroll(0, document.body.scrollHeight)
}, 3000);
// EMR
setInterval(function(){
document.getElementsByClassName('GAEMCWHGM')[14].click()
}, 5000);
@hakanilter
hakanilter / spark_helper.py
Created September 14, 2020 07:46
Default PySpark Settings
def get_spark(app_name):
"""
Creates Spark session with default parameters
"""
spark = SparkSession.builder \
.master(os.environ.get("SPARK_MASTER", "local[*]")) \
.appName(app_name) \
.config("spark.default.parallelism", 16) \
.config("spark.sql.adaptive.enabled", True) \
.config("spark.sql.warehouse.dir", SPARK_WAREHOUSE) \
@hakanilter
hakanilter / default.conf
Created August 8, 2021 14:09
Nginx Proxy for Kibana + Basic Auth
server {
listen 80;
server_name localhost;
auth_basic "Restricted Access";
auth_basic_user_file /etc/nginx/htpasswd.users;
location / {
proxy_pass https://vpc-my-es-574vcxyz.eu-central-1.es.amazonaws.com/;