seahrh’s gists

seahrh / pandas_ex.py

Last active June 7, 2018 07:51

	# Shuffle dataframe

	cities.reindex(np.random.permutation(cities.index))

	# Read data from Google Cloud Storage

	california_housing_dataframe = pd.read_csv("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")

	# Convert pandas data into a dict of np arrays
	# where `key` is column name.

seahrh / easypipe.py

Created May 1, 2018 03:42 — forked from dannguyen/easypipe.py

Using scikit-learn to classify NYT columnists

	# some convenience functions here, nothing new
	'''
	# usage:
	from easypipe import easy_pipeline
	from easypipe import print_metrics
	data_folder = "data-hold/20news"
	p = easy_pipeline()
	print_metrics(p, data_folder)
	'''

seahrh / easypipe.py

Created May 1, 2018 03:42 — forked from dannguyen/easypipe.py

Using scikit-learn to classify NYT columnists

	# some convenience functions here, nothing new
	'''
	# usage:
	from easypipe import easy_pipeline
	from easypipe import print_metrics
	data_folder = "data-hold/20news"
	p = easy_pipeline()
	print_metrics(p, data_folder)
	'''

seahrh / kafka_commands.sh

Created April 12, 2018 05:56

	# dump messages to stdout, uses old consumer api!

	kafka-console-consumer --zookeeper localhost:2181 --topic my_topic --from-beginning

	#

	kafka-topics --zookeeper localhost:2181 --describe --topic my_topic

	# alter topic

seahrh / airflow_18_ex_email_on_failure.py

Created March 23, 2018 08:39

	from datetime import datetime, timedelta
	from airflow import DAG
	from airflow import utils
	from airflow.operators import BashOperator, EmailOperator, DummyOperator

	default_args = {
	'owner': 'myowner',
	'depends_on_past': False,
	'start_date': datetime(year=2017, month=10, day=18, hour=0, minute=0),
	'email': ['[email protected]'],

seahrh / airflow_18_ex_simple.py

Created March 23, 2018 08:31

	from datetime import datetime, timedelta
	from airflow import DAG
	from airflow.operators.bash_operator import BashOperator

	default_args = {
	'owner': 'myowner',
	'depends_on_past': False,
	'start_date': datetime(year=2017, month=10, day=18, hour=0, minute=0),
	'email': ['[email protected]'],
	'email_on_failure': True,

seahrh / versions-maven-plugin.xml

Created February 8, 2018 18:35

	<plugin>
	<groupId>org.codehaus.mojo</groupId>
	<artifactId>versions-maven-plugin</artifactId>
	<version>2.3</version>
	<configuration>
	<rulesUri>file:///${project.basedir}/versions-maven-rules.xml</rulesUri>
	</configuration>
	<executions>
	<execution>
	<phase>compile</phase>

seahrh / versions-maven-rules.xml

Created February 8, 2018 18:33

	<?xml version="1.0" encoding="UTF-8"?>
	<ruleset xmlns="http://mojo.codehaus.org/versions-maven-plugin/rule/2.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" comparisonMethod="maven" xsi:schemaLocation="http://mojo.codehaus.org/versions-maven-plugin/rule/2.0.0 http://mojo.codehaus.org/versions-maven-plugin/xsd/rule-2.0.0.xsd">
	<ignoreVersions>
	<!-- Ignore Alpha's, Beta's, release candidates and milestones -->
	<ignoreVersion type="regex">(?i).*Alpha(?:-?\d+)?</ignoreVersion>
	<ignoreVersion type="regex">(?i).*Beta(?:-?\d+)?</ignoreVersion>
	<ignoreVersion type="regex">(?i).*-B(?:-?\d+)?</ignoreVersion>
	<ignoreVersion type="regex">(?i).*RC(?:-?\d+)?</ignoreVersion>
	<ignoreVersion type="regex">(?i).*CR(?:-?\d+)?</ignoreVersion>
	<ignoreVersion type="regex">(?i).*M(?:-?\d+)?</ignoreVersion>

seahrh / spark2_submit_hdp.sh

Created February 5, 2018 06:12

	#!/usr/bin/env bash
	export SPARK_MAJOR_VERSION=2

	/usr/hdp/current/spark2-client/bin/spark-submit --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
	--files /path/to/log4j.properties \
	--conf spark.yarn.executor.memoryOverhead=1024 \
	--conf spark.port.maxRetries=64 \
	--conf spark.driver.extraJavaOptions='-Dlog4j.debug -Dlog4j.configuration=file:/path/to/log4j.properties -Da=a1' \
	--conf spark.executor.extraJavaOptions='-Dlog4j.debug -Dlog4j.configuration=log4j.properties' \
	--master yarn \

seahrh / mysql_insert_if_not_exists.sql

Created February 5, 2018 05:52

	use mydb;
	set @s='pqrs';
	set @d=11.11;
	set @pk=15605;
	INSERT INTO t1 (s,d,_fk) SELECT * FROM (SELECT @s, @d, @pk) AS tmp
	WHERE NOT EXISTS (SELECT s FROM t1 WHERE s=@s and _fk=@pk) LIMIT 1;
	commit;

Ruhong seahrh