Skip to content

Instantly share code, notes, and snippets.

# Shuffle dataframe
cities.reindex(np.random.permutation(cities.index))
# Read data from Google Cloud Storage
california_housing_dataframe = pd.read_csv("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")
# Convert pandas data into a dict of np arrays
# where `key` is column name.
@seahrh
seahrh / easypipe.py
Created May 1, 2018 03:42 — forked from dannguyen/easypipe.py
Using scikit-learn to classify NYT columnists
# some convenience functions here, nothing new
'''
# usage:
from easypipe import easy_pipeline
from easypipe import print_metrics
data_folder = "data-hold/20news"
p = easy_pipeline()
print_metrics(p, data_folder)
'''
@seahrh
seahrh / easypipe.py
Created May 1, 2018 03:42 — forked from dannguyen/easypipe.py
Using scikit-learn to classify NYT columnists
# some convenience functions here, nothing new
'''
# usage:
from easypipe import easy_pipeline
from easypipe import print_metrics
data_folder = "data-hold/20news"
p = easy_pipeline()
print_metrics(p, data_folder)
'''
# dump messages to stdout, uses old consumer api!
kafka-console-consumer --zookeeper localhost:2181 --topic my_topic --from-beginning
#
kafka-topics --zookeeper localhost:2181 --describe --topic my_topic
# alter topic
from datetime import datetime, timedelta
from airflow import DAG
from airflow import utils
from airflow.operators import BashOperator, EmailOperator, DummyOperator
default_args = {
'owner': 'myowner',
'depends_on_past': False,
'start_date': datetime(year=2017, month=10, day=18, hour=0, minute=0),
'email': ['[email protected]'],
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
default_args = {
'owner': 'myowner',
'depends_on_past': False,
'start_date': datetime(year=2017, month=10, day=18, hour=0, minute=0),
'email': ['[email protected]'],
'email_on_failure': True,
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>versions-maven-plugin</artifactId>
<version>2.3</version>
<configuration>
<rulesUri>file:///${project.basedir}/versions-maven-rules.xml</rulesUri>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<?xml version="1.0" encoding="UTF-8"?>
<ruleset xmlns="http://mojo.codehaus.org/versions-maven-plugin/rule/2.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" comparisonMethod="maven" xsi:schemaLocation="http://mojo.codehaus.org/versions-maven-plugin/rule/2.0.0 http://mojo.codehaus.org/versions-maven-plugin/xsd/rule-2.0.0.xsd">
<ignoreVersions>
<!-- Ignore Alpha's, Beta's, release candidates and milestones -->
<ignoreVersion type="regex">(?i).*Alpha(?:-?\d+)?</ignoreVersion>
<ignoreVersion type="regex">(?i).*Beta(?:-?\d+)?</ignoreVersion>
<ignoreVersion type="regex">(?i).*-B(?:-?\d+)?</ignoreVersion>
<ignoreVersion type="regex">(?i).*RC(?:-?\d+)?</ignoreVersion>
<ignoreVersion type="regex">(?i).*CR(?:-?\d+)?</ignoreVersion>
<ignoreVersion type="regex">(?i).*M(?:-?\d+)?</ignoreVersion>
#!/usr/bin/env bash
export SPARK_MAJOR_VERSION=2
/usr/hdp/current/spark2-client/bin/spark-submit --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--files /path/to/log4j.properties \
--conf spark.yarn.executor.memoryOverhead=1024 \
--conf spark.port.maxRetries=64 \
--conf spark.driver.extraJavaOptions='-Dlog4j.debug -Dlog4j.configuration=file:/path/to/log4j.properties -Da=a1' \
--conf spark.executor.extraJavaOptions='-Dlog4j.debug -Dlog4j.configuration=log4j.properties' \
--master yarn \
use mydb;
set @s='pqrs';
set @d=11.11;
set @pk=15605;
INSERT INTO t1 (s,d,_fk) SELECT * FROM (SELECT @s, @d, @pk) AS tmp
WHERE NOT EXISTS (SELECT s FROM t1 WHERE s=@s and _fk=@pk) LIMIT 1;
commit;