Skip to content

Instantly share code, notes, and snippets.

View myui's full-sized avatar

Makoto YUI myui

View GitHub Profile
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
use news20;
set hivemall.smile.nprocs=4;
drop table rf_model;
create table rf_model
as
select train_randomforest_classifier(features,convert_label(label),'-trees 50 -seed 71')
from train;
@myui
myui / latlon.md
Last active April 29, 2017 20:08
WITH data as (
  select 25.7724247 as lat, -80.1854473 as lon, 10 as zoom
  union all
  select 25.7724247 as lat, -80.1854473 as lon, 15 as zoom
)
select 
   map_url(lat,lon,zoom) as osm_url,
   map_url(lat,lon,zoom,'-type googlemaps') as gmap_url,
 tile(lat,lon,zoom) as tile_number
@myui
myui / lda.sql
Last active April 29, 2017 20:36
-- fitting
select
label, word, avg(lambda) as lambda
from (
select
train_lda(feature, "-topic 2 -iter 20")
as (label, word, lambda)
from
data
) t1
select feature_hashing(array("userid#4505:3.3","movieid#2331:4.999", "movieid#2331"));

["1828616:3.3","6238429:4.999","6238429"]

SELECT
  features_hashing(
 array_concat(
@myui
myui / limit_url_depth.md
Created June 16, 2017 08:46
limit url to level 2 depth
select
  recent1[3] as url,
  regexp_extract(
    recent1[3],
    '^https?://[^/]+(/[^/]+){0,2}'
  ) as extracted
from
  pre_hivemall_ac
@myui
myui / titanic.dot
Created June 30, 2017 11:05
Graphvis output of Hivemall decision tree
digraph Tree {
node [shape=box, style="filled, rounded", color="black", fontname=helvetica];
edge [fontname=helvetica];
0 [label=<pclass = 1.0>, fillcolor="#00000000"];
1 [label=<sex = 0.0>, fillcolor="#00000000"];
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"];
2 [label=<age &le; 43.0>, fillcolor="#00000000"];
1 -> 2;
3 [label=<cabin = 0.0>, fillcolor="#00000000"];
2 -> 3;
@myui
myui / stratified-sampling.md
Created July 10, 2017 09:14
stratified sampling
SET sampleRate=0.1; -- in range (0.0-1.0]

select
    field1, field2, field3, ..., fieldN, state
from (
    select
        field1, field2, field3, ..., fieldN, state,
        count(*) over (partition by state) as state_cnt,
 rank() over (partition by state order by rand()) as state_rank
@myui
myui / criteo.md
Last active August 3, 2017 04:50
FFM on Criteo dataset

Data preparation

-- set mapred.max.split.size=128000000;
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
set hive.tez.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
set hive.mapjoin.smalltable.filesize=30000000;
-- set hive.optimize.s3.query=true;
set hive.exec.dynamic.partition.mode=nonstrict; 
set hive.optimize.sort.dynamic.partition=false;
import numpy as np
from numpy import array
from scipy.sparse import coo_matrix
row = np.array([82, 8, 56, 86, 48, 20, 22, 98, 65, 98])
col = np.array([80, 13, 51, 74, 16, 40, 64, 57, 40, 30])
data = np.array([0.6805822, 0.23116356, 0.8527678, 0.3512172, 0.03575957, 0.2307719, 0.6499588, 0.66925836, 0.8698123, 0.041548133])
coo = coo_matrix((data, (row, col)), shape=(100, 100))