Skip to content

Instantly share code, notes, and snippets.

View davideanastasia's full-sized avatar

Davide Anastasia davideanastasia

View GitHub Profile
def dataframe_from_file(filename):
try:
taxi_data = pd.read_csv(filename, names=['taxi_id', 'ts', 'longitude', 'latitude'], parse_dates=['ts'])
if len(taxi_data.index) == 0:
print("skipping {} as empty".format(filename))
return None
taxi_data['geohash'] = np.vectorize(lambda longitude, latitude: pgh.encode(latitude, longitude, precision=6))(
taxi_data['longitude'],
CREATE TABLE `kaggle_talkingdata_adtracking.dataset_test_submission_logreg_0001`
AS
SELECT click_id, prob as is_attributed
FROM `kaggle_talkingdata_adtracking.dataset_test_with_prediction_logreg_0001`
JOIN UNNEST(predicted_is_attributed_probs)
WHERE label = 1
ORDER BY click_id;
CREATE TABLE `kaggle-adfraud.kaggle_talkingdata_adtracking.dataset_test_with_prediction_logreg_0001`
AS
SELECT * FROM ML.PREDICT(MODEL `kaggle-adfraud.kaggle_talkingdata_adtracking.talkingdata_logreg_0001`,
(SELECT
click_id,
CAST(ip AS STRING) as ip,
CAST(app AS STRING) as app,
CAST(device AS STRING) as device,
CAST(os AS STRING) as os,
CAST(channel AS STRING) as channel,
CREATE MODEL `kaggle_talkingdata_adtracking.talkingdata_logreg_0001`
OPTIONS (
model_type='logistic_reg',
input_label_cols=['is_attributed'],
data_split_method='seq',
data_split_col='click_time'
) AS
SELECT CAST(ip AS STRING) as ip,
CAST(app AS STRING) as app,
CAST(device AS STRING) as device,
CREATE MODEL `kaggle_talkingdata_adtracking.talkingdata_logreg_0001`
OPTIONS (
model_type='logistic_reg',
input_label_cols=['is_attributed'],
data_split_method='seq',
data_split_col='click_time'
) AS
SELECT CAST(ip AS STRING) as ip,
CAST(app AS STRING) as app,
CAST(device AS STRING) as device,
CREATE MODEL `kaggle_talkingdata_adtracking.talkingdata_logreg_sample_0003`
OPTIONS (
model_type='logistic_reg',
input_label_cols=['is_attributed'],
data_split_method='seq',
data_split_col='click_time'
) AS
SELECT CAST(ip AS STRING) as ip,
CAST(app AS STRING) as app,
CAST(device AS STRING) as device,
CREATE MODEL `kaggle_talkingdata_adtracking.talkingdata_logreg_sample`
OPTIONS (
model_type='logistic_reg',
input_label_cols=['is_attributed']
) AS
SELECT ip,
app,
device,
os,
channel,
public class StopWordRemoveFnTest {
@Rule
public final transient TestPipeline pipeline = TestPipeline.create();
@Test
public void testDoFn_TestPipeline() throws Exception {
PCollection<KV<Empty, String>> input = pipeline.apply(Create.of(
KV.of(Empty.EMPTY, "be"), KV.of(Empty.EMPTY, "is"), KV.of(Empty.EMPTY, "night"), KV.of(Empty.EMPTY, "dream")
).withCoder(KvCoder.of(AvroCoder.of(Empty.class), StringUtf8Coder.of())));
public class StopWordRemoveFnTest {
static private class Empty {}
static private final Empty EMPTY = new Empty();
@Test
public void testDoFn() throws Exception {
StopWordRemoveFn<Empty> doFn = new StopWordRemoveFn<>();
DoFnTester<KV<Empty, String>, KV<Empty, String>> fnTester = DoFnTester.of(doFn);
@davideanastasia
davideanastasia / reducer.java
Created June 5, 2018 22:34
Apache Beam Getting Started - #4
public class ReduceFn extends Combine.CombineFn<Metadata, Index, Index> {
@Override
public Index createAccumulator() {
return new Index();
}
@Override
public Index addInput(Index accumulator, Metadata input) {
accumulator.add(input);