Skip to content

Instantly share code, notes, and snippets.

@lakshmanok
lakshmanok / etl_geojson.py
Created July 17, 2019 18:19
How to load GeoJSON files to BigQuery
#!/usr/bin/env python3
# See: https://medium.com/@lakshmanok/how-to-load-geojson-files-into-bigquery-gis-9dc009802fb4
import json
with open('NUTS_BN_01M_2016_4326_LEVL_3.geojson', 'r') as ifp:
with open('to_load.json', 'w') as ofp:
features = json.load(ifp)['features']
# new-line-separated JSON
schema = None
for obj in features:
@lakshmanok
lakshmanok / notebook_instance.sh
Last active September 24, 2020 03:09
Launch DLVM using gcloud
# A1. Launch a notebook instance and get URL to Jupyter running on it
IMAGE=--image-family=tf-latest-cpu
INSTANCE_NAME=dlvm
[email protected] # CHANGE THIS
STARTUP_SCRIPT="git clone https://github.com/GoogleCloudPlatform/data-science-on-gcp"
echo "Launching $INSTANCE_NAME"
gcloud compute instances create ${INSTANCE_NAME} \
--machine-type=n1-standard-2 \
--scopes=https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/userinfo.email \
Model type Custom deep learning model BigQuery ML AutoML
How Keras with a TensorFlow backend, trained on Cloud ML Engine SQL in BigQuery for ML on structured data AutoML uses neural architecture search and best-of-class model architectures for the specific problem
Best if you are a ML Engineer who knows Python and knows NLP techniques Data analyst who can wrangle data with SQL Developer who can create the dataset in the required format
How long it takes an experienced practioner A week to a month About an hour About a day
Most of this time is spent in Coding Python and experimentation with ML Writing SQL Waiting for job to finish
Cloud computing costs Medium to high depending on size of data, number of experiments, etc. Low Medium
Accuracy Low if you don't know what you are doing; extremely high if you employ appropriate architectures and have a large-enough dataset Moderate to high, mostly depending
# export similar to Cloud ML Engine / TF Serving convention
tf.logging.info('Starting to export model.')
estimator.export_savedmodel(
export_dir_base=os.path.join(output_dir, 'export/exporter'),
serving_input_receiver_fn=serving_input_fn)
# load last checkpoint and start from there
current_step = load_global_step_from_checkpoint_dir(output_dir)
steps_per_epoch = hparams['num_train_images'] // hparams['train_batch_size']
tf.logging.info('Training for %d steps (%.2f epochs in total). Current'
' step %d.',
max_steps,
max_steps / steps_per_epoch,
current_step)
start_timestamp = time.time() # This time will include compilation time
def train_and_evaluate(output_dir, hparams):
STEPS_PER_EVAL = 1000
max_steps = hparams['train_steps']
eval_batch_size = min(1024, hparams['num_eval_images'])
eval_batch_size = eval_batch_size - eval_batch_size % 8 # divisible by num_cores
tf.logging.info('train_batch_size=%d eval_batch_size=%d max_steps=%d',
hparams['train_batch_size'],
eval_batch_size,
max_steps)
def image_classifier(features, labels, mode, params):
image = features
if isinstance(features, dict):
image = features['image']
ylogits, nclasses = cnn_model(image, mode, params)
probabilities = tf.nn.softmax(ylogits)
class_int = tf.cast(tf.argmax(probabilities, 1), tf.int32)
class_str = tf.gather(LIST_OF_LABELS, class_int)
def serving_input_fn():
# Note: only handles one image at a time
feature_placeholders = {'image_bytes':
tf.placeholder(tf.string, shape=())}
image, _ = read_and_preprocess(
tf.squeeze(feature_placeholders['image_bytes']))
features = {
'image': tf.expand_dims(image, 0)
}
return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)
def read_and_preprocess(example_data):
parsed = tf.parse_single_example(example_data, {
'image/encoded': tf.FixedLenFeature((), tf.string, ''),
'image/class/label': tf.FixedLenFeature([], tf.int64, 1),
})
image_bytes = tf.reshape(parsed['image/encoded'], shape=[])
label = tf.cast(
tf.reshape(parsed['image/class/label'], shape=[]), dtype=tf.int32) - 1
# end up with pixel values that are in the -1, 1 range
def make_input_fn(pattern, mode, num_cores=8, transpose_input=False):
def _set_shapes(batch_size, images, labels):
"""Statically set the batch_size dimension."""
if transpose_input:
images.set_shape(images.get_shape().merge_with(
tf.TensorShape([None, None, None, batch_size])))
labels.set_shape(labels.get_shape().merge_with(
tf.TensorShape([batch_size])))
else:
images.set_shape(images.get_shape().merge_with(