Theodoros Ntakouris ntakouris

🤖

Building robots

ML Arcanist

ntakouris / simple_pipeline.py

Created July 14, 2020 16:59

	import tensorflow as tf

	# beam imports
	import apache_beam as beam
	from apache_beam.options.pipeline_options import PipelineOptions

	import tensorflow_transform.beam as tft_beam

	# orchestration
	from tfx.orchestration import pipeline, metadata

ntakouris / schema_edit.py

Created July 14, 2020 16:31

	import tensorflow_data_validation as tfdv
	schema = tfdv.load_schema_text(file)
	print(schema) #text output, like doing > cat file
	schema.blabla = ..

ntakouris / schema_sample.pbtxt

Created July 14, 2020 16:25

ntakouris / dataflow_pipeline_args.py

Created July 14, 2020 15:37

	DATAFLOW_BEAM_PIPELINE_ARGS = [
	'--project=' + '<your project id>',
	'--runner=DataflowRunner',
	'--temp_location=' + 'gs://<bucket name>/tmp',
	'--staging_location=' + 'gs://<bucket name>/staging',
	'--region=' + 'us-central1', # the place where our buckets are
	# '--disk_size_gb=50', # no fine tuning needed
	# If you are blocked by IPr Address quota, using a bigger machine_type will
	# reduce the number of needed IPs.
	# '--machine_type=n1-standard-8',

ntakouris / create_pipeline_simple.py

Created July 14, 2020 15:35

	def create_pipeline():
	no_eval_config = example_gen_pb2.Input(splits=[ # treat the entire input as a train split
	example_gen_pb2.Input.Split(name='train', pattern='taxi_dataset.csv'),
	])
	example_gen = CsvExampleGen(input=external_input(
	'gs://<bucket name>/'), input_config=no_eval_config)
	statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
	schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'])

	return pipeline.Pipeline(

ntakouris / tfma_eval.py

Created July 10, 2020 08:42

	import tensorflow_model_analysis as tfma

	# model.compile(..., metrics=[...]), etc) will be computed
	# automatically.
	metrics=[
	tfma.MetricConfig(class_name='ExampleCount')
	],
	# To add validation thresholds for metrics saved with the model,
	# add them keyed by metric name to the thresholds map.
	thresholds = {

ntakouris / encode_save_tfrecord.py

Created July 8, 2020 13:44

	raw_dataset = (raw_data, RAW_DATA_METADATA)
	transformed_dataset, transform_fn = (
	raw_dataset \| tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
	transformed_data, transformed_metadata = transformed_dataset
	transformed_data_coder = tft.coders.ExampleProtoCoder(
	transformed_metadata.schema)

	_ = (
	transformed_data
	\| 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)

ntakouris / preprocessing_adv.py

Created July 8, 2020 13:29

	def preprocessing_fn(inputs):
	"""Preprocess input columns into transformed columns."""
	# Since we are modifying some features and leaving others unchanged, we
	# start by setting `outputs` to a copy of `inputs.
	outputs = inputs.copy()

	# Scale numeric columns to have range [0, 1].
	for key in NUMERIC_FEATURE_KEYS:
	outputs[key] = tft.scale_to_0_1(outputs[key])

ntakouris / simple_dataset_metadata.py

Created July 8, 2020 13:04

	raw_data = [
	{'x': 1, 'y': 1, 's': 'hello'},
	{'x': 2, 'y': 2, 's': 'world'},
	{'x': 3, 'y': 3, 's': 'hello'}
	]

	raw_data_metadata = dataset_metadata.DatasetMetadata(
	dataset_schema.from_feature_spec({
	'y': tf.io.FixedLenFeature([], tf.float32),
	'x': tf.io.FixedLenFeature([], tf.float32),

ntakouris / preprocessing_simple.py

Created July 8, 2020 12:56

	def preprocessing_fn(inputs):
	"""Preprocess input columns into transformed columns."""
	x = inputs['x']
	y = inputs['y']
	s = inputs['s']
	x_centered = x - tft.mean(x)
	y_normalized = tft.scale_to_0_1(y)
	s_integerized = tft.compute_and_apply_vocabulary(s)
	x_centered_times_y_normalized = (x_centered * y_normalized)
	return {