I hereby claim:
- I am mattfaus on github.
- I am mattfaus (https://keybase.io/mattfaus) on keybase.
- I have a public key whose fingerprint is 1CF5 6643 9369 2689 9402 2358 69E8 0354 58E5 E154
To claim this, I am signing this object:
import db_util | |
db_util.enable_db_protobuf_projection() | |
db_util.enable_ndb_protobuf_projection() |
I hereby claim:
To claim this, I am signing this object:
class BatchedGcsCsvShardFileWriter(object): | |
"""Writes CSV data into multiple output shards, grouping rows by keys. | |
This class is a context manager, which closes all shards upon exit. | |
Say you are writing a lot of CSV data, like: | |
[0, "Bakery"], | |
[2, "Francisco"], | |
[3, "Matt"], |
class SortedGcsCsvShardFileMergeReader(object): | |
"""Merges several sorted .csv files stored on GCS. | |
This class is both an iterator and a context manager. | |
Let's say there are 2 .csv files stored on GCS, with contents like: | |
/bucket/file_1.csv: | |
[0, "Matt"], | |
[0, "Sam"], |
class ParallelInMemorySortGcsCsvShardFiles(pipeline.Pipeline): | |
def run(self, input_bucket, input_pattern, sort_columns, | |
model_type, output_bucket, output_pattern): | |
"""Sorts each input file in-memory, then writes it to an output file. | |
Arguments: | |
input_bucket - The GCS bucket which contains the unsorted .csv | |
files. | |
input_pattern - A regular expression used to find files in the |
class DeterministicCompressedFeatures(CompressedFeatures): | |
"""Generates random components after seeding with the component_key. | |
By using a known seed to generate the random components, we do not need to | |
store or manage them. We can just recompute them whenever we need. | |
""" | |
def __init__(self, num_features=RANDOM_FEATURE_LENGTH): | |
super(DeterministicallyRandomFeatures, self).__init__(num_features) |
{ | |
u 'fields': [{ | |
u 'type': u 'STRING', | |
u 'name': u 'playlists', | |
u 'mode': u 'REPEATED' | |
}, { | |
u 'type': u 'STRING', | |
u 'name': u 'source_table', | |
u 'mode': u 'NULLABLE' | |
}, { |
def get_table_schema(dataset, table): | |
"""If the table exists, returns its schema. Otherwise, returns None.""" | |
table_service = BigQueryService.get_service().tables() | |
try: | |
get_result = table_service.get( | |
projectId=BQ_PROJECT_ID, | |
datasetId=dataset, | |
tableId=table | |
).execute() | |
return get_result['schema'] |
import collections | |
import jinja2 | |
import logging | |
import os | |
import request_handler | |
import third_party.mapreduce | |
import third_party.mapreduce.input_readers | |
import third_party.mapreduce.output_writers | |
import third_party.mapreduce.lib.files | |
import third_party.mapreduce.operation |
class TransformedVideoTranslationInfo(bq_property_transform.TransformedEntity): | |
CUSTOM_SCHEMAS = { | |
'translated_youtube_ids': { | |
'name': 'translated_youtube_ids', | |
'type': 'record', | |
'mode': 'repeated', | |
'fields': [ | |
{'name': 'language', | |
'type': 'string'}, |