I hereby claim:
- I am andrewgross on github.
- I am andrewwgross (https://keybase.io/andrewwgross) on keybase.
- I have a public key ASDsj8ie3y_QBUpm4aBzm-ty7Hr9w_Y5PtWIcLZfQlt9JQo
To claim this, I am signing this object:
import os | |
from urllib.parse import urlparse | |
from pyspark.sql.functions import desc, asc | |
from pyspark.sql.types import ( | |
StructType, | |
StructField, | |
StringType, | |
LongType, |
import re | |
import pyspark.sql.types as T | |
from math import ceil | |
def repartition_for_writing(df): | |
count = df.count() | |
sampled_df = get_sampled_df(df, count=count) | |
string_column_sizes = get_string_column_sizes(sampled_df) | |
num_files = get_num_files(count, df.schema, string_column_sizes) |
import datetime | |
import json | |
BUCKET_NAME = "<s3_bucket_name>" | |
INVENTORY_PREFIX = "<prefix_given_to_s3_inventory>" # Should have data/, hive/, and some dated folders inside of it | |
ACCOUNT_CUID = "<your_canonical_user_id_for_cross_account>" # Account which is not the owner of S3 bucket, but trying to access it. Controls ROLE_ARN | |
ROLE_ARN = "<role_in_cross_account_that_can_assume_to_main_account>" | |
def role_arn_to_session(role_arn): |
def get_files_per_partition(df, partition_key, file_type="parquet", compression="snappy", byte_array_size=256): | |
rows = df.count() | |
print "Dataset has {} rows".format(rows) | |
schema = df.schema | |
num_partitions = 1 | |
if partition_key is not None: | |
num_partitions = df.select([partition_key]).distinct().count() | |
print "Dataset has {} distinct partition keys".format(num_partitions) | |
_df = df.drop(partition_key) | |
schema = _df.schema |
Hey, | |
This is going to be a bit of an esoteric ticket. I noticed some strange behavior recently when comparing Spectrum and Redshift results on the same dataset. | |
Redshift Data: fare.txt | |
Parquet Data: fare.parquet | |
The parquet data was generated from fare.txt with PySpark using convert.py on Spark 2.2.0 | |
Redshift Table Schema: |
I hereby claim:
To claim this, I am signing this object:
import cmd | |
from urlparse import urlparse | |
import sys | |
import re | |
import subprocess | |
def write_to_clipboard(output): | |
process = subprocess.Popen( | |
'pbcopy', env={'LANG': 'en_US.UTF-8'}, stdin=subprocess.PIPE) | |
process.communicate(output.encode('utf-8')) |
# -*- coding: utf-8 -*- | |
from __future__ import unicode_literals | |
import base64 | |
import json | |
import logging | |
from urlparse import parse_qs | |
import requests |
one_hour_ago := time.Now().AddDate(0, 0, -1).Add(time.Hour * 23) |
flip() { | |
echo; | |
echo -en "( º_º) ┬─┬ \r"; sleep .2; | |
echo -en " ( º_º) ┬─┬ \r"; sleep .2; | |
echo -en " ( ºДº)┬─┬ \r"; sleep .2; | |
echo -en " (╯'Д')╯︵⊏ \r"; sleep .1; | |
echo -en " (╯'□')╯︵ ⊏ \r"; sleep .1; | |
echo " (╯°□°)╯︵ ┻━┻"; sleep .1; | |
} |