janbenetka’s gists

janbenetka / partitioned_table.sql

Last active October 1, 2020 17:18

[BigQuery Partitioned Table DDL] #sql #bigquery

	CREATE TABLE dataset.new_table
	PARTITION BY DATE(timestamp_column) AS
	SELECT x, y, z, timestamp_column
	FROM dataset.existing_table

	CREATE OR REPLACE TABLE `uc-prox-core-dev.14_days_retention.home_od_flux_2020_aggregated_condensed`
	PARTITION BY minProcessingDate
	AS
	SELECT...

janbenetka / bar_chart_plotly.py

Last active August 20, 2021 05:16

[Bar chart in Plotly] #plotly #python

	import plotly.graph_objects as go

	countries = increase_per_country.country

	fig = go.Figure()
	fig.add_trace(go.Bar(
	x=countries,
	y=increase_per_country.identifier_count_new,
	name='New data',
	marker_color='#FF8000'

janbenetka / pandas_display_options.py

Created October 3, 2020 22:23

[Pandas settings/options] Setting display options in pandas #pandas #dataframes

	import pandas as pd
	pd.options.display.max_columns = 50 # None -> No Restrictions
	pd.options.display.max_rows = 200 # None -> Be careful with this
	pd.options.display.max_colwidth = 100
	pd.options.display.precision = 3

janbenetka / total_row_col_pandas.py

Created October 3, 2020 22:25

[Total sum row and column in Pandas] Add total row and coumn in Pandas dataframe #pandas #dataframes

	df = pd.DataFrame(dict(A=[2,6,3],
	B=[2,2,6],
	C=[3,2,3]))
	df['col_total'] = df.apply(lambda x: x.sum(), axis=1)
	df.loc['row_total'] = df.apply(lambda x: x.sum())

janbenetka / pandas_value_counts.py

Created October 3, 2020 22:30

[Pandas value counts] Counts number of observations for distinct values #pandas #dataframes

	# number of counts per value
	df['genre'].value_counts()

	# fraction per value (sums up to 1.0)
	df['genre'].value_counts(normalize=True)

janbenetka / country_codes.txt

Last active October 5, 2020 07:55

[Country codes] #geo

janbenetka / group_ratio.sql

Created October 7, 2020 05:53

[Ratio per group in BigQuery] #bigquery

	with per_os AS (
	SELECT
	osType,
	fips,
	count(*) count,
	any_value(geog) geog
	FROM `uc-puzzle-data.clusters_dwh.cluster_US` clusters
	, `uc-atlas.maps_us.census_tracts` tract
	WHERE
	localEventDate BETWEEN "2020-01-15" AND "2020-01-30"

janbenetka / hist_grouping.sql

Created October 14, 2020 00:48

[Historgram grouping in BigQuery] #bigquery #sql

	with count_per_identifier AS (
	SELECT
	identifier,
	COUNT(DISTINCT home_area_id) home_count
	FROM `uc-prox-core-dev.30_day_retention.home_identifier_weekly_8w_20d_100h`
	GROUP BY identifier
	)
	SELECT home_count, SUM(count) count FROM (SELECT 1 as count, home_count FROM count_per_identifier)
	GROUP BY home_count
	ORDER BY home_count

janbenetka / parallel_plot.py

Created October 15, 2020 16:16

[Parallel plot for ranked data] #plotly #ranking #pandas

	import plotly.express as px
	df = px.data.iris()
	fig = px.parallel_coordinates(df, color="species_id",
	dimensions=['sepal_width', 'sepal_length', 'petal_width',
	'petal_length'],
	color_continuous_scale=px.colors.diverging.Tealrose,
	color_continuous_midpoint=2)
	fig.show()

	#-----------------------------------

janbenetka / group_concat.sql

Created October 28, 2020 21:49

[BigQuery: Concat all text of grouped by rows] #bigquery #groupby

	SELECT url, COUNT(*) AS popularity, GROUP_CONCAT(tweet)
	FROM Table GROUP BY url ORDER BY popularity

Jan Benetka janbenetka