Alec Barrett-Wilsdon alecbw

Preface

In general, AWS services can be accessed using

I opted for the API library since it is

Don’t SELECT *, Specify explicit column names (columnar store)
Avoid large JOINs (filter each table first)
- In PRESTO tables are joined in the order they are listed!!
- Join small tables earlier in the plan and leave larger fact tables to the end
- Avoid cross joins or 1 to many joins as these can degrade performance
Order by and group by take time
- only use order by in subqueries if it is really necessary
When using GROUP BY, order the columns by the highest cardinality (that is, most number of unique values) to the lowest.

	#!/usr/bin/python
	# -- coding: utf-8 --

	'''To use gzip file between python application and S3 directly for Python3.
	Python 2 version - https://gist.github.com/a-hisame/f90815f4fae695ad3f16cb48a81ec06e
	'''

	import io
	import gzip
	import json

	from selenium import webdriver
	from selenium.webdriver.common.proxy import Proxy
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
	from selenium.webdriver.chrome.options import Options
	import zipfile,os

	def proxy_chrome(PROXY_HOST,PROXY_PORT,PROXY_USER,PROXY_PASS):
	manifest_json = """
	{

	How to search in all countries but the US (or any other for that matter)?

	Linkedin Country codes: https://developer.linkedin.com/docs/reference/country-codes#

	Linkedin faceted search url format: %5B"ca%3A0"%2C"au%3A0"%2C"es%3A0"%5D
	Decoded URL: ["ca:0","au:0","es:0"]

	=> Complete list for injection in url (remove the country you want to exclude):

	["ae:0","ar:0","at:0","au:0","be:0","br:0","ca:0","ch:0","cl:0","cn:0","co:0","cz:0","de:0","dk:0","es:0","fi:0","fr:0","fx:0","gb:0","gr:0","hk:0","hr:0","hu:0","id:0","ie:0","il:0","in:0","is:0","it:0","jp:0","lb:0","lu:0","lv:0","ma:0","mc:0","mx:0","my:0","nl:0","no:0","nz:0","oo:0","pe:0","ph:0","pk:0","pl:0","pr:0","pt:0","py:0","qa:0","ro:0","ru:0","sa:0","se:0","sg:0","sk:0","th:0","tr:0","tw:0","ua:0","us:0","uy:0","ve:0","vn:0","yu:0","za:0"]

	curl -L -k -s https://www.example.com \| tac \| sed "s#\\\/#\/#g" \| egrep -o "src['\"]?\s[=:]\s['\"]?[^'\"]+.js[^'\"> ]" \| awk -F '//' '{if(length($2))print "https://"$2}' \| sort -fu \| xargs -I '%' sh -c "curl -k -s \"%\" \| sed \"s/[;}\)>]/\n/g\" \| grep -Po \"(['\\\"](https?:)?[/]{1,2}[^'\\\"> ]{5,})\|(\.(get\|post\|ajax\|load)\s\(\s*['\\\"](https?:)?[/]{1,2}[^'\\\"> ]{5,})\"" \| awk -F "['\"]" '{print $2}' \| sort -fu

	# using linkfinder
	function ejs() {
	URL=$1;
	curl -Lks $URL \| tac \| sed "s#\\\/#\/#g" \| egrep -o "src['\"]?\s[=:]\s['\"]?[^'\"]+.js[^'\"> ]*" \| sed -r "s/^src['\"]?[=:]['\"]//g" \| awk -v url=$URL '{if(length($1)) if($1 ~/^http/) print $1; else if($1 ~/^\/\//) print "https:"$1; else print url"/"$1}' \| sort -fu \| xargs -I '%' sh -c "echo \"\n##### %\";wget --no-check-certificate --quiet \"%\"; basename \"%\" \| xargs -I \"#\" sh -c 'linkfinder.py -o cli -i #'"
	}

	# with file download (the new best one):
	# but there is a bug if you don't provide a root url

	# Read the S3 folder as glue dynamic data frames
	input_dyf = glueContext.create_dynamic_frame_from_options("s3", {
	"paths": [ inputPath ],
	"recurse": True,
	"groupFiles": "inPartition"
	},
	format = "parquet"
	)

	# Repartition them as required

	https://jennamolby.com/how-to-use-cookies-to-capture-url-parameters/

	let YOUR_DOMAIN = "YOUR_DOMAIN.TLD" // ex: scrapingbee.com

	function getParameterByName(name) {
	name = name.replace(/[\[]/, "\\[").replace(/[\]]/, "\\]");
	var regex = new RegExp("[\\?&]" + name + "=([^&#]*)"),
	results = regex.exec(location.search);
	return results === null ? "" : decodeURIComponent(results[1].replace(/\+/g, " "));
	}

	import geopandas as gpd
	import pandas as pd
	import fiona
	import sys
	import numpy as np
	import folium
	from folium.plugins import Search

	#data from https://frap.fire.ca.gov/mapping/gis-data/
	#create a list of layers within a file geodatabase