mh0w · May 6, 2025 14:34 · mh0w · Feb 21, 2025
diff --git a/intro.sql b/intro.sql
 -- SQL: Structured Query Language for communicating with relational (tabular) databases
 -- RDBMS: Relational Database Management Systems
 -- CRUD: Create, Read, Update, and Delete
 -- Databases contain tables, plus potentially Views and Queries too

 -- Once connected to the server database, the following example snippets work

 -- These snippets were written primarily with Hive and PySpark SQL in mind

 ------------------
 -- Querying data -
 ------------------

 -- All columns and rows from a table
 SELECT
    *
 FROM
    nspl24feb

 -- Certain columns and all rows from a table, rename one column
 SELECT
    forename AS first_name,
    last_name
 FROM
    my_table

 -- Conditionals: AND operator to specifiy multiple conditions
 SELECT
    *
 FROM
    my_table
 WHERE
    height>=170 AND age>=30

 -- Conditional: = operator --> exact match
 SELECT
    *
 FROM
    my_table
 WHERE
    last_name='Smith'

 -- Conditional: LIKE operator --> exact match
 SELECT
    *
 FROM
    my_table
 WHERE
    last_name LIKE 'Smith'

 -- Conditional: % operator as N-characters wildcard --> any starting or ending with Smith (Smith, Smithson, McSmith, etc)
 SELECT
    *
 FROM
    my_table
 WHERE
    last_name LIKE '%Smith%'

 -- Conditional: % operator as N-characters wildcard --> any starting with A & ending with n (Aaron, Arton, etc)
 SELECT
    *
 FROM
    my_table
 WHERE
    last_name LIKE 'A%n'

 -- Conditional: _ operator as a single-character wildcard (Tim, Tom, etc)
 SELECT
    *
 FROM
    my_table
 WHERE
    last_name LIKE 'T_m'

 -- Conditional: exact match from list
 SELECT
    *
 FROM
    my_table
 WHERE
    last_name IN ('Sally', 'Ang', 'Bart')

 -- Conditional: between operator
 SELECT
    *
 FROM
    my_table
 WHERE
    age BETWEEN 30 AND 50

 -- Conditional: IS null, IS NOT null
 SELECT
    *
 FROM
    my_table
 WHERE
    age IS null

 -- Order (asc by default)
 SELECT
    *
 FROM
    my_table
 ORDER BY
    age

 -- Order (desc)
 SELECT
    *
 FROM
    my_table
 ORDER BY
    age DESC

 -- Append (INSERT INTO) - add my_table_temp to bottom of my_table (assumes columns align)
 INSERT INTO my_db.my_table SELECT * FROM my_db.my_table_temp

 -- Join, INNER
 SELECT
    my_table_1.name_column,
    my_table_1.age,
    my_table_2.weight
 FROM
    my_table_1
    INNER JOIN my_table_2 ON my_table_1.name_column=my_table_2.name_column

 -- Join with aliases (a and b)
 SELECT
    a.name_column,
    a.age,
    b.weight
 FROM
    my_table_1 a
    INNER JOIN my_table_2 b ON a.name_column=b.name_column

 -- Aggregation: group by sum, count, avg
 SELECT
    a.full_name,
    a.age,
    b.height,
    sum(b.wage) AS total_wages,
    count(b.wage) AS qty_wages,
    avg(b.wage) AS avg_wage
 FROM
    my_table_1 a
    INNER JOIN my_table_2 b ON a.full_name=b.full_name
 GROUP BY
    a.full_name
 HAVING
    total_wages > 10000
 ORDER BY
    total_wages DESC

 -- Rename a table
 ALTER TABLE table_name RENAME TO new_table_name;

 -- Check partitions
 spark.sql("SHOW PARTITIONS hawkem_dev.table_1_toy_data")

 -- Check tables in database
 SHOW TABLES IN my_database_name

 -- Drop table
 DROP TABLE databasename.tablename
 DROP TABLE IF EXISTS databasename.tablename

 -- Create table as select (CTAS) - can be identical/backup or altered
 CREATE TABLE databasename.table2 AS SELECT * FROM databasename.table1
 CREATE TABLE databasename.table2 STORED AS PARQUET AS SELECT * FROM databasename.table1
 CREATE TABLE databasename.table2 AS SELECT * FROM databasename.table1 WHERE surname LIKE 'Smith'

 -- Distinct/unique values
 SELECT DISTINCT surname FROM database.table
 SELECT COUNT(DISTINCT surname) FROM database.table

 -- 1) ROW FORMAT SERDE "org.apache.hadoop.hive.contrib.serde2.RegexSerDe" tells hive to use this class to serialize
 -- and deserialize the rows to/from the file.
 -- 2) input.regex is a property used by this class (RegexSerde) to deserialize the rows read from the table data.
 -- So this regex pattern is applied to the row value read from the file to split up into different columns defined
 -- in the meta data for this hive table.
 CREATE TABLE IF NOT EXISTS database.weblogs (
    ip_address STRING,
    client_id INT,
    date_time STRING,
    request STRING,
    status INT,
    bytes INT,
    referer STRING,
    user_agent STRING
 )
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
    WITH SERDEPROPERTIES (
    -- This regex pattern will parse eight columns. Each pair of parentheses () in the pattern captures a group,
    -- and there are eight such groups in this pattern.
    "input.regex" = "^([\\d.]+) \\S+ (\\d+) \\[(.+?)\\] \"(.+?)\" (\\d{3}) (\\d+) \"(.+?)\"\\se* \"(.+?)\".*$"
 )
 -- The LOCATION of the data to turn into a table
 LOCATION '/spark-perf/ecommerce/weblogs/raw'
  
 -- 1) ROW FORMAT SERDE "org.apache.hadoop.hive.contrib.serde2.RegexSerDe" tells hive to use this class to serialize
 -- and deserialize the rows to/from the file.
 -- 2) input.regex is a property used by this class (RegexSerde) to deserialize the rows read from the table data.
 -- So this regex pattern is applied to the row value read from the file to split up into different columns defined
 -- in the meta data for this hive table.
 -- 3) output.format.string is a property used by this class (RegexSerde) to serialize the rows being written out to
 -- this table data. This value is used as a format to generate a row value (from its column values) that is to be
 -- written back to the output file for this hive table.
 CREATE TABLE intermediate_access_logs (
    ip STRING,
    date STRING,
    method STRING,
    url STRING,
    http_version STRING,
    code1 STRING,
    code2 STRING,
    dash STRING,
    user_agent STRING
 )
 ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
 WITH SERDEPROPERTIES (
    'input.regex' = '([^ ]*) - - \\[([^\\]]*)\\] "([^\ ]*) ([^\ ]*) ([^\ ]*)" (\\d*) (\\d*) "([^"]*)" "([^"]*)"',
    'output.format.string' = "%1$$s %2$$s %3$$s %4$$s %5$$s %6$$s %7$$s %8$$s %9$$s"
 )
 -- The LOCATION of the data to turn into a table
 LOCATION '/spark-perf/ecommerce/weblogs/raw'  

 -- Rounded frequency table
 SELECT 
    ROUND(column_name, 0) AS rounded_value,
    COUNT(*) AS frequency
 FROM 
    db_name.table_name
 GROUP BY 
    ROUND(column_name, 0)
 ORDER BY 
    frequency DESC;
    
 -- Freqency table / value_counts
 SELECT epoch, COUNT(*) AS count
 FROM your_table_name
 GROUP BY epoch;
	-- SQL: Structured Query Language for communicating with relational (tabular) databases
	-- RDBMS: Relational Database Management Systems
	-- CRUD: Create, Read, Update, and Delete
	-- Databases contain tables, plus potentially Views and Queries too

	-- Once connected to the server database, the following example snippets work

	-- These snippets were written primarily with Hive and PySpark SQL in mind

	------------------
	-- Querying data -
	------------------

	-- All columns and rows from a table
	SELECT
	*
	FROM
	nspl24feb

	-- Certain columns and all rows from a table, rename one column
	SELECT
	forename AS first_name,
	last_name
	FROM
	my_table

	-- Conditionals: AND operator to specifiy multiple conditions
	SELECT
	*
	FROM
	my_table
	WHERE
	height>=170 AND age>=30

	-- Conditional: = operator --> exact match
	SELECT
	*
	FROM
	my_table
	WHERE
	last_name='Smith'

	-- Conditional: LIKE operator --> exact match
	SELECT
	*
	FROM
	my_table
	WHERE
	last_name LIKE 'Smith'

	-- Conditional: % operator as N-characters wildcard --> any starting or ending with Smith (Smith, Smithson, McSmith, etc)
	SELECT
	*
	FROM
	my_table
	WHERE
	last_name LIKE '%Smith%'

	-- Conditional: % operator as N-characters wildcard --> any starting with A & ending with n (Aaron, Arton, etc)
	SELECT
	*
	FROM
	my_table
	WHERE
	last_name LIKE 'A%n'

	-- Conditional: _ operator as a single-character wildcard (Tim, Tom, etc)
	SELECT
	*
	FROM
	my_table
	WHERE
	last_name LIKE 'T_m'

	-- Conditional: exact match from list
	SELECT
	*
	FROM
	my_table
	WHERE
	last_name IN ('Sally', 'Ang', 'Bart')

	-- Conditional: between operator
	SELECT
	*
	FROM
	my_table
	WHERE
	age BETWEEN 30 AND 50

	-- Conditional: IS null, IS NOT null
	SELECT
	*
	FROM
	my_table
	WHERE
	age IS null

	-- Order (asc by default)
	SELECT
	*
	FROM
	my_table
	ORDER BY
	age

	-- Order (desc)
	SELECT
	*
	FROM
	my_table
	ORDER BY
	age DESC

	-- Append (INSERT INTO) - add my_table_temp to bottom of my_table (assumes columns align)
	INSERT INTO my_db.my_table SELECT * FROM my_db.my_table_temp

	-- Join, INNER
	SELECT
	my_table_1.name_column,
	my_table_1.age,
	my_table_2.weight
	FROM
	my_table_1
	INNER JOIN my_table_2 ON my_table_1.name_column=my_table_2.name_column

	-- Join with aliases (a and b)
	SELECT
	a.name_column,
	a.age,
	b.weight
	FROM
	my_table_1 a
	INNER JOIN my_table_2 b ON a.name_column=b.name_column

	-- Aggregation: group by sum, count, avg
	SELECT
	a.full_name,
	a.age,
	b.height,
	sum(b.wage) AS total_wages,
	count(b.wage) AS qty_wages,
	avg(b.wage) AS avg_wage
	FROM
	my_table_1 a
	INNER JOIN my_table_2 b ON a.full_name=b.full_name
	GROUP BY
	a.full_name
	HAVING
	total_wages > 10000
	ORDER BY
	total_wages DESC

	-- Rename a table
	ALTER TABLE table_name RENAME TO new_table_name;

	-- Check partitions
	spark.sql("SHOW PARTITIONS hawkem_dev.table_1_toy_data")

	-- Check tables in database
	SHOW TABLES IN my_database_name

	-- Drop table
	DROP TABLE databasename.tablename
	DROP TABLE IF EXISTS databasename.tablename

	-- Create table as select (CTAS) - can be identical/backup or altered
	CREATE TABLE databasename.table2 AS SELECT * FROM databasename.table1
	CREATE TABLE databasename.table2 STORED AS PARQUET AS SELECT * FROM databasename.table1
	CREATE TABLE databasename.table2 AS SELECT * FROM databasename.table1 WHERE surname LIKE 'Smith'

	-- Distinct/unique values
	SELECT DISTINCT surname FROM database.table
	SELECT COUNT(DISTINCT surname) FROM database.table

	-- 1) ROW FORMAT SERDE "org.apache.hadoop.hive.contrib.serde2.RegexSerDe" tells hive to use this class to serialize
	-- and deserialize the rows to/from the file.
	-- 2) input.regex is a property used by this class (RegexSerde) to deserialize the rows read from the table data.
	-- So this regex pattern is applied to the row value read from the file to split up into different columns defined
	-- in the meta data for this hive table.
	CREATE TABLE IF NOT EXISTS database.weblogs (
	ip_address STRING,
	client_id INT,
	date_time STRING,
	request STRING,
	status INT,
	bytes INT,
	referer STRING,
	user_agent STRING
	)
	ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
	WITH SERDEPROPERTIES (
	-- This regex pattern will parse eight columns. Each pair of parentheses () in the pattern captures a group,
	-- and there are eight such groups in this pattern.
	"input.regex" = "^([\\d.]+) \\S+ (\\d+) \\[(.+?)\\] \"(.+?)\" (\\d{3}) (\\d+) \"(.+?)\"\\se* \"(.+?)\".*$"
	)
	-- The LOCATION of the data to turn into a table
	LOCATION '/spark-perf/ecommerce/weblogs/raw'

	-- 1) ROW FORMAT SERDE "org.apache.hadoop.hive.contrib.serde2.RegexSerDe" tells hive to use this class to serialize
	-- and deserialize the rows to/from the file.
	-- 2) input.regex is a property used by this class (RegexSerde) to deserialize the rows read from the table data.
	-- So this regex pattern is applied to the row value read from the file to split up into different columns defined
	-- in the meta data for this hive table.
	-- 3) output.format.string is a property used by this class (RegexSerde) to serialize the rows being written out to
	-- this table data. This value is used as a format to generate a row value (from its column values) that is to be
	-- written back to the output file for this hive table.
	CREATE TABLE intermediate_access_logs (
	ip STRING,
	date STRING,
	method STRING,
	url STRING,
	http_version STRING,
	code1 STRING,
	code2 STRING,
	dash STRING,
	user_agent STRING
	)
	ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
	WITH SERDEPROPERTIES (
	'input.regex' = '([^ ]) - - \\[([^\\]])\\] "([^\ ]) ([^\ ]) ([^\ ])" (\\d) (\\d) "([^"])" "([^"]*)"',
	'output.format.string' = "%1$$s %2$$s %3$$s %4$$s %5$$s %6$$s %7$$s %8$$s %9$$s"
	)
	-- The LOCATION of the data to turn into a table
	LOCATION '/spark-perf/ecommerce/weblogs/raw'

	-- Rounded frequency table
	SELECT
	ROUND(column_name, 0) AS rounded_value,
	COUNT(*) AS frequency
	FROM
	db_name.table_name
	GROUP BY
	ROUND(column_name, 0)
	ORDER BY
	frequency DESC;

	-- Freqency table / value_counts
	SELECT epoch, COUNT(*) AS count
	FROM your_table_name
	GROUP BY epoch;
No results found