Skip to content

Instantly share code, notes, and snippets.

@mh0w
Last active May 6, 2025 14:34
Show Gist options
  • Save mh0w/9d7b5de30fa5da2f6826be5875a2d06b to your computer and use it in GitHub Desktop.
Save mh0w/9d7b5de30fa5da2f6826be5875a2d06b to your computer and use it in GitHub Desktop.
SQL intro
-- SQL: Structured Query Language for communicating with relational (tabular) databases
-- RDBMS: Relational Database Management Systems
-- CRUD: Create, Read, Update, and Delete
-- Databases contain tables, plus potentially Views and Queries too
-- Once connected to the server database, the following example snippets work
-- These snippets were written primarily with Hive and PySpark SQL in mind
------------------
-- Querying data -
------------------
-- All columns and rows from a table
SELECT
*
FROM
nspl24feb
-- Certain columns and all rows from a table, rename one column
SELECT
forename AS first_name,
last_name
FROM
my_table
-- Conditionals: AND operator to specifiy multiple conditions
SELECT
*
FROM
my_table
WHERE
height>=170 AND age>=30
-- Conditional: = operator --> exact match
SELECT
*
FROM
my_table
WHERE
last_name='Smith'
-- Conditional: LIKE operator --> exact match
SELECT
*
FROM
my_table
WHERE
last_name LIKE 'Smith'
-- Conditional: % operator as N-characters wildcard --> any starting or ending with Smith (Smith, Smithson, McSmith, etc)
SELECT
*
FROM
my_table
WHERE
last_name LIKE '%Smith%'
-- Conditional: % operator as N-characters wildcard --> any starting with A & ending with n (Aaron, Arton, etc)
SELECT
*
FROM
my_table
WHERE
last_name LIKE 'A%n'
-- Conditional: _ operator as a single-character wildcard (Tim, Tom, etc)
SELECT
*
FROM
my_table
WHERE
last_name LIKE 'T_m'
-- Conditional: exact match from list
SELECT
*
FROM
my_table
WHERE
last_name IN ('Sally', 'Ang', 'Bart')
-- Conditional: between operator
SELECT
*
FROM
my_table
WHERE
age BETWEEN 30 AND 50
-- Conditional: IS null, IS NOT null
SELECT
*
FROM
my_table
WHERE
age IS null
-- Order (asc by default)
SELECT
*
FROM
my_table
ORDER BY
age
-- Order (desc)
SELECT
*
FROM
my_table
ORDER BY
age DESC
-- Append (INSERT INTO) - add my_table_temp to bottom of my_table (assumes columns align)
INSERT INTO my_db.my_table SELECT * FROM my_db.my_table_temp
-- Join, INNER
SELECT
my_table_1.name_column,
my_table_1.age,
my_table_2.weight
FROM
my_table_1
INNER JOIN my_table_2 ON my_table_1.name_column=my_table_2.name_column
-- Join with aliases (a and b)
SELECT
a.name_column,
a.age,
b.weight
FROM
my_table_1 a
INNER JOIN my_table_2 b ON a.name_column=b.name_column
-- Aggregation: group by sum, count, avg
SELECT
a.full_name,
a.age,
b.height,
sum(b.wage) AS total_wages,
count(b.wage) AS qty_wages,
avg(b.wage) AS avg_wage
FROM
my_table_1 a
INNER JOIN my_table_2 b ON a.full_name=b.full_name
GROUP BY
a.full_name
HAVING
total_wages > 10000
ORDER BY
total_wages DESC
-- Rename a table
ALTER TABLE table_name RENAME TO new_table_name;
-- Check partitions
spark.sql("SHOW PARTITIONS hawkem_dev.table_1_toy_data")
-- Check tables in database
SHOW TABLES IN my_database_name
-- Drop table
DROP TABLE databasename.tablename
DROP TABLE IF EXISTS databasename.tablename
-- Create table as select (CTAS) - can be identical/backup or altered
CREATE TABLE databasename.table2 AS SELECT * FROM databasename.table1
CREATE TABLE databasename.table2 STORED AS PARQUET AS SELECT * FROM databasename.table1
CREATE TABLE databasename.table2 AS SELECT * FROM databasename.table1 WHERE surname LIKE 'Smith'
-- Distinct/unique values
SELECT DISTINCT surname FROM database.table
SELECT COUNT(DISTINCT surname) FROM database.table
-- 1) ROW FORMAT SERDE "org.apache.hadoop.hive.contrib.serde2.RegexSerDe" tells hive to use this class to serialize
-- and deserialize the rows to/from the file.
-- 2) input.regex is a property used by this class (RegexSerde) to deserialize the rows read from the table data.
-- So this regex pattern is applied to the row value read from the file to split up into different columns defined
-- in the meta data for this hive table.
CREATE TABLE IF NOT EXISTS database.weblogs (
ip_address STRING,
client_id INT,
date_time STRING,
request STRING,
status INT,
bytes INT,
referer STRING,
user_agent STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
-- This regex pattern will parse eight columns. Each pair of parentheses () in the pattern captures a group,
-- and there are eight such groups in this pattern.
"input.regex" = "^([\\d.]+) \\S+ (\\d+) \\[(.+?)\\] \"(.+?)\" (\\d{3}) (\\d+) \"(.+?)\"\\se* \"(.+?)\".*$"
)
-- The LOCATION of the data to turn into a table
LOCATION '/spark-perf/ecommerce/weblogs/raw'
-- 1) ROW FORMAT SERDE "org.apache.hadoop.hive.contrib.serde2.RegexSerDe" tells hive to use this class to serialize
-- and deserialize the rows to/from the file.
-- 2) input.regex is a property used by this class (RegexSerde) to deserialize the rows read from the table data.
-- So this regex pattern is applied to the row value read from the file to split up into different columns defined
-- in the meta data for this hive table.
-- 3) output.format.string is a property used by this class (RegexSerde) to serialize the rows being written out to
-- this table data. This value is used as a format to generate a row value (from its column values) that is to be
-- written back to the output file for this hive table.
CREATE TABLE intermediate_access_logs (
ip STRING,
date STRING,
method STRING,
url STRING,
http_version STRING,
code1 STRING,
code2 STRING,
dash STRING,
user_agent STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'input.regex' = '([^ ]*) - - \\[([^\\]]*)\\] "([^\ ]*) ([^\ ]*) ([^\ ]*)" (\\d*) (\\d*) "([^"]*)" "([^"]*)"',
'output.format.string' = "%1$$s %2$$s %3$$s %4$$s %5$$s %6$$s %7$$s %8$$s %9$$s"
)
-- The LOCATION of the data to turn into a table
LOCATION '/spark-perf/ecommerce/weblogs/raw'
-- Rounded frequency table
SELECT
ROUND(column_name, 0) AS rounded_value,
COUNT(*) AS frequency
FROM
db_name.table_name
GROUP BY
ROUND(column_name, 0)
ORDER BY
frequency DESC;
-- Freqency table / value_counts
SELECT epoch, COUNT(*) AS count
FROM your_table_name
GROUP BY epoch;
@mh0w
Copy link
Author

mh0w commented Feb 21, 2025

DESCRIBE DB.TABLE
DESCRIBE EXTENDED
DESCRIBE FORMATTED

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment