Last active
May 6, 2025 14:34
-
-
Save mh0w/9d7b5de30fa5da2f6826be5875a2d06b to your computer and use it in GitHub Desktop.
SQL intro
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- SQL: Structured Query Language for communicating with relational (tabular) databases | |
-- RDBMS: Relational Database Management Systems | |
-- CRUD: Create, Read, Update, and Delete | |
-- Databases contain tables, plus potentially Views and Queries too | |
-- Once connected to the server database, the following example snippets work | |
-- These snippets were written primarily with Hive and PySpark SQL in mind | |
------------------ | |
-- Querying data - | |
------------------ | |
-- All columns and rows from a table | |
SELECT | |
* | |
FROM | |
nspl24feb | |
-- Certain columns and all rows from a table, rename one column | |
SELECT | |
forename AS first_name, | |
last_name | |
FROM | |
my_table | |
-- Conditionals: AND operator to specifiy multiple conditions | |
SELECT | |
* | |
FROM | |
my_table | |
WHERE | |
height>=170 AND age>=30 | |
-- Conditional: = operator --> exact match | |
SELECT | |
* | |
FROM | |
my_table | |
WHERE | |
last_name='Smith' | |
-- Conditional: LIKE operator --> exact match | |
SELECT | |
* | |
FROM | |
my_table | |
WHERE | |
last_name LIKE 'Smith' | |
-- Conditional: % operator as N-characters wildcard --> any starting or ending with Smith (Smith, Smithson, McSmith, etc) | |
SELECT | |
* | |
FROM | |
my_table | |
WHERE | |
last_name LIKE '%Smith%' | |
-- Conditional: % operator as N-characters wildcard --> any starting with A & ending with n (Aaron, Arton, etc) | |
SELECT | |
* | |
FROM | |
my_table | |
WHERE | |
last_name LIKE 'A%n' | |
-- Conditional: _ operator as a single-character wildcard (Tim, Tom, etc) | |
SELECT | |
* | |
FROM | |
my_table | |
WHERE | |
last_name LIKE 'T_m' | |
-- Conditional: exact match from list | |
SELECT | |
* | |
FROM | |
my_table | |
WHERE | |
last_name IN ('Sally', 'Ang', 'Bart') | |
-- Conditional: between operator | |
SELECT | |
* | |
FROM | |
my_table | |
WHERE | |
age BETWEEN 30 AND 50 | |
-- Conditional: IS null, IS NOT null | |
SELECT | |
* | |
FROM | |
my_table | |
WHERE | |
age IS null | |
-- Order (asc by default) | |
SELECT | |
* | |
FROM | |
my_table | |
ORDER BY | |
age | |
-- Order (desc) | |
SELECT | |
* | |
FROM | |
my_table | |
ORDER BY | |
age DESC | |
-- Append (INSERT INTO) - add my_table_temp to bottom of my_table (assumes columns align) | |
INSERT INTO my_db.my_table SELECT * FROM my_db.my_table_temp | |
-- Join, INNER | |
SELECT | |
my_table_1.name_column, | |
my_table_1.age, | |
my_table_2.weight | |
FROM | |
my_table_1 | |
INNER JOIN my_table_2 ON my_table_1.name_column=my_table_2.name_column | |
-- Join with aliases (a and b) | |
SELECT | |
a.name_column, | |
a.age, | |
b.weight | |
FROM | |
my_table_1 a | |
INNER JOIN my_table_2 b ON a.name_column=b.name_column | |
-- Aggregation: group by sum, count, avg | |
SELECT | |
a.full_name, | |
a.age, | |
b.height, | |
sum(b.wage) AS total_wages, | |
count(b.wage) AS qty_wages, | |
avg(b.wage) AS avg_wage | |
FROM | |
my_table_1 a | |
INNER JOIN my_table_2 b ON a.full_name=b.full_name | |
GROUP BY | |
a.full_name | |
HAVING | |
total_wages > 10000 | |
ORDER BY | |
total_wages DESC | |
-- Rename a table | |
ALTER TABLE table_name RENAME TO new_table_name; | |
-- Check partitions | |
spark.sql("SHOW PARTITIONS hawkem_dev.table_1_toy_data") | |
-- Check tables in database | |
SHOW TABLES IN my_database_name | |
-- Drop table | |
DROP TABLE databasename.tablename | |
DROP TABLE IF EXISTS databasename.tablename | |
-- Create table as select (CTAS) - can be identical/backup or altered | |
CREATE TABLE databasename.table2 AS SELECT * FROM databasename.table1 | |
CREATE TABLE databasename.table2 STORED AS PARQUET AS SELECT * FROM databasename.table1 | |
CREATE TABLE databasename.table2 AS SELECT * FROM databasename.table1 WHERE surname LIKE 'Smith' | |
-- Distinct/unique values | |
SELECT DISTINCT surname FROM database.table | |
SELECT COUNT(DISTINCT surname) FROM database.table | |
-- 1) ROW FORMAT SERDE "org.apache.hadoop.hive.contrib.serde2.RegexSerDe" tells hive to use this class to serialize | |
-- and deserialize the rows to/from the file. | |
-- 2) input.regex is a property used by this class (RegexSerde) to deserialize the rows read from the table data. | |
-- So this regex pattern is applied to the row value read from the file to split up into different columns defined | |
-- in the meta data for this hive table. | |
CREATE TABLE IF NOT EXISTS database.weblogs ( | |
ip_address STRING, | |
client_id INT, | |
date_time STRING, | |
request STRING, | |
status INT, | |
bytes INT, | |
referer STRING, | |
user_agent STRING | |
) | |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' | |
WITH SERDEPROPERTIES ( | |
-- This regex pattern will parse eight columns. Each pair of parentheses () in the pattern captures a group, | |
-- and there are eight such groups in this pattern. | |
"input.regex" = "^([\\d.]+) \\S+ (\\d+) \\[(.+?)\\] \"(.+?)\" (\\d{3}) (\\d+) \"(.+?)\"\\se* \"(.+?)\".*$" | |
) | |
-- The LOCATION of the data to turn into a table | |
LOCATION '/spark-perf/ecommerce/weblogs/raw' | |
-- 1) ROW FORMAT SERDE "org.apache.hadoop.hive.contrib.serde2.RegexSerDe" tells hive to use this class to serialize | |
-- and deserialize the rows to/from the file. | |
-- 2) input.regex is a property used by this class (RegexSerde) to deserialize the rows read from the table data. | |
-- So this regex pattern is applied to the row value read from the file to split up into different columns defined | |
-- in the meta data for this hive table. | |
-- 3) output.format.string is a property used by this class (RegexSerde) to serialize the rows being written out to | |
-- this table data. This value is used as a format to generate a row value (from its column values) that is to be | |
-- written back to the output file for this hive table. | |
CREATE TABLE intermediate_access_logs ( | |
ip STRING, | |
date STRING, | |
method STRING, | |
url STRING, | |
http_version STRING, | |
code1 STRING, | |
code2 STRING, | |
dash STRING, | |
user_agent STRING | |
) | |
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' | |
WITH SERDEPROPERTIES ( | |
'input.regex' = '([^ ]*) - - \\[([^\\]]*)\\] "([^\ ]*) ([^\ ]*) ([^\ ]*)" (\\d*) (\\d*) "([^"]*)" "([^"]*)"', | |
'output.format.string' = "%1$$s %2$$s %3$$s %4$$s %5$$s %6$$s %7$$s %8$$s %9$$s" | |
) | |
-- The LOCATION of the data to turn into a table | |
LOCATION '/spark-perf/ecommerce/weblogs/raw' | |
-- Rounded frequency table | |
SELECT | |
ROUND(column_name, 0) AS rounded_value, | |
COUNT(*) AS frequency | |
FROM | |
db_name.table_name | |
GROUP BY | |
ROUND(column_name, 0) | |
ORDER BY | |
frequency DESC; | |
-- Freqency table / value_counts | |
SELECT epoch, COUNT(*) AS count | |
FROM your_table_name | |
GROUP BY epoch; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
DESCRIBE DB.TABLE
DESCRIBE EXTENDED
DESCRIBE FORMATTED