Skip to content

Instantly share code, notes, and snippets.

View deanjamesss's full-sized avatar

Dean McGrath deanjamesss

View GitHub Profile
@deanjamesss
deanjamesss / select_from_where_01.sql
Last active June 26, 2020 01:26
Simple Select-From-Where
select e.userid as employee_no,
e.dob as date_of_birth
from employee e
where e.first_name = 'Dean'
and e.surname = 'McGrath'
@deanjamesss
deanjamesss / select_from_where_02.sql
Last active June 26, 2020 02:52
Select-From-Where
select p.item_name as product_title,
p.cost as cost_price,
p.units as unit_count,
p.cost * p.units as inventory_cost
from product p
where p.type in ('storage', 'cables')
and p.item_name like '%USB%'
@deanjamesss
deanjamesss / getting_started_test_data.csv
Created September 9, 2020 04:50
Getting Started Test Data
first_name last_name gender birth_date
Tawanna Wunsch F 1973-08-14
Kayla Prohaska F 1977-10-18
Stuart Boyle F 1958-09-28
Benton Ernser M 1992-08-26
Stefani Haley F 1986-08-18
Tiffani Hickle M 1984-12-17
Marget Steuber M 1980-11-16
Luz Sauer M 1985-05-23
Ami Zemlak M 1990-03-16
import pandas as pd
def load_csv_file():
df = pd.read_csv('getting_started_test_data.csv')
print(df.info())
print(df.describe(include='all'))
print(df.head(5))
import numpy as np
import pandas as pd
import pandas_profiling
from faker import Faker
def build_fake_data(size=1):
fake = Faker()
output = [{'name': fake.name(),
'address': fake.address(),
number first_name last_name gender birth_date employment_status annual_salary tax_file_no
483 Lenord Kihn M 1994-07-01 64598
478 Palma Beahan F 1972-05-06 PE 124103
348 Hebert Muller M 1990-09-04 PE 100324
757 Virginia Ullrich M 1991-09-23 PE 144957
937 Roby Hudson M 1997-01-15 137659
264 Jaeda Effertz 1994-03-21 PE 131924
165 Jalissa Bogisich F 1974-01-07 PE 101133
211 Lexie Robel M 1997-09-05 106595
510 Billy Reilly M 1977-04-30 75300
import pandas as pd
if __name__ == '__main__':
# Override default pandas configuration
pd.options.display.width = 0
pd.options.display.max_rows = 10000
pd.options.display.max_info_columns = 10000
# Open example data.
df = pd.read_csv('employee_data.csv')
rows, columns = df.shape
cell_count = rows * columns
number_of_nulls = df.isnull().sum().sum()
percentage_of_missing = (number_of_nulls / cell_count) * 100
print(f'Percentage of missing values: {percentage_of_missing}%')
# Check to see if any rows have less than 2 elements.
under_threshold_removed = df.dropna(axis='index', thresh=2, inplace=False)
under_threshold_rows = df[~df.index.isin(under_threshold_removed.index)]
print(under_threshold_rows)
# Set a default category for missing genders.
df['gender'].cat.add_categories(new_categories=['U'], inplace=True)
df.fillna(value={'gender': 'U'}, inplace=True)
print(df.info())