Skip to content

Instantly share code, notes, and snippets.

@JonathanLoscalzo
Created April 26, 2019 17:16
Show Gist options
  • Select an option

  • Save JonathanLoscalzo/7d2b24e74ad66a4260ebf0a2bb7d459e to your computer and use it in GitHub Desktop.

Select an option

Save JonathanLoscalzo/7d2b24e74ad66a4260ebf0a2bb7d459e to your computer and use it in GitHub Desktop.
selecting, filtering, grouping data with sqlalchemy
# Import create_engine function
from sqlalchemy import create_engine
driver = 'mysql+pymysql://'
cred = 'student:datacamp'
url = '@courses.csrrinzqubik.us-east-1.rds.amazonaws.com:3306/'
port = 'census'
# Create an engine to the census database
engine = create_engine(driver + cred + url + port)
# Print the table names
print(engine.table_names())
#=========================================
# Build query to return state names by population difference from 2008 to 2000: stmt
stmt = select([census.columns.state, (census.columns.pop2008-census.columns.pop2000).label('pop_change')])
# Append group by for the state: stmt
stmt = stmt.group_by(census.columns.state)
# Append order by for pop_change descendingly: stmt
stmt = stmt.order_by(desc('pop_change'))
# Return only 5 results: stmt
stmt = stmt.limit(5)
# Use connection to execute the statement and fetch all results
results = connection.execute(stmt).fetchall()
# Print the state and population change for each record
for result in results:
print('{}:{}'.format(result.state, result.pop_change))
#=======================================
# import case, cast and Float from sqlalchemy
from sqlalchemy import case, cast, Float
# Build an expression to calculate female population in 2000
female_pop2000 = func.sum(
case([
(census.columns.sex == 'F', census.columns.pop2000)
], else_=0))
# Cast an expression to calculate total population in 2000 to Float
total_pop2000 = cast(func.sum(census.columns.pop2000), Float)
# Build a query to calculate the percentage of females in 2000: stmt
stmt = select([female_pop2000 / total_pop2000* 100])
# Execute the query and store the scalar result: percent_female
percent_female = connection.execute(stmt).scalar()
# Print the percentage
print(percent_female)
# Import create_engine function
from sqlalchemy import in_, and_, or_, any_, not_
states = ['New York', 'California', 'Texas']
# Create a query for the census table: stmt
stmt = select([census])
# Append a where clause to match all the states in_ the list states
stmt = stmt.where(in_(states))
# Loop over the ResultProxy and print the state and its population in 2000
for result in connection.execute(stmt):
print(result.state, result.pop2000)
# Build a query for the census table: stmt
stmt = select([census])
# Append a where clause to select only non-male records from California using and_
stmt = stmt.where(
# The state of California with a non-male sex
and_(census.columns.state == 'California',
census.columns.sex != 'M'
)
)
# Loop over the ResultProxy printing the age and sex
for result in connection.execute(stmt):
print(result.age, result.sex)
# Import create_engine function
from sqlalchemy import create_engine
driver = 'postgresql+psycopg2://'
cred = 'student:datacamp'
url = '@postgresql.csrrinzqubik.us-east-1.rds.amazonaws.com'
port = ':5432/census'
# Create an engine to the census database
engine = create_engine(driver + cred + url + port)
# Use the .table_names() method on the engine to print the table names
print(engine.table_names())
# ['census', 'state_fact', 'vrska', 'census1', 'data', 'data1', 'employees3', 'users', 'employees', 'employees_2']
# Create a select query: stmt
stmt = select([census])
# Add a where clause to filter the results to only those for New York
stmt = stmt.where(census.columns.state == 'New York')
# Execute the query to retrieve all the data returned: results
results = connection.execute(stmt).fetchall()
# Loop over the results and print the age, sex, and pop2008
for result in results:
print(result.age, result.sex, result.pop2008)
#<script.py> output:
# 0 M 128088
# 1 M 125649
# 2 M 121615
"""
Correlated Updates
You can also update records with data from a select statement.
This is called a correlated update.
It works by defining a select statement that returns the value you want to update
the record with and assigning that as the value in an update statement.
You'll be using a flat_census in this exercise as the target of your correlated update.
The flat_census table is a summarized copy of your census table
"""
# Build a statement to select name from state_fact: stmt
fips_stmt = select([state_fact.columns.name])
# Append a where clause to Match the fips_state to flat_census fips_code
fips_stmt = fips_stmt.where(
state_fact.columns.fips_state == flat_census.columns.fips_code)
# Build an update statement to set the name to fips_stmt: update_stmt
update_stmt = update(flat_census).values(state_name=fips_stmt)
# Execute update_stmt: results
results = connection.execute(update_stmt)
# Print rowcount
print(results.rowcount)
# Import func
from sqlalchemy import func
# Build a query to count the distinct states values: stmt
stmt = select([func.count(census.columns.state.distinct())])
# Execute the query and store the scalar result: distinct_state_count
distinct_state_count = connection.execute(stmt).scalar()
# Print the distinct_state_count
print(distinct_state_count)
#=============================
# Build a query to select the state and count of ages by state: stmt
stmt = select([census.columns.state, func.count(census.columns.age)])
# Group stmt by state
stmt = stmt.group_by(census.columns.state)
# Execute the statement and store all the records: results
results = connection.execute(stmt).fetchall()
# Print results
print(results)
# Print the keys/column names of the results returned
print(results[0].keys())
#=======================================
# Build an expression to calculate the sum of pop2008 labeled as population
pop2008_sum = func.sum(census.columns.pop2008).label('population')
# Build a query to select the state and sum of pop2008: stmt
stmt = select([census.columns.state, pop2008_sum])
# Group stmt by state
stmt = stmt.group_by(census.columns.state)
# Execute the statement and store all the records: results
results = connection.execute(stmt).fetchall()
# Print results
print(results)
# Print the keys/column names of the results returned
print(results[0].keys())
#[state, population]
# Import Table, Column, String, Integer, Float, Boolean from sqlalchemy
from sqlalchemy import Table, Column, Integer,Float, Boolean, String
# Define a new table with a name, count, amount, and valid column: data
data = Table('data', metadata,
Column('name', String(255)),
Column('count', Integer()),
Column('amount', Float()),
Column('valid', Boolean())
)
# Use the metadata to create the table
metadata.create_all(engine)
# Print table details
print(repr(data))
#================================
# Import Table, Column, String, Integer, Float, Boolean from sqlalchemy
from sqlalchemy import Table, Column, String, Integer, Float, Boolean
# Define a new table with a name, count, amount, and valid column: data
data = Table('data', metadata,
Column('name', String(255), unique=True),
Column('count', Integer(), default=1),
Column('amount', Float()),
Column('valid', Boolean(), default=False)
)
# Use the metadata to create the table
metadata.create_all(engine)
# Print the table details
print(repr(metadata.tables['data']))
# Make an alias of the employees table: managers
managers = employees.alias()
# Build a query to select manager's and their employees names: stmt
stmt = select(
[managers.columns.name.label('manager'),
employees.columns.name.label('employee')]
)
# Match managers id with employees mgr: stmt
stmt = stmt.where(managers.columns.id == employees.columns.mgr)
# Order the statement by the managers name: stmt
stmt = stmt.order_by(managers.columns.name)
# Execute statement: results
results = connection.execute(stmt).fetchall()
# Print records
for record in results:
print(record)
#==========================================
# Build a query to select managers and counts of their employees: stmt
stmt = select([managers.columns.name, func.count(employees.columns.id)])
# Append a where clause that ensures the manager id and employee mgr are equal
stmt = stmt.where(managers.columns.id == employees.columns.mgr)
# Group by Managers Name
stmt = stmt.group_by(managers.columns.name)
# Execute statement: results
results = connection.execute(stmt).fetchall()
# print manager
for record in results:
print(record)
#==========================================
# Create a insert statement for census: stmt
stmt = insert(census)
# Create an empty list and zeroed row count: values_list, total_rowcount
values_list = []
total_rowcount = 0
# Enumerate the rows of csv_reader
for idx, row in enumerate(csv_reader):
#create data and append to values_list
data = {'state': row[0], 'sex': row[1], 'age': row[2], 'pop2000':row[3],
'pop2008': row[4]}
values_list.append(data)
# Check to see if divisible by 51
if idx % 51 == 0:
results = connection.execute(stmt, values_list)
total_rowcount += results.rowcount
values_list = []
# Print total rowcount
print(total_rowcount)
# Import insert and select from sqlalchemy
from sqlalchemy import insert, select
# Build an insert statement to insert a record into the data table: stmt
stmt = insert(data).values(name='Anna', count=1, amount=1000.0, valid=True)
# Execute the statement via the connection: results
results = connection.execute(stmt)
# Print result rowcount
print(results.rowcount)
# Build a select statement to validate the insert
stmt = select([data]).where(data.columns.name == 'Anna')
# Print the result of executing the query.
print(connection.execute(stmt).first())
#======================================
# multiple insert or bulk insert
# Build a list of dictionaries: values_list
values_list = [
{'name': 'Anna', 'count': 1, 'amount': 1000.0, 'valid': True},
{'name': 'Taylor', 'count': 1, 'amount': 750.0, 'valid': False},
]
# Build an insert statement for the data table: stmt
stmt = insert(data)
# Execute stmt with the values_list: results
results = connection.execute(stmt, values_list)
# Print rowcount
print(results.rowcount)
# JOIN: with related tables
# Build a statement to join census and state_fact tables: stmt
stmt = select([census.columns.pop2000, state_fact.columns.abbreviation])
# Execute the statement and get the first result: result
result = connection.execute(stmt).first()
# Loop over the keys in the result object and print the key and value
for key in result.keys():
print(key, getattr(result, key))
#=====================
#build with join and select_from
# Build a statement to select the census and state_fact tables: stmt
stmt = select([census, state_fact])
# Add a select_from clause that wraps a join for the census and state_fact
# tables where the census state column and state_fact name column match
stmt = stmt.select_from(
census.join(state_fact, census.columns.state == state_fact.columns.name))
# Execute the statement and get the first result: result
result = connection.execute(stmt).first()
# Loop over the keys in the result object and print the key and value
for key in result.keys():
print(key, getattr(result, key))
#=====================
# Build a statement to select the state, sum of 2008 population and census
# division name: stmt
stmt = select([
census.columns.state,
func.sum(census.columns.pop2008),
state_fact.columns.census_division_name
])
# Append select_from to join the census and state_fact tables by the census state and state_fact name columns
stmt = stmt.select_from(
census.join(state_fact, census.columns.state == state_fact.columns.name)
)
# Append a group by for the state_fact name column
stmt = stmt.group_by(state_fact.columns.name)
# Execute the statement and get the results: results
results = connection.execute(stmt).fetchall()
# Loop over the the results object and print each record.
for record in results:
print(record)
#=====================================================
# Start a while loop checking for more results
while more_results:
# Fetch the first 50 results from the ResultProxy: partial_results
partial_results = results_proxy.fetchmany(50)
# if empty list, set more_results to False
if partial_results == []:
more_results = False
# Loop over the fetched records and increment the count for the state
for row in partial_results:
if row.state in state_count:
state_count[row.state] += 1
else:
state_count[row.state] = 1
# Close the ResultProxy, and thus the connection
results_proxy.close()
# Print the count by state
print(state_count)In
from sqlalchemy import order_by
# Build a query to select the state column: stmt
stmt = select([census.columns.state])
# Order stmt by the state column
stmt = stmt.order_by(census.columns.state)
# Execute the query and store the results: results
results = connection.execute(stmt).fetchall()
# Print the first 10 results
print(results[:10])
#==================================================
# Import desc
from sqlalchemy import desc
# Build a query to select the state column: stmt
stmt = select([census.columns.state])
# Order stmt by state in descending order: rev_stmt
rev_stmt = stmt.order_by(desc(census.columns.state))
# Execute the query and store the results: rev_results
rev_results = connection.execute(rev_stmt).fetchall()
# Print the first 10 rev_results
print(rev_results[:10])
#===========================================
# Build a query to select state and age: stmt
stmt = select([census.columns.state, census.columns.age])
# Append order by to ascend by state and descend by age
stmt = stmt.order_by(census.columns.state, desc(census.columns.age))
# Execute the statement and store all the records: results
results = connection.execute(stmt).fetchall()
# Print the first 20 results
print(results[:20])
# import pandas
import pandas as pd
# Create a DataFrame from the results: df
df = pd.DataFrame(results)
# Set column names
df.columns = results[0].keys()
# Print the Dataframe
print(df)
# ===================================
# Import pyplot as plt from matplotlib
import matplotlib.pyplot as plt
# Plot the DataFrame
df.plot.bar()
plt.show()
#====================================
# Build a select statement: select_stmt
select_stmt = select([state_fact]).where(state_fact.columns.name == 'New York')
# Print the results of executing the select_stmt
print(connection.execute(select_stmt).fetchall())
# Build a statement to update the fips_state to 36: stmt
stmt = update(state_fact).values(fips_state = 36)
# Append a where clause to limit it to records for New York state
stmt = stmt.where(state_fact.columns.name == 'New York')
# Execute the statement: results
results = connection.execute(stmt)
# Print rowcount
print(results.rowcount)
# Execute the select_stmt again to view the changes
print(connection.execute(select_stmt).fetchall())
#===========================================================
# Build a statement to update the notes to 'The Wild West': stmt
stmt = update(state_fact).values(notes='The Wild West')
# Append a where clause to match the West census region records
stmt = stmt.where(state_fact.columns.census_region_name == 'West')
# Execute the statement: results
results = connection.execute(stmt)
# Print rowcount
print(results.rowcount)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment