Last active
March 1, 2024 08:45
-
-
Save Akash-Ansari/c52909a6965f044aeba87d3e2c335830 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Reading a text file | |
# Steps: | |
filename = "Huck_finn.txt" | |
file = open(filename, mode = "r") | |
text = file.read() | |
print(text) | |
print(file.closed) # returns boolean value; indicates that the file is closed or not. | |
file.close() # closes the file | |
# It is good to use a context manager since it automatically closes the | |
# file | |
with open("huck_finn.txt", "r") as file: | |
print(file.read()) | |
# Read & print the first 3 lines | |
with open('moby_dick.txt') as file: | |
print(file.readline()) | |
print(file.readline()) | |
print(file.readline()) | |
# Flat files = are files with records....like txt file with row column, csv files | |
# relational databases are not flat files. | |
######################################### Using numpy to import flat files ############################################ | |
# For all-numeric dataset, we can use numpy "loadtxt" function. | |
# The first argument will be the filename. | |
# The second will be the delimiter which, in this case, is a comma. | |
# Import package | |
import numpy as np | |
# Assign filename to variable: file | |
file = 'digits.csv' | |
# Load file as array: digits | |
digits = np.loadtxt(file, delimiter=",") | |
# Print datatype of digits | |
print(type(digits)) | |
# Select and reshape a row | |
im = digits[21, 1:] | |
im_sq = np.reshape(im, (28, 28)) | |
# Plot reshaped data (matplotlib.pyplot already loaded as plt) | |
plt.imshow(im_sq, cmap='Greys', interpolation='nearest') | |
plt.show() | |
################################################ Customizing your NumPy import ########################################## | |
# What if there are rows, such as a header, that you don't want to import? What if your file has a | |
# delimiter other than a comma? What if you only wish to import particular columns? | |
# There are a number of arguments that np.loadtxt() takes that you'll find useful: | |
# delimiter changes the delimiter that loadtxt() is expecting, for example, you can use ',' and | |
# '\t' for comma-delimited and tab-delimited respectively; skiprows allows you to specify how many | |
# rows (not indices) you wish to skip; usecols takes a list of the indices of the columns you wish to keep. | |
# Import numpy | |
import numpy as np | |
# Assign the filename: file | |
file = 'digits_header.txt' | |
# Load the data: data | |
data = np.loadtxt(file, delimiter="\t", skiprows=1, usecols=[0,2]) | |
# Print data | |
print(data) | |
########################################## Importing different datatypes ############################################### | |
# The file seaslug.txt has a text header, consisting of strings is tab-delimited. | |
# These data consists of percentage of sea slug larvae that had metamorphosed in a given time period. Read more here. | |
# (http://www.stat.ucla.edu/projects/datasets/seaslug-explanation.html) | |
# Due to the header, if you tried to import it as-is using np.loadtxt(), Python would throw you a ValueError and tell you | |
# that it could not convert string to float. There are two ways to deal with this: firstly, you can set the data type argument | |
# dtype equal to str (for string). | |
# Alternatively, you can skip the first row as we have seen before, using the skiprows argument. | |
# Assign filename: file | |
file = 'seaslug.txt' | |
# Import file: data | |
data = np.loadtxt(file, delimiter='\t', dtype=str) | |
# Print the first element of data | |
print(data[0]) | |
# Import data as floats and skip the first row: data_float | |
data_float = np.loadtxt(file, delimiter='\t', dtype='float', skiprows=1) | |
# Print the 10th element of data_float | |
print(data_float[9]) | |
# Plot a scatterplot of the data | |
plt.scatter(data_float[:, 0], data_float[:, 1]) | |
plt.xlabel('time (min.)') | |
plt.ylabel('percentage of larvae') | |
plt.show() | |
##################################### Working with mixed datatypes (1) ####################################### | |
# Much of the time you will need to import datasets which have different datatypes in different columns; one column may contain | |
# strings and another floats, for example. The function np.loadtxt() will freak at this. There is another function, | |
# np.genfromtxt(), which can handle such structures. If we pass dtype=None to it, it will figure out what types each column should be. | |
# Import 'titanic.csv' using the function np.genfromtxt() as follows: | |
data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None) | |
# Here, the first argument is the filename, the second specifies the delimiter , and the third argument names tells us | |
# there is a header. Because the data are of different types, data is an object called a structured array. Because numpy | |
# arrays have to contain elements that are all the same type, the structured array solves this by being a 1D array, where | |
# each element of the array is a row of the flat file imported. You can test this by checking out the array's shape in the | |
# shell by executing np.shape(data). | |
data['Fare'][5] | |
8.4582999999999995 | |
data['Fare'][0:5] | |
array([ 7.25 , 71.2833, 7.925 , 53.1 , 8.05 ]) | |
# Acccessing rows and columns of structured arrays is super-intuitive: to get the ith row, merely execute data[i] and to get the | |
# column with name 'Fare', execute data['Fare']. | |
# Print the entire column with name Survived to the shell. What are the last 4 values of this column? | |
data["Survived"][-4:] | |
# Working with mixed datatypes (2) | |
# You have just used np.genfromtxt() to import data containing mixed datatypes. There is also another function | |
# np.recfromcsv() that behaves similarly to np.genfromtxt(), except that its default dtype is None. | |
# Assign the filename: file | |
file = 'titanic.csv' | |
# Import file using np.recfromcsv: d | |
d = np.recfromcsv(file, delimiter = ',', names = True, dtype = None) | |
# Print out first three entries of d | |
print(d[:3]) | |
#################################### Using pandas to import flat files as DataFrames ####################################### | |
# In the last exercise, you were able to import flat files containing columns with different datatypes as numpy arrays. | |
# However, the DataFrame object in pandas is a more appropriate structure in which to store such data and, thankfully, | |
# we can easily import files of mixed data types as DataFrames using the pandas functions read_csv() and read_table(). | |
# Import pandas as pd | |
import pandas as pd | |
# Assign the filename: file | |
file = 'titanic.csv' | |
# Read the file into a DataFrame: df | |
df = pd.read_csv(file) | |
# View the head of the DataFrame | |
df.head() | |
# In the last exercise, you were able to import flat files into a pandas DataFrame. As a bonus, it is then straightforward to | |
# retrieve the corresponding numpy array using the attribute values. | |
# You'll now have a chance to do this using the MNIST dataset, which is available as digits.csv. | |
# Assign the filename: file | |
file = 'digits.csv' | |
# Read the first 5 rows of the file into a DataFrame: data | |
data = pd.read_csv(file, nrows = 5, header = None) | |
# Build a numpy array from the DataFrame: data_array | |
data_array = np.array(data.values) | |
# Print the datatype of data_array to the shell | |
print(type(data_array)) | |
################################################# Customizing your pandas import ########################################## | |
# The pandas package is also great at dealing with many of the issues you will encounter when importing data as a data scientist, | |
# such as comments occurring in flat files, empty lines and missing values. Note that missing values are also commonly referred to | |
# as NA or NaN. To wrap up this chapter, you're now going to import a slightly corrupted copy of the Titanic dataset | |
# titanic_corrupt.txt, which | |
# contains comments after the character '#' | |
# is tab-delimited. | |
# Import matplotlib.pyplot as plt | |
import matplotlib.pyplot as plt | |
# Assign filename: file | |
file = 'titanic_corrupt.txt' | |
# Import file: data | |
data = pd.read_csv(file, sep='\t', comment='#', na_values='Nothing') | |
# Print the head of the DataFrame | |
print(data.head()) | |
# Plot 'Age' variable in a histogram | |
pd.DataFrame.hist(data[['Age']]) | |
plt.xlabel('Age (years)') | |
plt.ylabel('count') | |
plt.show() | |
######################################### Working with relational databases in Python ###################################### | |
######## What are the tables in the database? ####### | |
# In this exercise, you'll once again create an engine to connect to 'Chinook.sqlite'. Before you can get any data out of the database, | |
# however, you'll need to know what tables it contains! | |
# To this end, you'll save the table names to a list using the method table_names() on the engine and then you will print the list. | |
# Import necessary module | |
from sqlalchemy import create_engine | |
# Create engine: engine | |
engine = create_engine("sqlite:///Chinook.sqlite") | |
# Save the table names to a list: table_names | |
table_names = engine.table_names() | |
# Print the table names to the shell | |
print(table_names) | |
############################## The Hello World of SQL Queries! ########################## | |
# Now, it's time for liftoff! In this exercise, you'll perform the Hello World of SQL queries, SELECT, in order to retrieve all | |
# columns of the table Album in the Chinook database. Recall that the query SELECT * selects all columns. | |
# Import packages | |
from sqlalchemy import create_engine | |
import pandas as pd | |
# Create engine: engine | |
engine = create_engine('sqlite:///Chinook.sqlite') | |
# Open engine connection: con | |
con = engine.connect() | |
# Perform query: rs | |
rs = con.execute("SELECT * from Album") | |
# Save results of the query to DataFrame: df | |
df = pd.DataFrame(rs.fetchall()) | |
# Close connection | |
con.close() | |
# Print head of DataFrame df | |
print(df.head()) | |
############################ Customizing the Hello World of SQL Queries ############################################ | |
# Congratulations on executing your first SQL query! Now you're going to figure out how to customize your query in order to: | |
# Select specified columns from a table; | |
# Select a specified number of rows; | |
# Import column names from the database table. | |
# Recall that Hugo performed a very similar query customization in the video: | |
# Open engine in context manager | |
# Perform query and save results to DataFrame: df | |
with engine.connect() as con: | |
rs = con.execute("SELECT LastName, Title from Employee") | |
df = pd.DataFrame(rs.fetchmany(size = 3)) | |
df.columns = rs.keys() | |
# Print the length of the DataFrame df | |
print(len(df)) | |
# Print the head of the DataFrame df | |
print(df.head()) | |
############################### Filtering your database records using SQL's WHERE ############################# | |
# You can now execute a basic SQL query to select records from any table in your database and you can also perform simple query | |
# customizations to select particular columns and numbers of rows. | |
# There are a couple more standard SQL query chops that will aid you in your journey to becoming an SQL ninja. | |
# Let's say, for example that you wanted to get all records from the Customer table of the Chinook database for which the Country | |
# is 'Canada'. You can do this very easily in SQL using a SELECT statement followed by a WHERE clause as follows: | |
# SELECT * FROM Customer WHERE Country = 'Canada' | |
# In fact, you can filter any SELECT statement by any condition | |
# Create engine: engine | |
engine = create_engine('sqlite:///Chinook.sqlite') | |
# Open engine in context manager | |
# Perform query and save results to DataFrame: df | |
with engine.connect() as con: | |
rs = con.execute('SELECT * from Employee WHERE EmployeeId >= 6') | |
df = pd.DataFrame(rs.fetchall()) | |
df.columns = rs.keys() | |
# Print the head of the DataFrame df | |
print(df.head()) | |
######################################## Ordering your SQL records with ORDER BY ################################ | |
# You can also order your SQL query results. For example, if you wanted to get all records from the Customer table of the Chinook | |
# database and order them in increasing order by the column SupportRepId, you could do so with the following query: | |
# "SELECT * FROM Customer ORDER BY SupportRepId" | |
# In fact, you can order any SELECT statement by any column. | |
# In this interactive exercise, you'll select all records of the Employee table and order them in increasing order by the column BirthDate. | |
# Packages are already imported as follows: | |
# import pandas as pd | |
# from sqlalchemy import create_engine | |
# Get querying! | |
# Create engine: engine | |
engine = create_engine('sqlite:///Chinook.sqlite') | |
# Open engine in context manager | |
with engine.connect() as con: | |
rs = con.execute('SELECT * FROM Employee ORDER BY BirthDate') | |
df = pd.DataFrame(rs.fetchall()) | |
# Set the DataFrame's column names | |
df.columns = rs.keys() | |
# Print head of DataFrame | |
print(df.head()) | |
########################## Pandas and The Hello World of SQL Queries! ######################################### | |
# Here, you'll take advantage of the power of pandas to write the results of your SQL query to a DataFrame in one swift line of | |
# Python code! | |
# You'll first import pandas and create the SQLite 'Chinook.sqlite' engine. Then you'll query the database to select all records | |
# from the Album table. | |
# Recall that to select all records from the Orders table in the Northwind database, Hugo executed the following command: | |
# df = pd.read_sql_query("SELECT * FROM Orders", engine) | |
# Import packages | |
from sqlalchemy import create_engine | |
import pandas as pd | |
# Create engine: engine | |
engine = create_engine('sqlite:///Chinook.sqlite') | |
# Execute query and store records in DataFrame: df | |
df = pd.read_sql_query('SELECT * FROM Album', engine) | |
# Print head of DataFrame | |
print(df.head()) | |
# Open engine in context manager | |
# Perform query and save results to DataFrame: df1 | |
with engine.connect() as con: | |
rs = con.execute("SELECT * FROM Album") | |
df1 = pd.DataFrame(rs.fetchall()) | |
df1.columns = rs.keys() | |
# Confirm that both methods yield the same result: does df = df1 ? | |
print(df.equals(df1)) | |
########################################### Pandas for more complex querying ############################################### | |
# Here, you'll become more familiar with the pandas function read_sql_query() | |
# by using it to execute a more complex query: a SELECT statement followed by both a WHERE clause AND an ORDER BY clause. | |
# You'll build a DataFrame that contains the rows of the Employee table for which the EmployeeId is greater than or equal to 6 and | |
# you'll order these entries by BirthDate. | |
# Import packages | |
from sqlalchemy import create_engine | |
import pandas as pd | |
# Create engine: engine | |
engine = create_engine('sqlite:///Chinook.sqlite') | |
# Execute query and store records in DataFrame: df | |
df = pd.read_sql_query( | |
'SELECT * FROM Employee WHERE EmployeeId >= 6 ORDER BY BirthDate', engine) | |
# Print head of DataFrame | |
print(df.head()) | |
##################################### The power of SQL lies in relationships between tables: INNER JOIN ############################ | |
# Here, you'll perform your first INNER JOIN! You'll be working with your favourite SQLite database, Chinook.sqlite. | |
# For each record in the Album table, you'll extract the Title along with the Name of the Artist. The latter will come from the | |
# Artist table and so you will need to INNER JOIN these two tables on the ArtistID column of both. | |
# Recall that to INNER JOIN the Orders and Customers tables from the Northwind database, Hugo executed the following SQL query: | |
# "SELECT OrderID, CompanyName FROM Orders INNER JOIN Customers on Orders.CustomerID = Customers.CustomerID" | |
# The following code has already been executed to import the necessary packages and to create the engine: | |
# import pandas as pd | |
# from sqlalchemy import create_engine | |
# engine = create_engine('sqlite:///Chinook.sqlite') | |
# Open engine in context manager | |
# Perform query and save results to DataFrame: df | |
with engine.connect() as con: | |
rs = con.execute( | |
'SELECT Title, Name FROM Album INNER JOIN Artist on Album.ArtistID = Artist.ArtistID', | |
) | |
df = pd.DataFrame(rs.fetchall()) | |
df.columns = rs.keys() | |
# Print head of DataFrame df | |
print(df.head()) | |
################################################### Filtering your INNER JOIN ########################################### | |
# Congrats on performing your first INNER JOIN! You're now going to finish this chapter with one final exercise in which you | |
# perform an INNER JOIN and filter the result using a WHERE clause. | |
# Recall that to INNER JOIN the Orders and Customers tables from the Northwind database, Hugo executed the following SQL query: | |
# "SELECT OrderID, CompanyName FROM Orders INNER JOIN Customers on Orders.CustomerID = Customers.CustomerID" | |
# The following code has already been executed to import the neccesary packages and to create the engine: | |
# import pandas as pd | |
# from sqlalchemy import create_engine | |
# engine = create_engine('sqlite:///Chinook.sqlite') | |
# Execute query and store records in DataFrame: df | |
df = pd.read_sql_query('SELECT * FROM PlaylistTrack INNER JOIN Track on PlaylistTrack.TrackId = Track.TrackId WHERE Milliseconds < 250000', engine) | |
# Print head of DataFrame | |
print(df.head()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment