Last active
May 15, 2020 23:04
-
-
Save hsleonis/4de5a00e59b663a48a61065611f35de4 to your computer and use it in GitHub Desktop.
Pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create a list of strings: flash | |
flash = ['jay garrick', 'barry allen', 'wally west', 'bart allen'] | |
# Print each list item in flash using a for loop | |
for item in flash: | |
print(item) | |
# Create an iterator for flash: superhero | |
superhero = iter(flash) | |
# Print each item from the iterator | |
print(next(superhero)) | |
print(next(superhero)) | |
print(next(superhero)) | |
print(next(superhero)) | |
# Create an iterator for range(3): small_value | |
small_value = iter(range(3)) | |
# Print the values in small_value | |
print(next(small_value)) | |
print(next(small_value)) | |
print(next(small_value)) | |
# Loop over range(3) and print the values | |
for num in range(3): | |
print(num) | |
# Create an iterator for range(10 ** 100): googol | |
googol = iter(range(10 ** 100)) | |
# Print the first 5 values from googol | |
print(next(googol)) | |
print(next(googol)) | |
print(next(googol)) | |
print(next(googol)) | |
print(next(googol)) | |
# Create a range object: values | |
values = range(10, 21) | |
# Print the range object | |
print(values) | |
# Create a list of integers: values_list | |
values_list = list(values) | |
# Print values_list | |
print(values_list) | |
# Get the sum of values: values_sum | |
values_sum = sum(values) | |
# Print values_sum | |
print(values_sum) | |
# Create a list of strings: mutants | |
mutants = ['charles xavier', | |
'bobby drake', | |
'kurt wagner', | |
'max eisenhardt', | |
'kitty pryde'] | |
# Create a list of tuples: mutant_list | |
mutant_list = list(enumerate(mutants)) | |
# Print the list of tuples | |
print(mutant_list) | |
# Unpack and print the tuple pairs | |
for index1,value1 in enumerate(mutants): | |
print(index1, value1) | |
# Change the start index | |
for index2,value2 in enumerate(mutants, start=1): | |
print(index2, value2) | |
# Create a list of tuples: mutant_data | |
mutant_data = list(zip(mutants, aliases, powers)) | |
# Print the list of tuples | |
print(mutant_data) | |
# Create a zip object using the three lists: mutant_zip | |
mutant_zip = zip(mutants, aliases, powers) | |
# Print the zip object | |
print(mutant_zip) | |
# Unpack the zip object and print the tuple values | |
for value1, value2, value3 in mutant_zip: | |
print(value1, value2, value3) | |
# Create a zip object from mutants and powers: z1 | |
z1 = zip(mutants, powers) | |
# Print the tuples in z1 by unpacking with * | |
print(*z1) | |
# Re-create a zip object from mutants and powers: z1 | |
z1 = zip(mutants, powers) | |
# 'Unzip' the tuples in z1 by unpacking with * and zip(): result1, result2 | |
result1, result2 = zip(*z1) | |
# Check if unpacked tuples are equivalent to original tuples | |
print(result1 == mutants) | |
print(result2 == powers) | |
# Initialize an empty dictionary: counts_dict | |
counts_dict = dict() | |
# Iterate over the file chunk by chunk | |
for chunk in pd.read_csv('tweets.csv', chunksize=10): | |
# Iterate over the column in DataFrame | |
for entry in chunk['lang']: | |
if entry in counts_dict.keys(): | |
counts_dict[entry] += 1 | |
else: | |
counts_dict[entry] = 1 | |
# Print the populated dictionary | |
print(counts_dict) | |
# Define count_entries() | |
def count_entries(csv_file, c_size, colname): | |
"""Return a dictionary with counts of | |
occurrences as value for each key.""" | |
# Initialize an empty dictionary: counts_dict | |
counts_dict = {} | |
# Iterate over the file chunk by chunk | |
for chunk in pd.read_csv(csv_file, chunksize=c_size): | |
# Iterate over the column in DataFrame | |
for entry in chunk[colname]: | |
if entry in counts_dict.keys(): | |
counts_dict[entry] += 1 | |
else: | |
counts_dict[entry] = 1 | |
# Return counts_dict | |
return counts_dict | |
# Call count_entries(): result_counts | |
result_counts = count_entries('tweets.csv', 10, 'lang') | |
# Print result_counts | |
print(result_counts) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# output: ['h', 'c', 'c', 't', 'w'] | |
doctor = ['house', 'cuddy', 'chase', 'thirteen', 'wilson'] | |
[doc[0] for doc in doctor] | |
# Create list comprehension: squares | |
squares = [i**2 for i in range(0,10)] | |
# Create a 5 x 5 matrix using a list of lists: matrix | |
matrix = [[col for col in range(0,5)] for row in range(0,5)] | |
# Print the matrix | |
for row in matrix: | |
print(row) | |
# Create a list of strings: fellowship | |
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli'] | |
# Create list comprehension: new_fellowship with member with 7 or more characters | |
new_fellowship = [member for member in fellowship if len(member) >= 7] | |
# Print the new list | |
print(new_fellowship) | |
# Create list comprehension: new_fellowship with member with 7 or more characters else '' | |
new_fellowship = [member if len(member) >= 7 else '' for member in fellowship] | |
# Print the new list | |
print(new_fellowship) | |
# Create dict comprehension: new_fellowship | |
new_fellowship = {member:len(member) for member in fellowship} | |
# Print the new dictionary | |
print(new_fellowship) | |
# Create generator object: result | |
result = (num for num in range(0, 31)) | |
# Print the first 5 values | |
print(next(result)) | |
print(next(result)) | |
print(next(result)) | |
print(next(result)) | |
print(next(result)) | |
# Print the rest of the values | |
for value in result: | |
print(value) | |
# Create a list of strings: lannister | |
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey'] | |
# Create a generator object: lengths | |
lengths = (len(person) for person in lannister) | |
# Iterate over and print the values in lengths | |
for value in lengths: | |
print(value) | |
# Define generator function get_lengths | |
def get_lengths(input_list): | |
"""Generator function that yields the | |
length of the strings in input_list.""" | |
# Yield the length of a string | |
for person in input_list: | |
yield(len(person)) | |
# Print the values generated by get_lengths() | |
for value in get_lengths(lannister): | |
print(value) | |
# Extract the created_at column from df: tweet_time | |
tweet_time = df['created_at'] | |
# Extract the clock time: tweet_clock_time | |
tweet_clock_time = [entry[11:19] for entry in tweet_time] | |
# Print the extracted times | |
print(tweet_clock_time) | |
# Extract the clock time: tweet_clock_time | |
tweet_clock_time = [entry[11:19] for entry in tweet_time if entry[17:19] == '19'] | |
# Print the extracted times | |
print(tweet_clock_time) | |
# Zip lists: zipped_lists | |
zipped_lists = zip(feature_names,row_vals ) | |
# Create a dictionary: rs_dict | |
rs_dict = dict(zipped_lists) | |
# Print the dictionary | |
print(rs_dict) | |
# Define lists2dict() | |
def lists2dict(list1, list2): | |
"""Return a dictionary where list1 provides | |
the keys and list2 provides the values.""" | |
# Zip lists: zipped_lists | |
zipped_lists = zip(list1, list2) | |
# Create a dictionary: rs_dict | |
rs_dict = dict(zipped_lists) | |
# Return the dictionary | |
return rs_dict | |
# Call lists2dict: rs_fxn | |
rs_fxn = lists2dict(feature_names, row_vals) | |
# Print rs_fxn | |
print(rs_fxn) | |
# Print the first two lists in row_lists | |
print(row_lists[0]) | |
print(row_lists[1]) | |
# Turn list of lists into list of dicts: list_of_dicts | |
list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists] | |
# Print the first two dictionaries in list_of_dicts | |
print(list_of_dicts[0]) | |
print(list_of_dicts[1]) | |
# Import the pandas package | |
import pandas as pd | |
# Turn list of lists into list of dicts: list_of_dicts | |
list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists] | |
# Turn list of dicts into a DataFrame: df | |
df = pd.DataFrame(list_of_dicts) | |
# Print the head of the DataFrame | |
print(df.head()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Open a connection to the file | |
with open('world_dev_ind.csv') as file: | |
# Skip the column names | |
file.readline() | |
# Initialize an empty dictionary: counts_dict | |
counts_dict = {} | |
# Process only the first 1000 rows | |
for j in range(0, 1000): | |
# Split the current line into a list: line | |
line = file.readline().split(',') | |
# Get the value for the first column: first_col | |
first_col = line[0] | |
# If the column value is in the dict, increment its value | |
if first_col in counts_dict.keys(): | |
counts_dict[first_col] += 1 | |
# Else, add to the dict and set value to 1 | |
else: | |
counts_dict[first_col] = 1 | |
# Print the resulting dictionary | |
print(counts_dict) | |
# Define read_large_file() | |
def read_large_file(file_object): | |
"""A generator function to read a large file lazily.""" | |
# Loop indefinitely until the end of the file | |
while True: | |
# Read a line from the file: data | |
data = file_object.readline() | |
# Break if this is the end of the file | |
if not data: | |
break | |
# Yield the line of data | |
yield data | |
# Open a connection to the file | |
with open('world_dev_ind.csv') as file: | |
# Create a generator object for the file: gen_file | |
gen_file = read_large_file(file) | |
# Print the first three lines of the file | |
print(next(gen_file)) | |
print(next(gen_file)) | |
print(next(gen_file)) | |
# Initialize an empty dictionary: counts_dict | |
counts_dict = {} | |
# Open a connection to the file | |
with open('world_dev_ind.csv') as file: | |
# Iterate over the generator from read_large_file() | |
for line in read_large_file(file): | |
row = line.split(',') | |
first_col = row[0] | |
if first_col in counts_dict.keys(): | |
counts_dict[first_col] += 1 | |
else: | |
counts_dict[first_col] = 1 | |
print(counts_dict) | |
# Import the pandas package | |
import pandas as pd | |
# Initialize reader object: df_reader | |
df_reader = pd.read_csv('ind_pop.csv', chunksize=10) | |
# Print two chunks | |
print(next(df_reader)) | |
print(next(df_reader)) | |
# Initialize reader object: urb_pop_reader | |
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000) | |
# Get the first DataFrame chunk: df_urb_pop | |
df_urb_pop = next(urb_pop_reader) | |
# Check out the head of the DataFrame | |
print(df_urb_pop.head()) | |
# Check out specific country: df_pop_ceb | |
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB'] | |
# Zip DataFrame columns of interest: pops | |
pops = zip(df_pop_ceb['Total Population'], df_pop_ceb['Urban population (% of total)']) | |
# Turn zip object into list: pops_list | |
pops_list = list(pops) | |
# Print pops_list | |
print(pops_list) | |
# Use list comprehension to create new DataFrame column 'Total Urban Population' | |
df_pop_ceb['Total Urban Population'] = [int(p[0]*p[1]*0.01) for p in pops_list] | |
# Plot urban population data | |
df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population') | |
plt.show() | |
# Initialize reader object: urb_pop_reader | |
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000) | |
# Initialize empty DataFrame: data | |
data = pd.DataFrame() | |
# Iterate over each DataFrame chunk | |
for df_urb_pop in urb_pop_reader: | |
# Check out specific country: df_pop_ceb | |
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB'] | |
# Zip DataFrame columns of interest: pops | |
pops = zip(df_pop_ceb['Total Population'], | |
df_pop_ceb['Urban population (% of total)']) | |
# Turn zip object into list: pops_list | |
pops_list = list(pops) | |
# Use list comprehension to create new DataFrame column 'Total Urban Population' | |
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list] | |
# Append DataFrame chunk to data: data | |
data = data.append(df_pop_ceb) | |
# Plot urban population data | |
data.plot(kind='scatter', x='Year', y='Total Urban Population') | |
plt.show() | |
# Define plot_pop() | |
def plot_pop(filename, country_code): | |
# Initialize reader object: urb_pop_reader | |
urb_pop_reader = pd.read_csv(filename, chunksize=1000) | |
# Initialize empty DataFrame: data | |
data = pd.DataFrame() | |
# Iterate over each DataFrame chunk | |
for df_urb_pop in urb_pop_reader: | |
# Check out specific country: df_pop_ceb | |
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code] | |
# Zip DataFrame columns of interest: pops | |
pops = zip(df_pop_ceb['Total Population'], | |
df_pop_ceb['Urban population (% of total)']) | |
# Turn zip object into list: pops_list | |
pops_list = list(pops) | |
# Use list comprehension to create new DataFrame column 'Total Urban Population' | |
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list] | |
# Append DataFrame chunk to data: data | |
data = data.append(df_pop_ceb) | |
# Plot urban population data | |
data.plot(kind='scatter', x='Year', y='Total Urban Population') | |
plt.show() | |
# Set the filename: fn | |
fn = 'ind_pop_data.csv' | |
# Call plot_pop for country code 'CEB' | |
plot_pop('ind_pop_data.csv', 'CEB') | |
# Call plot_pop for country code 'ARB' | |
plot_pop('ind_pop_data.csv', 'ARB') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment