Last active
December 27, 2017 06:19
-
-
Save ktl014/1f2b47c46705a63ad74c4a00e1209359 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ================== Loading & Exploring JSON ================== # | |
# Load JSON: json_data | |
with open("a_movie.json") as json_file: | |
json_data = json.load(json_file) | |
# Print each key-value pair in json_data | |
for k in json_data.keys(): | |
print(k + ': ', json_data[k]) | |
# ================== Query API & decode JSON data ================== # | |
# Import package | |
import requests | |
# Assign URL to variable: url | |
''' | |
API query string should have two arguments: apikey=ff21610b and t=social+network | |
''' | |
url = 'http://www.omdbapi.com/?apikey=ff21610b&t=social+network' | |
# Package the request, send the request and catch the response: r | |
r = requests.get(url) | |
# Decode the JSON data into a dictionary: json_data | |
json_data = r.json() | |
# Print each key-value pair in json_data | |
for k in json_data.keys(): | |
print(k + ': ', json_data[k]) | |
# ----------- A) Wikipedia API ------- # | |
# Assign URL to variable: url (CHECK URL FOR WIKI API) | |
url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=pizza' | |
# Package the request, send the request and catch the response: r | |
r = requests.get(url) | |
# Decode the JSON data into a dictionary: json_data | |
json_data = r.json() | |
# Print the Wikipedia page extract | |
pizza_extract = json_data['query']['pages']['24768']['extract'] | |
print(pizza_extract) | |
# ----------- B) Twitter API ------- # | |
# Import package | |
import tweepy | |
# -_-_- i) Handle Twitter API OAuth Authentication | |
# Store OAuth authentication credentials in relevant variables | |
access_token = "1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy" | |
access_token_secret = "X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx" | |
consumer_key = "nZ6EA0FxZ293SxGNg8g8aP0HM" | |
consumer_secret = "fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i" | |
# Pass OAuth details to tweepy's OAuth handler | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_token, access_token_secret) | |
# -_-_- ii) Stream Tweets | |
# Initialize Stream listener (CHECK Tweet_listener.py for class object) | |
l = MyStreamListener() | |
# Create you Stream object with authentication | |
stream = tweepy.Stream(auth, l) | |
# Filter Twitter Streams to capture data by the keywords: | |
stream.filter(track=['clinton', 'trump', 'sanders', 'cruz']) | |
# -_-_- iii) Load & Explore Twitter Data | |
# Twitter data saved to text file from previous step | |
# Import package | |
import json | |
# String of path to file: tweets_data_path | |
tweets_data_path = 'tweets.txt' | |
# Initialize empty list to store tweets: tweets_data | |
tweets_data = [] | |
# Open connection to file | |
tweets_file = open(tweets_data_path, "r") | |
# Read in tweets and store in list: tweets_data | |
for line in tweets_file: | |
tweet = json.loads(line) | |
tweets_data.append(tweet) | |
# Close connection to file | |
tweets_file.close() | |
# Print the keys of the first tweet dict | |
print(tweets_data[0].keys()) | |
# -_-_- iv) Twitter data to Dataframe | |
# Import package | |
import pandas as pd | |
# Build DataFrame of tweet texts and languages | |
df = pd.DataFrame(tweets_data, columns=['text', 'lang']) | |
# Print head of DataFrame | |
print(df.head()) | |
# -_-_- v) Twitter Analysis | |
# Initialize list to store tweet counts | |
[clinton, trump, sanders, cruz] = [0, 0, 0, 0] | |
import re | |
def word_in_text(word, tweet): | |
word = word.lower() | |
text = tweet.lower() | |
match = re.search(word, tweet) | |
if match: | |
return True | |
return False | |
# Iterate through df, counting the number of tweets in which | |
# each candidate is mentioned | |
for index, row in df.iterrows(): | |
clinton += word_in_text('clinton', row['text']) | |
trump += word_in_text('trump', row['text']) | |
sanders += word_in_text('sanders', row['text']) | |
cruz += word_in_text('cruz', row['text']) | |
# -_-_- v) Plot Twitter Data | |
# Import packages | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
# Set seaborn style | |
sns.set(color_codes=True) | |
# Create a list of labels:cd | |
cd = ['clinton', 'trump', 'sanders', 'cruz'] | |
# Plot histogram | |
ax = sns.barplot(cd, [clinton, trump, sanders, cruz]) | |
ax.set(ylabel="count") | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import numpy | |
import numpy as np | |
# Assign the filename: file | |
file = 'digits_header.txt' | |
# Load the data: data | |
''' | |
for example, you can use ',' and '\t' for comma-delimited and tab-delimited respectively; | |
skiprows allows you to specify how many rows (not indices) you wish to skip; | |
usecols takes a list of the indices of the columns you wish to keep. | |
''' | |
data = np.loadtxt(file, delimiter='\t', skiprows=1, usecols=[0,2]) | |
# Print data | |
print(data) | |
# Import data as floats and skip the first row: data_float | |
data_float = np.loadtxt(file, delimiter='\t', dtype=float, skiprows=1) | |
# Print the 10th element of data_float | |
print(data_float[9]) | |
# Plot a scatterplot of the data | |
plt.scatter(data_float[:, 0], data_float[:, 1]) | |
plt.xlabel('time (min.)') | |
plt.ylabel('percentage of larvae') | |
plt.show() | |
# Import data containing mixed datatypes (i.e. one column containing strings and another floats) | |
data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ================== Loading a pickled file ================== # | |
# Import pickle package | |
import pickle | |
# Open pickle file and load data: d | |
with open('data.pkl', "rb") as file: | |
d = pickle.load(file) | |
# Print d | |
print(d) | |
# Print datatype of d | |
print(type(d)) | |
# ================== Loading sheets in Excel Files ================== # | |
# Import pandas to import excel spreadsheets | |
import pandas as pd | |
# Assign spreadsheet filename: file | |
file = 'battledeath.xlsx' | |
# Load spreadsheet: xl | |
xl = pd.ExcelFile(file) | |
# Print sheet names | |
print(xl.sheet_names) | |
# ----------- A) Parse sheets into dataframes ------- # | |
# Parse the first sheet and rename the columns: df1 | |
df1 = xl.parse(0, skiprows=[0], names=['Country', 'AAM due to War (2002)']) | |
# Print the head of the DataFrame df1 | |
print(df1.head()) | |
# Parse the first column of the second sheet and rename the column: df2 | |
df2 = xl.parse(1, parse_cols=[0], skiprows=[0], names=['Country']) | |
# Print the head of the DataFrame df2 | |
print(df2.head()) | |
# ================== Importing SAS/STATA Files ================== # | |
# Import sas7bdat package | |
from sas7bdat import SAS7BDAT | |
# Save file to a DataFrame: df_sas | |
with SAS7BDAT('sales.sas7bdat') as file: | |
df_sas = file.to_data_frame() | |
# Print head of DataFrame | |
print(df_sas.head()) | |
# Plot histogram of DataFrame features (pandas and pyplot already imported) | |
pd.DataFrame.hist(df_sas[['P']]) | |
plt.ylabel('count') | |
plt.show() | |
# Load Stata file into a pandas DataFrame: df | |
df = pd.read_stata('disarea.dta') | |
# Print the head of the DataFrame df | |
print(df.head()) | |
# Plot histogram of one column of the DataFrame | |
pd.DataFrame.hist(df[['disa10']]) | |
plt.xlabel('Extent of disease') | |
plt.ylabel('Number of coutries') | |
plt.show() | |
# ================== Importing HDF5 Files ================== # | |
# Import packages | |
import numpy as np | |
import h5py | |
# Assign filename: file | |
file = 'LIGO_data.hdf5' | |
# Load file: data | |
data = h5py.File(file, 'r') | |
# Print the datatype of the loaded file | |
print(type(data)) | |
# Print the keys of the file | |
for key in data.keys(): | |
print(key) | |
# ----------- A) Extracting data from HDF5 File ------- # | |
# Get the HDF5 group: group | |
group = data['strain'] | |
# Set variable equal to time series data: strain | |
strain = data['strain']['Strain'].value | |
# Set number of time points to sample: num_samples | |
num_samples = 10000 | |
# Set time vector | |
time = np.arange(0, 1, 1/num_samples) | |
# Plot data | |
plt.plot(time, strain[:num_samples]) | |
plt.xlabel('GPS Time (s)') | |
plt.ylabel('strain') | |
plt.show() | |
# ================== Importing MATLAB Files ================== # | |
# Import package | |
import scipy.io | |
# Load MATLAB file: mat | |
mat = scipy.io.loadmat('albeck_gene_expression.mat') | |
# Print the datatype type of mat | |
print(type(mat)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Assign the filename: file | |
file = 'digits.csv' | |
# Read the first 5 rows of the file into a DataFrame: data | |
data = pd.read_csv(file, nrows=5, header=None) | |
# Build a numpy array from the DataFrame: data_array | |
data_array = data.values | |
# Print the datatype of data_array to the shell | |
print(type(data_array)) | |
# ================== Customizing Pandas Import ================== # | |
# Import file: data (handle comments, empty lines, missing values) | |
data = pd.read_csv(file, sep='\t', comment='#', na_values='Nothing') | |
# Print the head of the DataFrame | |
print(data.head()) | |
# Plot 'Age' variable in a histogram | |
pd.DataFrame.hist(data[['Age']]) | |
plt.xlabel('Age (years)') | |
plt.ylabel('count') | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ================== SQL Query Databases ================== # | |
# Import packages | |
from sqlalchemy import create_engine | |
import pandas as pd | |
# Create engine: engine | |
engine = create_engine('sqlite:///Chinook.sqlite') | |
# Open engine connection: con | |
con = engine.connect() | |
# Perform query: rs | |
rs = con.execute('SELECT * FROM Album') | |
# Save results of the query to DataFrame: df | |
df = pd.DataFrame(rs.fetchall()) | |
# Close connection | |
con.close() | |
# Print head of DataFrame df | |
print(df.head()) | |
# ----------- A) Customize SQl query results ------- # | |
# Create engine: engine | |
engine = create_engine('sqlite:///Chinook.sqlite') | |
# Open engine in context manager | |
with engine.connect() as con: | |
rs = con.execute('SELECT * FROM Employee ORDER BY BirthDate') | |
df = pd.DataFrame(rs.fetchall()) | |
# Set the DataFrame's column names | |
df.columns = rs.keys() | |
# Print head of DataFrame | |
print(df.head()) | |
# ----------- B) Write results of SQL Query directly to DataFrame ------- # | |
# Create engine: engine | |
engine = create_engine('sqlite:///Chinook.sqlite') | |
# Execute query and store records in DataFrame: df | |
df = pd.read_sql_query("SELECT * FROM Album", engine) | |
# Print head of DataFrame | |
print(df.head()) | |
# ----------- C) Relationships between tables: INNER JOIN ------- # | |
# Open engine in context manager | |
# Perform query and save results to DataFrame: df | |
with engine.connect() as con: | |
rs = con.execute("SELECT Title, Name FROM Album INNER JOIN Artist on Album.ArtistID = Artist.ArtistID") | |
df = pd.DataFrame(rs.fetchall()) | |
df.columns = rs.keys() | |
# Print head of DataFrame df | |
print(df.head()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ================== Files from Web ================== # | |
# ----------- A) Flat Files ------- # | |
# Import packages | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
# Assign url of file: url | |
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv' | |
# Read file into a DataFrame: df | |
df = pd.read_csv(url, sep=';') | |
# ----------- B) NonFlat Files ------- # | |
# Assign url of file: url | |
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls' | |
# Read in all sheets of Excel file: xl | |
xl = pd.read_excel(url, sheetname=None) | |
# Print the sheetnames to the shell | |
print(xl.keys()) | |
# Print the head of the first sheet (using its name, NOT its index) | |
print(xl['1700'].head()) | |
# ================== HTTP Requests ================== # | |
# Import package | |
import requests | |
# Specify the url: url | |
url = "http://www.datacamp.com/teach/documentation" | |
# Packages the request, send the request and catch the response: r | |
r = requests.get(url) | |
# Extract the response: text | |
html_doc = r.text | |
# Print the html | |
print(html_doc) | |
# ----------- A) Parsing HTML with BeautifulSoup ------- # | |
# create a BeautifulSoup object from the HTML: soup | |
soup = BeautifulSoup(html_doc) | |
# Print the title of Guido's webpage | |
print(soup.title) | |
# Find all 'a' tags (which define hyperlinks): a_tags | |
a_tags = soup.find_all('a') | |
# Print the URLs to the shell | |
for link in a_tags: | |
print(link.get('href')) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
What I learned: | |
- importing text files and flat files | |
- importing files in other formats | |
- writing SQL queries | |
- Getting data from relational databases | |
- Pulling data from the web | |
- Pulling data from APIs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class MyStreamListener(tweepy.StreamListener): | |
def __init__(self, api=None): | |
super(MyStreamListener, self).__init__() | |
self.num_tweets = 0 | |
self.file = open("tweets.txt", "w") | |
def on_status(self, status): | |
tweet = status._json | |
self.file.write( json.dumps(tweet) + '\n' ) | |
self.num_tweets += 1 | |
if self.num_tweets < 100: | |
return True | |
else: | |
return False | |
self.file.close() | |
def on_error(self, status): | |
print(status) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment