Last active
February 21, 2024 09:30
-
-
Save mGalarnyk/8cd9a387b380f03c2ee9533f22558469 to your computer and use it in GitHub Desktop.
Getting and Clean Data Quiz 3 from John Hopkins Coursera for the blog post https://medium.com/@GalarnykMichael/getting-and-cleaning-data-jhu-coursera-course-3-c3635747858b#.270anhem0
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Getting and Cleaning Data, Quiz 3 JHU Coursera | |
# 1. Register an application with the Github API here https://github.com/settings/applications. | |
#The American Community Survey distributes downloadable data about United States communities. Download the 2006 microdata survey about housing for the state of Idaho using download.file() from here: | |
#https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv | |
#and load the data into R. The code book, describing the variable names is here: | |
#https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FPUMSDataDict06.pdf | |
#Create a logical vector that identifies the households on greater than 10 acres who sold more than $10,000 worth of agriculture products. Assign that logical vector to the variable agricultureLogical. Apply the which() function like this to identify the rows of the data frame where the logical vector is TRUE. | |
#which(agricultureLogical) | |
#What are the first 3 values that result? | |
# Download the file | |
download.file('https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv' | |
, 'ACS.csv' | |
, method='curl' ) | |
# Read data into data.frame | |
ACS <- read.csv('ACS.csv') | |
agricultureLogical <- ACS$ACR == 3 & ACS$AGS == 6 | |
head(which(agricultureLogical), 3) | |
# 2. Using the jpeg package read in the following picture of your instructor into R | |
# https://d396qusza40orc.cloudfront.net/getdata%2Fjeff.jpg | |
# Use the parameter native=TRUE. What are the 30th and 80th quantiles of the resulting data? (some Linux systems may produce an answer 638 different for the 30th quantile) | |
# write the file url and file destination to an object | |
# install.packages('jpeg') | |
library(jpeg) | |
# Download the file | |
download.file('https://d396qusza40orc.cloudfront.net/getdata%2Fjeff.jpg' | |
, 'jeff.jpg' | |
, mode='wb' ) | |
# Read the image | |
picture <- jpeg::readJPEG('jeff.jpg' | |
, native=TRUE) | |
# Get Sample Quantiles corressponding to given prob | |
quantile(picture, probs = c(0.3, 0.8) ) | |
# 3. Load the Gross Domestic Product data for the 190 ranked countries in this data set: | |
# https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv | |
# Load the educational data from this data set: | |
# https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FEDSTATS_Country.csv | |
# Match the data based on the country shortcode. How many of the IDs match? Sort the data frame in descending order by GDP rank (so United States is last). | |
# What is the 13th country in the resulting data frame? | |
# install.packages("data.table) | |
library("data.table") | |
# Download data and read FGDP data into data.table | |
FGDP <- data.table::fread('https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv' | |
, skip=4 | |
, nrows = 190 | |
, select = c(1, 2, 4, 5) | |
, col.names=c("CountryCode", "Rank", "Economy", "Total") | |
) | |
# Download data and read FGDP data into data.table | |
FEDSTATS_Country <- data.table::fread('https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FEDSTATS_Country.csv' | |
) | |
#data.table::merge(FGDP, FEDSTATS_Country, by = 'CountryCode') | |
mergedDT <- merge(FGDP, FEDSTATS_Country, by = 'CountryCode') | |
# How many of the IDs match? | |
nrow(mergedDT) | |
# Sort the data frame in descending order by GDP rank (so United States is last). | |
# What is the 13th country in the resulting data frame? | |
mergedDT[order(-Rank)][13,.(Economy)] | |
# 4. What is the average GDP ranking for the | |
# "High income: OECD" and "High income: nonOECD" group? | |
# "High income: OECD" | |
mergedDT[`Income Group` == "High income: OECD" | |
, lapply(.SD, mean) | |
, .SDcols = c("Rank") | |
, by = "Income Group"] | |
# "High income: nonOECD" | |
mergedDT[`Income Group` == "High income: nonOECD" | |
, lapply(.SD, mean) | |
, .SDcols = c("Rank") | |
, by = "Income Group"] | |
# 5. Cut the GDP ranking into 5 separate quantile groups. | |
# Make a table versus Income.Group. | |
# How many countries are Lower middle income but among the 38 nations with highest GDP? | |
install.packages('dplyr') | |
library('dplyr') | |
breaks <- quantile(mergedDT[, Rank], probs = seq(0, 1, 0.2), na.rm = TRUE) | |
mergedDT$quantileGDP <- cut(mergedDT[, Rank], breaks = breaks) | |
mergedDT[`Income Group` == "Lower middle income", .N, by = c("Income Group", "quantileGDP")] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@ uno online: Great! This is what I'm looking for.