Skip to content

Instantly share code, notes, and snippets.

@pedrobrantes
Created April 7, 2025 22:22
Show Gist options
  • Select an option

  • Save pedrobrantes/f10f7a83c5ce5366fc20b95d5a2be9bc to your computer and use it in GitHub Desktop.

Select an option

Save pedrobrantes/f10f7a83c5ce5366fc20b95d5a2be9bc to your computer and use it in GitHub Desktop.
This Gist contains R code exercises demonstrating data manipulation with the dplyr package using a dataset of US counties. It includes examples of selecting columns, filtering rows based on various criteria (population, state, poverty, unemployment, etc.), sorting data, and creating new columns through calculations and conditional logic.
# Exercises
library(dplyr)
counties <- read.csv(file = "Data_Manipulation_with_dplyr/datasets/counties.csv")
counties <- counties %>%
mutate(
census_id = as.character(census_id),state = as.character(state),county = as.character(county),region = as.character(region),metro = as.character(metro),population = as.numeric(population),men = as.numeric(men),women = as.numeric(women),hispanic = as.numeric(hispanic),white = as.numeric(white),black = as.numeric(black),native = as.numeric(native),asian = as.numeric(asian),pacific = as.numeric(pacific),citizens = as.numeric(citizens),income = as.numeric(income),income_err = as.numeric(income_err),income_per_cap = as.numeric(income_per_cap),income_per_cap_err = as.numeric(income_per_cap_err),poverty = as.numeric(poverty),child_poverty = as.numeric(child_poverty),professional = as.numeric(professional),service = as.numeric(service),office = as.numeric(office),construction = as.numeric(construction),production = as.numeric(production),drive = as.numeric(drive),carpool = as.numeric(carpool),transit = as.numeric(transit),walk = as.numeric(walk),other_transp = as.numeric(other_transp),work_at_home = as.numeric(work_at_home),mean_commute = as.numeric(mean_commute),employed = as.numeric(employed),private_work = as.numeric(private_work),public_work = as.numeric(public_work),self_employed = as.numeric(self_employed),family_work = as.numeric(family_work),unemployment = as.numeric(unemployment),land_area = as.numeric(land_area),row_sum = professional + service + office + construction + production,professional = professional / row_sum * 100,service = service / row_sum * 100,office = office / row_sum * 100,construction = construction / row_sum * 100,production = production / row_sum * 100) %>% select(-row_sum)
# Select the state, county, population and poverty variables
counties %>%
select(state, county, population, poverty)
# Add a verb to sort in descending order of public_work
counties_selected <- counties %>%
select(
state, county, population, private_work,
public_work, self_employed
) %>%
arrange(desc(public_work))
# Filter for counties with a population above one million
counties_selected %>%
filter(population > 10^6)
# and Califórnia too
counties_selected %>%
filter(
population > 1e+6,
state == "California"
)
# Filter for Texas and more than 10000 people and
# sort in descending order of private_work
counties_selected %>%
filter(state == "Texas", population > 1e+4) %>%
arrange(desc(private_work))
# Add a new column public_workers with the number of people
# employed in public work
counties_selected %>%
mutate(
public_workers = public_work * (population / 100)
)
# Select the columns state, county, population, men and women
# Calculate proportion_women as the fraction of the population
# made up of women
counties %>%
select(state, county, population, men, women) %>%
mutate(
proportion_women = women / population
)
# Select the state county, income and unemployment variables
counties %>%
select(state, county, income, unemployment)
# Select the state, county, men, women variables and calculate
# the difference between the number of men and women
counties %>%
select(state, county, men, women) %>%
mutate(
diff_men_women = men - women
)
# Filter for counties in the state of Florida with an
# unemployment rate lower than 7%
counties %>%
filter(
state == "Florida",
unemployment < 7
)
# Filter for counties that are in the "South" region and have
# a population greater than 500000
counties %>%
filter(
region == "South",
population > 5e+5
)
# Select the columns state, county, income_per_cap and poverty
# Order the results in ascending order by the poverty rate
counties %>%
select(state, county, income_per_cap, poverty) %>%
arrange(poverty)
# Create a new column called metro_status that contains "Yes" if
# the county belongs to a metropolitan area and "No" otherwise
counties %>%
mutate(
metro_status = ifelse(metro == "Metro", "Yes", "No")
)
# Select the columns state, county, white, black and asian
# Calculate the proportion of white people relative
# to the total population
counties %>%
mutate(
state, county, white, black, asian,
population_white = population * (white / 100)
)
# Filter for counties where the proportion of white people is greater
# than 80% and the income per capita is less than 25000
counties %>%
filter(
white > 80,
income_per_cap < 25^3
)
# Create a new column called commute_above_average that is TRUE if the
# mean commute time is greater than 25 minutes and FALSE otherwise
counties %>%
mutate(
commute_above_average =
ifelse(
mean_commute > 25,
TRUE,
FALSE
)
)
# Select the columns state, county, professional, service, office,
# construction and production
# Calculate the total number of workers in these sectors
counties %>%
select(
state, county, professional, service, office,
construction, production, employed
) %>%
mutate(
professional_workers = (professional / 100) * employed,
service_workers = (service / 100) * employed,
office_workers = (office / 100) * employed,
construction_workers = (construction / 100) * employed,
production_workers = (production / 100) * employed,
total_workers_selected = professional_workers +
service_workers + office_workers +
construction_workers + production_workers
)
# Filter for counties in the state of Texas that have a land_area
# greater than 1000 square miles and order them in descending
# order by land_area
counties %>%
filter(
state == "Texas",
land_area > 10^3
) %>%
arrange(desc(land_area))
# Select the columns state, county, child_poverty and poverty
# Create a new column to categorize the relationship between child
# poverty and overall poverty
counties %>%
mutate(
.keep = "none",
state, county, child_poverty, poverty,
poverty_ratio = child_poverty / poverty,
poverty_category = case_when(
poverty_ratio > 1 ~ "Child Higher",
poverty_ratio == 1 ~ "Proportional",
poverty_ratio < 1 ~ "Child Lower"
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment