pedrobrantes · April 7, 2025 22:22
diff --git a/Transforming_data_with_dplyr.R b/Transforming_data_with_dplyr.R
 # Exercises

 library(dplyr)
 counties <- read.csv(file = "Data_Manipulation_with_dplyr/datasets/counties.csv")
 counties <- counties %>%
  mutate(
    census_id = as.character(census_id),state = as.character(state),county = as.character(county),region = as.character(region),metro = as.character(metro),population = as.numeric(population),men = as.numeric(men),women = as.numeric(women),hispanic = as.numeric(hispanic),white = as.numeric(white),black = as.numeric(black),native = as.numeric(native),asian = as.numeric(asian),pacific = as.numeric(pacific),citizens = as.numeric(citizens),income = as.numeric(income),income_err = as.numeric(income_err),income_per_cap = as.numeric(income_per_cap),income_per_cap_err = as.numeric(income_per_cap_err),poverty = as.numeric(poverty),child_poverty = as.numeric(child_poverty),professional = as.numeric(professional),service = as.numeric(service),office = as.numeric(office),construction = as.numeric(construction),production = as.numeric(production),drive = as.numeric(drive),carpool = as.numeric(carpool),transit = as.numeric(transit),walk = as.numeric(walk),other_transp = as.numeric(other_transp),work_at_home = as.numeric(work_at_home),mean_commute = as.numeric(mean_commute),employed = as.numeric(employed),private_work = as.numeric(private_work),public_work = as.numeric(public_work),self_employed = as.numeric(self_employed),family_work = as.numeric(family_work),unemployment = as.numeric(unemployment),land_area = as.numeric(land_area),row_sum = professional + service + office + construction + production,professional = professional / row_sum * 100,service = service / row_sum * 100,office = office / row_sum * 100,construction = construction / row_sum * 100,production = production / row_sum * 100) %>% select(-row_sum)
 # Select the state, county, population and poverty variables
 counties %>% 
  select(state, county, population, poverty)

 # Add a verb to sort in descending order of public_work
 counties_selected <- counties %>% 
  select(
    state, county, population, private_work,
    public_work, self_employed
  ) %>% 
  arrange(desc(public_work))

 # Filter for counties with a population above one million
 counties_selected %>% 
  filter(population > 10^6)

 # and Califórnia too
 counties_selected %>% 
  filter(
    population > 1e+6,
    state == "California"
  )

 # Filter for Texas and more than 10000 people and
 # sort in descending order of private_work
 counties_selected %>% 
  filter(state == "Texas", population > 1e+4) %>% 
  arrange(desc(private_work))

 # Add a new column public_workers with the number of people
 # employed in public work
 counties_selected %>% 
  mutate(
    public_workers = public_work * (population / 100)
  )

 # Select the columns state, county, population, men and women
 # Calculate proportion_women as the fraction of the population
 # made up of women
 counties %>% 
  select(state, county, population, men, women) %>% 
  mutate(
    proportion_women = women / population
  )

 # Select the state county, income and unemployment variables
 counties %>% 
  select(state, county, income, unemployment)

 # Select the state, county, men, women variables and calculate
 # the difference between the number of men and women
 counties %>% 
  select(state, county, men, women) %>%
  mutate(
    diff_men_women = men - women
  )

 # Filter for counties in the state of Florida with an
 # unemployment rate lower than 7%
 counties %>% 
  filter(
    state == "Florida",
    unemployment < 7
  )

 # Filter for counties that are in the "South" region and have
 # a population greater than 500000
 counties %>% 
  filter(
    region == "South",
    population > 5e+5
  )

 # Select the columns state, county, income_per_cap and poverty
 # Order the results in ascending order by the poverty rate
 counties %>% 
  select(state, county, income_per_cap, poverty) %>% 
  arrange(poverty)

 # Create a new column called metro_status that contains "Yes" if
 # the county belongs to a metropolitan area and "No" otherwise
 counties %>% 
  mutate(
    metro_status = ifelse(metro == "Metro", "Yes", "No")
  )

 # Select the columns state, county, white, black and asian
 # Calculate the proportion of white people relative
 # to the total population
 counties %>% 
  mutate(
    state, county, white, black, asian,
    population_white = population * (white / 100)
  )

 # Filter for counties where the proportion of white people is greater
 # than 80% and the income per capita is less than 25000
 counties %>% 
  filter(
    white > 80,
    income_per_cap < 25^3
  )

 # Create a new column called commute_above_average that is TRUE if the
 # mean commute time is greater than 25 minutes and FALSE otherwise
 counties %>% 
  mutate(
    commute_above_average =
      ifelse(
        mean_commute > 25,
        TRUE,
        FALSE
      )
  )

 # Select the columns state, county, professional, service, office,
 # construction and production
 # Calculate the total number of workers in these sectors
 counties %>% 
  select(
    state, county, professional, service, office,
    construction, production, employed
  ) %>% 
  mutate(
    professional_workers = (professional / 100) * employed,
    service_workers = (service / 100) * employed,
    office_workers = (office / 100) * employed,
    construction_workers = (construction / 100) * employed,
    production_workers = (production / 100) * employed,
    total_workers_selected = professional_workers +
      service_workers + office_workers +
      construction_workers + production_workers
  )

 # Filter for counties in the state of Texas that have a land_area
 # greater than 1000 square miles and order them in descending
 # order by land_area
 counties %>% 
  filter(
    state == "Texas",
    land_area > 10^3
  ) %>%
  arrange(desc(land_area))

 # Select the columns state, county, child_poverty and poverty
 # Create a new column to categorize the relationship between child
 # poverty and overall poverty
 counties %>% 
  mutate(
    .keep = "none",
    state, county, child_poverty, poverty,
    poverty_ratio = child_poverty / poverty,
    poverty_category = case_when(
      poverty_ratio > 1 ~ "Child Higher",
      poverty_ratio == 1 ~ "Proportional",
      poverty_ratio < 1 ~ "Child Lower"
    )
  )
	# Exercises

	library(dplyr)
	counties <- read.csv(file = "Data_Manipulation_with_dplyr/datasets/counties.csv")
	counties <- counties %>%
	mutate(
	census_id = as.character(census_id),state = as.character(state),county = as.character(county),region = as.character(region),metro = as.character(metro),population = as.numeric(population),men = as.numeric(men),women = as.numeric(women),hispanic = as.numeric(hispanic),white = as.numeric(white),black = as.numeric(black),native = as.numeric(native),asian = as.numeric(asian),pacific = as.numeric(pacific),citizens = as.numeric(citizens),income = as.numeric(income),income_err = as.numeric(income_err),income_per_cap = as.numeric(income_per_cap),income_per_cap_err = as.numeric(income_per_cap_err),poverty = as.numeric(poverty),child_poverty = as.numeric(child_poverty),professional = as.numeric(professional),service = as.numeric(service),office = as.numeric(office),construction = as.numeric(construction),production = as.numeric(production),drive = as.numeric(drive),carpool = as.numeric(carpool),transit = as.numeric(transit),walk = as.numeric(walk),other_transp = as.numeric(other_transp),work_at_home = as.numeric(work_at_home),mean_commute = as.numeric(mean_commute),employed = as.numeric(employed),private_work = as.numeric(private_work),public_work = as.numeric(public_work),self_employed = as.numeric(self_employed),family_work = as.numeric(family_work),unemployment = as.numeric(unemployment),land_area = as.numeric(land_area),row_sum = professional + service + office + construction + production,professional = professional / row_sum * 100,service = service / row_sum * 100,office = office / row_sum * 100,construction = construction / row_sum * 100,production = production / row_sum * 100) %>% select(-row_sum)
	# Select the state, county, population and poverty variables
	counties %>%
	select(state, county, population, poverty)

	# Add a verb to sort in descending order of public_work
	counties_selected <- counties %>%
	select(
	state, county, population, private_work,
	public_work, self_employed
	) %>%
	arrange(desc(public_work))

	# Filter for counties with a population above one million
	counties_selected %>%
	filter(population > 10^6)

	# and Califórnia too
	counties_selected %>%
	filter(
	population > 1e+6,
	state == "California"
	)

	# Filter for Texas and more than 10000 people and
	# sort in descending order of private_work
	counties_selected %>%
	filter(state == "Texas", population > 1e+4) %>%
	arrange(desc(private_work))

	# Add a new column public_workers with the number of people
	# employed in public work
	counties_selected %>%
	mutate(
	public_workers = public_work * (population / 100)
	)

	# Select the columns state, county, population, men and women
	# Calculate proportion_women as the fraction of the population
	# made up of women
	counties %>%
	select(state, county, population, men, women) %>%
	mutate(
	proportion_women = women / population
	)

	# Select the state county, income and unemployment variables
	counties %>%
	select(state, county, income, unemployment)

	# Select the state, county, men, women variables and calculate
	# the difference between the number of men and women
	counties %>%
	select(state, county, men, women) %>%
	mutate(
	diff_men_women = men - women
	)

	# Filter for counties in the state of Florida with an
	# unemployment rate lower than 7%
	counties %>%
	filter(
	state == "Florida",
	unemployment < 7
	)

	# Filter for counties that are in the "South" region and have
	# a population greater than 500000
	counties %>%
	filter(
	region == "South",
	population > 5e+5
	)

	# Select the columns state, county, income_per_cap and poverty
	# Order the results in ascending order by the poverty rate
	counties %>%
	select(state, county, income_per_cap, poverty) %>%
	arrange(poverty)

	# Create a new column called metro_status that contains "Yes" if
	# the county belongs to a metropolitan area and "No" otherwise
	counties %>%
	mutate(
	metro_status = ifelse(metro == "Metro", "Yes", "No")
	)

	# Select the columns state, county, white, black and asian
	# Calculate the proportion of white people relative
	# to the total population
	counties %>%
	mutate(
	state, county, white, black, asian,
	population_white = population * (white / 100)
	)

	# Filter for counties where the proportion of white people is greater
	# than 80% and the income per capita is less than 25000
	counties %>%
	filter(
	white > 80,
	income_per_cap < 25^3
	)

	# Create a new column called commute_above_average that is TRUE if the
	# mean commute time is greater than 25 minutes and FALSE otherwise
	counties %>%
	mutate(
	commute_above_average =
	ifelse(
	mean_commute > 25,
	TRUE,
	FALSE
	)
	)

	# Select the columns state, county, professional, service, office,
	# construction and production
	# Calculate the total number of workers in these sectors
	counties %>%
	select(
	state, county, professional, service, office,
	construction, production, employed
	) %>%
	mutate(
	professional_workers = (professional / 100) * employed,
	service_workers = (service / 100) * employed,
	office_workers = (office / 100) * employed,
	construction_workers = (construction / 100) * employed,
	production_workers = (production / 100) * employed,
	total_workers_selected = professional_workers +
	service_workers + office_workers +
	construction_workers + production_workers
	)

	# Filter for counties in the state of Texas that have a land_area
	# greater than 1000 square miles and order them in descending
	# order by land_area
	counties %>%
	filter(
	state == "Texas",
	land_area > 10^3
	) %>%
	arrange(desc(land_area))

	# Select the columns state, county, child_poverty and poverty
	# Create a new column to categorize the relationship between child
	# poverty and overall poverty
	counties %>%
	mutate(
	.keep = "none",
	state, county, child_poverty, poverty,
	poverty_ratio = child_poverty / poverty,
	poverty_category = case_when(
	poverty_ratio > 1 ~ "Child Higher",
	poverty_ratio == 1 ~ "Proportional",
	poverty_ratio < 1 ~ "Child Lower"
	)
	)
No results found