Created
April 7, 2025 22:22
-
-
Save pedrobrantes/f10f7a83c5ce5366fc20b95d5a2be9bc to your computer and use it in GitHub Desktop.
This Gist contains R code exercises demonstrating data manipulation with the dplyr package using a dataset of US counties. It includes examples of selecting columns, filtering rows based on various criteria (population, state, poverty, unemployment, etc.), sorting data, and creating new columns through calculations and conditional logic.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Exercises | |
| library(dplyr) | |
| counties <- read.csv(file = "Data_Manipulation_with_dplyr/datasets/counties.csv") | |
| counties <- counties %>% | |
| mutate( | |
| census_id = as.character(census_id),state = as.character(state),county = as.character(county),region = as.character(region),metro = as.character(metro),population = as.numeric(population),men = as.numeric(men),women = as.numeric(women),hispanic = as.numeric(hispanic),white = as.numeric(white),black = as.numeric(black),native = as.numeric(native),asian = as.numeric(asian),pacific = as.numeric(pacific),citizens = as.numeric(citizens),income = as.numeric(income),income_err = as.numeric(income_err),income_per_cap = as.numeric(income_per_cap),income_per_cap_err = as.numeric(income_per_cap_err),poverty = as.numeric(poverty),child_poverty = as.numeric(child_poverty),professional = as.numeric(professional),service = as.numeric(service),office = as.numeric(office),construction = as.numeric(construction),production = as.numeric(production),drive = as.numeric(drive),carpool = as.numeric(carpool),transit = as.numeric(transit),walk = as.numeric(walk),other_transp = as.numeric(other_transp),work_at_home = as.numeric(work_at_home),mean_commute = as.numeric(mean_commute),employed = as.numeric(employed),private_work = as.numeric(private_work),public_work = as.numeric(public_work),self_employed = as.numeric(self_employed),family_work = as.numeric(family_work),unemployment = as.numeric(unemployment),land_area = as.numeric(land_area),row_sum = professional + service + office + construction + production,professional = professional / row_sum * 100,service = service / row_sum * 100,office = office / row_sum * 100,construction = construction / row_sum * 100,production = production / row_sum * 100) %>% select(-row_sum) | |
| # Select the state, county, population and poverty variables | |
| counties %>% | |
| select(state, county, population, poverty) | |
| # Add a verb to sort in descending order of public_work | |
| counties_selected <- counties %>% | |
| select( | |
| state, county, population, private_work, | |
| public_work, self_employed | |
| ) %>% | |
| arrange(desc(public_work)) | |
| # Filter for counties with a population above one million | |
| counties_selected %>% | |
| filter(population > 10^6) | |
| # and Califórnia too | |
| counties_selected %>% | |
| filter( | |
| population > 1e+6, | |
| state == "California" | |
| ) | |
| # Filter for Texas and more than 10000 people and | |
| # sort in descending order of private_work | |
| counties_selected %>% | |
| filter(state == "Texas", population > 1e+4) %>% | |
| arrange(desc(private_work)) | |
| # Add a new column public_workers with the number of people | |
| # employed in public work | |
| counties_selected %>% | |
| mutate( | |
| public_workers = public_work * (population / 100) | |
| ) | |
| # Select the columns state, county, population, men and women | |
| # Calculate proportion_women as the fraction of the population | |
| # made up of women | |
| counties %>% | |
| select(state, county, population, men, women) %>% | |
| mutate( | |
| proportion_women = women / population | |
| ) | |
| # Select the state county, income and unemployment variables | |
| counties %>% | |
| select(state, county, income, unemployment) | |
| # Select the state, county, men, women variables and calculate | |
| # the difference between the number of men and women | |
| counties %>% | |
| select(state, county, men, women) %>% | |
| mutate( | |
| diff_men_women = men - women | |
| ) | |
| # Filter for counties in the state of Florida with an | |
| # unemployment rate lower than 7% | |
| counties %>% | |
| filter( | |
| state == "Florida", | |
| unemployment < 7 | |
| ) | |
| # Filter for counties that are in the "South" region and have | |
| # a population greater than 500000 | |
| counties %>% | |
| filter( | |
| region == "South", | |
| population > 5e+5 | |
| ) | |
| # Select the columns state, county, income_per_cap and poverty | |
| # Order the results in ascending order by the poverty rate | |
| counties %>% | |
| select(state, county, income_per_cap, poverty) %>% | |
| arrange(poverty) | |
| # Create a new column called metro_status that contains "Yes" if | |
| # the county belongs to a metropolitan area and "No" otherwise | |
| counties %>% | |
| mutate( | |
| metro_status = ifelse(metro == "Metro", "Yes", "No") | |
| ) | |
| # Select the columns state, county, white, black and asian | |
| # Calculate the proportion of white people relative | |
| # to the total population | |
| counties %>% | |
| mutate( | |
| state, county, white, black, asian, | |
| population_white = population * (white / 100) | |
| ) | |
| # Filter for counties where the proportion of white people is greater | |
| # than 80% and the income per capita is less than 25000 | |
| counties %>% | |
| filter( | |
| white > 80, | |
| income_per_cap < 25^3 | |
| ) | |
| # Create a new column called commute_above_average that is TRUE if the | |
| # mean commute time is greater than 25 minutes and FALSE otherwise | |
| counties %>% | |
| mutate( | |
| commute_above_average = | |
| ifelse( | |
| mean_commute > 25, | |
| TRUE, | |
| FALSE | |
| ) | |
| ) | |
| # Select the columns state, county, professional, service, office, | |
| # construction and production | |
| # Calculate the total number of workers in these sectors | |
| counties %>% | |
| select( | |
| state, county, professional, service, office, | |
| construction, production, employed | |
| ) %>% | |
| mutate( | |
| professional_workers = (professional / 100) * employed, | |
| service_workers = (service / 100) * employed, | |
| office_workers = (office / 100) * employed, | |
| construction_workers = (construction / 100) * employed, | |
| production_workers = (production / 100) * employed, | |
| total_workers_selected = professional_workers + | |
| service_workers + office_workers + | |
| construction_workers + production_workers | |
| ) | |
| # Filter for counties in the state of Texas that have a land_area | |
| # greater than 1000 square miles and order them in descending | |
| # order by land_area | |
| counties %>% | |
| filter( | |
| state == "Texas", | |
| land_area > 10^3 | |
| ) %>% | |
| arrange(desc(land_area)) | |
| # Select the columns state, county, child_poverty and poverty | |
| # Create a new column to categorize the relationship between child | |
| # poverty and overall poverty | |
| counties %>% | |
| mutate( | |
| .keep = "none", | |
| state, county, child_poverty, poverty, | |
| poverty_ratio = child_poverty / poverty, | |
| poverty_category = case_when( | |
| poverty_ratio > 1 ~ "Child Higher", | |
| poverty_ratio == 1 ~ "Proportional", | |
| poverty_ratio < 1 ~ "Child Lower" | |
| ) | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment