Created
April 4, 2017 15:22
-
-
Save Awuor87/8b520ac4ab46ba017030a8d83d4d55db to your computer and use it in GitHub Desktop.
Sample code for dplyr package
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## What is dplyr? | |
# dplyr is a grammar that makes data manipulation quick and easy. | |
# Install the dplyr library | |
install.packages('dplyr') | |
# load the dplyr library | |
library(dplyr) | |
# Install the nycflights13 library | |
install.packages('nycflights13') | |
# Load the nycflights13 library | |
library(nycflights13) | |
# Assign the flights data | |
flights = flights | |
# View the column names | |
str(flights) | |
# Select the carrier and origin columns | |
select(flights,carrier, origin) | |
# Select the dep_tim and distance columsn and assign it to x | |
x <- select(flights, dep_time, distance) | |
# Use str() to view the data | |
str(x) | |
# Convert x to a normal data frame | |
x=as.data.frame(x) | |
# Select all columns except flight | |
select(flights, -flight) | |
# Select all columns except flight and hour | |
select(flights, -flight, -hour) | |
# Look at the names of the columns | |
names(flights) | |
# Select the last three columns using ':' | |
select(flights, hour:time_hour) | |
# Use starts_with to select variables that start with "a" | |
select(flights, start_with ("a")) | |
# Use ends_with to select variables that end with "y" | |
select(flights, ends_with ("y")) | |
# Use contains to select variables that contain "u" | |
select(flights, contains("u")) | |
## Practice: select the columns that contain "time" | |
select(flights, contains ("time")) | |
select(flights, contains("time"), ends_with("y")) | |
## select columns dep_delay through carrier | |
select(flights, dep_delay:carrier) | |
## Select all columns except "tailnum", "day", and "arr_delay" | |
select(flights, -tailnum, -arr_delay, -day) | |
####################### | |
# The filter verb | |
####################### | |
# Filter the flights data for rows where the hour was greater than or equal to 5 | |
filter(flights, hour>=5) | |
# day is 1 or day is 5 | |
filter(flights, day==1|5) | |
# two column conditions, day is greater than 2 and hour is greater than 10 | |
filter(flights, day>2 & hour>10) | |
filter(flights, day>2, hour>10) | |
# carrier %in% "B6" | |
filter(flights, carrier %in% "B6") | |
# carrier %in% "B6" or "UA" | |
filter(flights, carrier %in% "B6"|carrier %in% "UA") | |
# carrier not in "B6" or "UA" | |
filter(flights, !carrier %in% "B6"| carrier %in% "UA") | |
## Practice: filter flights that were in October and and departed at 5 | |
filter(flights, month==10 && dep_time==5) | |
## select flights where air_time is less than 100. | |
filter(flights, air_time<100) | |
# filter selects rows while select only selects columns | |
####################### | |
# The arrange verb | |
####################### | |
# Arrange flights by month | |
arrange(flights, month) | |
# Arrange flights by month and dep_delay | |
arrange(flights, month & dep_delay) | |
# arrange in descending order by month | |
arrange(flights, desc(month)) | |
# arrange month by descending order dep_delay in ascending order | |
arrange(flights, desc(month), dep_delay) | |
arrange(flights, -month, dep_delay) | |
## Practice: arrange the flights by carrier and month and assign to x | |
x<-arrange(flights, carrier, month) | |
####################### | |
# The mutate verb | |
####################### | |
# Create the variable gain, arr_time minus dep_time | |
mutate(flights, gain= arr_time - dep_time) | |
## divide distance by air_time to get speed | |
mutate(flights, speed = distance/air_time) | |
################################### | |
# Combining verbs and the pipe | |
################################### | |
# select the columns year, month, day, and carrier. | |
# Then filter where month is 12. | |
filter(select(flights, year, month, day, carrier),month==12) | |
# select the columns year, month, day, and carrier. | |
# Then filter where month is 12 and arrange by day | |
arrange(filter(select(flights, year, month, day, carrier),month==12),day) | |
# The pipe operator %>% # is an alternative to the combinations above | |
flights %>% select(year, month,day,carrier) | |
flights %>% select(year, month, day,carrier) %>% filter(month==12) %>%arrange(day) | |
# Use the pipe and select the columns year, month, day, and carrier. | |
# Then filter where month is 12 and arrange by day | |
flights%>% select(year, month, day, carrier) %>% | |
filter(month==12)%>% | |
arrange(day) | |
## select carrier, distance, air_time, and month. | |
## filter where month is 5 | |
## create variable mpm = distance/air_time | |
## arrange by mpm in descending order | |
flights%>% select(carrier, distance, air_time, month)%>% | |
filter(month ==5)%>% | |
mutate(mpm=distance/air_time)%>% | |
arrange(-mpm) | |
## select origin, dest, dep_time, arr_time, carrier | |
## filter carriers 'AA' and 'DL' | |
## create travel_time, arr_time minus dep_time | |
## arrange by travel_time | |
flights %>% select(origin, dest, dep_time, arr_time, carrier)%>% | |
filter(carriers=="AA"|carriers=="DL") | |
ti | |
################################### | |
# The summarise verb | |
################################### | |
# finding the mean, median, standard deviation, min, and max of distance | |
flights%>% summarize(mean=(distance), median=median(distance), min= min(distance), max= max(distance)) | |
# find the average of air_time | |
flights%>% summarise(average_air_time=mean(air_time, na.rm=T)) | |
#na.rm -tells r to ignore the missing values...missinig values are identified with the result of NA for mean | |
## Practice: | |
## filter carrier by 'DL' | |
## find the average distnace, median distance | |
flights%>% filter(carrier=="DL")%>% | |
summarise(average_distance= mean(distance),median_distance= median(distance)) | |
################################### | |
# The Group By Verb | |
################################### | |
# Calcuate the mean and median distance for each carrier. | |
flights%>% group_by(carrier) %>% | |
summarise(average_distance= mean(distance), median_distance= median(distance)) | |
## group_by carrier and month | |
## filter for month 9 and 10 | |
## calculate average, min, and max distance | |
flights%>% filter(month==9 | month==10)%>% | |
group_by(carrier, month)%>% | |
summarise(average_distance=mean(distance), min_distance= min(distance), max_distance=max(distance)) | |
inner_join(average_distance, median_distance, by="carrier") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment