Skip to content

Instantly share code, notes, and snippets.

@Awuor87
Created April 4, 2017 15:22
Show Gist options
  • Save Awuor87/8b520ac4ab46ba017030a8d83d4d55db to your computer and use it in GitHub Desktop.
Save Awuor87/8b520ac4ab46ba017030a8d83d4d55db to your computer and use it in GitHub Desktop.
Sample code for dplyr package
## What is dplyr?
# dplyr is a grammar that makes data manipulation quick and easy.
# Install the dplyr library
install.packages('dplyr')
# load the dplyr library
library(dplyr)
# Install the nycflights13 library
install.packages('nycflights13')
# Load the nycflights13 library
library(nycflights13)
# Assign the flights data
flights = flights
# View the column names
str(flights)
# Select the carrier and origin columns
select(flights,carrier, origin)
# Select the dep_tim and distance columsn and assign it to x
x <- select(flights, dep_time, distance)
# Use str() to view the data
str(x)
# Convert x to a normal data frame
x=as.data.frame(x)
# Select all columns except flight
select(flights, -flight)
# Select all columns except flight and hour
select(flights, -flight, -hour)
# Look at the names of the columns
names(flights)
# Select the last three columns using ':'
select(flights, hour:time_hour)
# Use starts_with to select variables that start with "a"
select(flights, start_with ("a"))
# Use ends_with to select variables that end with "y"
select(flights, ends_with ("y"))
# Use contains to select variables that contain "u"
select(flights, contains("u"))
## Practice: select the columns that contain "time"
select(flights, contains ("time"))
select(flights, contains("time"), ends_with("y"))
## select columns dep_delay through carrier
select(flights, dep_delay:carrier)
## Select all columns except "tailnum", "day", and "arr_delay"
select(flights, -tailnum, -arr_delay, -day)
#######################
# The filter verb
#######################
# Filter the flights data for rows where the hour was greater than or equal to 5
filter(flights, hour>=5)
# day is 1 or day is 5
filter(flights, day==1|5)
# two column conditions, day is greater than 2 and hour is greater than 10
filter(flights, day>2 & hour>10)
filter(flights, day>2, hour>10)
# carrier %in% "B6"
filter(flights, carrier %in% "B6")
# carrier %in% "B6" or "UA"
filter(flights, carrier %in% "B6"|carrier %in% "UA")
# carrier not in "B6" or "UA"
filter(flights, !carrier %in% "B6"| carrier %in% "UA")
## Practice: filter flights that were in October and and departed at 5
filter(flights, month==10 && dep_time==5)
## select flights where air_time is less than 100.
filter(flights, air_time<100)
# filter selects rows while select only selects columns
#######################
# The arrange verb
#######################
# Arrange flights by month
arrange(flights, month)
# Arrange flights by month and dep_delay
arrange(flights, month & dep_delay)
# arrange in descending order by month
arrange(flights, desc(month))
# arrange month by descending order dep_delay in ascending order
arrange(flights, desc(month), dep_delay)
arrange(flights, -month, dep_delay)
## Practice: arrange the flights by carrier and month and assign to x
x<-arrange(flights, carrier, month)
#######################
# The mutate verb
#######################
# Create the variable gain, arr_time minus dep_time
mutate(flights, gain= arr_time - dep_time)
## divide distance by air_time to get speed
mutate(flights, speed = distance/air_time)
###################################
# Combining verbs and the pipe
###################################
# select the columns year, month, day, and carrier.
# Then filter where month is 12.
filter(select(flights, year, month, day, carrier),month==12)
# select the columns year, month, day, and carrier.
# Then filter where month is 12 and arrange by day
arrange(filter(select(flights, year, month, day, carrier),month==12),day)
# The pipe operator %>% # is an alternative to the combinations above
flights %>% select(year, month,day,carrier)
flights %>% select(year, month, day,carrier) %>% filter(month==12) %>%arrange(day)
# Use the pipe and select the columns year, month, day, and carrier.
# Then filter where month is 12 and arrange by day
flights%>% select(year, month, day, carrier) %>%
filter(month==12)%>%
arrange(day)
## select carrier, distance, air_time, and month.
## filter where month is 5
## create variable mpm = distance/air_time
## arrange by mpm in descending order
flights%>% select(carrier, distance, air_time, month)%>%
filter(month ==5)%>%
mutate(mpm=distance/air_time)%>%
arrange(-mpm)
## select origin, dest, dep_time, arr_time, carrier
## filter carriers 'AA' and 'DL'
## create travel_time, arr_time minus dep_time
## arrange by travel_time
flights %>% select(origin, dest, dep_time, arr_time, carrier)%>%
filter(carriers=="AA"|carriers=="DL")
ti
###################################
# The summarise verb
###################################
# finding the mean, median, standard deviation, min, and max of distance
flights%>% summarize(mean=(distance), median=median(distance), min= min(distance), max= max(distance))
# find the average of air_time
flights%>% summarise(average_air_time=mean(air_time, na.rm=T))
#na.rm -tells r to ignore the missing values...missinig values are identified with the result of NA for mean
## Practice:
## filter carrier by 'DL'
## find the average distnace, median distance
flights%>% filter(carrier=="DL")%>%
summarise(average_distance= mean(distance),median_distance= median(distance))
###################################
# The Group By Verb
###################################
# Calcuate the mean and median distance for each carrier.
flights%>% group_by(carrier) %>%
summarise(average_distance= mean(distance), median_distance= median(distance))
## group_by carrier and month
## filter for month 9 and 10
## calculate average, min, and max distance
flights%>% filter(month==9 | month==10)%>%
group_by(carrier, month)%>%
summarise(average_distance=mean(distance), min_distance= min(distance), max_distance=max(distance))
inner_join(average_distance, median_distance, by="carrier")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment