Created
April 19, 2025 23:49
-
-
Save boooeee/16ca4f5b58910c10e11a1c4cc3d66960 to your computer and use it in GitHub Desktop.
R code base analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this code analyzes all of the R scripts in a specified folder and then creates charts summarizing your coding trends over time # | |
# Load necessary libraries | |
library(tidyverse) | |
library(fs) | |
library(ggplot2) | |
library(lubridate) | |
library(dplyr) | |
library(scales) | |
# Specify your folder path - replace with your actual path | |
folder_path <- "~/" # Update this! | |
cutoff_date<-"2012-01-01" # update this for the cutoff date to start anayzing your R code | |
# Function to analyze R files | |
analyze_r_files <- function(folder_path) { | |
# Find all .R files in the folder and subfolders | |
r_files <- fs::dir_ls(path = folder_path, | |
recurse = TRUE, | |
glob = "*.R") | |
# Initialize a dataframe to store results | |
results <- data.frame( | |
file_name = character(), | |
file_path = character(), | |
last_modified = as.POSIXct(character()), | |
size_bytes = numeric(), | |
num_lines = numeric(), | |
num_chars = numeric(), | |
num_dollar_signs = numeric(), | |
num_functions = numeric(), | |
num_comments = numeric(), | |
num_pipes = numeric(), # Added for %>% count | |
num_libraries = numeric(), # Added for library() count | |
stringsAsFactors = FALSE | |
) | |
# Process each file | |
for (file in r_files) { | |
tryCatch({ | |
# Get file info | |
file_info <- fs::file_info(file) | |
file_name <- basename(file) | |
# Read file content | |
content <- readLines(file, warn = FALSE) | |
# Calculate metrics | |
num_lines <- length(content) | |
content_text <- paste(content, collapse = "\n") | |
num_chars <- nchar(content_text) | |
num_dollar_signs <- stringr::str_count(content_text, "\\$") | |
num_functions <- stringr::str_count(content_text, "function\\s*\\(") | |
num_comments <- sum(stringr::str_detect(content, "^\\s*#")) | |
num_pipes <- stringr::str_count(content_text, "%>%") # Count pipes | |
num_libraries <- stringr::str_count(content_text, "library\\(") # Count library calls | |
# Add to results | |
results <- rbind(results, data.frame( | |
file_name = file_name, | |
file_path = as.character(file), | |
last_modified = file_info$modification_time, | |
size_bytes = file_info$size, | |
num_lines = num_lines, | |
num_chars = num_chars, | |
num_dollar_signs = num_dollar_signs, | |
num_functions = num_functions, | |
num_comments = num_comments, | |
num_pipes = num_pipes, # Added pipe count | |
num_libraries = num_libraries, # Added library count | |
stringsAsFactors = FALSE | |
)) | |
}, error = function(e) { | |
warning("Error processing file: ", file, "\n", e$message) | |
}) | |
} | |
return(results) | |
} | |
# Run the analysis | |
results <- analyze_r_files(folder_path) | |
# summarize by date ---- | |
date_summ <- results %>% | |
mutate(date_floor=floor_date(last_modified,unit="months")) %>% | |
group_by(date_floor) %>% | |
summarise(scripts=n(),lines=sum(num_lines), | |
characters=sum(num_chars),dollars=sum(num_dollar_signs), | |
pipes=sum(num_pipes),libraries=sum(num_libraries), | |
lines=sum(num_lines),comments=sum(num_comments)) %>% | |
mutate(dollar_pct=dollars/characters, | |
pipe_pct=pipes/characters, | |
packages_per_script=libraries/scripts, | |
lines_per_script=lines/scripts, | |
comments_per_line=comments/lines) %>% | |
mutate(date_floor = as_date(date_floor)) | |
# Ensure date is a proper Date object | |
date_summ <- date_summ %>% | |
mutate(date_floor = as_date(date_floor)) | |
# Create improved plot | |
ggplot( | |
subset(date_summ, date_floor >= as_date(cutoff_date)), | |
aes(x = date_floor, y = pipe_pct) | |
) + | |
geom_point(size = 2, alpha = 0.6, color = "#478873") + | |
geom_smooth(color = "#2F4F4F", size = 1.2, se=F) + # A darker gray-green for contrast | |
labs( | |
title = "R Pipe Operator Usage Over Time", | |
subtitle = "Frequency of `%>%` by last modified date (2012–2025)", | |
x = "Last modified date", | |
y = "Frequency of Pipe Operator", | |
caption = "Source: My R code repository" | |
) + | |
scale_x_date( | |
date_breaks = "1 year", | |
date_labels = "%Y", | |
limits = c(as_date("2012-01-01"), NA), | |
expand = c(0.01, 0.01) | |
) + | |
scale_y_continuous( | |
labels = label_percent(accuracy = 0.1) | |
) + | |
theme_minimal(base_size = 16) + | |
theme( | |
plot.background = element_rect(fill = "#F7FBF9", color = NA), | |
panel.background = element_rect(fill = "#F7FBF9", color = NA), | |
panel.grid.major = element_line(color = "grey90"), | |
panel.grid.minor = element_blank(), | |
axis.text.x = element_text(angle = 45, hjust = 1), | |
plot.title = element_text(face = "bold", size = 20), | |
plot.subtitle = element_text(size = 14, margin = margin(b = 10)), | |
plot.caption = element_text(size = 10, face = "italic", hjust = 1), | |
axis.title.x = element_text(margin = margin(t = 10)), | |
axis.title.y = element_text(margin = margin(r = 10)) | |
) | |
ggsave( | |
filename = paste0(folder_path,"pipe_operator_usage.png"), | |
width = 10, | |
height = 6, | |
dpi = 300, | |
bg = "#F7FBF9" | |
) | |
# Create improved plot | |
ggplot( | |
subset(date_summ, date_floor >= as_date(cutoff_date)), | |
aes(x = date_floor, y = dollar_pct) | |
) + | |
geom_point(size = 2, alpha = 0.6, color = "#478873") + | |
geom_smooth(color = "#2F4F4F", size = 1.2, se=F) + # A darker gray-green for contrast | |
labs( | |
title = "Dollar Sign Usage in R Code Over Time", | |
subtitle = "Frequency of '$' usage by last modified date (2012–2025)", | |
x = "Last modified date", | |
y = "Frequency of '$' Usage", | |
caption = "Source: Mike Beuoy's R code repository" | |
) + | |
scale_x_date( | |
date_breaks = "1 year", | |
date_labels = "%Y", | |
limits = c(as_date("2012-01-01"), NA), | |
expand = c(0.01, 0.01) | |
) + | |
scale_y_continuous( | |
labels = label_percent(accuracy = 0.1) | |
) + | |
theme_minimal(base_size = 16) + | |
theme( | |
plot.background = element_rect(fill = "#F7FBF9", color = NA), | |
panel.background = element_rect(fill = "#F7FBF9", color = NA), | |
panel.grid.major = element_line(color = "grey90"), | |
panel.grid.minor = element_blank(), | |
axis.text.x = element_text(angle = 45, hjust = 1), | |
plot.title = element_text(face = "bold", size = 20), | |
plot.subtitle = element_text(size = 14, margin = margin(b = 10)), | |
plot.caption = element_text(size = 10, face = "italic", hjust = 1), | |
axis.title.x = element_text(margin = margin(t = 10)), | |
axis.title.y = element_text(margin = margin(r = 10)) | |
) | |
ggsave( | |
filename = paste0(folder_path,"dollar_sign.png"), | |
width = 10, | |
height = 6, | |
dpi = 300, | |
bg = "#F7FBF9" | |
) | |
# packages per script ---- | |
ggplot( | |
subset(date_summ, date_floor >= as_date(cutoff_date)), | |
aes(x = date_floor, y = packages_per_script) | |
) + | |
geom_point(size = 2, alpha = 0.6, color = "#478873") + | |
geom_smooth(color = "#2F4F4F", size = 1.2, se=F) + # A darker gray-green for contrast | |
labs( | |
title = "Number of Packages Loaded Per R Script Over Time", | |
subtitle = "Frequency of 'library' statement by last modified date (2012–2025)", | |
x = "Last modified date", | |
y = "Avg # of packages loaded", | |
caption = "Source: My R code repository" | |
) + | |
scale_x_date( | |
date_breaks = "1 year", | |
date_labels = "%Y", | |
limits = c(as_date("2012-01-01"), NA), | |
expand = c(0.01, 0.01) | |
) + | |
theme_minimal(base_size = 16) + | |
theme( | |
plot.background = element_rect(fill = "#F7FBF9", color = NA), | |
panel.background = element_rect(fill = "#F7FBF9", color = NA), | |
panel.grid.major = element_line(color = "grey90"), | |
panel.grid.minor = element_blank(), | |
axis.text.x = element_text(angle = 45, hjust = 1), | |
plot.title = element_text(face = "bold", size = 20), | |
plot.subtitle = element_text(size = 14, margin = margin(b = 10)), | |
plot.caption = element_text(size = 10, face = "italic", hjust = 1), | |
axis.title.x = element_text(margin = margin(t = 10)), | |
axis.title.y = element_text(margin = margin(r = 10)) | |
) | |
ggsave( | |
filename = paste0(folder_path,"packages_loaded.png"), | |
width = 10, | |
height = 6, | |
dpi = 300, | |
bg = "#F7FBF9" | |
) | |
# lines per script ---- | |
ggplot( | |
subset(date_summ, date_floor >= as_date(cutoff_date)), | |
aes(x = date_floor, y = lines_per_script) | |
) + | |
geom_point(size = 2, alpha = 0.6, color = "#478873") + | |
geom_smooth(color = "#2F4F4F", size = 1.2, se=F) + # A darker gray-green for contrast | |
labs( | |
title = "Number of Lines Per R Script Over Time", | |
subtitle = "Number of lines in each R script by last modified date (2012–2025)", | |
x = "Last modified date", | |
y = "Avg # of lines", | |
caption = "Source: My R code repository" | |
) + | |
scale_x_date( | |
date_breaks = "1 year", | |
date_labels = "%Y", | |
limits = c(as_date("2012-01-01"), NA), | |
expand = c(0.01, 0.01) | |
) + | |
theme_minimal(base_size = 16) + | |
theme( | |
plot.background = element_rect(fill = "#F7FBF9", color = NA), | |
panel.background = element_rect(fill = "#F7FBF9", color = NA), | |
panel.grid.major = element_line(color = "grey90"), | |
panel.grid.minor = element_blank(), | |
axis.text.x = element_text(angle = 45, hjust = 1), | |
plot.title = element_text(face = "bold", size = 20), | |
plot.subtitle = element_text(size = 14, margin = margin(b = 10)), | |
plot.caption = element_text(size = 10, face = "italic", hjust = 1), | |
axis.title.x = element_text(margin = margin(t = 10)), | |
axis.title.y = element_text(margin = margin(r = 10)) | |
) | |
ggsave( | |
filename = paste0(folder_path,"lines_per_script.png"), | |
width = 10, | |
height = 6, | |
dpi = 300, | |
bg = "#F7FBF9" | |
) | |
# comments per line ---- | |
ggplot( | |
subset(date_summ, date_floor >= as_date(cutoff_date)), | |
aes(x = date_floor, y = comments_per_line) | |
) + | |
geom_point(size = 2, alpha = 0.6, color = "#478873") + | |
geom_smooth(color = "#2F4F4F", size = 1.2, se=F) + # A darker gray-green for contrast | |
labs( | |
title = "Number of Comments Per Line Over Time", | |
subtitle = "Number of comments per line in each R script by last modified date (2012–2025)", | |
x = "Last modified date", | |
y = "Comments Per Line", | |
caption = "Source: My R code repository" | |
) + | |
scale_x_date( | |
date_breaks = "1 year", | |
date_labels = "%Y", | |
limits = c(as_date("2012-01-01"), NA), | |
expand = c(0.01, 0.01) | |
) + | |
theme_minimal(base_size = 16) + | |
theme( | |
plot.background = element_rect(fill = "#F7FBF9", color = NA), | |
panel.background = element_rect(fill = "#F7FBF9", color = NA), | |
panel.grid.major = element_line(color = "grey90"), | |
panel.grid.minor = element_blank(), | |
axis.text.x = element_text(angle = 45, hjust = 1), | |
plot.title = element_text(face = "bold", size = 20), | |
plot.subtitle = element_text(size = 14, margin = margin(b = 10)), | |
plot.caption = element_text(size = 10, face = "italic", hjust = 1), | |
axis.title.x = element_text(margin = margin(t = 10)), | |
axis.title.y = element_text(margin = margin(r = 10)) | |
) | |
ggsave( | |
filename = paste0(folder_path,"comments_per_line.png"), | |
width = 10, | |
height = 6, | |
dpi = 300, | |
bg = "#F7FBF9" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment