Last active
November 11, 2023 00:25
-
-
Save MarkEdmondson1234/a7d6daedfa40ce2d6f27a1a5c56e9a50 to your computer and use it in GitHub Desktop.
Check 0 pageviews by comparing sitemap.XML URLs with Google Analytics visits.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(googleAnalyticsR) | |
library(xml2) | |
library(dplyr) | |
ga_auth() | |
## date range of URLs to test | |
dates <- c(Sys.Date() - 30, Sys.Date()) | |
##GA View ID | |
id <- 11111111 | |
## function to get sitemap URLs | |
get_sitemap <- function(sitemap, field = "loc"){ | |
sm <- as_list(read_xml(sitemap)) | |
out <- try(Reduce(rbind, | |
vapply(sm, function(x) Reduce(rbind, x[[field]]), character(1)) | |
)) | |
if(inherits(out, "try-error")){ | |
message("Problem with sitemap:", sitemap) | |
return(NULL) | |
} | |
as.vector(out) | |
} | |
## make google SEO filter | |
google_seo <- filter_clause_ga4( | |
list( | |
dim_filter("source", "EXACT", "google"), | |
dim_filter("medium", "EXACT", "organic") | |
), | |
operator = "AND") | |
## get the pages | |
pages <- google_analytics_4(id, | |
date_range = dates, | |
dimensions = "pagePath", | |
metrics = c("pageviews","totalEvents"), | |
dim_filters = google_seo, | |
max = -1, | |
anti_sample = TRUE) | |
## get the sitemap index file | |
url_si <- "http://www.example.com/sitemap.xml" | |
sitemap_index <- get_sitemap(url_si) | |
## get all the sitemaps (maybe you only need the call above if you have no sitemap index) | |
many_sitemaps <- lapply(sitemap_index, get_sitemap) | |
## all the urls in all the sitemaps | |
all_urls <- Reduce(c, many_sitemaps) | |
## Compare and get the URLs that are in XML but not in Google Analytics | |
## dplyr transformations | |
sitemap_urls <- as.tbl(as.data.frame(all_urls, stringsAsFactors = FALSE)) | |
sitemap_urls <- sitemap_urls %>% mutate(path = paste0("/",urltools::path(all_urls))) | |
sitemap_not_in_ga <- anti_join(sitemap_urls, pages, by = c(path = "pagePath")) | |
## write out to CSV | |
write.csv(sitemap_not_in_ga, file = "./data/sitemap_urls_not_in_ga.csv", row.names = FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Just saw this, sorry. It won't work in Wordpress, which is PHP. This is a script to run in R, locally on your laptop, perhaps.