-
-
Save admariner/8ad2e72d9aa7cf22ef7e4327acedaef2 to your computer and use it in GitHub Desktop.
Check 0 pageviews by comparing sitemap.XML URLs with Google Analytics visits.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(googleAnalyticsR) | |
library(xml2) | |
library(dplyr) | |
ga_auth() | |
## date range of URLs to test | |
dates <- c(Sys.Date() - 30, Sys.Date()) | |
##GA View ID | |
id <- 11111111 | |
## function to get sitemap URLs | |
get_sitemap <- function(sitemap, field = "loc"){ | |
sm <- as_list(read_xml(sitemap)) | |
out <- try(Reduce(rbind, | |
vapply(sm, function(x) Reduce(rbind, x[[field]]), character(1)) | |
)) | |
if(inherits(out, "try-error")){ | |
message("Problem with sitemap:", sitemap) | |
return(NULL) | |
} | |
as.vector(out) | |
} | |
## make google SEO filter | |
google_seo <- filter_clause_ga4( | |
list( | |
dim_filter("source", "EXACT", "google"), | |
dim_filter("medium", "EXACT", "organic") | |
), | |
operator = "AND") | |
## get the pages | |
pages <- google_analytics_4(id, | |
date_range = dates, | |
dimensions = "pagePath", | |
metrics = c("pageviews","totalEvents"), | |
dim_filters = google_seo, | |
max = -1, | |
anti_sample = TRUE) | |
## get the sitemap index file | |
url_si <- "http://www.example.com/sitemap.xml" | |
sitemap_index <- get_sitemap(url_si) | |
## get all the sitemaps (maybe you only need the call above if you have no sitemap index) | |
many_sitemaps <- lapply(sitemap_index, get_sitemap) | |
## all the urls in all the sitemaps | |
all_urls <- Reduce(c, many_sitemaps) | |
## Compare and get the URLs that are in XML but not in Google Analytics | |
## dplyr transformations | |
sitemap_urls <- as.tbl(as.data.frame(all_urls, stringsAsFactors = FALSE)) | |
sitemap_urls <- sitemap_urls %>% mutate(path = paste0("/",urltools::path(all_urls))) | |
sitemap_not_in_ga <- anti_join(sitemap_urls, pages, by = c(path = "pagePath")) | |
## write out to CSV | |
write.csv(sitemap_not_in_ga, file = "./data/sitemap_urls_not_in_ga.csv", row.names = FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment