Created
November 2, 2021 14:26
-
-
Save coffee-mug/142d2dd29c7222de88ffd15cb5086846 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const path = require('path'); | |
// Usage: node main.js [optional har file path] [optional JSON dataLayer file path] | |
// Iterate over all files in $folderName, grouping them by name and then performing | |
// audit: hostname extaction, flagging dataLayer values observed in requests and exporting | |
// results as a csv file in the curent folder. | |
// REQUIREMENTS: | |
// * files should have the same name: fnac_home.json and fnac_home.har | |
// Change folderName with the name of the folders hosting your .har and .json files | |
main("infos"); | |
function main(folderName) { | |
console.log("Debug: ", folderName); | |
// result array empty | |
const filesTuples = []; | |
// if files length is odd => error | |
const files = fs.readdirSync(folderName); | |
if (files.length == 0) { | |
console.log("No files to parse, exit."); | |
return | |
} | |
if (files.length % 2 != 0) { | |
console.log(`Wrong number of files (${files.length}), count should be even`); | |
return | |
} | |
console.log(`${files.length} files to group`); | |
// Sort all files names lexicographically | |
files.sort(); | |
files.forEach((value, index) => { | |
// we operate on the current file and the next one, hence index must be even | |
if (index % 2 == 0) { | |
// next file is a json file and has the same name | |
if (files[index + 1].endsWith(".json") && | |
files[index + 1].replace('.json', '') == files[index].replace('.har', '')) { | |
// store tuple in results | |
filesTuples.push([value, files[index + 1]]); | |
} | |
} | |
}) | |
let results = []; | |
filesTuples.forEach(tuple => { | |
// debug | |
console.log(`Comparing ${tuple[0]} with ${tuple[1]}`) | |
const output = compareRequestsAndDatalayer(path.join(folderName, tuple[0]), path.join(folderName, tuple[1])); | |
results = results.concat(output) | |
}) | |
// Add headers for readibility. Unshift mutates the array in place. | |
results.unshift(['Parent page', 'Hit', 'Hit Hostname', "DataLayer values present in the hit"]) | |
// debug | |
console.log(results.slice(0, 4)); | |
// - export a csv list of origin url, hit, hit domain, dataLayer values present in hit | |
fs.writeFileSync("audit.csv", results.join('\n')); | |
// consolidate JSONs files in a single flattened JSON object. | |
mergeDatalayerFiles(); | |
} | |
// flattenObject recursively takes an object with an arbitrary depth | |
// and return an object with all properties grouped at the root level. | |
// Useful to have reduce digitalData to the same level | |
// Exemple: | |
// const nestedObject = { a: { b: 1}, c: { d: { e: 3 }}}, f: [1,2,{ g: "lol"}]} | |
// flattenObject(nestedObject, null, {}) | |
// { b: 1, e: 3, f: [1,2], g: "lol"} | |
function flattenObject(object, key, output) { | |
// It's an object ? Recrusive call | |
if (typeof object == "object" && !(object instanceof Array)) { | |
Object.keys(object).forEach(property => flattenObject(object[property], property, output)); | |
// An array ? Recursive call ! | |
} else if (object instanceof Array) { | |
object.forEach(value => flattenObject(value, key, output)); | |
// a primitive - (we don't handle Set, Map, ...) for now | |
} else { | |
if (output[key] != undefined) { | |
if (output[key] instanceof Array) { | |
output[key].push(object); | |
} else { | |
var temp = []; | |
temp.push(output[key]); | |
temp.push(object); | |
output[key] = temp; | |
} | |
} else { | |
output[key] = object; | |
} | |
return output; | |
} | |
return output; | |
} | |
/* Merge JSON files */ | |
function mergeDatalayerFiles() { | |
const folder = "infos"; | |
let merged = {} | |
fs.readdirSync("infos") | |
.filter(file => file.endsWith(".json")) | |
.forEach(file => { | |
let content = JSON.parse(fs.readFileSync(path.join(folder, file))); | |
content["AUDITED_PAGE"] = file.replace('.json', '').replace('_', ' '); | |
merged = Object.assign(merged, content); | |
}); | |
fs.writeFileSync("merged.json", JSON.stringify(merged, (key, value) => { | |
if (value instanceof Array) { | |
return JSON.stringify(value); | |
} | |
return value | |
}, 2)); | |
} | |
/* | |
Export requests into a file | |
Export dataLayer into a file | |
*/ | |
function compareRequestsAndDatalayer(reqs, dL) { | |
const requestsPath = reqs || process.argv[2]; | |
const dataLayerPath = dL || process.argv[3]; | |
const allRequests = requestsPath ? fs.readFileSync(requestsPath) : null; | |
const dataLayer = dataLayerPath ? JSON.parse(fs.readFileSync(dataLayerPath)) : null; | |
// We're interested in PII or other juicy dataLayer values so filter out dumb data like single digits | |
// or "true" "false" that may be visible in hits while not being taken from the dataLayer. | |
const dataLayerEntries = dataLayer ? Object.entries(dataLayer).filter(tuple => tuple[1].length > 1 && !/true|false/.test(tuple[1])) : []; | |
const parsedRequests = JSON.parse(allRequests); | |
// For each request in the request file | |
// - Filter requests out | |
console.log("Requests length before filtering out extensions\n", JSON.parse(allRequests).log.entries.length); | |
const filtered = parsedRequests | |
.log | |
.entries | |
.filter(entry => !/\.css|\.html|\.js|\.jpe?g|\.png|\.woff2|\.svg|\.pdf|\.docx?|\.php|\.json/.test(decodeURIComponent(entry.request.url))) | |
console.log("Requests length after filtering out extensions\n", filtered.length); | |
// - Group them by domain | |
const output = filtered | |
.map(entry => [ | |
// Parent page | |
parsedRequests.log.pages[0].title, | |
// Hit URL | |
decodeURIComponent(entry.request.url).replace(/,/g, '-'), | |
// Hit hostname | |
new URL(entry.request.url).hostname.replace('www.', ''), | |
// Datalayer values present in the hit | |
dataLayerEntries.length > 0 ? dataLayerEntries.filter(tuple => decodeURIComponent(entry.request.url).includes(tuple[1])).map(tuple => tuple.join('=')).join(';') : "", | |
]) | |
return output; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment