Created
June 20, 2022 11:37
-
-
Save mvolfik/a0875528358835cdf041fc126f155591 to your computer and use it in GitHub Desktop.
For a given shopify store, crawl a list of products from the API, and from sitemaps, and compare if both contain the same products
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { gotScraping } from 'got-scraping'; | |
const fetcher = gotScraping.extend({ | |
proxyUrl: process.env.PROXY_URL, | |
retry: { limit: 3 }, | |
}); | |
async function main() { | |
let baseUrl; | |
try { | |
baseUrl = new URL(process.argv[2]); | |
} catch (e) { | |
console.error( | |
'Provide a Shopify store URL as a command line argument (including `http(s)://`)', | |
); | |
process.exit(1); | |
} | |
baseUrl.pathname = '/'; | |
const robotsResponse = await fetcher.get(new URL('/robots.txt', baseUrl)); | |
const robots = robotsResponse.body; | |
if (!robots.includes('# we use Shopify as our ecommerce platform')) { | |
console.error("This doesn't seem to be a Shopify store"); | |
process.exit(1); | |
} | |
const rootSitemapResponse = await fetcher.get( | |
new URL('/sitemap.xml', baseUrl), | |
); | |
const rootSitemap = rootSitemapResponse.body; | |
const productsFromSitemap = new Set(); | |
const sitemapPromises = [ | |
...rootSitemap | |
.replaceAll('&', '&') | |
.matchAll(/<loc>(https?:\/\/[^/]+\/sitemap_products_.*)<\/loc>/g), | |
].map(async (match) => { | |
const url = match[1]; | |
const sitemapResponse = await fetcher.get(url); | |
console.log(`Fetched sitemap ${url}`); | |
const sitemap = sitemapResponse.body; | |
for (const productMatch of sitemap.matchAll( | |
/<loc>https?:\/\/[^/]+\/products\/(.*)<\/loc>/g, | |
)) { | |
productsFromSitemap.add(productMatch[1]); | |
} | |
}); | |
await Promise.all(sitemapPromises); | |
const productsFromAPI = new Set(); | |
let i = 1; | |
while (true) { | |
const oldSize = productsFromAPI.size; | |
const promises = []; | |
const newI = Math.ceil(i * 1.3); | |
for (let j = i; j < newI; j++) { | |
promises.push( | |
fetcher | |
.get(new URL(`/products.json?page=${j}`, baseUrl)) | |
.then((response) => { | |
console.log( | |
`Fetched products page ${response.requestUrl.toString()}`, | |
); | |
const { products } = JSON.parse(response.body); | |
for (const product of products) { | |
productsFromAPI.add(product.handle); | |
} | |
}), | |
); | |
} | |
await Promise.all(promises); | |
console.log(); | |
i = newI; | |
if (productsFromAPI.size === oldSize) { | |
break; | |
} | |
} | |
const productsFromSitemapArray = [...productsFromSitemap.values()].sort(); | |
const productsFromAPIArray = [...productsFromAPI.values()].sort(); | |
const notInSitemap = []; | |
const notInAPI = []; | |
let sitemap_i = 0; | |
let API_i = 0; | |
let inBoth = 0; | |
while (true) { | |
if ( | |
sitemap_i >= productsFromSitemapArray.length | |
&& API_i >= productsFromAPIArray.length | |
) { | |
break; | |
} | |
if (sitemap_i >= productsFromSitemapArray.length) { | |
notInSitemap.push(productsFromAPIArray[API_i++]); | |
} else if (API_i >= productsFromAPIArray.length) { | |
notInAPI.push(productsFromSitemapArray[sitemap_i++]); | |
} else if ( | |
productsFromSitemapArray[sitemap_i] < productsFromAPIArray[API_i] | |
) { | |
notInAPI.push(productsFromSitemapArray[sitemap_i++]); | |
} else if ( | |
productsFromSitemapArray[sitemap_i] > productsFromAPIArray[API_i] | |
) { | |
notInSitemap.push(productsFromAPIArray[API_i++]); | |
} else { | |
sitemap_i++; | |
API_i++; | |
inBoth++; | |
} | |
} | |
console.log('Missing from sitemap:', notInSitemap); | |
console.log('Missing from API:', notInAPI); | |
console.log('In both:', inBoth); | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment