Created
August 6, 2025 01:42
-
-
Save wlib/1ec790c2ee8367ad97cab4f499ad5eab to your computer and use it in GitHub Desktop.
Scrape all amazon orders (run in console on amazon.com). Waits for a second for each request. Takes a long time to run for some accounts...
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const ordersUrl = "https://www.amazon.com/your-orders/orders" | |
const wait = (ms = 1_000) => | |
new Promise(resolve => { | |
setTimeout(() =>{ | |
resolve() | |
}, ms) | |
}) | |
const fetchDocument = async url => { | |
const html = await fetch(url).then(res => res.ok ? res.text() : undefined) | |
if (!html) | |
return | |
return Document.parseHTMLUnsafe(html) | |
} | |
const fetchDocumentViaIframe = async url => | |
new Promise(resolve => { | |
const iframe = document.createElement("iframe") | |
iframe.addEventListener("load", () => { | |
const html = iframe.contentDocument.documentElement.outerHTML | |
resolve(Document.parseHTMLUnsafe(html)) | |
}) | |
iframe.src = url | |
}) | |
const ordersPage = await fetchDocument(ordersUrl) | |
const ordersUrlsByYear = | |
[...ordersPage.querySelector("select[name = timeFilter]").options] | |
.flatMap(option => { | |
if (!option.value.startsWith("year-")) | |
return [] | |
const url = new URL(ordersUrl) | |
url.searchParams.set("timeFilter", option.value) | |
return [url.href] | |
}) | |
const orderIds = new Set() | |
const extractOrderIds = (doc, baseUrl) => { | |
for (const a of doc.querySelectorAll("a")) { | |
let url | |
try { | |
url = new URL(a.href, baseUrl) | |
} | |
catch { | |
continue | |
} | |
if (!url.href.startsWith("https://www.amazon.com/gp/css/summary/print.html")) | |
continue | |
const orderId = url.searchParams.get("orderID") | |
if (!orderId) | |
continue | |
orderIds.add(orderId) | |
} | |
} | |
for (const ordersPageUrl of ordersUrlsByYear) { | |
const ordersPage = await fetchDocumentViaIframe(ordersPageUrl) | |
await wait() | |
extractOrderIds(ordersPage, ordersPageUrl) | |
const maxStartIndex = [...ordersPage.querySelectorAll(".a-pagination a")] | |
.flatMap(a => { | |
let url | |
try { | |
url = new URL(a.href, ordersPageUrl) | |
} | |
catch { | |
return [] | |
} | |
if (!url.href.startsWith(ordersUrl)) | |
return [] | |
const startIndex = parseInt(url.searchParams.get("startIndex")) | |
if (isNaN(startIndex)) | |
return [] | |
return [startIndex] | |
}) | |
.sort((a, b) => b - a)[0] | |
if (!maxStartIndex) | |
continue | |
const otherPageUrls = Array.from({ length: maxStartIndex / 10 }) | |
.map((_, i) => { | |
const startIndex = (i + 1) * 10 | |
const url = new URL(ordersPageUrl) | |
url.searchParams.set("startIndex", startIndex) | |
return url.href | |
}) | |
for (const otherPageUrl of otherPageUrls) { | |
const otherPage = await fetchDocumentViaIframe(otherPageUrl) | |
await wait() | |
extractOrderIds(otherPage, ordersPageUrl) | |
} | |
} | |
const extractModernInvoiceInfo = (invoicePage, baseUrl) => { | |
const orderDate = invoicePage.querySelector("[data-component = orderDate]")?.textContent.trim() | |
const items = [...invoicePage.querySelectorAll("[data-component = purchasedItems] .a-fixed-left-grid")] | |
.map(purchasedItem => { | |
const imageContainer = purchasedItem.querySelector("[data-component = itemImage]") | |
let image | |
try { | |
image = new URL(imageContainer?.querySelector("img")?.getAttribute("src"), baseUrl).href | |
} | |
catch {} | |
let quantity = parseInt(imageContainer?.querySelector(".od-item-view-qty")?.textContent.trim()) | |
if (isNaN(quantity)) | |
quantity = undefined | |
const titleLink = purchasedItem.querySelector("[data-component = itemTitle] a") | |
const title = titleLink?.textContent.trim() | |
let url | |
try { | |
const url_ = new URL(titleLink.href, baseUrl) | |
url_.search = "" | |
url = url_.href | |
} | |
catch {} | |
const unitPrice = purchasedItem.querySelector("[data-component = unitPrice] .a-offscreen")?.textContent.trim() | |
return { image, title, url, unitPrice, quantity } | |
}) | |
return { orderDate, items } | |
} | |
const extractLegacyInvoiceInfo = (invoicePage, baseUrl) => { | |
const orderDateCell = [...invoicePage.querySelectorAll("td:not(:has(td))")] | |
.find(td => td.textContent.includes("Order Placed:")) | |
const orderDate = orderDateCell?.textContent.replace("Order Placed:", "").trim() | |
const rawItems = [...invoicePage.querySelectorAll("tr")] | |
.flatMap(row => { | |
const italicText = row.querySelector("i") | |
const priceCell = row.querySelector("td[align = right]") | |
if (!(italicText && priceCell && priceCell.textContent.includes("$"))) | |
return [] | |
let quantity = parseInt(italicText.previousSibling?.textContent.match(/(\d+)\s+of:/)?.[1]) | |
if (isNaN(quantity)) | |
quantity = 1 | |
const title = italicText.textContent.trim() | |
const unitPrice = priceCell.textContent.trim() | |
return [{ | |
title, | |
unitPrice, | |
quantity | |
}] | |
}) | |
const itemsMap = new Map() | |
for (const item of rawItems) { | |
const key = `${item.title}\0${item.unitPrice}` | |
if (itemsMap.has(key)) | |
itemsMap.get(key).quantity += item.quantity | |
else | |
itemsMap.set(key, { ...item }) | |
} | |
const items = [...itemsMap.values()] | |
return { orderDate, items } | |
} | |
const orderIdToInfo = {} | |
for (const orderId of orderIds) { | |
const invoiceUrl = new URL("https://www.amazon.com/gp/css/summary/print.html") | |
invoiceUrl.searchParams.set("orderID", orderId) | |
const invoicePage = await fetchDocument(invoiceUrl) | |
await wait() | |
orderIdToInfo[orderId] = | |
invoicePage.querySelector("[data-component = orderDate]") | |
? extractModernInvoiceInfo(invoicePage, invoiceUrl) | |
: extractLegacyInvoiceInfo(invoicePage, invoiceUrl) | |
} | |
console.log(orderIdToInfo) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment