Skip to content

Instantly share code, notes, and snippets.

@wlib
Created August 6, 2025 01:42
Show Gist options
  • Save wlib/1ec790c2ee8367ad97cab4f499ad5eab to your computer and use it in GitHub Desktop.
Save wlib/1ec790c2ee8367ad97cab4f499ad5eab to your computer and use it in GitHub Desktop.
Scrape all amazon orders (run in console on amazon.com). Waits for a second for each request. Takes a long time to run for some accounts...
const ordersUrl = "https://www.amazon.com/your-orders/orders"
const wait = (ms = 1_000) =>
new Promise(resolve => {
setTimeout(() =>{
resolve()
}, ms)
})
const fetchDocument = async url => {
const html = await fetch(url).then(res => res.ok ? res.text() : undefined)
if (!html)
return
return Document.parseHTMLUnsafe(html)
}
const fetchDocumentViaIframe = async url =>
new Promise(resolve => {
const iframe = document.createElement("iframe")
iframe.addEventListener("load", () => {
const html = iframe.contentDocument.documentElement.outerHTML
resolve(Document.parseHTMLUnsafe(html))
})
iframe.src = url
})
const ordersPage = await fetchDocument(ordersUrl)
const ordersUrlsByYear =
[...ordersPage.querySelector("select[name = timeFilter]").options]
.flatMap(option => {
if (!option.value.startsWith("year-"))
return []
const url = new URL(ordersUrl)
url.searchParams.set("timeFilter", option.value)
return [url.href]
})
const orderIds = new Set()
const extractOrderIds = (doc, baseUrl) => {
for (const a of doc.querySelectorAll("a")) {
let url
try {
url = new URL(a.href, baseUrl)
}
catch {
continue
}
if (!url.href.startsWith("https://www.amazon.com/gp/css/summary/print.html"))
continue
const orderId = url.searchParams.get("orderID")
if (!orderId)
continue
orderIds.add(orderId)
}
}
for (const ordersPageUrl of ordersUrlsByYear) {
const ordersPage = await fetchDocumentViaIframe(ordersPageUrl)
await wait()
extractOrderIds(ordersPage, ordersPageUrl)
const maxStartIndex = [...ordersPage.querySelectorAll(".a-pagination a")]
.flatMap(a => {
let url
try {
url = new URL(a.href, ordersPageUrl)
}
catch {
return []
}
if (!url.href.startsWith(ordersUrl))
return []
const startIndex = parseInt(url.searchParams.get("startIndex"))
if (isNaN(startIndex))
return []
return [startIndex]
})
.sort((a, b) => b - a)[0]
if (!maxStartIndex)
continue
const otherPageUrls = Array.from({ length: maxStartIndex / 10 })
.map((_, i) => {
const startIndex = (i + 1) * 10
const url = new URL(ordersPageUrl)
url.searchParams.set("startIndex", startIndex)
return url.href
})
for (const otherPageUrl of otherPageUrls) {
const otherPage = await fetchDocumentViaIframe(otherPageUrl)
await wait()
extractOrderIds(otherPage, ordersPageUrl)
}
}
const extractModernInvoiceInfo = (invoicePage, baseUrl) => {
const orderDate = invoicePage.querySelector("[data-component = orderDate]")?.textContent.trim()
const items = [...invoicePage.querySelectorAll("[data-component = purchasedItems] .a-fixed-left-grid")]
.map(purchasedItem => {
const imageContainer = purchasedItem.querySelector("[data-component = itemImage]")
let image
try {
image = new URL(imageContainer?.querySelector("img")?.getAttribute("src"), baseUrl).href
}
catch {}
let quantity = parseInt(imageContainer?.querySelector(".od-item-view-qty")?.textContent.trim())
if (isNaN(quantity))
quantity = undefined
const titleLink = purchasedItem.querySelector("[data-component = itemTitle] a")
const title = titleLink?.textContent.trim()
let url
try {
const url_ = new URL(titleLink.href, baseUrl)
url_.search = ""
url = url_.href
}
catch {}
const unitPrice = purchasedItem.querySelector("[data-component = unitPrice] .a-offscreen")?.textContent.trim()
return { image, title, url, unitPrice, quantity }
})
return { orderDate, items }
}
const extractLegacyInvoiceInfo = (invoicePage, baseUrl) => {
const orderDateCell = [...invoicePage.querySelectorAll("td:not(:has(td))")]
.find(td => td.textContent.includes("Order Placed:"))
const orderDate = orderDateCell?.textContent.replace("Order Placed:", "").trim()
const rawItems = [...invoicePage.querySelectorAll("tr")]
.flatMap(row => {
const italicText = row.querySelector("i")
const priceCell = row.querySelector("td[align = right]")
if (!(italicText && priceCell && priceCell.textContent.includes("$")))
return []
let quantity = parseInt(italicText.previousSibling?.textContent.match(/(\d+)\s+of:/)?.[1])
if (isNaN(quantity))
quantity = 1
const title = italicText.textContent.trim()
const unitPrice = priceCell.textContent.trim()
return [{
title,
unitPrice,
quantity
}]
})
const itemsMap = new Map()
for (const item of rawItems) {
const key = `${item.title}\0${item.unitPrice}`
if (itemsMap.has(key))
itemsMap.get(key).quantity += item.quantity
else
itemsMap.set(key, { ...item })
}
const items = [...itemsMap.values()]
return { orderDate, items }
}
const orderIdToInfo = {}
for (const orderId of orderIds) {
const invoiceUrl = new URL("https://www.amazon.com/gp/css/summary/print.html")
invoiceUrl.searchParams.set("orderID", orderId)
const invoicePage = await fetchDocument(invoiceUrl)
await wait()
orderIdToInfo[orderId] =
invoicePage.querySelector("[data-component = orderDate]")
? extractModernInvoiceInfo(invoicePage, invoiceUrl)
: extractLegacyInvoiceInfo(invoicePage, invoiceUrl)
}
console.log(orderIdToInfo)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment