Skip to content

Instantly share code, notes, and snippets.

@mpkocher
Last active August 6, 2024 01:18
Show Gist options
  • Save mpkocher/be674875e28c2e6ee5b853f2160a94a7 to your computer and use it in GitHub Desktop.
Save mpkocher/be674875e28c2e6ee5b853f2160a94a7 to your computer and use it in GitHub Desktop.
Gibson.com Scraper

Gibson

Mod collection details

For unclear reasons, the underlying calls will produce a lot of JSON. It's around ~1.4 MB(!) zipped across the wire (and ~7MB unzipped).

This is why the UI is sluggish on initial load.

The backend call is a POST to https://www.gibson.com/en-US/Collection/GetProductGridList.

curl 'https://www.gibson.com/en-US/Collection/GetProductGridList' --compressed -X POST -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0' -H 'Accept: */*' -H 'Accept-Language: en-US,en;q=0.5' -H 'Accept-Encoding: gzip, deflate, br, zstd' -H 'Referer: https://www.gibson.com/en-US/Collection/gibson-mod?filter=iso' -H 'Content-Type: application/json' -H 'Origin: https://www.gibson.com' -H 'DNT: 1' -H 'Sec-GPC: 1' -H 'Connection: keep-alive' -H 'Sec-Fetch-Dest: empty' -H 'Sec-Fetch-Mode: cors' -H 'Sec-Fetch-Site: same-origin' -H 'Priority: u=4' -H 'TE: trailers' --data-raw '{"ComponentIds":["4Bmy2OdwhOY372zwmjPn16"],"InStockOnly":true}'

The useful headers are:

POST /en-US/Collection/GetProductGridList HTTP/2

User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0
Accept: */*
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br, zstd
Referer: https://www.gibson.com/en-US/Collection/gibson-mod?filter=iso
Content-Type: application/json

Gibson Custom Select

https://www.gibson.com/en-US/Collection/gibson-custom-select

CURL

curl 'https://www.gibson.com/en-US/Collection/GetProductGridList' --compressed -X POST -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0' -H 'Accept: */*' -H 'Accept-Language: en-US,en;q=0.5' -H 'Accept-Encoding: gzip, deflate, br, zstd' -H 'Referer: https://www.gibson.com/en-US/Collection/gibson-custom-select' -H 'Content-Type: application/json' -H 'Origin: https://www.gibson.com' -H 'DNT: 1' -H 'Sec-GPC: 1' -H 'Connection: keep-alive' -H 'Sec-Fetch-Dest: empty' -H 'Sec-Fetch-Mode: cors' -H 'Sec-Fetch-Site: same-origin' -H 'Priority: u=4' -H 'TE: trailers' --data-raw '{"ComponentIds":["5RjxBwISl2NnlLZ34cVHou"],"InStockOnly":false}'

Headers for Select

POST /en-US/Collection/GetProductGridList HTTP/2
Host: www.gibson.com
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0
Accept: */*
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br, zstd
Referer: https://www.gibson.com/en-US/Collection/gibson-custom-select
Content-Type: application/json
Content-Length: 63
Origin: https://www.gibson.com
DNT: 1
Sec-GPC: 1
Connection: keep-alive
Priority: u=4
TE: trailers

Details of Each Listing

def g(jx):
    p0 = jx['ProductGridList'][0]
    for p1 in p0['Products']: 
        slug = p1['Slug'] # This is used to construct the details URL
        desc = p1['MarketingCopy']['LongFormHTML']
        skus = p1['Skus']
        for sku in skus:
            yield (slug, ) + f(sku) + (desc,)


def f(sx): 
    # For each Sku
    return (sx["SeoId"], sx["SeoName"], sx['Sys']['CreatedAt'], sx['DefaultDirectPrices']['USD'],
    "https:" + sx['DefaultImage']['File']['Url'])
//> using jvm temurin:21
//> using scala 3.3.3
//> using dep "com.lihaoyi::os-lib:0.10.3"
//> using dep "com.lihaoyi::upickle:3.2.0"
//> using dep "com.lihaoyi::requests:0.9.0"
//> using dep "com.lihaoyi::mainargs:0.7.1"
//> using main-class Main
import java.time.{Instant, LocalDateTime, ZoneId, ZonedDateTime}
import java.time.format.DateTimeFormatter
import os.Source
import upickle.default._
import mainargs.{arg, ParserForMethods, TokensReader, main => mainer}
object Fetcher {
final val GIBSON_URL =
"https://www.gibson.com/en-US/Collection/GetProductGridList"
final val MOD =
"""{"ComponentIds":["4Bmy2OdwhOY372zwmjPn16"],"InStockOnly":true}"""
final val SELECT =
"""{"ComponentIds":["5RjxBwISl2NnlLZ34cVHou"],"InStockOnly":false}"""
final val USER_AGENT =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.1"
def getter(url: String, rawData: String) = {
val headers = Map(
"user-agent" -> USER_AGENT,
"Content-Type" -> "application/json"
)
val r = requests.post(url, headers = headers, data = rawData)
r.statusCode match {
case 200 => ujson.read(r.text())
case _ =>
throw new Exception(
s"Failed request. Status code ${r.statusCode} for ${url}"
)
}
}
def writeResults(sx: os.Source, suffix: String, root: os.Path) = {
val now = Instant.now()
val fmter = DateTimeFormatter.ISO_INSTANT.withZone(ZoneId.systemDefault())
val filename: String = fmter.format(now) + suffix
val output = root / filename
println(s"Writing to file $output")
os.write(output, sx)
}
def scrape(url: String, outputDir: os.Path, ext: String, rawData: String) = {
val jx = getter(url, rawData)
val items = extractor(jx)
items.foreach(println)
writeResults(getter(url, rawData), ext, outputDir)
}
def extractor(
jx: ujson.Value
): List[(String, String, String, String, String, String, String)] = {
val pgxs = jx("ProductGridList").arr
// Only one element?
val pg0 = pgxs(0)
pg0("Products").arr.toList.map { p =>
val name = p("Name").str
val slug = p("Slug").str
val desc = p("MarketingCopy")("LongFormHTML").str
// val skus = p("Skus").arr
p("Skus").arr.toList.map { sku =>
(
slug,
sku("SeoId").str,
sku("SeoName").str,
sku("Sys")("CreatedAt").str,
sku("DefaultDirectPrices")("USD").str,
"https:" + sku("DefaultImage")("File")("Url").str,
desc
)
}
}.flatten
}
}
object Main {
implicit object PathRead extends TokensReader.Simple[os.Path] {
def shortName = "path"
def read(strs: Seq[String]) = Right(
os.Path(strs.head, os.pwd)
) // Should this have os.pwd?
}
@mainer
def scraper(
@arg(short = 'o', doc = "Output Directory")
outputDir: os.Path
) = {
println(f"Scraping ${Fetcher.GIBSON_URL}")
Seq(
(Fetcher.MOD, "__gibson-mod.json"),
(Fetcher.SELECT, "__gibson-select.json")
).foreach { (d, ext) =>
Fetcher.scrape(Fetcher.GIBSON_URL, outputDir, ext, d)
}
println("Completed scraping.")
}
@mainer
def extractor(
@arg(short = 'i', doc = "Input Directory")
inputDir: os.Path
) = {
// this can be modified to output to CSV or similar.
println(f"Extracting $inputDir")
os.list(inputDir).filter(_.ext == "json").foreach { p =>
val jx = ujson.read(os.read(p))
Fetcher.extractor(jx).foreach(println)
}
}
def main(args: Array[String]): Unit = ParserForMethods(this).runOrExit(args)
}
.PHONY: compile run extract
default: compile ;
compile:
scala-cli compile gibson.scala
fmt:
scala-cli fmt .
run:
./gibson scraper -o .
extract:
./gibson extractor -i .
package:
rm gibson
scala-cli package --power gibson.scala -o gibson
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment