Skip to content

Instantly share code, notes, and snippets.

@alanshaw
Last active December 21, 2015 07:28
Show Gist options
  • Save alanshaw/6270911 to your computer and use it in GitHub Desktop.
Save alanshaw/6270911 to your computer and use it in GitHub Desktop.
Scrape polestarglobal.com [where to buy page](http://web.polestarglobal.com/about/where-to-buy) for import into django
var Browser = require("zombie")
, defaults = require("defaults")
var regionMap = {
"Africa": 1,
"Asia/Pacific": 2,
"Europe": 3,
"Middle East": 4,
"North America": 5,
"South America": 6
}
var prodMap = {
"Fleet Management": "fleet_management",
"Marine Asset Tracker": "marine_asset_tracker",
"Ship Security (SSAS & SSRS)": "ship_security",
"LRIT Conformance": "lrit_conformance",
"Fisheries Vessel Monitoring": "fisheries_vessel_monitoring",
"Approved Service Centres": "approved_service_centre"
}
var defaultDist = {
"fleet_management": false,
"marine_asset_tracker": false,
"ship_security": false,
"lrit_conformance": false,
"fisheries_vessel_monitoring": false,
"approved_service_centre": false
}
var browser = new Browser()
browser.visit("http://web.polestarglobal.com/about/where-to-buy", {debug: true}, function (er) {
if (er) return console.error(er)
var distributors = []
, pk = 0
var distItems = browser.queryAll("#retailers > li")
console.log("Found", distItems.length, "distributors")
distItems.forEach(function (dist) {
var fields = defaults({}, defaultDist)
fields.name = browser.text(".org", dist)
fields.country = browser.text(".country-name", dist)
fields.region = regionMap[dist.getAttribute("data-region")]
dist.getAttribute("data-products").split(",").forEach(function (prod) {
fields[prodMap[prod]] = true
})
fields.rep = browser.text(".fn", dist) || null
fields.street = browser.html(".street-address > *", dist) || null
if (fields.street) {
fields.street = fields.street
.replace(/<div>/g, "")
.replace(/<\/div>/g, "\n")
.replace(/\\t/g, "")
.trim()
}
fields.city = browser.text(".locality", dist) || null
fields.county = browser.text(".region", dist) || null
fields.postcode = browser.text(".postal-code", dist) || null
var tels = browser.queryAll(".tel", dist)
if (tels[0]) {
fields.tel = browser.text(tels[0]) || null
} else {
fields.tel = null
}
if (tels[1]) {
fields.fax = browser.text(tels[1]) || null
} else {
fields.fax = null
}
fields.email = browser.text(".email", dist) || null
fields.website = browser.text(".url", dist) || null
var map = browser.query("img", dist)
if (map) {
var matches = (/color:red%7C([0-9-.]+),([0-9-.]+)&/g).exec(map.src)
if (matches) {
fields.latitude = matches[1]
fields.longitude = matches[2]
}
}
distributors.push({
pk: ++pk,
model: "content.distributor",
fields: fields
})
})
console.log(JSON.stringify(distributors, null, 2))
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment