Last active
April 30, 2020 15:25
-
-
Save Jerska/f3aa5a7509625bb0c1671b687d861964 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
new Crawler({ | |
appId: "FIXME", | |
apiKey: "FIXME", | |
indexPrefix: "crawler_FIXME_", | |
rateLimit: 4, | |
maxUrls: 100, | |
startUrls: ["FIXME"], | |
ignoreQueryParams: ["utm_medium", "utm_source", "utm_campaign", "utm_term"], | |
actions: [ | |
{ | |
indexName: "all", | |
pathsToMatch: ["https://FIXME/**"], | |
recordExtractor: ({ url, $, contentLength, fileType, helpers }) => { | |
// Configuration documentation: https://www.algolia.com/doc/api-reference/crawler/ | |
// Extracting data documentation: https://www.algolia.com/doc/tools/crawler/guides/extracting-data/ | |
/* 1. Helpers */ | |
/** | |
* Record promotion level. | |
* Change this variable value to boost the ranking of the next added records | |
*/ | |
let promote = 0; | |
/** | |
* Record current hierarchy level | |
* Change this variable value to change the hierarchy of the next added records | |
*/ | |
let hierarchy = [url.host, url.pathname]; | |
/** | |
* Transform a hierarchy array into a hierarchy object | |
* @example | |
* buildHierarchyObject(['https://www.example.org', 'Questions']) | |
* // => { lvl0: 'www.example.org', lvl1: 'www.example.org > Questions' } | |
*/ | |
function buildHierarchyObj() { | |
const res = {}; | |
for (let i = 0; i < hierarchy.length; ++i) { | |
res[`lvl${i}`] = hierarchy.slice(0, i + 1).join(" > "); | |
} | |
return res; | |
} | |
/** | |
* Take the first truthy value in a list of args. | |
* Calls any function passed to use its return value instead, and skips on throw. | |
* @example | |
* pickWithFallback( | |
* $('title').text(), | |
* () => $('meta[name="og:title"]').attr('content').trim() | |
* ) | |
*/ | |
function pickWithFallback(...values) { | |
for (let val of values) { | |
if (typeof val === "function") { | |
try { | |
val = val(); | |
} catch (_e) { | |
continue; | |
} | |
} | |
if (val) return val; | |
} | |
return null; | |
} | |
/** | |
* Clean whitespaces in a chunk of text. | |
*/ | |
function cleanWhitespaces(str) { | |
if (str === null) return str; | |
return String(str).replace(/\s+/gm, " ").trim(); | |
} | |
/** | |
* Remove some useless nodes for crawling from the page. | |
* This is usually useful to clean up `content`. | |
*/ | |
function removeSelectors(...selectors) { | |
for (let selector of selectors) { | |
$(selector).remove(); | |
} | |
} | |
let records = []; // Final records array | |
let position = 1; // Position in the page | |
/** | |
* Add record to records list | |
*/ | |
function addRecord(attributes) { | |
records.push({ | |
objectID: `${url.href} ${position}`, | |
url: url.href, | |
hierarchy, | |
hierarchyObj: buildHierarchyObj(), | |
promote, | |
urlDepth: url.pathname.split("/").length, | |
position: position++, | |
...attributes, | |
}); | |
} | |
/* 2. Extraction */ | |
console.log(`Crawling "${url.href}"`); | |
// Remove useless DOM nodes | |
removeSelectors("header", "footer", "nav"); | |
addRecord({ | |
title: cleanWhitespaces( | |
pickWithFallback( | |
$('meta[property="og:title"]').attr("content"), | |
$("head > title").text(), | |
() => $("h1").first().text(), | |
"No title" | |
) | |
), | |
keywords: pickWithFallback( | |
() => | |
$("meta[name=keywords]") | |
.attr("content") | |
.split(",") | |
.map(cleanWhitespaces), | |
[] | |
), | |
content: cleanWhitespaces( | |
pickWithFallback( | |
$("meta[name=description]").attr("content"), | |
$("body").text(), | |
"" | |
) | |
).slice(0, 10000), | |
}); | |
return records; | |
}, | |
}, | |
], | |
initialIndexSettings: { | |
all: { | |
searchableAttributes: [ | |
"unordered(keywords)", | |
"unordered(title)", | |
"unordered(hierarchy)", | |
"unordered(content)", | |
"url", | |
], | |
customRanking: ["desc(promote)", "asc(urlDepth)", "asc(position)"], | |
attributesForFaceting: ["hierarchyObj"], | |
attributesToHighlight: ["url", "title", "keywords", "hierarchy"], | |
attributesToSnippet: ["content"], | |
}, | |
}, | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment