Skip to content

Instantly share code, notes, and snippets.

@Jeff-Lewis
Forked from idkjs/SchemaOrgParser.js
Created September 8, 2020 01:02
Show Gist options
  • Save Jeff-Lewis/b4649f288e8b01adda5268e131d132a9 to your computer and use it in GitHub Desktop.
Save Jeff-Lewis/b4649f288e8b01adda5268e131d132a9 to your computer and use it in GitHub Desktop.
// originaly from https://kb.apify.com/tips-and-tricks/scraping-data-from-websites-using-schemaorg-microdata
function schemaOrgParser() {
var extractValue = function(elem) {
return $(elem).attr("content") || $(elem).text()
|| $(elem).attr("src") || $(elem).attr("href") || null;
};
var addProperty = function(item,propName,value) {
if( typeof(value)==='string' )
value = value.trim();
if( Array.isArray(item[propName]) )
item[propName].push(value);
else if( typeof(item[propName])!=='undefined' )
item[propName] = [item[propName], value];
else
item[propName] = value;
}
var extractItem = function(elem) {
var item = { _type: $(elem).attr("itemtype") };
var count = 0;
// iterate itemprops not nested in another itemscope
$(elem).find("[itemprop]").filter(function() {
return $(this).parentsUntil(elem, '[itemscope]').length === 0;
}).each( function() {
addProperty(
item,
$(this).attr("itemprop"),
$(this).is("[itemscope]") ? extractItem(this) : extractValue(this));
count++;
});
// special case - output at least something
if( count===0 )
addProperty(item, "_value", extractValue(elem));
return item;
};
var extractAllItems = function() {
var items = [];
// find top-level itemscope elements
$("[itemscope]").filter(function() {
return $(this).parentsUntil("body", '[itemscope]').length === 0;
}).each( function() {
items.push( extractItem(this) );
});
return items;
};
return extractAllItems();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment