Created
March 5, 2020 07:35
-
-
Save Grawl/485ceafb7256920043350fc64e4fe8d4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Return array of URLs found in HTML string | |
* | |
* If `<img>` wrapped into `<a>` uses `[href]` of `<a>` if it's an image | |
* | |
* @param {string} html | |
* @returns {string[]} | |
*/ | |
async ImageURLsFromHTML(html) { | |
try { | |
const allowedImageTypes = [ | |
'image/jpeg', | |
'image/jpg', | |
'image/png', | |
'image/webp', | |
] | |
const urls = [] | |
await posthtml() | |
.use(async tree => { | |
tree.walk(node => { | |
// TODO don't return img[src] if it's child of a[href] having image content-type | |
if (node.tag === 'a') { | |
const a = node | |
const src = a.attrs.href | |
// Node has <img> child | |
if (node.content.find(node => ( | |
typeof node === 'object' && | |
node.tag === 'img' && | |
node.attrs.src | |
))) { | |
// TODO ensure it works | |
(async function() { | |
const request = await Promise.resolve(fetch(src, { | |
method: 'HEAD', | |
})) | |
const headers = request.headers | |
const contentType = headers.get('Content-Type') | |
if (allowedImageTypes.includes(contentType)) { | |
urls.push(src) | |
} | |
}()) | |
} | |
} else if ( | |
node.tag === 'img' && | |
node.attrs.src | |
) { | |
urls.push(node.attrs.src) | |
} | |
return node | |
}) | |
return tree | |
}) | |
.process(html) | |
return urls | |
} catch (error) { | |
console.error(error) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment