Skip to content

Instantly share code, notes, and snippets.

@nberlette
Last active June 29, 2021 02:16
Show Gist options
  • Save nberlette/926b8cabf926b81318d396d00c0ac355 to your computer and use it in GitHub Desktop.
Save nberlette/926b8cabf926b81318d396d00c0ac355 to your computer and use it in GitHub Desktop.
Cheerio + HTMLRewriter Typescript Wrapper
/**
* Cheerio Wrapper HTMLRewriter.ts
*
* Usage of cheerio here only simulates stream-oriented parser! It is slow!
* This typescript port hasn't been tested, and probably doesn't work.
*/
import cheerio from 'cheerio'
class HTMLRewriter implements Cheerio {
public selectors: Selector[]
public response: Response
constructor(): Cheerio {
this.selectors = []
return this
}
on (element: Selector, handler: Cheerio): Cheerio {
this.selectors.push([element, handler])
return this
}
async transform (response: Response): Response {
const text = await response.text()
const $ = cheerio.load(text)
// Simulate stream-based parser
this.walk($, $.root())
return new Response($.root().html(), response)
}
walk ($: Cheerio, node: Element): Element[] {
const $node = this.wrapElement($, node)
const matchedHandlers = []
for (const [selector, handler] of this.selectors) {
if ($node.is(selector)) {
matchedHandlers.push(handler)
}
}
for (const handler of matchedHandlers) {
handler.element && handler.element($node)
}
const lastNode = $node.contents().length - 1
const children = $node.contents().toArray()
for (const [i, child] of children.entries()) {
if (child.nodeType === 1) {
this.walk($, child)
} else {
const lastInNode = (i === lastNode) || (children[i+1].nodeType !== child.nodeType)
const $child = this.wrapOther($, child, lastInNode)
for (let handler of matchedHandlers) {
if (child.nodeType === 3) { // text node
handler.text && handler.text($child)
} else if (child.nodeType === 8) { // comment node
handler.comments && handler.comments($child)
}
}
}
}
return $node
}
wrapElement ($: Cheerio, node: TagElement): TagElement {
const $node = $(node)
$node.tagName = node.name
$node.hasAttribute = hasAttribute.bind($node)
$node.getAttribute = getAttribute.bind($node)
$node.setAttribute = setAttribute.bind($node)
return $node
}
wrapOther ($: Cheerio, node: TextElement, lastInNode: boolean): TextElement {
const $node = $(node)
$node.text = node.nodeValue
$node.replace = replace.bind($node)
$node.
if (node.nodeType === 3) {
$node.lastInTextNode = lastInNode
}
return $node
}
}
function replace(content: string, options?: CheerioParserOptions) {
this[0].nodeValue = content
}
function hasAttribute(name: string): boolean {
return (this.getAttribute(name) !== undefined)
}
function getAttribute(name: string): any {
return this.attr(name)
}
function setAttribute(name: string, value: string): any {
this.attr(name, value)
}
export default HTMLRewriter
// Type definitions for Cheerio v0.22.0
// Project: https://github.com/cheeriojs/cheerio
// Definitions by: Bret Little <https://github.com/blittle>
// VILIC VANE <http://vilic.info>
// Wayne Maurer <https://github.com/wmaurer>
// Umar Nizamani <https://github.com/umarniz>
// LiJinyao <https://github.com/LiJinyao>
// Chennakrishna <https://github.com/chennakrishna8>
// AzSiAz <https://github.com/AzSiAz>
// Ryo Ota <https://github.com/nwtgck>
// Hiroki Osame <https://github.com/privatenumber>
// Artishevskiy Alexey <https://github.com/dhvcc>
// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped
/// <reference types="node" />
interface Document {}
declare namespace cheerio {
type Element = TextElement | TagElement | CommentElement;
interface TextElement {
type: 'text';
next: Element | null;
prev: Element | null;
parent: Element;
data?: string;
startIndex?: number;
endIndex?: number;
}
interface TagElement {
tagName: string;
type: 'tag' | 'script' | 'style';
name: string;
attribs: { [attr: string]: string };
'x-attribsNamespace': { [attr: string]: string };
'x-prefixNamespace': { [attr: string]: string };
children: Element[];
childNodes: Element[] | null;
lastChild: Element | null;
firstChild: Element | null;
next: Element | null;
nextSibling: Element;
prev: Element | null;
previousSibling: Element;
parent: Element;
parentNode: Element;
nodeValue: string;
data?: string;
startIndex?: number;
endIndex?: number;
}
interface CommentElement {
type: 'comment';
next: Element | null;
prev: Element | null;
parent: Element;
data?: string;
startIndex?: number;
endIndex?: number;
}
type AttrFunction = (el: Element, i: number, currentValue: string) => any;
interface Cheerio {
// Document References
// Cheerio https://github.com/cheeriojs/cheerio
// JQuery http://api.jquery.com
[Symbol.iterator](): IterableIterator<Element>;
[index: number]: Element;
cheerio: string;
length: number;
// Attributes
attr(): { [attr: string]: string };
attr(name: string): string | undefined;
attr(name: string, value: AttrFunction): Cheerio;
// `value` *can* be `any` here but:
// 1. That makes type-checking the function-type useless
// 2. It's converted to a string anyways
attr(name: string, value: string): Cheerio;
// The map's values *can* be `any` but they'll all be cast to strings
// regardless.
attr(map: { [key: string]: any }): Cheerio;
data(): any;
data(name: string): any;
data(name: string, value: any): any;
val(): string;
val(value: string): Cheerio;
removeAttr(name: string): Cheerio;
has(selector: string): Cheerio;
has(element: Element): Cheerio;
hasClass(className: string): boolean;
addClass(classNames: string): Cheerio;
removeClass(): Cheerio;
removeClass(className: string): Cheerio;
removeClass(func: (index: number, className: string) => string): Cheerio;
toggleClass(className: string): Cheerio;
toggleClass(className: string, toggleSwitch: boolean): Cheerio;
toggleClass(toggleSwitch?: boolean): Cheerio;
toggleClass(
func: (index: number, className: string, toggleSwitch: boolean) => string,
toggleSwitch?: boolean,
): Cheerio;
is(selector: string): boolean;
is(element: Element): boolean;
is(element: Element[]): boolean;
is(selection: Cheerio): boolean;
is(func: (index: number, element: Element) => boolean): boolean;
// Form
serialize(): string;
serializeArray(): { name: string; value: string }[];
// Traversing
find(selector: string): Cheerio;
find(element: Cheerio): Cheerio;
parent(selector?: string): Cheerio;
parents(selector?: string): Cheerio;
parentsUntil(selector?: string, filter?: string): Cheerio;
parentsUntil(element: Element, filter?: string): Cheerio;
parentsUntil(element: Cheerio, filter?: string): Cheerio;
prop(name: string): any;
prop(name: string, value: any): Cheerio;
closest(): Cheerio;
closest(selector: string): Cheerio;
next(selector?: string): Cheerio;
nextAll(): Cheerio;
nextAll(selector: string): Cheerio;
nextUntil(selector?: string, filter?: string): Cheerio;
nextUntil(element: Element, filter?: string): Cheerio;
nextUntil(element: Cheerio, filter?: string): Cheerio;
prev(selector?: string): Cheerio;
prevAll(): Cheerio;
prevAll(selector: string): Cheerio;
prevUntil(selector?: string, filter?: string): Cheerio;
prevUntil(element: Element, filter?: string): Cheerio;
prevUntil(element: Cheerio, filter?: string): Cheerio;
slice(start: number, end?: number): Cheerio;
siblings(selector?: string): Cheerio;
children(selector?: string): Cheerio;
contents(): Cheerio;
each(func: (index: number, element: Element) => any): Cheerio;
map(func: (index: number, element: Element) => any): Cheerio;
filter(selector: string): Cheerio;
filter(selection: Cheerio): Cheerio;
filter(element: Element): Cheerio;
filter(elements: Element[]): Cheerio;
filter(func: (index: number, element: Element) => boolean): Cheerio;
not(selector: string): Cheerio;
not(selection: Cheerio): Cheerio;
not(element: Element): Cheerio;
not(func: (index: number, element: Element) => boolean): Cheerio;
first(): Cheerio;
last(): Cheerio;
eq(index: number): Cheerio;
get(): any[];
get(index: number): any;
index(): number;
index(selector: string): number;
index(selection: Cheerio): number;
end(): Cheerio;
add(selectorOrHtml: string): Cheerio;
add(selector: string, context: Document): Cheerio;
add(element: Element): Cheerio;
add(elements: Element[]): Cheerio;
add(selection: Cheerio): Cheerio;
addBack(): Cheerio;
addBack(filter: string): Cheerio;
// Manipulation
appendTo(target: Cheerio): Cheerio;
prependTo(target: Cheerio): Cheerio;
append(content: string, ...contents: any[]): Cheerio;
append(content: Document, ...contents: any[]): Cheerio;
append(content: Document[], ...contents: any[]): Cheerio;
append(content: Cheerio, ...contents: any[]): Cheerio;
prepend(content: string, ...contents: any[]): Cheerio;
prepend(content: Document, ...contents: any[]): Cheerio;
prepend(content: Document[], ...contents: any[]): Cheerio;
prepend(content: Cheerio, ...contents: any[]): Cheerio;
after(content: string, ...contents: any[]): Cheerio;
after(content: Document, ...contents: any[]): Cheerio;
after(content: Document[], ...contents: any[]): Cheerio;
after(content: Cheerio, ...contents: any[]): Cheerio;
insertAfter(content: string): Cheerio;
insertAfter(content: Document): Cheerio;
insertAfter(content: Cheerio): Cheerio;
before(content: string, ...contents: any[]): Cheerio;
before(content: Document, ...contents: any[]): Cheerio;
before(content: Document[], ...contents: any[]): Cheerio;
before(content: Cheerio, ...contents: any[]): Cheerio;
insertBefore(content: string): Cheerio;
insertBefore(content: Document): Cheerio;
insertBefore(content: Cheerio): Cheerio;
remove(selector?: string): Cheerio;
replaceWith(content: string): Cheerio;
replaceWith(content: Element): Cheerio;
replaceWith(content: Element[]): Cheerio;
replaceWith(content: Cheerio): Cheerio;
replaceWith(content: () => Cheerio): Cheerio;
empty(): Cheerio;
html(): string | null;
html(html: string): Cheerio;
text(): string;
text(text: string): Cheerio;
wrap(content: string): Cheerio;
wrap(content: Document): Cheerio;
wrap(content: Cheerio): Cheerio;
css(propertyName: string): string;
css(propertyNames: string[]): string[];
css(propertyName: string, value: string): Cheerio;
css(propertyName: string, value: number): Cheerio;
css(propertyName: string, func: (index: number, value: string) => string): Cheerio;
css(propertyName: string, func: (index: number, value: string) => number): Cheerio;
css(properties: Object): Cheerio;
// Rendering
// Miscellaneous
clone(): Cheerio;
// Not Documented
toArray(): Element[];
}
interface CheerioParserOptions {
// Document References
// Cheerio https://github.com/cheeriojs/cheerio
// HTMLParser2 https://github.com/fb55/htmlparser2/wiki/Parser-options
// DomHandler https://github.com/fb55/DomHandler
xmlMode?: boolean;
decodeEntities?: boolean;
lowerCaseTags?: boolean;
lowerCaseAttributeNames?: boolean;
recognizeCDATA?: boolean;
recognizeSelfClosing?: boolean;
normalizeWhitespace?: boolean;
withStartIndices?: boolean;
withEndIndices?: boolean;
ignoreWhitespace?: boolean;
_useHtmlParser2?: boolean;
}
interface Selector {
(selector: string): Cheerio;
(selector: string, context: string): Cheerio;
(selector: string, context: Element): Cheerio;
(selector: string, context: Element[]): Cheerio;
(selector: string, context: Cheerio): Cheerio;
(selector: string, context: string, root: string): Cheerio;
(selector: string, context: Element, root: string): Cheerio;
(selector: string, context: Element[], root: string): Cheerio;
(selector: string, context: Cheerio, root: string): Cheerio;
(selector: any): Cheerio;
}
interface Root extends Selector {
// Document References
// Cheerio https://github.com/cheeriojs/cheerio
// JQuery http://api.jquery.com
root(): Cheerio;
contains(container: Element, contained: Element): boolean;
parseHTML(data: string, context?: Document | null, keepScripts?: boolean): Document[];
html(options?: CheerioParserOptions): string;
html(dom: string | Cheerio | Element, options?: CheerioParserOptions): string;
xml(dom?: string | Cheerio | Element): string;
}
interface CheerioAPI extends Root {
version: string;
load(html: string | Buffer, options?: CheerioParserOptions): Root;
load(element: Element | Element[], options?: CheerioParserOptions): Root;
}
}
declare module 'cheerio' {
const cheerioModule: cheerio.CheerioAPI;
export = cheerioModule;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment