Last active
May 23, 2023 00:54
-
-
Save gtrabanco/9c89a8f7be106290575ff55a40e405e0 to your computer and use it in GitHub Desktop.
Idea to parse document
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { genericMapToType } from './generic-map-to-type.ts'; | |
export type ScraperSetter = (property: string, value?: any) => void; | |
type Key = Exclude<any, Function>; | |
type Value = Exclude<any, Function>; | |
export type ScrapeHandler = { | |
selector: string; | |
handler: HTMLRewriterTypes.HTMLRewriterElementContentHandlers; | |
}; | |
export type ScrapeHandleElement = (setter: Map<Key, Value>) => ScrapeHandler; | |
export type ScrapeHandleMultipleElements = ( | |
setter: Map<Key, Map<Key, Value>> | |
) => Array<ScrapeHandler> | ScrapeHandler; | |
export class KVScrapeStorage extends Map<Key, Value> { | |
constructor() { | |
super(); | |
} | |
set(key: Key, value: Value | ((prev?: Value) => Value)): this { | |
const current = this.get(key) ?? undefined; | |
const newValue = value instanceof Function ? value(current) : value; | |
return super.set(key, newValue); | |
} | |
get(key: string): Map<Key, Value> | undefined { | |
return super.get(key); | |
} | |
} | |
// Next class is for internal use in HTMLRewriterScrapeDocument | |
class DocumentKVStorage extends Map<Key, Value> { | |
constructor() { | |
super(); | |
} | |
set(key: Key, value: Value | ((prev?: Value) => Value)): this { | |
const current = this.get(key) ?? undefined; | |
const newValue = value instanceof Function ? value(current) : value; | |
return super.set(key, newValue); | |
} | |
get(key: string): Map<Key, Value> | undefined { | |
return super.get(key); | |
} | |
} | |
type MaptToTypeHandler = (result: Map<Key, Value>) => any; | |
export class HTMLRewriterScrapeDocument extends HTMLRewriter { | |
private kv = new DocumentKVStorage(); | |
private mapToType: { | |
[key: string]: typeof genericMapToType; | |
} = {}; | |
constructor() { | |
super(); | |
} | |
public addHandlers( | |
alias: string, | |
handlers: | |
| Array<ScrapeHandleElement> | |
| ScrapeHandleElement | |
| ScrapeHandleMultipleElements, | |
mapToType?: MaptToTypeHandler | |
) { | |
this.kv.set(alias, new KVScrapeStorage()); | |
const setter = this.kv.get(alias)!; | |
if (mapToType) { | |
this.mapToType[alias] = mapToType; | |
} | |
if (Array.isArray(handlers)) { | |
handlers.forEach((handle: ScrapeHandleElement) => { | |
const { selector, handler: handleParseElement } = handle(setter); | |
super.on(selector, handleParseElement); | |
}); | |
} | |
if (handlers instanceof Function) { | |
const handleParsing = handlers(setter); | |
if (Array.isArray(handleParsing)) { | |
handleParsing.forEach(({ selector, handler: handleParseElement }) => | |
super.on(selector, handleParseElement) | |
); | |
} else { | |
super.on(handleParsing.selector, handleParsing.handler); | |
} | |
} | |
return this; | |
} | |
scrape(response: Response) { | |
super.transform(response); | |
const keys = this.kv.keys(); | |
const result = {} as { [key: string]: any }; | |
for (const key of keys) { | |
const value = this.kv.get(key); | |
if (value) { | |
result[key] = this.mapToType[key] ? this.mapToType[key](value) : value; | |
} | |
} | |
return result; | |
} | |
} | |
// HOW TO PARSE A PAGE | |
// Selectors | |
const currentSelectedFederationIdSelect = '#territorial > option[selected]'; | |
const currentPageFederationHeaderAElementSelector = | |
'#cabecera > div:nth-child(2) > a:nth-child(1)'; | |
const currentPageFederationImgElementSelector = | |
'#cabecera > div:nth-child(2) > a:nth-child(1) > img'; | |
// Getting the info from those selectors in the page | |
export const currentFederationInfo = (kv: KVScrapeStorage) => [ | |
{ | |
// Federation id and name | |
selector: currentSelectedFederationIdSelect, | |
handler: { | |
element: (element: HTMLRewriterTypes.Element) => { | |
const key = 'rfebmId'; | |
const value = Number(element.getAttribute('value')) ?? -1; | |
kv.set(key, value); | |
}, | |
text: ({ text }: HTMLRewriterTypes.Text) => { | |
const key = 'name'; | |
const value = text.trim(); | |
kv.set(key, (prev?: string) => (prev ?? '') + value); | |
}, | |
}, | |
}, | |
{ | |
// Url to the federation website | |
selector: currentPageFederationHeaderAElementSelector, | |
handler: { | |
element: (element: HTMLRewriterTypes.Element) => { | |
const key = 'url'; | |
const urlString = element.getAttribute('href'); | |
kv.set(key, urlString); | |
}, | |
}, | |
}, | |
{ | |
// Url to the federation shield | |
selector: currentPageFederationImgElementSelector, | |
handler: { | |
element: (element: HTMLRewriterTypes.Element) => { | |
const key = 'shieldUrl'; | |
const imgUrl = element.getAttribute('src'); | |
kv.set(key, imgUrl); | |
}, | |
}, | |
}, | |
]; | |
// Parse <select> of all federations | |
const federationsSelector = '#territorial > option'; | |
function allFederationsInfo(kv: KVScrapeStorage) { | |
let currentNumber = -1; | |
return [ | |
{ | |
selector: federationsSelector, | |
handler: { | |
element: (element: HTMLRewriterTypes.Element) => { | |
currentNumber = Number(element.getAttribute('value')) ?? -1; | |
}, | |
text: ({ text }: HTMLRewriterTypes.Text) => { | |
const name = text.trim(); | |
if (name.length > 0) { | |
kv.set(currentNumber, name); | |
} | |
}, | |
}, | |
}, | |
]; | |
} | |
// How to use all above: | |
// const rw = new HTMLRewriterScrapeDocument(); | |
// rw.addHandlers('currentFederation', federationInfoFromHeaderHandle); | |
// rw.addHandlers('allFederations', allFederationsInfo); | |
// const response = await fetch( | |
// 'https://www.rfebm.com/competiciones/competicion.php' | |
// ); | |
// const result = rw.scrape(response); | |
// console.log(result); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You can add multiple handlers and will provide you all the data in the aliases you give.