Last active
May 23, 2023 00:54
-
-
Save gtrabanco/9c89a8f7be106290575ff55a40e405e0 to your computer and use it in GitHub Desktop.
Idea to parse document
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { genericMapToType } from './generic-map-to-type.ts'; | |
export type ScraperSetter = (property: string, value?: any) => void; | |
type Key = Exclude<any, Function>; | |
type Value = Exclude<any, Function>; | |
export type ScrapeHandler = { | |
selector: string; | |
handler: HTMLRewriterTypes.HTMLRewriterElementContentHandlers; | |
}; | |
export type ScrapeHandleElement = (setter: Map<Key, Value>) => ScrapeHandler; | |
export type ScrapeHandleMultipleElements = ( | |
setter: Map<Key, Map<Key, Value>> | |
) => Array<ScrapeHandler> | ScrapeHandler; | |
export class KVScrapeStorage extends Map<Key, Value> { | |
constructor() { | |
super(); | |
} | |
set(key: Key, value: Value | ((prev?: Value) => Value)): this { | |
const current = this.get(key) ?? undefined; | |
const newValue = value instanceof Function ? value(current) : value; | |
return super.set(key, newValue); | |
} | |
get(key: string): Map<Key, Value> | undefined { | |
return super.get(key); | |
} | |
} | |
// Next class is for internal use in HTMLRewriterScrapeDocument | |
class DocumentKVStorage extends Map<Key, Value> { | |
constructor() { | |
super(); | |
} | |
set(key: Key, value: Value | ((prev?: Value) => Value)): this { | |
const current = this.get(key) ?? undefined; | |
const newValue = value instanceof Function ? value(current) : value; | |
return super.set(key, newValue); | |
} | |
get(key: string): Map<Key, Value> | undefined { | |
return super.get(key); | |
} | |
} | |
type MaptToTypeHandler = (result: Map<Key, Value>) => any; | |
export class HTMLRewriterScrapeDocument extends HTMLRewriter { | |
private kv = new DocumentKVStorage(); | |
private mapToType: { | |
[key: string]: typeof genericMapToType; | |
} = {}; | |
constructor() { | |
super(); | |
} | |
public addHandlers( | |
alias: string, | |
handlers: | |
| Array<ScrapeHandleElement> | |
| ScrapeHandleElement | |
| ScrapeHandleMultipleElements, | |
mapToType?: MaptToTypeHandler | |
) { | |
this.kv.set(alias, new KVScrapeStorage()); | |
const setter = this.kv.get(alias)!; | |
if (mapToType) { | |
this.mapToType[alias] = mapToType; | |
} | |
if (Array.isArray(handlers)) { | |
handlers.forEach((handle: ScrapeHandleElement) => { | |
const { selector, handler: handleParseElement } = handle(setter); | |
super.on(selector, handleParseElement); | |
}); | |
} | |
if (handlers instanceof Function) { | |
const handleParsing = handlers(setter); | |
if (Array.isArray(handleParsing)) { | |
handleParsing.forEach(({ selector, handler: handleParseElement }) => | |
super.on(selector, handleParseElement) | |
); | |
} else { | |
super.on(handleParsing.selector, handleParsing.handler); | |
} | |
} | |
return this; | |
} | |
scrape(response: Response) { | |
super.transform(response); | |
const keys = this.kv.keys(); | |
const result = {} as { [key: string]: any }; | |
for (const key of keys) { | |
const value = this.kv.get(key); | |
if (value) { | |
result[key] = this.mapToType[key] ? this.mapToType[key](value) : value; | |
} | |
} | |
return result; | |
} | |
} | |
// HOW TO PARSE A PAGE | |
// Selectors | |
const currentSelectedFederationIdSelect = '#territorial > option[selected]'; | |
const currentPageFederationHeaderAElementSelector = | |
'#cabecera > div:nth-child(2) > a:nth-child(1)'; | |
const currentPageFederationImgElementSelector = | |
'#cabecera > div:nth-child(2) > a:nth-child(1) > img'; | |
// Getting the info from those selectors in the page | |
export const currentFederationInfo = (kv: KVScrapeStorage) => [ | |
{ | |
// Federation id and name | |
selector: currentSelectedFederationIdSelect, | |
handler: { | |
element: (element: HTMLRewriterTypes.Element) => { | |
const key = 'rfebmId'; | |
const value = Number(element.getAttribute('value')) ?? -1; | |
kv.set(key, value); | |
}, | |
text: ({ text }: HTMLRewriterTypes.Text) => { | |
const key = 'name'; | |
const value = text.trim(); | |
kv.set(key, (prev?: string) => (prev ?? '') + value); | |
}, | |
}, | |
}, | |
{ | |
// Url to the federation website | |
selector: currentPageFederationHeaderAElementSelector, | |
handler: { | |
element: (element: HTMLRewriterTypes.Element) => { | |
const key = 'url'; | |
const urlString = element.getAttribute('href'); | |
kv.set(key, urlString); | |
}, | |
}, | |
}, | |
{ | |
// Url to the federation shield | |
selector: currentPageFederationImgElementSelector, | |
handler: { | |
element: (element: HTMLRewriterTypes.Element) => { | |
const key = 'shieldUrl'; | |
const imgUrl = element.getAttribute('src'); | |
kv.set(key, imgUrl); | |
}, | |
}, | |
}, | |
]; | |
// Parse <select> of all federations | |
const federationsSelector = '#territorial > option'; | |
function allFederationsInfo(kv: KVScrapeStorage) { | |
let currentNumber = -1; | |
return [ | |
{ | |
selector: federationsSelector, | |
handler: { | |
element: (element: HTMLRewriterTypes.Element) => { | |
currentNumber = Number(element.getAttribute('value')) ?? -1; | |
}, | |
text: ({ text }: HTMLRewriterTypes.Text) => { | |
const name = text.trim(); | |
if (name.length > 0) { | |
kv.set(currentNumber, name); | |
} | |
}, | |
}, | |
}, | |
]; | |
} | |
// How to use all above: | |
// const rw = new HTMLRewriterScrapeDocument(); | |
// rw.addHandlers('currentFederation', federationInfoFromHeaderHandle); | |
// rw.addHandlers('allFederations', allFederationsInfo); | |
// const response = await fetch( | |
// 'https://www.rfebm.com/competiciones/competicion.php' | |
// ); | |
// const result = rw.scrape(response); | |
// console.log(result); | |
To parse a table I have done this function:
import { KVScrapeStorage } from 'libraries/html-rewriter-scrape-document.ts';
type HTMLTagAttributeResult = [string, string]; // [key, value]
type HTMLTagResult = {
[key: string]: Array<HTMLTagAttributeResult>;
} & {
text?: string;
};
export type HTMLTableRowResult = Map<number, Map<string, HTMLTagResult>>;
// selector must end with ' *' to parse inner elements as well
// if table has tbody, selector must be 'table > tbody > *'
// if table has no tbody, selector must be 'table > *'
// Whatever you do it needs to parse tr and td and subelements
export function tableParserAsJSON(tableRowSelector: string) {
let currentRowNumber = 0;
let currentCellNumber = -1;
let currentTagName = '';
return (kvScrapeStorage: KVScrapeStorage) => ({
selector: tableRowSelector,
handler: {
element: (element: HTMLRewriterTypes.Element) => {
currentTagName = element.tagName;
if (element.tagName === 'tr') {
currentRowNumber++;
currentCellNumber = 0;
}
if (element.tagName === 'td') {
currentCellNumber++;
}
kvScrapeStorage.set(currentRowNumber, (prev: any) => {
prev ??= {};
prev[currentCellNumber] ??= {};
prev[currentCellNumber][currentTagName] ??= [];
const attrs = Object.fromEntries(element.attributes);
const attrsEntries = Object.entries(attrs);
prev[currentCellNumber][currentTagName].push({ attrsEntries });
return prev;
});
},
text: ({ text }: HTMLRewriterTypes.Text) => {
if (currentTagName === 'tr') return;
kvScrapeStorage.set(currentRowNumber, (prev?: any) => {
prev ??= {};
prev[currentCellNumber] ??= {};
prev[currentCellNumber][currentTagName] ??= [];
const currentIndex =
prev[currentCellNumber][currentTagName].length - 1;
prev[currentCellNumber][currentTagName][currentIndex]['text'] ??= '';
prev[currentCellNumber][currentTagName][currentIndex]['text'] += text;
return prev;
});
},
},
});
}
export function genericTableMapToTypedTable(cellNames?: Array<string>) {
return function (table: HTMLTableRowResult) {
const rows = Array.from(table.values());
const rowsCells = rows.map((row) => {
const cells = Object.values(row);
if (!cellNames) return cells;
const cellsEntries = cells.map((cell, index) => [
cellNames?.[index] ?? index,
cell,
]);
return Object.fromEntries(cellsEntries);
});
return rowsCells;
};
}
Parsing a table example with output:
import {
genericTableMapToTypedTable,
tableParserAsJSON,
} from './src/scrape-pages/handlers/generic/table-as-json.ts';
import { HTMLRewriterScrapeDocument } from './src/libraries/html-rewriter-scrape-document.ts';
const rw = new HTMLRewriterScrapeDocument();
const response = await fetch(
'https://www.rfebm.com/competiciones/competicion.php'
);
const tableRow = 'body > div.table-responsive > table > tbody *';
rw.addHandlers(
'table',
tableParserAsJSON(tableRow),
genericTableMapToTypedTable([
'local',
'visitor',
'teams',
'result',
'datetime',
'court',
'tv',
'previous',
'streaming',
'live',
'livedata',
'stats',
'status',
'oficial_report',
])
);
const result = rw.scrape(response).table; // table is the alias given in addHandlers
console.log({ result });
Output example when parsing a table
{
result: [
{
"14": {
td: [
{
attrsEntries: [
[ "class", "negrita p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "#" ], [ "onclick", "mostrarActaFormal(1291431)" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-info" ]
],
text: "VER ACTA"
}
]
},
local: {
tr: [
{
attrsEntries: []
}
]
},
visitor: {
td: [
{
attrsEntries: [
[ "class", "celda_peque p-t-15" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "equipo.php?seleccion=0&id_equipo=209083&id=1017101" ]
]
}
],
img: [
{
attrsEntries: [
[ "src", "http://balonmano.isquad.es/images/afiliacion_clubs/857/square_653864366c64616b3361.jpg"
], [ "class", "escudo_tabla" ]
]
}
]
},
teams: {
td: [
{
attrsEntries: [
[ "class", "celda_peque p-t-15" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "equipo.php?seleccion=0&id_equipo=209035&id=1017101" ]
]
}
],
img: [
{
attrsEntries: [
[ "src", "http://balonmano.isquad.es/images/afiliacion_clubs/305/square_74396b6a72766a626963.jpg"
], [ "class", "escudo_tabla" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "cuchufleta" ], [ "style", "display: none;" ]
]
}
],
pre: [
{
attrsEntries: []
}
]
},
result: {
td: [
{
attrsEntries: [
[ "class", "p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "class", "" ], [ "href", "equipo.php?seleccion=0&id_equipo=209083&id=1017101"
]
],
text: "Prado Marianistas - "
}, {
attrsEntries: [
[ "class", "negrita" ], [ "href", "equipo.php?seleccion=0&id_equipo=209035&id=1017101"
]
],
text: "BM SANSE"
}
]
},
datetime: {
td: [
{
attrsEntries: [
[ "class", "centrado p-t-20" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "" ]
],
text: "19 -"
}, {
attrsEntries: [
[ "class", "negrita" ]
],
text: " 27"
}
]
},
court: {
td: [
{
attrsEntries: []
}
],
div: [
{
attrsEntries: [
[ "class", "negrita" ]
],
text: "21/05/2023"
}, {
attrsEntries: [],
text: "10:30 "
}
]
},
tv: {
td: [
{
attrsEntries: [
[ "class", "negrita p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "https://www.google.com/maps/?q=42.6593719083639,-8.11761953439327"
], [ "target", "_blank" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "glyphicon glyphicon-map-marker" ], [ "aria-hidden", "true" ]
],
text: " PAVILLÓN LALÍN ARENA "
}
]
},
previous: {
td: [
{
attrsEntries: [
[ "class", "centrado p-t-20" ]
]
}
]
},
streaming: {
td: [
{
attrsEntries: [
[ "class", "p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "#" ], [ "onclick", "mostrarPrevio(1291431)" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-primary" ]
],
text: "PREVIO"
}
]
},
live: {
td: [
{
attrsEntries: [
[ "class", "p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "https://youtube.com/live/4gLWI_3m3y4?feature=share" ], [ "target",
"_blank" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-success" ]
],
text: "STREAMING"
}
]
},
livedata: {
td: [
{
attrsEntries: [
[ "class", "negrita p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "#" ], [ "onclick", "mostrarActa(1291431)" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-info" ]
],
text: "DIRECTO"
}
]
},
stats: {
td: [
{
attrsEntries: [
[ "class", "negrita p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "#" ], [ "onclick", "window.open(\"//balonmano.misquad.es/estadisticas_pdc_pdf.php?id=1291431&token=4d3f95997750adbee8fadd82a7272efb&sec=0471022ebf9a7f52719c208c4da47557\");"
]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-info" ]
],
text: "DATOS LIVE"
}
]
},
status: {
td: [
{
attrsEntries: [
[ "class", "negrita p-t-20" ]
],
text: " "
}
],
a: [
{
attrsEntries: [
[ "href", "#" ], [ "onclick", "window.open(\"//balonmano.misquad.es/estadisticas_pdc.php?id=1291431&id_equipo=209083&token=4d3f95997750adbee8fadd82a7272efb&sec=0471022ebf9a7f52719c208c4da47557\");"
]
]
}, {
attrsEntries: [
[ "href", "#" ], [ "onclick", "window.open(\"//balonmano.misquad.es/estadisticas_pdc.php?id=1291431&id_equipo=209035&token=4d3f95997750adbee8fadd82a7272efb&sec=0471022ebf9a7f52719c208c4da47557\");"
]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-info" ]
],
text: "LOCAL"
}, {
attrsEntries: [
[ "class", "label label-info" ], [ "style", "margin-left: 5px;" ]
],
text: "VISIT."
}
]
},
oficial_report: {
td: [
{
attrsEntries: [
[ "class", "p-t-20" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-success" ]
],
text: "FINALIZADO"
}
]
}
}, {
"14": {
td: [
{
attrsEntries: [
[ "class", "negrita p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "#" ], [ "onclick", "mostrarActaFormal(1291432)" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-info" ]
],
text: "VER ACTA"
}
]
},
local: {
tr: [
{
attrsEntries: []
}
]
},
visitor: {
td: [
{
attrsEntries: [
[ "class", "celda_peque p-t-15" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "equipo.php?seleccion=0&id_equipo=205087&id=1017101" ]
]
}
],
img: [
{
attrsEntries: [
[ "src", "http://balonmano.isquad.es/images/afiliacion_clubs/269/square_6e323338383761617937.jpg"
], [ "class", "escudo_tabla" ]
]
}
]
},
teams: {
td: [
{
attrsEntries: [
[ "class", "celda_peque p-t-15" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "equipo.php?seleccion=0&id_equipo=203695&id=1017101" ]
]
}
],
img: [
{
attrsEntries: [
[ "src", "http://balonmano.isquad.es/images/afiliacion_clubs/380/square_39753176353763756539.jpg"
], [ "class", "escudo_tabla" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "cuchufleta" ], [ "style", "display: none;" ]
]
}
],
pre: [
{
attrsEntries: []
}
]
},
result: {
td: [
{
attrsEntries: [
[ "class", "p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "class", "negrita" ], [ "href", "equipo.php?seleccion=0&id_equipo=205087&id=1017101"
]
],
text: "VAYRO LALÍN - "
}, {
attrsEntries: [
[ "class", "" ], [ "href", "equipo.php?seleccion=0&id_equipo=203695&id=1017101"
]
],
text: "C.BM. MISLATA"
}
]
},
datetime: {
td: [
{
attrsEntries: [
[ "class", "centrado p-t-20" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "negrita" ]
],
text: "30 -"
}, {
attrsEntries: [
[ "class", "" ]
],
text: " 25"
}
]
},
court: {
td: [
{
attrsEntries: []
}
],
div: [
{
attrsEntries: [
[ "class", "negrita" ]
],
text: "21/05/2023"
}, {
attrsEntries: [],
text: "12:30 "
}
]
},
tv: {
td: [
{
attrsEntries: [
[ "class", "negrita p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "https://www.google.com/maps/?q=42.6593719083639,-8.11761953439327"
], [ "target", "_blank" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "glyphicon glyphicon-map-marker" ], [ "aria-hidden", "true" ]
],
text: " PAVILLÓN LALÍN ARENA "
}
]
},
previous: {
td: [
{
attrsEntries: [
[ "class", "centrado p-t-20" ]
]
}
]
},
streaming: {
td: [
{
attrsEntries: [
[ "class", "p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "#" ], [ "onclick", "mostrarPrevio(1291432)" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-primary" ]
],
text: "PREVIO"
}
]
},
live: {
td: [
{
attrsEntries: [
[ "class", "p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "https://youtube.com/live/kXLfsx98PNc?feature=share" ], [ "target",
"_blank" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-success" ]
],
text: "STREAMING"
}
]
},
livedata: {
td: [
{
attrsEntries: [
[ "class", "negrita p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "#" ], [ "onclick", "mostrarActa(1291432)" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-info" ]
],
text: "DIRECTO"
}
]
},
stats: {
td: [
{
attrsEntries: [
[ "class", "negrita p-t-20" ]
]
}
],
a: [
{
attrsEntries: [
[ "href", "#" ], [ "onclick", "window.open(\"//balonmano.misquad.es/estadisticas_pdc_pdf.php?id=1291432&token=6561e8bd54145760a7296abc49fd2d9d&sec=0471022ebf9a7f52719c208c4da47557\");"
]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-info" ]
],
text: "DATOS LIVE"
}
]
},
status: {
td: [
{
attrsEntries: [
[ "class", "negrita p-t-20" ]
],
text: " "
}
],
a: [
{
attrsEntries: [
[ "href", "#" ], [ "onclick", "window.open(\"//balonmano.misquad.es/estadisticas_pdc.php?id=1291432&id_equipo=205087&token=6561e8bd54145760a7296abc49fd2d9d&sec=0471022ebf9a7f52719c208c4da47557\");"
]
]
}, {
attrsEntries: [
[ "href", "#" ], [ "onclick", "window.open(\"//balonmano.misquad.es/estadisticas_pdc.php?id=1291432&id_equipo=203695&token=6561e8bd54145760a7296abc49fd2d9d&sec=0471022ebf9a7f52719c208c4da47557\");"
]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-info" ]
],
text: "LOCAL"
}, {
attrsEntries: [
[ "class", "label label-info" ], [ "style", "margin-left: 5px;" ]
],
text: "VISIT."
}
]
},
oficial_report: {
td: [
{
attrsEntries: [
[ "class", "p-t-20" ]
]
}
],
span: [
{
attrsEntries: [
[ "class", "label label-success" ]
],
text: "FINALIZADO"
}
]
}
}
]
}
You can add multiple handlers and will provide you all the data in the aliases you give.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Expected output