Created
December 3, 2023 20:39
-
-
Save HelgeSverre/20b309c8ed93a43961f75e94a1a33d34 to your computer and use it in GitHub Desktop.
Experimental approach to selector finding using AI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App\Scraping; | |
use App\HtmlCompressor; | |
use Closure; | |
use OpenAI; | |
use Spatie\Fork\Fork; | |
class SelectorFinder | |
{ | |
public static function find($html) | |
{ | |
// This removes elements and attributes that usually dont contain useful info for scraping (style, script tags etc) | |
$compressed = HtmlCompressor::compress($html); | |
// Chunk the HTML into workable chunks of approx 3000 tokens | |
/** @var Closure[] $chunks */ | |
$chunks = collect(str_split($compressed, 8000)) | |
->map(fn ($chunk) => str_replace( | |
search: '[HTML_CHUNK]', | |
replace: $chunk, | |
subject: <<<PROMPT | |
If the following html chunk contains any of the following data, make me a CSS selector that will select the appropriate value, leave it null if not. | |
Fields: | |
product_category_tree | |
product_name | |
product_brand | |
product_image | |
product_sku | |
product_price | |
Example structure | |
{ | |
"field_1": {"selector": ".breadcrumb-items a", "attr": "innerText", list: true}, | |
"field_2": {"selector": ".product-details > .product-title", "attr": "innerText", list: false}, | |
} | |
HTML chunk: | |
[HTML_CHUNK] | |
Output as JSON: | |
PROMPT | |
)) | |
->map(function ($prompt) { | |
// Request to OpenAI that will return the matching selectors it found in the chunk | |
return function () use ($prompt) { | |
$response = OpenAI::client('sk-your-key-here') | |
->chat() | |
->create([ | |
'model' => 'gpt-3.5-turbo', | |
'temperature' => 0.4, | |
'messages' => [ | |
['role' => 'system', 'content' => 'You are an AI that figures out the correct CSS selectors to use for extracting the desired data from the html, you only provide selectors json, not the data itself.'], | |
['role' => 'user', 'content' => $prompt], | |
], | |
]); | |
return rescue(fn () => json_decode($response->choices[0]->message->content, true), report: false); | |
}; | |
}); | |
// Run all requests in parallel (its faster, but not neccesary) | |
$results = Fork::new()->run(...$chunks); | |
$ordered = collect($results)->sortKeys()->values(); | |
return $ordered; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment