Skip to content

Instantly share code, notes, and snippets.

@kidager
Created June 26, 2018 13:01
Show Gist options
  • Save kidager/34a4c2dfac116e88b2634bbc7eb3c796 to your computer and use it in GitHub Desktop.
Save kidager/34a4c2dfac116e88b2634bbc7eb3c796 to your computer and use it in GitHub Desktop.
scrap scrap
<?php
// Some Artisan stuff
Artisan::command('do:shit', function () {
$guzzleClient = new GuzzleHttp\Client([
'base_uri' => 'https://www.seloger.com',
'headers' => [
'Connection' => 'keep-alive',
'Pragma' => 'no-cache',
'Cache-Control' => 'no-cache',
'Upgrade-Insecure-Requests' => '1',
'DNT' => '1',
'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer' => 'https://www.seloger.com',
'Accept-Language' => 'fr,en;q=0.8,en;q=0.7',
],
// 'proxy' => [ // Disable proxy for DIVA
// 'http' => '',
// 'https' => '',
// 'no' => [],
// ],
]);
$response = $guzzleClient->get(
// 'list.htm?tri=initial&idtypebien=2,1&div=2238&idtt=1&naturebien=1,2,4&LISTING-LISTpg=2'
'list.htm?tri=initial&idtypebien=2,1&div=2238&idtt=2,5&naturebien=1,2,4&LISTING-LISTpg=1'
);
$html = (string)$response->getBody();
$internalErrors = libxml_use_internal_errors(true);
$dom = new DOMDocument('1.0', 'UTF-8');
$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
function toDOM($element)
{
if ($element instanceof DOMDocument) {
$newDOM = $element;
} else {
$newDOM = new DOMDocument('1.0', 'UTF-8');
$newDOM->appendChild($newDOM->importNode($element, true));
}
return $newDOM;
}
function toHtml($element)
{
return toDOM($element)->saveHTML();
}
function getSubElements($element, $query)
{
$newDOM = toDOM($element);
$selector = new DOMXPath($newDOM);
$arr = [];
foreach ($selector->query($query) as $node) {
$arr[] = $node;
}
return collect($arr);
}
$arr = getSubElements($dom, '//*[starts-with(@id, "annonce-")]');
dd($arr->map(function ($element1) use ($dom) {
$link = getSubElements($element1, '//a[@class="link_AB"]')->first();
$elements1 = getSubElements($element1, '//div[@class="slideContent"]');
$pictures = $elements1->map(function ($element2) {
$picture = getSubElements($element2, '//a[@class="link_AB"]//div')->first();
$pictureUrl = data_get(json_decode($picture->getAttribute('data-lazy')), 'url');
return [
'picture' => $pictureUrl,
];
});
$surfaceInfo = getSubElements($element1, '//div[@class="c-pa-info"]//div[@class="c-pa-criterion"]//em');
$city = getSubElements($element1, '//div[@class="c-pa-city"]')->first();
$pprice = data_get(getSubElements($element1, '//div[@class="c-pa-price"]//span[@class="c-pa-pprice"]')->first(), 'nodeValue');
$cprice = data_get(getSubElements($element1, '//div[@class="c-pa-price"]//span[@class="c-pa-cprice"]')->first(), 'nodeValue');
$sprice = data_get(getSubElements($element1, '//div[@class="c-pa-price"]//span[@class="c-pa-sprice"]')->first(), 'nodeValue');
$phone = getSubElements($element1, '//div[@class="c-pa-actions"]//a[@data-ava-id="telAgence"]')->first();
$agencyUrl = getSubElements($element1, '//div[@class="c-pa-agency"]//a')->first();
$agencyInfo = optional(getSubElements($element1, '//div[@class="c-pa-agency"]//a//div')->first());
// dd(toHtml($element1));
return [
'listing_id' => $element1->getAttribute('data-listing-id'),
'publication_id' => $element1->getAttribute('data-publication-id'),
'link' => $link->getAttribute('href'),
'city' => data_get($city, 'nodeValue'),
'pieces_s' => data_get($surfaceInfo->get(0), 'nodeValue'),
'pieces' => (int)data_get($surfaceInfo->get(0), 'nodeValue'),
'chambres_s' => data_get($surfaceInfo->get(1), 'nodeValue'),
'chambres' => (int)data_get($surfaceInfo->get(1), 'nodeValue'),
'surface_s' => data_get($surfaceInfo->get(2), 'nodeValue'),
'surface' => (float)data_get($surfaceInfo->get(2), 'nodeValue'),
'price' => [
'pprice' => trim($pprice),
'cprice' => trim($cprice),
'sprice' => trim($sprice),
],
'pictures' => $pictures,
'agence' => [
'name' => $agencyInfo->getAttribute('alt'),
'url' => $agencyUrl->getAttribute('href'),
'phone' => $phone->getAttribute('data-tooltip-focus'),
'picture' => data_get(json_decode($agencyInfo->getAttribute('data-lazy')), 'url'),
],
];
})->shuffle()->first());
// foreach ($images as $image) {
// $image->setAttribute('src', 'http://example.com/' . $image->getAttribute('src'));
// }
// $html = $dom->saveHTML();
})->describe('Do shit');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment