Last active
May 23, 2023 13:56
-
-
Save Ashivudhi/16f90beb7b7c1f6c43f7791b4174f43c to your computer and use it in GitHub Desktop.
Crawling Jobs on Neis site script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App\Console\Commands; | |
use Illuminate\Console\Command; | |
use DOMDocument; | |
use DOMXPath; | |
use Illuminate\Support\Facades\Log; | |
class CrawlJobs extends Command | |
{ | |
protected $signature = 'jobs:crawling'; | |
protected $description = 'Crawl job posts and save as JSON'; | |
public function handle() | |
{ | |
// Function to fetch and parse the HTML content of a given URL | |
function fetchHTML($url): bool|string | |
{ | |
$curl = curl_init(); | |
curl_setopt($curl, CURLOPT_URL, $url); | |
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); | |
$html = curl_exec($curl); | |
curl_close($curl); | |
Log::info('content returned'.$html); | |
return $html; | |
} | |
// URL of the job search results page | |
$url = 'https://nieis.namibiaatwork.gov.na/search-results-jobs'; | |
// Fetch the HTML content of the job search results page | |
$html = fetchHTML($url); | |
// Create a DOMDocument object and load the HTML | |
$dom = new DOMDocument(); | |
libxml_use_internal_errors(true); // Disable libxml errors and warnings | |
$dom->loadHTML($html); | |
libxml_clear_errors(); | |
// Create a DOMXPath object to query the DOMDocument | |
$xpath = new DOMXPath($dom); | |
// XPath query to select job posts | |
$jobPostXPath = '//div[contains(@class, "panel-heading")]'; | |
// Get all job posts | |
$jobPosts = $xpath->query($jobPostXPath); | |
// Array to store job data | |
$jobs = []; | |
// Iterate over the job posts and extract the desired information | |
foreach ($jobPosts as $jobPost) { | |
// Extract job title | |
$titleNode = $xpath->query('.//h2/a', $jobPost)->item(0); | |
$title = $titleNode->textContent; | |
// Extract job description | |
$descriptionNode = $xpath->query('.//div[@class="col-md-8"]', $jobPost)->item(0); | |
$description = $descriptionNode->textContent; | |
// Create a job array | |
$job = [ | |
'title' => $title, | |
'description' => $description | |
]; | |
// Add the job to the jobs array | |
$jobs[] = $job; | |
} | |
// Convert the jobs array to JSON | |
$json = json_encode($jobs, JSON_PRETTY_PRINT); | |
// Save the JSON data to a file | |
$file = 'jobs.json'; | |
file_put_contents($file, $json); | |
$this->info('Job data saved to ' . $file); | |
return null; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment