Skip to content

Instantly share code, notes, and snippets.

@s3rgeym
Last active January 11, 2025 02:22
Show Gist options
  • Save s3rgeym/66424f6c1cad9ade8f1517022d1b88ae to your computer and use it in GitHub Desktop.
Save s3rgeym/66424f6c1cad9ade8f1517022d1b88ae to your computer and use it in GitHub Desktop.
#!/usr/bin/env php
<?php
error_reporting(E_ALL);
set_time_limit(0);
function get_sitemap_urls($url) {
preg_match_all("#<loc>(.+?)</loc>#", gzdecode(file_get_contents($url)), $matches);
return $matches[1];
}
$sitemap_urls = get_sitemap_urls("https://career.habr.com/assets/sitemap.xml.gz");
foreach ($sitemap_urls as $sitemap_url) {
foreach (get_sitemap_urls($sitemap_url) as $company_url) {
if (preg_match('#/companies/[^/]+$#', $company_url)) {
fwrite(STDERR, "Check $company_url\n");
$contents = @file_get_contents($company_url);
if (preg_match('#<div class="company_site"><a rel="nofollow" href="([^"]+)#', $contents, $matches)) {
fwrite(STDERR, "Found " . $matches[1] . "\n");
echo $matches[1] . "\n";
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment