Created
June 12, 2023 11:11
-
-
Save motebaya/f0e99dd799e46a5bc2f405a44cc94df7 to your computer and use it in GitHub Desktop.
scrape readable word from site with PHPGuzzle and PHPHTMLParser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
@for get readable word to change function or variable name | |
@credit: t.me/dvinchii a.k.a michino | |
File: word.php | |
Size: 1966 Blocks: 8 IO Block: 4096 regular file | |
Device: 10304h/66308d Inode: 243109 Links: 1 | |
Access: (0777/-rwxrwxrwx) Uid: ( 1000/ mochino) Gid: ( 1000/ mochino) | |
Access: 2023-05-02 12:32:30.134484200 +0700 | |
Modify: 2023-03-27 21:16:44.640398400 +0700 | |
Change: 2023-03-27 21:16:44.640398400 +0700 | |
Birth: - | |
*/ | |
require_once '../vendor/autoload.php'; | |
use GuzzleHttp\Client; | |
use PHPHtmlParser\Dom; | |
class getWord | |
{ | |
private $client; | |
private $options; | |
private $soup; | |
private $alpa; | |
private $count; | |
public function __construct() | |
{ | |
$this->client = new Client(); | |
$this->soup = new Dom; | |
$this->options = [ | |
"headers" => [ | |
"User-Agent" => "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36" | |
] | |
]; | |
$this->alpa = str_split("abcdefghijklmnopqrstuvwxyz"); | |
} | |
private function clear_word($string) | |
{ | |
return strtolower( | |
preg_replace( | |
'/_+/', | |
'_', | |
preg_replace( | |
'/[^A-Za-z]/', | |
'_', | |
$string->text | |
) | |
) | |
); | |
} | |
public function get_animal($f) | |
{ | |
// scrap from a-z animals | |
$page = $this->client->get('https://a-z-animals.com/animals/', $this->options); | |
$this->soup->loadStr($page->getBody()); | |
foreach ($this->alpa as $alpha) { | |
$table = $this->soup->find("#h-animals-that-start-with-{$alpha}"); | |
if (count($table) != 0) { | |
foreach ($table->nextSibling()->find("a") as $a) { | |
$clear_word = $this->clear_word($a); | |
fwrite($f, $clear_word . "\n"); | |
print " $this->count -> $clear_word \n"; | |
$this->count++; | |
} | |
} | |
} | |
} | |
public function get_fruit($f) | |
{ | |
// scrap from wiki | |
$page = $this->client->get("https://en.wikipedia.org/wiki/List_of_culinary_fruits", $this->options); | |
$this->soup->loadStr($page->getBody()); | |
$table = $this->soup->find(".wikitable"); | |
if (count($table) != 0) { | |
foreach ($table as $tab) { | |
foreach ($tab->find("i") as $i) { | |
$clear_word = $this->clear_word($i); | |
fwrite($f, $clear_word . "\n"); | |
print " $this->count -> $clear_word \n"; | |
$this->count++; | |
} | |
} | |
} | |
} | |
} | |
$f = fopen("word.txt", "a"); | |
$main = new getWord(); | |
$main->get_animal($f); | |
$main->get_fruit($f); | |
fclose($f); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment