Skip to content

Instantly share code, notes, and snippets.

@erm3nda
Last active January 2, 2017 08:09
Show Gist options
  • Save erm3nda/63e2a53f23287828306e18189807576a to your computer and use it in GitHub Desktop.
Save erm3nda/63e2a53f23287828306e18189807576a to your computer and use it in GitHub Desktop.
Blizzard user data scrapper like a thread from PHP CLI
<?php
/**
* database.php
*
* Database access
*
* @category Data scrapper
* @package Blizzard
* @author erm3nda
* @website http://erm3nda.github.io
* @copyright idk
* @license Persona natural de liza
* @version 0.0.1
* @see pthreads
* @link thebot.net/threads/battle-net-name-number-scraper.368348/#post-4088269
* @note This is NOT the right/best way to do that, it's the old style to make a bunch of processes. But works.
* @platform Linux
*/
$results_file = "results.txt"; // U can use anything here /tmp/8a9sf8asdf.txt or whatever.
header("Content-type: text/plain, charset=iso-8859-1");
header("Content-Disposition: attachment; filename=$results_file");
function getCurrentSize(){
$linecount = 0;
$handle = fopen($file, "r");
while(!feof($handle)){
$line = fgets($handle);
$linecount++;
}
fclose($handle);
return $linecount;
}
function getCurrentLines(){
$lines = count(split(file_get_contents($results_file)));
return $lines;
}
// this function does check for PID or lock files
function processExists($pid) {
$return = exec("ps -A | grep $pid");
if (preg_match("/$pid/", $return)) {
return $pid;
}
}
function removeItem($array, $element) {
return array_diff($array, [$element]);
}
# da loop
if (@$argv[1]) {
$html2 = file_get_contents('http://us.battle.net'.$argv[1]);
$doc2 = new DOMDocument();
if(!empty($html2))
{
$doc2->loadHTML($html2);
libxml_clear_errors();
$xpath2 = new DOMXPath($doc2);
$row2 = $xpath2->query('//a[@class="context-link d3-class-1"]/@href');
if($row2->length > 0)
{
foreach($row2 as $row)
{
$whole_string = $row->nodeValue;
$cut_string = explode('/', $whole_string);
$main_info = $cut_string[4];
$conv_info = iconv("UTF-8//", "ISO-8859-1//IGNORE", $main_info );
$main_infos = explode('-', $conv_info);
// save only if it's not currently present
if(!preg_match("/$main_infos[0]:$main_infos[1]/", file_get_contents($results_file))){
file_put_contents($results_file, $main_infos[0] . ":" . $main_infos[1] . "\n", FILE_APPEND);
} else {
echo "Skipping already saved result: $main_infos[0]:$main_infos[1]";
}
}
}
}
} else {
//~ flush();
echo "Running Blizz ... ";
$array = array();
$html1 = file_get_contents('http://us.battle.net/d3/en/forum/3354739/');
$doc1 = new DOMDocument();
libxml_use_internal_errors(TRUE);
if(!empty($html1))
{
$doc1->loadHTML($html1);
libxml_clear_errors();
$xpath1 = new DOMXPath($doc1);
$row1 = $xpath1->query('//a[@class="topic-title"]/@href');
echo "found " . $row1->length . " links" . PHP_EOL;
if($row1->length > 0)
$requests = $row1->length;
{
echo "Spawning a total of $requests processes, wait a few ..." . PHP_EOL;
foreach($row1 as $row)
{
$pid[] = shell_exec("php ". __file__ . " $row->nodeValue > /dev/null 2>&1 & echo $!;"); // run that in a "threaded way"
$requests = $requests - 1;
usleep(100);
}
// wait for spawned processes
foreach ($pid as $p)
{
if (!processExists($p)){
removeItem($pid, $p); // remove finished PIDs from current list
usleep(100);
}
}
}
}
while (count($pid) == 0){
echo "Processes did finished, continue ...";
}
sleep(4); // wait for disk writes
$content = array_unique(split("\n", file_get_contents($results_file)));
echo "Found " . count($content) . " of unique results:" . PHP_EOL;
file_put_contents($results_file, implode("\n", $content));
exit();
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment