Last active
January 2, 2017 08:09
-
-
Save erm3nda/63e2a53f23287828306e18189807576a to your computer and use it in GitHub Desktop.
Blizzard user data scrapper like a thread from PHP CLI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* database.php | |
* | |
* Database access | |
* | |
* @category Data scrapper | |
* @package Blizzard | |
* @author erm3nda | |
* @website http://erm3nda.github.io | |
* @copyright idk | |
* @license Persona natural de liza | |
* @version 0.0.1 | |
* @see pthreads | |
* @link thebot.net/threads/battle-net-name-number-scraper.368348/#post-4088269 | |
* @note This is NOT the right/best way to do that, it's the old style to make a bunch of processes. But works. | |
* @platform Linux | |
*/ | |
$results_file = "results.txt"; // U can use anything here /tmp/8a9sf8asdf.txt or whatever. | |
header("Content-type: text/plain, charset=iso-8859-1"); | |
header("Content-Disposition: attachment; filename=$results_file"); | |
function getCurrentSize(){ | |
$linecount = 0; | |
$handle = fopen($file, "r"); | |
while(!feof($handle)){ | |
$line = fgets($handle); | |
$linecount++; | |
} | |
fclose($handle); | |
return $linecount; | |
} | |
function getCurrentLines(){ | |
$lines = count(split(file_get_contents($results_file))); | |
return $lines; | |
} | |
// this function does check for PID or lock files | |
function processExists($pid) { | |
$return = exec("ps -A | grep $pid"); | |
if (preg_match("/$pid/", $return)) { | |
return $pid; | |
} | |
} | |
function removeItem($array, $element) { | |
return array_diff($array, [$element]); | |
} | |
# da loop | |
if (@$argv[1]) { | |
$html2 = file_get_contents('http://us.battle.net'.$argv[1]); | |
$doc2 = new DOMDocument(); | |
if(!empty($html2)) | |
{ | |
$doc2->loadHTML($html2); | |
libxml_clear_errors(); | |
$xpath2 = new DOMXPath($doc2); | |
$row2 = $xpath2->query('//a[@class="context-link d3-class-1"]/@href'); | |
if($row2->length > 0) | |
{ | |
foreach($row2 as $row) | |
{ | |
$whole_string = $row->nodeValue; | |
$cut_string = explode('/', $whole_string); | |
$main_info = $cut_string[4]; | |
$conv_info = iconv("UTF-8//", "ISO-8859-1//IGNORE", $main_info ); | |
$main_infos = explode('-', $conv_info); | |
// save only if it's not currently present | |
if(!preg_match("/$main_infos[0]:$main_infos[1]/", file_get_contents($results_file))){ | |
file_put_contents($results_file, $main_infos[0] . ":" . $main_infos[1] . "\n", FILE_APPEND); | |
} else { | |
echo "Skipping already saved result: $main_infos[0]:$main_infos[1]"; | |
} | |
} | |
} | |
} | |
} else { | |
//~ flush(); | |
echo "Running Blizz ... "; | |
$array = array(); | |
$html1 = file_get_contents('http://us.battle.net/d3/en/forum/3354739/'); | |
$doc1 = new DOMDocument(); | |
libxml_use_internal_errors(TRUE); | |
if(!empty($html1)) | |
{ | |
$doc1->loadHTML($html1); | |
libxml_clear_errors(); | |
$xpath1 = new DOMXPath($doc1); | |
$row1 = $xpath1->query('//a[@class="topic-title"]/@href'); | |
echo "found " . $row1->length . " links" . PHP_EOL; | |
if($row1->length > 0) | |
$requests = $row1->length; | |
{ | |
echo "Spawning a total of $requests processes, wait a few ..." . PHP_EOL; | |
foreach($row1 as $row) | |
{ | |
$pid[] = shell_exec("php ". __file__ . " $row->nodeValue > /dev/null 2>&1 & echo $!;"); // run that in a "threaded way" | |
$requests = $requests - 1; | |
usleep(100); | |
} | |
// wait for spawned processes | |
foreach ($pid as $p) | |
{ | |
if (!processExists($p)){ | |
removeItem($pid, $p); // remove finished PIDs from current list | |
usleep(100); | |
} | |
} | |
} | |
} | |
while (count($pid) == 0){ | |
echo "Processes did finished, continue ..."; | |
} | |
sleep(4); // wait for disk writes | |
$content = array_unique(split("\n", file_get_contents($results_file))); | |
echo "Found " . count($content) . " of unique results:" . PHP_EOL; | |
file_put_contents($results_file, implode("\n", $content)); | |
exit(); | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment