Last active
February 8, 2016 11:20
-
-
Save yurukov/2c211c2151eaeb5a403e to your computer and use it in GitHub Desktop.
Scraper for the dog registry in Plovdiv, Bulgaria
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1. Open http://registry.plovdiv.bg/eDogs/default.aspx | |
2. Select 50 entries per page | |
3. Copy JS in console and run | |
4. Back up the downloaded data file | |
5. Extract all addresses: | |
awk -F '\t' '{print $4}' plddogs.tsv | grep -v address | sed 's_,\? \?\(ет\|ап\)\.\? \?[0-9]\+ \?__g;s_ \?№ \?_ _' |sort -u > addr | |
6. Geotag the addresses: | |
php geotag.php addr > addrg | |
7. Merge geotagged addresses with scaraped data and exclude columns: | |
for i in `sed 's_\t_|_g;s_ _\__g' plddogs.tsv | grep -v address`; | |
do c=`echo $i | sed 's_\__ _g'`; | |
echo -n `echo $c | awk -F '|' '{print $1","$2","$5","$6","}'`; | |
a=`echo $c | awk -F '|' '{print $4}'| sed 's_,\? \?\(ет\|ап\)\.\? \?[0-9]\+ \?__g;s_ \?№ \?_ _'`; | |
grep -m 1 "^$a" addrg | sed 's_.*\t__'; | |
done > data.csv | |
8. Cleanup extracted report: | |
sed -i 's_\([0-9]\{2\}\)\.\([0-9]\{2\}\)\.\([0-9]\{4\}\) г\._\3-\2-\1_;s_\([0-9]\{4\}\)-\([0-9]\{2\}\)-00_\1-\2-01_;s_\([0-9]\{4\}\)-00-\([0-9]\{2\}\)_\1-01-\2_' data.csv | |
php cleandata.php data.csv > data_clean.php | |
9. Manually replace gender and category values and fix dates. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
1 кастрирано | |
2 с чип | |
3 ловно | |
4 на инвалид | |
5 от приют | |
6 служебно | |
*/ | |
/* | |
This rounds down the precision of the coordinates and displaces them by a few meters. | |
This anonymises the data and separates points that are at the same address. | |
*/ | |
$data = file_get_contents($argv[1]); | |
$data = explode("\n",trim($data)); | |
for ($i=0;$i<count($data);$i++) { | |
$data[$i]=explode(",",trim($data[$i])); | |
if (count($data[$i])!=6 && count($data[$i])!=10) die("\nerror".$i); | |
if (count($data[$i])==10) | |
$data[$i]=array($data[$i][4],$data[$i][5],$data[$i][6],$data[$i][7],$data[$i][8],$data[$i][9]); | |
$lat=$data[$i][4]; | |
$lng=$data[$i][5]; | |
$r=rand(5,50)/100000; | |
$angle = rand(0,100)/100*M_PI*2; | |
$data[$i][4]=round($lat+sin($angle)*$r,5); | |
$data[$i][5]=round($lng+cos($angle)*$r,5); | |
echo implode(",",$data[$i])."\n"; | |
} | |
?> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$stopG=false; | |
$start = count($argv)>2?intval($argv[2]):0; | |
$data = file_get_contents($argv[1]); | |
$data = explode("\n",$data); | |
for ($i=$start;$i<count($data);$i++) { | |
$add=$data[$i].", Пловдив, България"; | |
$res=false; | |
if (!$stopG) { | |
$res=geocodeGoogle($add); | |
if ($res==="limit") { | |
$res=false; | |
$stopG=true; | |
} | |
} | |
if (!$res) | |
$res=geocodeOpen($add); | |
usleep(300000); | |
echo $data[$i]."\t".(!$res || count($res)==0?'ERROR':(count($res)==1?$res[0]:implode('&',$res)))."\n"; | |
} | |
function geocodeGoogle($add) { | |
$data = file_get_contents("https://maps.googleapis.com/maps/api/geocode/json?address=".urlencode(trim($add))."®ion=BG&sensor=false&key=[google api key]]"); | |
if (!$data) | |
return false; | |
$data = json_decode($data); | |
if ($data->status=="OVER_QUERY_LIMIT") | |
return "limit"; | |
if ($data->status!="OK" || !$data->results) | |
return false; | |
$res=array(); | |
foreach ($data->results as $row) { | |
$res[]=$row->geometry->location->lat.','.$row->geometry->location->lng.'|'.$row->formatted_address; | |
} | |
return $res; | |
} | |
function geocodeOpen($add) { | |
$data = file_get_contents("http://nominatim.openstreetmap.org/search?format=json&addressdetails=1&q=".urlencode(trim($add))); | |
if (!$data) | |
return false; | |
$data = json_decode($data); | |
$res=array(); | |
foreach ($data as $row) | |
$res[]=$row->lat.','.$row->lon.'|'.$row->display_name; | |
return $res; | |
} | |
?> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function download(filename, text) { | |
var element = document.createElement('a'); | |
element.setAttribute('href', 'data:text/plain;charset=utf-8,' + encodeURIComponent(text)); | |
element.setAttribute('download', filename); | |
element.style.display = 'none'; | |
document.body.appendChild(element); | |
element.click(); | |
document.body.removeChild(element); | |
} | |
pages = parseInt($('#ctl00_ContentPlaceHolder1_lblTotalPages').text()); | |
data ="regNum\tregDate\tname\taddress\tgender\tcategory"; | |
page=1; | |
$('#ctl00_ContentPlaceHolder1_ddlPage').val(page).change(); | |
task= setInterval(function() { | |
$('.gridview tr').each( function(i) { | |
if (i==0) return; | |
var temp=""; | |
$(this).children().each(function(j,c) { | |
temp+=$(c).text().replace(/\s+/g," ").trim()+"\t"; | |
}); | |
data+="\n"+temp.substring(0,temp.length-2); | |
}); | |
if ((++page)>pages) { | |
clearInterval(task); | |
download('dogs.tsv',data); | |
} else { | |
$('#ctl00_ContentPlaceHolder1_ddlPage').val(page).change(); | |
} | |
},2000); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment