Skip to content

Instantly share code, notes, and snippets.

@Ozerich
Created November 3, 2010 19:48
Show Gist options
  • Save Ozerich/661594 to your computer and use it in GitHub Desktop.
Save Ozerich/661594 to your computer and use it in GitHub Desktop.
<?php
set_time_limit(111111);
$db_host = "localhost";
$db_user = "root";
$db_password = "";
$db_dbname = "spr";
$db_tablename = "data";
mysql_connect($db_host, $db_user, $db_password) or die("Can't connect to mysql");
mysql_select_db($db_dbname) or die("Can't connect to database $db_dbname");
@mysql_query("DROP TABLE $db_tablename");
mysql_query("CREATE TABLE $db_tablename(
id INT AUTO_INCREMENT PRIMARY KEY,
section VARCHAR(100),
subsection VARCHAR(100),
name VARCHAR(500),
address VARCHAR(500),
timetable VARCHAR(500),
phone VARCHAR(500),
site VARCHAR(100))
DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci");
function getPage($url)
{
print_r($url."<br>\n");
$ch = curl_init ();
curl_setopt ($ch , CURLOPT_URL , $url);
curl_setopt ($ch , CURLOPT_USERAGENT , "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU; rv:1.7.12) Gecko/20050919 Firefox/1.0.7");
curl_setopt ($ch , CURLOPT_RETURNTRANSFER , 1 );
curl_setopt ($ch,CURLOPT_HTTPHEADER,array ("Content-Type: text/xml; charset=windows-1251","Expect: 100-continue"));
$content = curl_exec($ch);
$content = iconv("cp1251", "UTF-8", $content);
curl_close($ch);
return $content;
}
function addItemToBase($section_name, $subsection_name, $name, $address, $timetable, $phone, $site)
{
global $db_tablename;
$query = "INSERT INTO ".$db_tablename. " SET section='".mysql_escape_string($section_name)."',
subsection='".mysql_escape_string($subsection_name)."',
name='".mysql_escape_string($name)."',
address='".mysql_escape_string($address)."',
timetable='".mysql_escape_string($timetable)."',
phone='".mysql_escape_string($phone)."',
site='".mysql_escape_string($site)."'";
mysql_query($query) or die(mysql_error());
}
function parseItem($url, $section_name, $subsection_name)
{
$url = "http://www.spr.ru/".$url.".html";
$name = $address = $timetable = $phone = $site = "";
$text = getPage($url);
preg_match("#<H1 class='main' style='font-size:24px;'>(.+?)</H1>#sui", $text, $name);
$name = str_replace("&nbsp", "", htmlspecialchars_decode(strip_tags($name[1])));
preg_match("#<font class='view-orange'>Адрес</font>&nbsp;<font class='view-font'>(.+?)&nbsp;&nbsp;<noindex>#sui", $text, $address);
$address = str_replace("&nbsp", "", htmlspecialchars_decode(strip_tags($address[1])));
preg_match("#<font class='view-orange'>Телефон</font>(.+?)<br><br>#", $text, $phone);
$phone = str_replace("&nbsp", "", htmlspecialchars_decode(strip_tags($phone[1])));
preg_match("#<font class='view-orange'>Часы работы</font><font class='font'>&nbsp;</font><font class='view-font'>(.+?)</font>#sui", $text, $timetable);
$timetable = $timetable[1];
preg_match("#<font class='view-orange'>Сайт</font><font class='font'>&nbsp;</font><font class='view-font'>(.+?)</font>#sui", $text, $site);
$site = $site[1];
addItemToBase($section_name, $subsection_name, $name, $address, $timetable, $phone, $site);
}
function parseCity($url)
{
$text = getPage($url);
preg_match_all("#<noindex><a rel='nofollow' href='(.+?)' title='(.+?)' style='font-size:14px;'>(.+?)</a></noindex><br>#sui", $text, $sections, PREG_SET_ORDER);
foreach($sections as $section)
{
$section_url = $section[1];
$section_name = $section[3];
$section_text = getPage($section_url);
preg_match_all("#<font style='font-size:16px; color:222222;'>(.+?)<br><br>#sui", $section_text, $subsections_content, PREG_SET_ORDER);
foreach($subsections_content as $subsection_item)
{
preg_match_all("#<a href='(.+?)' title='(.+?)' style='line-height:17px; font-size:14px;'>(.+?)</a>#sui", $subsection_item[1], $subsections, PREG_SET_ORDER);
foreach($subsections as $subsection)
{
$subsection_url = $subsection[1];
$subsection_name = $subsection[3];
$subsection_text = getPage($subsection_url);
preg_match_all('#firms.push\(\["(.+?)","(.+?)","(.+?)",(.+?)\]\);#sui', $subsection_text, $items, PREG_SET_ORDER);
foreach($items as $item)
parseItem($item[3], $section_name, $subsection_name);
}
}
}
exit();
}
$text = getPage("http://www.spr.ru/");
preg_match('#<ul class="Control">(.+?)</ul>#sui', $text, $regions_content);
preg_match_all('#<li><a href="(.+?)" rel=\'nofollow\'>(.+?)</a></li>#sui', $regions_content[0], $regions, PREG_SET_ORDER);
foreach($regions as $region)
{
$url = $region[1];
$text = getPage($url);
preg_match_all("#<font class='list'>(.+?)</font>#sui", $text, $sections, PREG_SET_ORDER);
foreach($sections as $section)
{
preg_match_all("#<a href='(.+?)' title='(.+?)'>(.+?)</a><br>#sui", $section[1], $items, PREG_SET_ORDER);
foreach($items as $item)
parseCity($item[1]);
}
exit();
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment