本节内容:
从给定网站抓取省市级城市数据
完整代码:
<?php
/**
* 名称: 抓取省市级城市
*
* 功能: 从所给指定的网址中抓取数据并分析出自己想要的数据。
* 编辑:www.jb200.com
*/
/*
SQL:
CREATE TABLE province_city (
id int(11) unsigned NOT NULL auto_increment,
city varchar(200) character set latin1 collate latin1_bin NOT NULL default '',
parent_id smallint(4) unsigned NOT NULL default '0',
city_code varchar(10) character set latin1 collate latin1_bin NOT NULL default '',
PRIMARY KEY (id),
KEY parent_id (parent_id),
KEY city_code (city_code)
) ENGINE=myisam
*/
/*
@取得程序执行的时间微秒
*/
function getMicrotime()
{
list($usec, $sec) = explode(" ",microtime());
return ((double)$usec + (double)$sec);
}
/*
@过滤字符串取得需要的值
*/
function filterData($data){
global $pre;
$start_len = strpos($data,"=")+1;
$end_len = strpos($data,">");
$len = $end_len-$start_len;
$url = substr($data,$start_len,$len);
//$tmp = implode(file($pre.$url));
$tmp = getDataFromUrl($pre.$url);
return $tmp;
}
/*
@取得指定网站上的数据
*/
function getDataFromUrl($url){
$data = implode("",file($url));
$data = strip_tags($data,"<a>");
preg_match_all ("/(<([/w]+)[^>]*>)(.*)(<////2>)/", $data, $matches);
return $matches;
}
set_time_limit(0);
$startTime = getMicrotime();
$conn = mysql_connect("localhost","root","");
mysql_select_db("365tag",$conn);
$sql = "INSERT INTO province_city (id,city, parent_id) VALUES ";
$pre = "http://bjrd.beijing.gov.cn/life/life_com/code/";
$url = "http://bjrd.beijing.gov.cn/life/life_com/code/city.asp";
$matches = getDataFromUrl($url);
global $id_num;
$id_num = 0;
for ($i=0; $i<count($matches[0]); $i++) {
$id = !empty($id_num)?($id_num+$i+1):($i+1);
echo "id_num: ".$id_num."<br>";
echo "id:".$id."<br>";
$j = $i+1;
echo "<li><p>".$j.".".$matches[3][$i]."</p></li>";
$sql .= "('".$id."','".$matches[3][$i]."',0),";
$city = filterData($matches[1][$i]);
for($k=0;$k<count($city[0]);$k++){
unset($id_tmp);
$id_tmp= $id+$k+1;
//echo "id_tmp:".$id_tmp."<br>";
$z = $k+1;
$sql .= "('".$id_tmp."','".$city[3][$k]."','".$id."'),";
echo $z.".".$city[3][$k];
echo "<br>";
}
echo "<hr>";
$id_num = $id_num+count($city[0]);
}
echo "SQL:<BR>".$sql;
echo "<hr>";
$endTime = getMicrotime();
$execTime = $endTime-$startTime;
echo "<font size=2 color=blue>抓取及分析数据所用时间:".$execTime."</font>";
//开始执行添加数据库的程序
$len_tmp = strrpos($sql,',');
$sql = substr($sql,0,$len_tmp).";";
mysql_query($sql) or die(mysql_error());
?>