有时,很想知道每天爬行了网站些什么页面,爬了几次,但本人网站都是静态页面,而且空间商的记录要每天最设保留日志第二天才会保留。
以下代码是通过伪静态的逆向,实现的有关静态页面的蜘蛛爬行记录的代码。
供大家学习参考。
php文件:bot.php
<?php
$useragent = addslashes(strtolower($_SERVER['HTTP_USER_AGENT']));
if (strpos($useragent, 'googlebot')!== false){$bot = 'Google';}
elseif (strpos($useragent,'mediapartners-google') !== false){$bot = 'Google Adsense';}
elseif (strpos($useragent,'baiduspider') !== false){$bot = 'Baidu';}
elseif (strpos($useragent,'sogou spider') !== false){$bot = 'Sogou';}
elseif (strpos($useragent,'sogou web') !== false){$bot = 'Sogou web';}
elseif (strpos($useragent,'sosospider') !== false){$bot = 'SOSO';}
elseif (strpos($useragent,'yahoo') !== false){$bot = 'Yahoo';}
elseif (strpos($useragent,'msn') !== false){$bot = 'MSN';}
elseif (strpos($useragent,'msnbot') !== false){$bot = 'msnbot';}
elseif (strpos($useragent,'sohu') !== false){$bot = 'Sohu';}
elseif (strpos($useragent,'yodaoBot') !== false){$bot = 'Yodao';}
elseif (strpos($useragent,'twiceler') !== false){$bot = 'Twiceler';}
elseif (strpos($useragent,'ia_archiver') !== false){$bot = 'Alexa_';}
elseif (strpos($useragent,'iaarchiver') !== false){$bot = 'Alexa';}
elseif (strpos($useragent,'slurp') !== false){$bot = '雅虎';}
elseif (strpos($useragent,'bot') !== false){$bot = '其它蜘蛛';}
if(isset($bot)){
$fp = @fopen('bot.txt','a');
fwrite($fp,date('Y-m-d H:i:s')."t".$_SERVER["REMOTE_ADDR"]."t".$bot."t".'http://'.$_SERVER['SERVER_NAME'].$_SERVER["HTTP_X_REWRITE_URL"]."rn");
fclose($fp);
}
$file=".".$_SERVER[HTTP_X_REWRITE_URL];
$f_head=substr($file,-5);
if($f_head==".html")
{
if(file_exists($file))
{
echo file_get_contents($file);
}else
{
header('HTTP/1.1 404 Not Found');
header("status: 404 Not Found");
echo "该页面无法找到";
}
}
else
{
header('HTTP/1.1 404 Not Found');
header("status: 404 Not Found");
echo "该页面无法找到";
}
?>
伪静态文件内容:
[ISAPI_Rewrite]
# 3600 = 1 hour
CacheClockRate 3600
RepeatLimit 32
# Protect httpd.ini and httpd.parse.errors files
# from accessing through HTTP
RewriteRule /index.html /index.php
RewriteRule ^/article/(.*) /bot.php [L]
RewriteRule ^/list/(.*) /bot.php [L]