1,php获取baidu搜索结果中的URL列表。
代码:
<?php
/*
* 获取百度搜索url
* edit: www.jb200.com
*/
echo "Website: www.jb200.comrn";
/*
搜索字符串示例:
[url]http://www.baidu.com/s?wd=test&pn=30&ie=utf-8&usm=2&rsv_page=1&rn=100[/url]
bs=test 搜索的词
ie=utf-8 页码编码
rn=100 每页显示条数
pn=30 搜索多少条 // ($page - 1) * 10;
*/
/********************
* $keywords 搜索字符
* $page 搜索多少页数据
* $num 每页显示条数
* $result 结果集
*
********************/
define("_RESULT",'baidu_result.txt');//保存文件名
$result = array();//保存结果信息
function geturl($keywords, $page, $num)
{
//$num=100; //每页显示多少条
//$page=10; //搜索多少页
$page=($page - 1) * $num;
//$page = ($page - 1) * 10;
$content = file_get_contents("http://www.baidu.com/s?wd=$keywords&pn=$page&ie=utf-8&usm=2&rsv_page=1&rn=$num");
$pregOne = "/<spans*class="g">(.*?)/.*?</span>/";//得到URL数组
$fileContent = preg_replace(array("/<b>/","/</b>/"),"",$content); //去除 <b></b> 标签
preg_match_all($pregOne, $fileContent, $urlList); //开始正则
$list= $urlList[1]; //返回获取到的url
global $result;
foreach($list as $u)
{
if(!in_array(trim($u) , $result))
$result[] = trim($u);
}
}
echo "请输入搜索关键字: rn";
$baiduDORK = urlencode(fgets(STDIN));
//$baiduDORK='dedecms';//google关键字
echo "保存结果文件: [不输入即默认]: rn";
$f_result = trim(fgets(STDIN));
if($f_result==NULL){$f_result=_RESULT;}
if(file_exists("$f_result")){
@unlink("$f_result");
echo "结果文件$f_result 以存在,新结果文件将覆盖 ...rn";
}
$names=dirname(__FILE__).DIRECTORY_SEPARATOR.$f_result; //保存文件完整路径
$page = 10; //页数
$num = 100; //每页显示条数
for ($i=1; $i<=$page; $i++){
geturl($baiduDORK, $i, $num, $result);
}
if(count($result)<1){
exit("[-] 数据获取失败!请检查网络环境!rn");
}
echo "获取".count($result)."条数据!rn";
if(file_put_contents("$names",join("rn",$result)))
{
echo "正在保存数据...rn";
usleep(100000);
echo "保存位置:rn". $names."rnrn";
}else{
exit("数据获取失败!请检查网络环境!rn");
}
?>
2,获取google搜索结果的Url列表。
代码:
<?php
echo "Website: www.jb200.comrn";
//设置超时时间
set_time_limit(0);
error_reporting(E_ERROR);
define("_RESULT",'result.txt');
function geturl($keywords, $page, $num)
{
$page = ($page - 1) * 10;
$content =
file_get_contents("http://www.google.com/search?sclient=psy-ab&hl=en&start=$page&source=hp&q=$keywords&pbx=1&oq=$keywords&num=$num&aq=f&aqi=g4"
);
$preg = '/<h3s*class="r"s*>.*/im';
preg_match_all($preg, $content, $m);
preg_match_all('/<a(.*?)>(.*?)/', $m[0][0], $ms);
$list = array();
foreach ($ms[1] as $link)
{
preg_match('/http://[a-zA-Z0-9._-]*/', $link, $matches);
if (!empty($matches[0]))
{
$list[] = $matches[0];
}
}
$list = array_unique($list);
return $list;
}
echo "Please Enter GoogleDork: rn";
$GOOGLEDORK = urlencode(fgets(STDIN));
echo "Result file [Enter for None]: rn";
$result = trim(fgets(STDIN));
if($result==NULL){$result=_RESULT;}
if(file_exists("$result")){
@unlink("$result");
echo "Clear Cache ...rn";
}
$page = 20;
$num = 100;
for ($i=1;$i<=$page;$i++)
{
$url = geturl($GOOGLEDORK, $i, $num);
print_r('[+] Page: '.$i.' Results Count: '.count($url)."rn");
foreach ($url as $u)
{
#print_r($u."rn");
@$fp=@fopen('tmp','a');
@fwrite($fp,$u."rn");
@fclose($fp);
}
}
$new_filename="$result";
$file=file('tmp');
$array=preg_replace('/($s*$)|(^s*^)/m','',$file);
foreach ($array as $key=>$r){
$array[$key]=trim("$r");
}
$names=dirname(__FILE__).DIRECTORY_SEPARATOR.$new_filename;
$new_array=array_values(array_unique($array));
if(file_put_contents("$new_filename",join("rn",$new_array)))
{
echo "Get Subdomain Success!rnrn";
usleep(100000);
echo "Save To:rn". $names."rnrn";
if(file_exists('tmp'))
{
@unlink('tmp');
echo "Clear Cache ...rnrn";
}
}else {
echo "rn[!] Failed! Connect Google Error!rn ";
echo "rn[-] Plase Proxy...rn";
}
exit;
?>