python查询百度关键词排名的实现代码

发布时间:2020-11-23编辑:脚本学堂
分享一例python代码,用于查询关键词在百度中的排名,用到了urllib和urllib2以及re正则表达式模块。有兴趣的朋友一起学习研究下。

python使用urllib和urllib2以及re正则表达式模块,实现查询某个关键词在指定站点的百度排名。

编辑推荐:
php检测页面是否被百度收录
php 获取百度收录和百度快照时间
php 查询百度与google收录情况
php 百度快照、百度收录、百度热词

来看具体的实现代码:
 

复制代码 代码示例:

#!/bin/python
#site: www.jb200.com
# -*- coding: utf-8 -*-
#encoding = utf-8
import urllib2
import urllib
import re
from urllib import quote_plus
from urlparse import urlparse

def get_site_word_baidu_rank(siteHost,word,maxScanPageNumber = 10,printSearchLog=False):
    def printLog(log):
        if printSearchLog:
            print log

    page = 1
    pageSize = 10
    siteHost = siteHost.lower()
    number = 0
    got = False
    gotUrl = None
    searchUrl = None
    while True:
        if page == maxScanPageNumber: break
        searchUrl = 'http://www.baidu.com/s?wd='+quote_plus(word)+'&pn='+str((page-1)*pageSize)+'&tn=baiduhome_pg&ie=utf-8&usm=2'
        printLog('搜索第%d页' % (page,))

        data = urllib.urlopen(searchUrl)
        html = data.read()

        itemPattern = re.compile('<h3 class="t"><a[s]+data-click="[^"]+"  href="(?P<url>[^"]+)".*?<span class="g">(?P<urldate>[^<]+)</span>')
        matches = itemPattern.finditer(html)
        number = 0
        for m in matches:
            number += 1
            urldate = m.group('urldate').strip()
            siteUrl = urldate[0:urldate.find(' ')]

            itemUrl = '%s%s' % ('http://',siteUrl)
            urlObject = urlparse(itemUrl)
            if urlObject.netloc.find(':') == -1:host = urlObject.netloc               
            else :host = urlObject.netloc[0:urlObject.netloc.find(':')]

            if host.lower() == siteHost or host.lower().find('.' + siteHost) > -1:
                gotUrl = m.group('url')               
                realUrlFile = urllib2.urlopen(gotUrl)
                gotUrl = realUrlFile.geturl()
                got = True
                break
        if got:break
        page += 1

    if got:
        number = (page-1) * pageSize + number
        return (number,page,gotUrl,searchUrl)

    return None

if __name__ == '__main__':
    words = ('程序员','内存溢出','Outofmemory','python','java')
    siteHost = 'outofmemory.cn'
    for w in words:
        result = get_site_word_baidu_rank(siteHost,w,10)

        if result:
            print w + ':您的网站排在第%d位,在第%d页,排上的链接是%s,搜索页地址%s'%result
        else:
            print '未找到记录'