Python抓取网页界面的简单代码

发布时间:2019-09-11编辑:脚本学堂
分享一例python实现抓取网页界面的代码,通过urllib2获取预先指定的地址的页面,通过BeautifulSoup来解析界面元素。有需要的朋友参考下。

初学python,做了个简易抓取网页界面,主要为了增加对python的熟识度。

本代码主要是通过urllib2获取预先指定的地址的页面,通过BeautifulSoup来解析界面元素,找到href标签,并将相关的数据存入数据库,以方便后面取出继续抓取。
整个抓取和解析也是基于多线程与队列来控制的。做的比较简单与粗糙,后续深入可以改进。

python代码:
 

复制代码 代码示例:
import DBHelper 
import CodeHelper 
import urllib2 
from bs4 import BeautifulSoup 
import threading as thread 
import Queue 
import time 
 
class Resource: 
     
    def __init__(self, url, text, content, status): 
        self._url = url 
        self._text = text 
        self._content = content 
        self._status = status 
         
    def insert(self): 
        dbHelper = DBHelper.DBHelper() 
        sql = 'select * from resource where url=%s' 
        data = dbHelper.read(sql, [self._url]) 
        if data is not None : 
            return 
        sql = 'insert into resource(url,text,content,status) values(%s,%s,%s,%s)' 
        print 'url: %s content: %s status: %s' %(self._url, self._text, self._content, self._status) 
        dbHelper.execute(sql, [self._url, self._text, self._content, self._status]); 
        dbHelper.commint() 
        dbHelper.close() 
         
    def updateStatus(self): 
        dbHelper = DBHelper.DBHelper() 
        sql = 'update resource set status=%s where url=%s' 
        dbHelper.execute(sql, [self._status, self._url]); 
        dbHelper.commint() 
        dbHelper.close() 
         
    def updateContentAndStatus(self): 
        dbHelper = DBHelper.DBHelper() 
        sql = 'update resource set content=%s,status=%s where url=%s' 
        dbHelper.execute(sql, [self._content, self._status, self._url]); 
        dbHelper.commint() 
        dbHelper.close() 
         
    def readListByStatus(self): 
        dbHelper = DBHelper.DBHelper() 
        sql = 'select * from resource where status=%s' 
        return dbHelper.readList(sql, [self._status]); 
         
    def readList(self): 
        dbHelper = DBHelper.DBHelper() 
        return dbHelper.readList('select * from resource'); 
         
class ResourceThread(thread.Thread): 
     
    def __init__(self, task_queue): 
        thread.Thread.__init__(self) 
        self._task_queue = task_queue 
        self.setDaemon(True) 
        self.start() 
     
    def run(self): 
        print 'current thread name %s' %thread.currentThread().name 
        while True : 
            try : 
                func, args = self._task_queue.get(block = False) 
                func(args) 
                self._task_queue.task_done() 
            except Exception,e : 
                print str(e) 
                break 
             
class ResourceManager: 
     
    def __init__(self, taskNum = 10, threadNum = 2) : 
        self._task_queue = Queue.Queue() 
        self._threads = [] 
        self.__init__task_queue__(taskNum) 
        self.__init__thread_pool(threadNum) 
         
    def __init__task_queue__(self, taskNum) : 
        for i in range(taskNum) : 
            print 'this is %s task' %i 
            self.add_task(do_task, i) 
         
    def __init__thread_pool(self, threadNum) : 
        for i in range(threadNum) : 
            print 'threadNum %s' %i 
            resourceThread = ResourceThread(self._task_queue) 
            self._threads.append(resourceThread) 
             
    def add_task(self, func, *args) : 
        self._task_queue.put((func, args)) 
     
    def check_queue(self): 
        return self._task_queue.qsize() 
     
    def wait_for_complete(self) : 
        for thread_item in self._threads : 
            if thread_item.isAlive() : 
                thread_item.join() 
     
def do_task(args): 
    print 'this task args %s' %args 
    resource = Resource(None, None, None, 0) 
    data = resource.readListByStatus() 
    print 'read status 0 data is %s' %data 
    if data is None : 
        return 
    for item in data : 
        url = item[1] 
        if url is None or url.find('http://') == -1 : 
            continue 
        content = urllib2.urlopen(url).read() 
        html = BeautifulSoup(content) 
        fetch_resource = Resource(url, None, str(html.find('body'))[0:9999], 1) 
        fetch_resource.updateContentAndStatus() 
        aLinks = html.find_all('a') 
        print 'aLinks %s' %aLinks 
        for aLink in aLinks : 
            href = aLink.get('href') 
            a_text = CodeHelper.encodeContent(aLink.get_text()) 
            print 'href %s text %s' %(href, a_text) 
            subResource = Resource(href, a_text, '', 0) 
            subResource.insert() 
             
def execute(): 
    urls = ['http://www.jb200.com', 'http://www.1ting.com/', 'http://www.jbxue.net/', 'http://y.qq.com/'] 
    for url in urls : 
        resource = Resource(url, None, 0) 
        resource.insert() 
     
    start = time.time() 
    resource_manager =  ResourceManager(20, 4) 
    resource_manager.wait_for_complete() 
    end = time.time() 
    print "cost all time: %s" % (end-start) 
 
if __name__ == '__main__': 
    execute()