python多进程分块读取文件的实例代码

发布时间:2020-09-13编辑:脚本学堂
本文介绍下,用python实现多进程分块读取文件的代码,有需要的朋友参考下吧。

代码如下:
   

复制代码 代码示例:
# -*- coding: utf-8 -*-
    #多进程分块读取文件
     
    import urlparse 
    import datetime 
    import os 
    from multiprocessing import Process,Queue,Array,RLock 
     
    WORKERS = 6 
    BLOCK_SIZE = 0 
    FILE_SIZE = 0 
    FILE_NAME = 'try.log' 
     
    def getFilesize(file): 
        global FILE_SIZE 
        fs = open(file,'r') 
        fs.seek(0,os.SEEK_END) 
        FILE_SIZE = fs.tell() 
        fs.close() 
     
    def process_found(pid,array,rlock): 
        global FILE_SIZE,BLOCK_SIZE 
        fs = open(FILE_NAME,'rb') 
        try: 
            rlock.acquire() 
            begin = array[0] 
            end = (begin + BLOCK_SIZE) 
            print begin,end 
            if begin >= FILE_SIZE: 
                print 'begin',begin 
                array[0] = begin 
                raise Exception('end of file') 
            if end < FILE_SIZE: 
                fs.seek(end) 
                fs.readline() 
                end = fs.tell() 
            if end >= FILE_SIZE: 
                end = FILE_SIZE 
            array[0] = end 
            print '-------------',begin,end 
        except Exception, e: 
            print e.__class__.__name__,str(e) 
            return 
        finally: 
            rlock.release() 
     
        fs.seek(begin) 
        pos = begin 
        fd = open('tmp_pid'+str(pid)+'_jobs','wb') 
        while pos < end: 
            fd.write(fs.readline()) 
            pos = fs.tell() 
     
        fs.close() 
        fd.close() 
     
    def main(): 
        global FILE_SIZE,BLOCK_SIZE,WORKERS,FILE_NAME 
        getFilesize(FILE_NAME) 
        BLOCK_SIZE = FILE_SIZE/WORKERS 
        print FILE_SIZE,BLOCK_SIZE 
        rlock = RLock() 
        array = Array('l',WORKERS) 
        array[0] = 0 
        process=[] 
        for i in range(WORKERS): 
            p=Process(target=process_found, args=[i,array,rlock]) 
            process.append(p) 
        for i in range(WORKERS): 
            process[i].start() 
        for i in range(WORKERS): 
            process[i].join() 
     
    if __name__ == '__main__': 
        main()