python多线程下载网页的实现代码

发布时间:2019-12-02编辑:脚本学堂
分享一例python使用多线程下载网页的代码,学习下python中threading、httplib、urllib2等模块的用法,有需要的朋友不妨作个参考。

本节主要内容:
掌握python多线程下载文件的方法

代码:
 

复制代码 代码示例:

#!/bin/python
#
#site: www.jb200.com
import httplib
import urllib2
import time
from threading import Thread
from Queue import Queue
from time import sleep

proxy = 'your proxy';
opener = urllib2.build_opener( urllib2.ProxyHandler({'http':proxy}) )
urllib2.install_opener( opener )

ids = {};
for i in range(1,110):
    try:
        listUrl = "http://www.jb200.com/sort/list_8_%d.shtml" % (i);
        print listUrl;
        page = urllib2.urlopen(listUrl).read();
        speUrl = "http://www.jb200.com/soft/";
        speUrlLen = len(speUrl);
        idx = page.find(speUrl,0);
        while idx!=-1:
            dotIdx = page.find(".",idx + speUrlLen);
            if dotIdx != -1:
                id = page[idx + speUrlLen:dotIdx];
                ids[id] = 1;
            idx = page.find("http://www.jb200.com/soft/",idx + speUrlLen);
    except:
        pass;

q = Queue()
NUM = 5
failedId = [];

def do_somthing_using(id):
    try:
        url = "http://www.jb200.com/download.php?softid=%s&type=dx" % (id);
        h2 = httplib.HTTPConnection("your proxy", "you port");
        h2.request("HEAD", url);
        resp = h2.getresponse();
        header = resp.getheaders();
        location = header[3][1];       
        sContent = urllib2.urlopen(location).read();
        savePath = "C:someweb%s.rar" % (id);
        file=open(savePath,'wb');
        file.write(sContent);
        file.close();  
        print savePath + " saved";
    except:
        pass;

def working():
    while True:
        arguments = q.get()
        do_somthing_using(arguments)
        sleep(1)
        q.task_done()

for i in range(NUM):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
for id in ids:
    q.put(id)

q.join()