多线程及代理服务器下载图片

多线程及代理服务器下载图片,第1张

概述多线程及代理服务器下载图片

下面是内存溢出 jb51.cc 通过网络收集整理的代码片段。

内存溢出小编现在分享给大家,也给大家做个参考。

#! /usr/bin/env python# -*- Coding=utf-8 -*- import urllib2,urllibimport re,time,socketimport osimport sysimport threadingpath = os.getcwd()                     new_path = os.path.join(path,r'mnsfz')if not os.path.isdir(new_path):    os.mkdir(new_path)path1=new_path+'/'+r'List1.txt'k=open(path1,'wt')k.close()path2=new_path+'/'+r'List2.txt'g=open(path2,'wt')g.close()path3=new_path+'/'+r'List3.txt'g=open(path3,'wt')g.close()match1=r'<div ><a href="(info_topList.*?)"'#匹配下一页match2=r'(http.*?)#'#匹配txt1中的网址match3=r'<a href="(unit_info.*?ps=18)">'#匹配HTML1中的各图册pageurlmatch4=r'(unit_info.*?ps=18)'#匹配txt2中的地址match5=r'value="(http://.*?\.jpg)" emptyok="true" />'#匹配打开的图册中图片的下载地址<input name="picurl" type="hIDden"  value="http://d4.lexun.net/d43/act/20150324/18/94798621.jpg" emptyok="true" />match6=r'"<a href="(unit_info.*?ps=18)">u"下一页"'#匹配打开图册的下一页match7=r'(http.*?\.jpg)'#匹配txt3中的地址match8=r'<img src="(http.*?\.jpg)" alt='#匹配原图下载地址'<a href="http.jpg">立即下载'match9=r'(http.*?\.jpg)'#匹配原图下载页的真实imgurlurl1=r'http://p.lexun.net/w/info_topList.aspx?flag=1&ps=18&total=17967&total=17967&cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=451103666'def pageloop1(url1):    for i in range(1,41):        putintotxt(url1+r'#',path1)        HTML=useragent(url1)        bturl=geturl(match1,HTML)        if bturl:            src=bturl[0]            url1=r'http://p.lexun.net/w/'+src.replace(r'amp;','')def pageloop2(url2):    print r'page',url2    HTML2=useragent(url2)    Pagelist=geturl(match3,HTML2)    putintotxt(Pagelist,path2)def pageloop3(pageurl):    url2=r'http://p.lexun.net/w/'+pageurl.replace(r'amp;','')    # print r'next page',url2    HTML3=useragent(url2)    imgList=geturl(match5,HTML3)    # print imgList    putintotxt(imgList,path3)    nextimgurl=geturl(match6,HTML3)    if nextimgurl:        src=nextimgurl[0]        pageurl2=r'http://p.lexun.net/w/'+src.replace(r'amp;','')        pageloop3(pageurl2)def pageloop4(urlimg):    try:        name=os.path.basename(urlimg)        size=os.path.isfile(new_path+'/'+name)        if size==True:            print u'已经存在'            pass        else:            content=urllib2.urlopen(urlimg,None,timeout=20).read()            with open(new_path+'/'+name,'wb') as code:                code.write(content)            if size==False:                print u'需要host'                useragent2(urlimg)            else:                print urlimg    except:        useragent2(urlimg)def useragent2(urlimg):    try:        url=r'http://app.lexun.com/resizepic/pic_zoomr.aspx?cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=3925580'+str(i)        values={        "User-Agent": "Mozilla/5.0 (windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/38.0.2125.122 Safari/537.36","picurl":urlimg        }        data = urllib.urlencode(values)        req = urllib2.Request(url,data)        proxy_support = urllib2.ProxyHandler({'http':'http://190.79.62.76:8080'})        opener = urllib2.build_opener(proxy_support,urllib2.httpHandler)        urllib2.install_opener(opener)        response = urllib2.urlopen(req)        HTML = response.read()        HTL=geturl(match8,HTML)        print HTL[0]        pageloop4(HTL[0])    except:        passclass getallpag(threading.Thread):       def __init__(self,begin,end):        threading.Thread.__init__(self)        self.begin = begin        self.end = end    def run(self):        for i in range(self.begin,self.end):            pageloop2(ALLPAG[i])class getimgpag(threading.Thread):       def __init__(self,self.end):            pageloop3(ALLPAG2[i])class getmypic(threading.Thread):       def __init__(self,self.end):            pageloop4(ALLPIC[i])def geturl(match,HTML):    reg=re.compile(match)    URLNEXT=re.findall(reg,HTML)    return URLNEXTdef putintotxt(url,path):    with open (path,'a+') as code:        code.writelines(url)def useragent(url):    try:         HTML = urllib2.urlopen(url,timeout=10).read()        #time.sleep(1)    except:        HTML=r'123456'        pass    return HTML    def Listmk(path,match):    f=open(path,'r+')    allurl=f.readlines()    f.close    reg=re.compile(match)    urlList=re.findall(reg,allurl[0])    return urlListpageloop1(url1)ALLPAG=Listmk(path1,match2)l=len(ALLPAG)print lif __name__ == '__main__':    threads = []    m=1    n=10    while(1):        threads.append(getallpag(m-1,n-1))        m+=10        n+=10        if n-1>l:            break    for t in threads:        t.start()    for t in threads:        t.join()ALLPAG2=Listmk(path2,match4)l2=len(ALLPAG2)print l2if __name__ == '__main__':    threads = []    m=0    n=100    while(1):        threads.append(getimgpag(m,n))        m+=101        n+=100        if n>l2:            break    for t in threads:        t.start()    for t in threads:        t.join()ALLPIC=Listmk(path3,match7)print u'一共:',len(ALLPIC)if __name__ == '__main__':    threads = []    i=0    j=100    kl=len(ALLPIC)    while(1):        threads.append(getmypic(i,j))        i+=101        j+=100        if j>kl:            break    for t in threads:        t.start()        # 等待子线程结束    for t in threads:        t.join()      print "the end!!"

以上是内存溢出(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

总结

以上是内存溢出为你收集整理的多线程及代理服务器下载图片全部内容,希望文章能够帮你解决多线程及代理服务器下载图片所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: https://outofmemory.cn/langs/1199094.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-04
下一篇 2022-06-04

发表评论

登录后才能评论

评论列表(0条)

保存