下面是内存溢出 jb51.cc 通过网络收集整理的代码片段。
内存溢出小编现在分享给大家,也给大家做个参考。
#! /usr/bin/env python# -*- Coding=utf-8 -*- import urllib2,urllibimport re,time,socketimport osimport sysimport threadingpath = os.getcwd() new_path = os.path.join(path,r'mnsfz')if not os.path.isdir(new_path): os.mkdir(new_path)path1=new_path+'/'+r'List1.txt'k=open(path1,'wt')k.close()path2=new_path+'/'+r'List2.txt'g=open(path2,'wt')g.close()path3=new_path+'/'+r'List3.txt'g=open(path3,'wt')g.close()match1=r'<div ><a href="(info_topList.*?)"'#匹配下一页match2=r'(http.*?)#'#匹配txt1中的网址match3=r'<a href="(unit_info.*?ps=18)">'#匹配HTML1中的各图册pageurlmatch4=r'(unit_info.*?ps=18)'#匹配txt2中的地址match5=r'value="(http://.*?\.jpg)" emptyok="true" />'#匹配打开的图册中图片的下载地址<input name="picurl" type="hIDden" value="http://d4.lexun.net/d43/act/20150324/18/94798621.jpg" emptyok="true" />match6=r'"<a href="(unit_info.*?ps=18)">u"下一页"'#匹配打开图册的下一页match7=r'(http.*?\.jpg)'#匹配txt3中的地址match8=r'<img src="(http.*?\.jpg)" alt='#匹配原图下载地址'<a href="http.jpg">立即下载'match9=r'(http.*?\.jpg)'#匹配原图下载页的真实imgurlurl1=r'http://p.lexun.net/w/info_topList.aspx?flag=1&ps=18&total=17967&total=17967&cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=451103666'def pageloop1(url1): for i in range(1,41): putintotxt(url1+r'#',path1) HTML=useragent(url1) bturl=geturl(match1,HTML) if bturl: src=bturl[0] url1=r'http://p.lexun.net/w/'+src.replace(r'amp;','')def pageloop2(url2): print r'page',url2 HTML2=useragent(url2) Pagelist=geturl(match3,HTML2) putintotxt(Pagelist,path2)def pageloop3(pageurl): url2=r'http://p.lexun.net/w/'+pageurl.replace(r'amp;','') # print r'next page',url2 HTML3=useragent(url2) imgList=geturl(match5,HTML3) # print imgList putintotxt(imgList,path3) nextimgurl=geturl(match6,HTML3) if nextimgurl: src=nextimgurl[0] pageurl2=r'http://p.lexun.net/w/'+src.replace(r'amp;','') pageloop3(pageurl2)def pageloop4(urlimg): try: name=os.path.basename(urlimg) size=os.path.isfile(new_path+'/'+name) if size==True: print u'已经存在' pass else: content=urllib2.urlopen(urlimg,None,timeout=20).read() with open(new_path+'/'+name,'wb') as code: code.write(content) if size==False: print u'需要host' useragent2(urlimg) else: print urlimg except: useragent2(urlimg)def useragent2(urlimg): try: url=r'http://app.lexun.com/resizepic/pic_zoomr.aspx?cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=3925580'+str(i) values={ "User-Agent": "Mozilla/5.0 (windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/38.0.2125.122 Safari/537.36","picurl":urlimg } data = urllib.urlencode(values) req = urllib2.Request(url,data) proxy_support = urllib2.ProxyHandler({'http':'http://190.79.62.76:8080'}) opener = urllib2.build_opener(proxy_support,urllib2.httpHandler) urllib2.install_opener(opener) response = urllib2.urlopen(req) HTML = response.read() HTL=geturl(match8,HTML) print HTL[0] pageloop4(HTL[0]) except: passclass getallpag(threading.Thread): def __init__(self,begin,end): threading.Thread.__init__(self) self.begin = begin self.end = end def run(self): for i in range(self.begin,self.end): pageloop2(ALLPAG[i])class getimgpag(threading.Thread): def __init__(self,self.end): pageloop3(ALLPAG2[i])class getmypic(threading.Thread): def __init__(self,self.end): pageloop4(ALLPIC[i])def geturl(match,HTML): reg=re.compile(match) URLNEXT=re.findall(reg,HTML) return URLNEXTdef putintotxt(url,path): with open (path,'a+') as code: code.writelines(url)def useragent(url): try: HTML = urllib2.urlopen(url,timeout=10).read() #time.sleep(1) except: HTML=r'123456' pass return HTML def Listmk(path,match): f=open(path,'r+') allurl=f.readlines() f.close reg=re.compile(match) urlList=re.findall(reg,allurl[0]) return urlListpageloop1(url1)ALLPAG=Listmk(path1,match2)l=len(ALLPAG)print lif __name__ == '__main__': threads = [] m=1 n=10 while(1): threads.append(getallpag(m-1,n-1)) m+=10 n+=10 if n-1>l: break for t in threads: t.start() for t in threads: t.join()ALLPAG2=Listmk(path2,match4)l2=len(ALLPAG2)print l2if __name__ == '__main__': threads = [] m=0 n=100 while(1): threads.append(getimgpag(m,n)) m+=101 n+=100 if n>l2: break for t in threads: t.start() for t in threads: t.join()ALLPIC=Listmk(path3,match7)print u'一共:',len(ALLPIC)if __name__ == '__main__': threads = [] i=0 j=100 kl=len(ALLPIC) while(1): threads.append(getmypic(i,j)) i+=101 j+=100 if j>kl: break for t in threads: t.start() # 等待子线程结束 for t in threads: t.join() print "the end!!"
以上是内存溢出(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
总结以上是内存溢出为你收集整理的多线程及代理服务器下载图片全部内容,希望文章能够帮你解决多线程及代理服务器下载图片所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)