下面是内存溢出 jb51.cc 通过网络收集整理的代码片段。
内存溢出小编现在分享给大家,也给大家做个参考。
#!/usr/bin/env python# -*- Coding: utf-8 -*-# ****************************************************************************# copyright (C) 2010 [email protected] # Author: yangyingchao <[email protected]> # This program is free software; you can redistribute it and/or modify it# under the terms of the GNU General Public license as published by the Free# Software Foundation; either version 2,or (at your option) any later# version. # This program is distributed in the hope that it will be useful,but WITHOUT# ANY WARRANTY; without even the implIEd warranty of MERCHANTABIliTY or# fitness FOR A PARTIculaR PURPOSE. See the GNU General Public license for# more details. # You should have received a copy of the GNU General Public license along with# GNU Emacs; see the file copYING. If not,write to the Free Software# Foundation,Inc.,59 Temple Place - Suite 330,Boston,MA 02111-1307,USA.# **************************************************************************** from copy import deepcopyfrom sgmllib import SGMLParserfrom xml.dom.minIDom import *import osimport reimport sysimport urllib2 Title = "UnTitled" class MyParser(SGMLParser): def __init__(self): self.data = "" self.links = [] self.TAG_BEG = False self.TAG_END = False SGMLParser.__init__(self,0) def handle_data(self,data): if (self.TAG_BEG is True) and (self.TAG_END is False): self.data += data pass def start_Title(self,attrs): self.link = "" self.data="" self.TAG_BEG = True self.TAG_END = False for (key,val) in attrs: if key == "href": self.link = val def end_Title(self): self.TAG_BEG = False self.TAG_END = True self.Title = self.data.strip() def flush(self): pass def handle_comment(self,data): pass def start_a(self,attrs): self.data="" self.TAG_BEG = True self.TAG_END = False for (key,val) in attrs: if key == "href": self.link = val def end_a(self): self.TAG_BEG = False self.TAG_END = True tmp = {} tmp["name"] = self.data tmp["link"] = self.link self.links.append(deepcopy(tmp)) def unkNown_starttag(self,tag,attrs): pass def unkNown_endtag(self,tag): pass def unkNown_entityref(self,ref): pass def unkNown_charref(self,ref): pass def unkNown_decl(self,data): pass def close(self): SGMLParser.close(self) self.flush() def lst2str(lst): string = "" for item in lst: string += item.strip()+ "\n" return string def downURL(url,filename): print "Download %s,save as %s"%(url,filename) try: fp = urllib2.urlopen(url) except: print "download exception" print sys.exc_info() return 0 op = open(filename,"wb") while 1: s = fp.read() if not s: break op.write(s) fp.close( ) op.close( ) return 1 def reptile(base_url): """ Download all articles from base_url. Arguments: - `base_url`: Url of website. """ page_List = [] if not len(base_url): print "No page to reptile!" sys.exit(1) parser = MyParser() if base_url.startswith("http"): myopen = urllib2.urlopen else: myopen = open try: content = myopen(base_url).read() except: print "Failed to read from %s."%base_url print sys.exc_info() for item in content: parser.Feed(item) for tmp in parser.links: page_List.append(tmp.get("link")) global Title Title = parser.Title parser.close() item_List = List(set(page_List)) for item in item_List: # Strip '#' from url. pos = item.find('#') if pos != -1: item = item[:pos] # Added base_url to item if necessary if not item.startswith("http"): item = base_url.rstrip("/")+"/"+item pass local_file = item.split("/")[-1] print item,local_file if not local_file: print "Empty local file! Continue from next one!" continue if os.access(local_file,os.F_OK): print "file: %s existed,skip ..."%local_file else: ret = downURL(item,local_file) # Remember to download the index file! downURL(base_url,"index.HTML") print "Total: %d articles."%(len(item_List)) pass def walk_dir(lst,dirname,filenames): for filename in filenames: fn = os.path.join(dirname,filename) if os.path.isdir(fn) or \ not filename.endswith("HTML"): continue print "Processing: %s"%fn tmp = {} parser = MyParser() content = open(fn).read() for item in content: parser.Feed(item) tmp["file"] = filename tmp["Title"] = parser.Title parser.close() lst.append(deepcopy(tmp)) pass def gen_index(): """ Generate index of all HTMLs in this directory. """ file_Lists = [] os.path.walk(".",walk_dir,file_Lists) fp = open("%s.devhelp2"%os.path.basename(os.getcwd()),"w") string = '<?xml version="1.0" enCoding="utf-8"?>\n<book author=""' +\ ' language="c" link="index.HTML" name="" title="%s"'%Title+\ ' version="2" xmlns="http://www.devhelp.net/book">\n <chapters>' for item in file_Lists: link = item.get("file") try: name =item.get("Title").decode('gbk').encode('utf-8') except: name = item.get("Title") finally: string += '<sub link="%s" name="%s"/>\n'%(link,name) string += '\n</chapters>\n </book>\n' fp.write(string) fp.close() if __name__ == '__main__': if len(sys.argv) != 2: print "Usage: %s url of baIDu space"%sys.argv[0] print "Such as: %s http://hi.baIDu.com/Username" gen_index() sys.exit(1) base_url = sys.argv[1] reptile (base_url) gen_index()
以上是内存溢出(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
总结以上是内存溢出为你收集整理的Python抓取指定网页以及该网页上所有链接全部内容,希望文章能够帮你解决Python抓取指定网页以及该网页上所有链接所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)