Python抓取指定网页以及该网页上所有链接

Python抓取指定网页以及该网页上所有链接,第1张

概述Python抓取指定网页以及该网页上所有链接

下面是内存溢出 jb51.cc 通过网络收集整理的代码片段。

内存溢出小编现在分享给大家,也给大家做个参考。

#!/usr/bin/env python# -*- Coding: utf-8 -*-# ****************************************************************************# copyright (C) 2010 [email protected] # Author: yangyingchao <[email protected]> # This program is free software; you can redistribute it and/or modify it# under the terms of the GNU General Public license as published by the Free# Software Foundation; either version 2,or (at your option) any later# version. # This program is distributed in the hope that it will be useful,but WITHOUT# ANY WARRANTY; without even the implIEd warranty of MERCHANTABIliTY or# fitness FOR A PARTIculaR PURPOSE. See the GNU General Public license for# more details. # You should have received a copy of the GNU General Public license along with# GNU Emacs; see the file copYING.  If not,write to the Free Software# Foundation,Inc.,59 Temple Place - Suite 330,Boston,MA 02111-1307,USA.# **************************************************************************** from copy import deepcopyfrom sgmllib import SGMLParserfrom xml.dom.minIDom import *import osimport reimport sysimport urllib2 Title = "UnTitled" class MyParser(SGMLParser):     def __init__(self):        self.data = ""        self.links = []        self.TAG_BEG = False        self.TAG_END = False        SGMLParser.__init__(self,0)     def handle_data(self,data):        if (self.TAG_BEG is True) and (self.TAG_END is False):            self.data += data        pass     def start_Title(self,attrs):        self.link = ""        self.data=""         self.TAG_BEG = True        self.TAG_END = False        for (key,val) in attrs:            if key == "href":                self.link = val     def end_Title(self):        self.TAG_BEG = False        self.TAG_END = True         self.Title = self.data.strip()      def flush(self):        pass     def handle_comment(self,data):        pass     def start_a(self,attrs):        self.data=""         self.TAG_BEG = True        self.TAG_END = False        for (key,val) in attrs:            if key == "href":                self.link = val     def end_a(self):        self.TAG_BEG = False        self.TAG_END = True        tmp = {}        tmp["name"] = self.data        tmp["link"] = self.link        self.links.append(deepcopy(tmp))      def unkNown_starttag(self,tag,attrs):        pass     def unkNown_endtag(self,tag):        pass      def unkNown_entityref(self,ref):        pass     def unkNown_charref(self,ref):        pass     def unkNown_decl(self,data):        pass     def close(self):        SGMLParser.close(self)        self.flush() def lst2str(lst):    string = ""    for item in lst:        string += item.strip()+ "\n"    return string def downURL(url,filename):    print "Download %s,save as %s"%(url,filename)    try:        fp = urllib2.urlopen(url)    except:        print "download exception"        print sys.exc_info()        return 0    op = open(filename,"wb")    while 1:        s = fp.read()        if not s:            break        op.write(s)    fp.close( )    op.close( )    return 1  def reptile(base_url):    """    Download all articles from base_url.    Arguments:    - `base_url`: Url of website.    """    page_List = []    if not len(base_url):        print "No page to reptile!"        sys.exit(1)     parser = MyParser()     if base_url.startswith("http"):        myopen = urllib2.urlopen    else:        myopen = open     try:        content = myopen(base_url).read()    except:        print "Failed to read from %s."%base_url        print sys.exc_info()     for item in content:        parser.Feed(item)     for tmp in parser.links:        page_List.append(tmp.get("link"))     global Title    Title = parser.Title    parser.close()     item_List = List(set(page_List))     for item in item_List:        # Strip '#' from url.        pos = item.find('#')        if pos != -1:            item = item[:pos]         # Added base_url to item if necessary        if not item.startswith("http"):            item = base_url.rstrip("/")+"/"+item            pass         local_file = item.split("/")[-1]        print item,local_file        if not local_file:            print "Empty local file! Continue from next one!"            continue         if os.access(local_file,os.F_OK):            print "file: %s existed,skip ..."%local_file        else:            ret = downURL(item,local_file)     # Remember to download the index file!    downURL(base_url,"index.HTML")    print "Total: %d articles."%(len(item_List))    pass  def walk_dir(lst,dirname,filenames):    for filename in filenames:        fn = os.path.join(dirname,filename)        if os.path.isdir(fn) or \               not filename.endswith("HTML"):            continue        print "Processing: %s"%fn        tmp = {}        parser = MyParser()        content = open(fn).read()        for item in content:            parser.Feed(item)        tmp["file"] = filename        tmp["Title"] = parser.Title        parser.close()        lst.append(deepcopy(tmp))    pass def gen_index():    """    Generate index of all HTMLs in this directory.    """    file_Lists = []    os.path.walk(".",walk_dir,file_Lists)     fp = open("%s.devhelp2"%os.path.basename(os.getcwd()),"w")    string = '<?xml version="1.0" enCoding="utf-8"?>\n<book author=""' +\        ' language="c" link="index.HTML" name="" title="%s"'%Title+\        ' version="2" xmlns="http://www.devhelp.net/book">\n  <chapters>'    for item in file_Lists:        link = item.get("file")        try:            name =item.get("Title").decode('gbk').encode('utf-8')        except:            name = item.get("Title")        finally:            string += '<sub link="%s" name="%s"/>\n'%(link,name)     string +=   '\n</chapters>\n   </book>\n'    fp.write(string)    fp.close() if __name__ == '__main__':    if len(sys.argv) != 2:        print "Usage: %s url of baIDu space"%sys.argv[0]        print "Such as: %s http://hi.baIDu.com/Username"        gen_index()        sys.exit(1)    base_url = sys.argv[1]    reptile (base_url)    gen_index()

以上是内存溢出(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

总结

以上是内存溢出为你收集整理的Python抓取指定网页以及该网页上所有链接全部内容,希望文章能够帮你解决Python抓取指定网页以及该网页上所有链接所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/1199054.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-04
下一篇 2022-06-04

发表评论

登录后才能评论

评论列表(0条)

保存