SAX将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。
复制代码 代码如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#---------------------------------------
# 程序:XML解析器
# 版本:01.0
# 作者:mupeng
# 日期:2013-12-18
# 语言:Python 2.7
# 功能:将xml解析成对应的html
# 注解:该程序用xml.sax模块的parse函数解析XML,并生成事件
# 继承ContentHandler并重写其事件处理函数
# Dispatcher主要用于相应标签的起始、结束事件的派发
#---------------------------------------
from xml.sax.handler import ContentHandler
from xml.sax import parse
class Dispatcher:
def dispatch(self, prefix, name, attrs=None):
mname = prefix + name.capitalize()
dname = 'default' + prefix.capitalize()
method = getattr(self, mname, None)
if callable(method): args = ()
else:
method = getattr(self, dname, None)
#args = name
#if prefix == 'start': args += attrs
if callable(method): method()
def startElement(self, name, attrs):
self.dispatch('start', name, attrs)
def endElement(self, name):
self.dispatch('end', name)
class Website(Dispatcher, ContentHandler):
def __init__(self):
self.fout = open('ddt_SAX.html', 'w')
self.imagein = False
self.desflag = False
self.item = False
self.title = ''
self.link = ''
self.guid = ''
self.url = ''
self.pubdate = ''
self.description = ''
self.temp = ''
self.prx = ''
def startChannel(self):
self.fout.write('''n
def endChannel(self):
self.fout.write('''
''')
self.fout.close()
def characters(self, chars):
if chars.strip():
#chars = chars.strip()
self.temp += chars
#print self.temp
def startTitle(self):
if self.item:
self.fout.write('''
''')
def endTitle(self):
if not self.imagein and not self.item:
self.title = self.temp
self.temp = ''
self.fout.write(self.title.encode('gb2312'))
#self.title = self.temp
self.fout.write('''
nnn
n
n
n
''')
if self.item:
self.title = self.temp
self.temp = ''
self.fout.write(self.title.encode('gb2312'))
self.fout.write('''
''')
def startImage(self):
self.imagein = True
def endImage(self):
self.imagein = False
def startlink(self):
if self.imagein:
self.fout.write('''
def endlink(self):
self.link = self.temp
self.temp = ''
if self.imagein:
self.fout.write(self.link.encode('gb2312'))
self.fout.write('''" target="_blank">n ''')
elif self.item:
#self.link = self.temp
pass
else:
self.fout.write(self.link)
self.fout.write(''' " target="
_blank
"> ''')
self.fout.write(self.title.encode('gb2312'))
self.fout.write('''
''')
self.fout.write(self.description.encode('gb2312'))
self.fout.write('''
''')
def startUrl(self):
if self.imagein:
self.fout.write(''' def endUrl(self):
self.url = self.temp
self.temp = ''
if self.imagein:
self.fout.write(self.url.encode('gb2312'))
self.fout.write('''" border="0">n
''')
if self.item:
#self.url = self.temp
pass
def defaultStart(self):
pass
def defaultEnd(self):
self.temp = ''
def startDescription(self):
pass
def endDescription(self):
self.description = self.temp
self.temp = ''
if self.item:
#self.fout.write('¡¡¡¡')
self.fout.write(self.description.encode('gb2312'))
def endGuid(self):
self.guid = self.temp
def endPubdate(self):
if not self.temp.startswith('http'):
self.pubdate = self.temp
self.temp = ''
else:
self.pubdate = ''
def startItem(self):
self.item = True
def endItem(self):
self.item = False
self.fout.write('''
self.fout.write(self.link)
self.fout.write(''' " target="_blank"> ''')
self.fout.write(self.guid)
self.fout.write('''
''')
self.fout.write(self.pubdate)
self.fout.write('''
#程序入口
if __name__ == '__main__':
parse('ddt.xml', Website())
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)