需求
有一个表,里面数据量比较大,每天一更新,其字段可以通过xml配置文件进行配置,即,可能每次建表的字段不一样。
上游跑时会根据配置从源文件中提取,到入库这一步需要根据配置进行建表。
解决
写了一个简单的xml,配置需要字段及类型
上游读取到对应的数据
入库这一步,先把原表删除,根据配置建新表
XML文件
<?xml version="1.0" enCoding="UTF-8"?><!-- 表名,数据库名 可灵活配置插入哪个库哪个表 --><table name="top_query" db_name="evaluting_sys"><!-- 非业务主键,自增长,可配名,其他 INTEGER UNSIGNED auto_INCREMENT --><primary_key><name>ID</name></primary_key><!-- 字段开始 --><fIEld><name>query</name><type>varchar(200)</type><is_index>false</is_index><description>query</description></fIEld><fIEld><name>pv</name><type>integer</type><is_index>false</is_index><description>pv</description></fIEld><fIEld><name>avg_money</name><type>integer</type><is_index>false</is_index><description></description></fIEld><!-- 字段配置结束 --></table>
处理脚本
#!/usr/bin/python# -*- Coding:utf-8 -*-#author: wklken#desc: use to read db xml config.#-----------------------#2012-02-18 created#----------------------import sys,osfrom xml.dom import minIDom,Nodedef read_dbconfig_xml(xml_file_path): content = {} root = minIDom.parse(xml_file_path) table = root.getElementsByTagname("table")[0] #read dbname and table name. table_name = table.getAttribute("name") db_name = table.getAttribute("db_name") if len(table_name) > 0 and len(db_name) > 0: db_sql = "create database if not exists `" + db_name +"`; use " + db_name + ";" table_drop_sql = "drop " + table_name + " if exists " + table_name + ";" content.update({"db_sql" : db_sql}) content.update({"table_sql" : table_drop_sql }) else: print "Error:attribute is not define well! db_name=" + db_name + " ;table_name=" + table_name sys.exit(1) #print table_name,db_name table_create_sql = "create table " + table_name +"(" #read primary cell primary_key = table.getElementsByTagname("primary_key")[0] primary_key_name = primary_key.getElementsByTagname("name")[0].childNodes[0].nodeValue table_create_sql += primary_key_name + " INTEGER NOT NulL auto_INCREMENT PRIMARY KEY," #print primary_key.toxml() #read ordernary fIEld fIElds = table.getElementsByTagname("fIEld") f_index = 0 for fIEld in fIElds: f_index += 1 name = fIEld.getElementsByTagname("name")[0].childNodes[0].nodeValue type = fIEld.getElementsByTagname("type")[0].childNodes[0].nodeValue table_create_sql += name + " " + type if f_index != len(fIElds): table_create_sql += "," is_index = fIEld.getElementsByTagname("is_index")[0].childNodes[0].nodeValue table_create_sql += ");" content.update({"table_create_sql" : table_create_sql}) #character set latin1 collate latin1_danish_ci; print contentif __name__ == "__main__":read_dbconfig_xml(sys.argv[1])
PYTHON解析XML大文件[SAX]
需求
读取xml数据文件,文件较大,需要实时处理插入到数据库
xml文档
<PERSONS><person> <ID>100000</ID> <sex>男</sex> <address>北京,海淀区</address> <fansNum>437</fansNum> <summary>1989</summary> <wbNum>333</wbNum> <gzNum>242</gzNum> <blog>null</blog> <edu>大学</edu> <work></work> <renZh>1</renZh> <brithday>2月14日</brithday></person></PERSONS>
处理
sax处理时并不会像dom一样可以以类似节点的维度进行读取,它只有 开始标签 内容 结束标签 之分
处理思想是:通过一个handler,对开始标签,内容,结束标签各有一个处理函数
代码及注解
person 处理类
from xml.sax import handler,parseStringclass PersonHandler(handler.ContentHandler): def __init__(self,db_ops): #db op obj self.db_ops = db_ops #存储一个person的map self.person = {} #当前的tag self.current_tag = "" #是否是tag之间的内容 ,目的拿到tag间内容,不受空白的干扰 self.in_quote = 0 #开始,清空map def startElement(self,name,attr): #以person,清空map if name == "person": self.person = {} #记录 状态 self.current_tag = name self.in_quote = 1 #结束,插入数据库 def endElement(self,name): #以person结尾 代表读取一个person的信息结束 if name == "person": #do something in_fIElds = tuple([ ('"' + self.person.get(i,"") + '"') for i in fIElds ]) print in_sql % in_fIElds db_ops.insert( in_sql%(in_fIElds)) #处理 self.in_quote = 0 def characters(self,content): #若是在tag之间的内容,更新到map中 if self.in_quote: self.person.update({self.current_tag: content})
加上入库的完整代码
#!/usr/bin/python# -*- Coding:utf-8 -*-#parse_person.py#version : 0.1#author : wukunliang@163.com#desc : parse person.xml and out sqlimport sys,osimport MysqLdbreload(sys)sys.setdefaultencoding('utf-8')in_sql = "insert into person(ID,sex,address,fansNum,summary,wbNum,gzNum,blog,edu,work,renZh,brithday) values(%s,%s,%s)"fIElds = ("ID","sex","address","fansNum","summary","wbNum","gzNum","blog","edu","work","renZh","brithday")#数据库方法class Db_Connect: def __init__(self,db_host,user,pwd,db_name,charset="utf8",use_unicode = True): print "init begin" print db_host,charset,use_unicode self.conn = MysqLdb.Connection(db_host,charset=charset,use_unicode=use_unicode) print "init end" def insert(self,sql): try: n = self.conn.cursor().execute(sql) return n except MysqLdb.Warning,e: print "Error: execute sql '",sql,"' Failed" def close(self): self.conn.close()#person 处理类from xml.sax import handler,parseStringclass PersonHandler(handler.ContentHandler): def __init__(self,db_ops): #db op obj self.db_ops = db_ops #存储一个person的map self.person = {} #当前的tag self.current_tag = "" #是否是tag之间的内容 self.in_quote = 0 #开始,清空map def startElement(self,attr): #以person,清空map if name == "person": self.person = {} #记录 状态 self.current_tag = name self.in_quote = 1 #结束,插入数据库 def endElement(self,name): #以person结尾 代表读取一个person的信息结束 if name == "person": #do something in_fIElds = tuple([ ('"' + self.person.get(i,"") + '"') for i in fIElds ]) print in_sql % in_fIElds db_ops.insert( in_sql%(in_fIElds)) #处理 self.in_quote = 0 def characters(self,content): #若是在tag之间的内容,更新到map中 if self.in_quote: self.person.update({self.current_tag: content})if __name__ == "__main__": f = open("./person.xml") #如果源文件gbk 转码 若是utf-8,去掉decode.encode db_ops = Db_Connect("127.0.0.1","root","test") parseString(f.read().decode("gbk").encode("utf-8"),PersonHandler(db_ops)) f.close() db_ops.close()总结
以上是内存溢出为你收集整理的实例Python处理XML文件的方法全部内容,希望文章能够帮你解决实例Python处理XML文件的方法所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)