python – 搜索/替换xml的内容
我成功使用xml.etree.ElementTree来解析xml,搜索内容,然后将其写入不同的xml.但是,我只是在一个标签内部处理文本.
import os,sys,glob,xml.etree.ElementTree as ET path = r"G:63D RRC GIS Datametadatageneral2010_contract" for fn in os.listdir(path): filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml") for filepath in filepaths: (pa,filename) = os.path.split(filepath) ####use this section to grab element text from old,archived metadata files; this text then gets put into current,working .xml### root = ET.parse(pa + os.sep + "archive" + os.sep + "base_metadata_overall.xml").getroot() iterator = root.getiterator() for item in iterator: if item.tag == "abstract": correct_abstract = item.text root2 = ET.parse(pa + os.sep + "base_metadata_overall.xml").getroot() iterator2 = root2.getiterator("descript") for item in iterator2: if item.tag == "abstract": old_abstract = item.find("abstract") old_abstract_text = old_abstract.text item.remove(old_abstract) new_symbol_abstract = ET.SubElement(item,"title") new_symbol_abstract.text = correct_abstract tree = ET.ElementTree(root2) tree.write(pa + os.sep + "base_metadata_overall.xml") print "created --- " + filename + " metadata" 但现在,我需要: 1)搜索xml并抓取“attr”标签之间的所有内容,下面是示例: <attr><attrlabl Sync="TRUE">OBJECTID</attrlabl><attalias Sync="TRUE">ObjectIdentifier</attalias><attrtype Sync="TRUE">OID</attrtype><attwidth Sync="TRUE">4</attwidth><atprecis Sync="TRUE">0</atprecis><attscale Sync="TRUE">0</attscale><attrdef Sync="TRUE">Internal feature number.</attrdef></attr> 2)现在,我需要打开一个不同的xml并搜索相同“attr”标记之间的所有内容,并替换为上面的内容. 基本上,我之前在做什么,但忽略了“attr”标签之间的子元素,属性等等,并将其视为文本. 谢谢!! 请耐心等待,这个论坛有点不同(发帖)然后我习惯了! 这是我到目前为止所拥有的: import os,re,xml.etree.ElementTree as ET from lxml import etree path = r"C:temppythonxml" for fn in os.listdir(path): filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml") for filepath in filepaths: (pa,filename) = os.path.split(filepath) xml = open(pa + os.sep + "attributes.xml") xmltext = xml.read() correct_attrs = re.findall("<attr> (.*?)</attr>",xmltext,re.DOTALL) for item in correct_attrs: correct_attribute = "<attr>" + item + "</attr>" xml2 = open(pa + os.sep + "base_metadata_overall.xml") xmltext2 = xml2.read() old_attrs = re.findall("<attr>(.*?)</attr>",re.DOTALL) for item2 in old_attrs: old_attribute = "<attr>" + item + "</attr>" old = etree.fromstring(old_attribute) replacement = new.xpath('//attr') for attr in old.xpath('//attr'): attr.getparent().replace(attr,copy.deepcopy(replacement)) print lxml.etree.tostring(old) 得到这个工作,见下文,甚至想出如何导出到新的.xml node = replacements.pop() IndexError:从空列表中弹出 import os,copy,lxml,xml.etree.ElementTree as ET from lxml import etree path = r"C:temppythonxml" for fn in os.listdir(path): filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml") for filepath in filepaths: xmlatributes = open(pa + os.sep + "attributes.xml") xmlatributes_txt = xmlatributes.read() xmltarget = open(pa + os.sep + "base_metadata_overall.xml") xmltarget_txt = xmltarget.read() source = lxml.etree.fromstring(xmlatributes_txt) dest = lxml.etree.fromstring(xmltarget_txt) replacements = source.xpath('//attr') replacements.reverse() for attr in dest.xpath('//attr'): node = replacements.pop() attr.getparent().replace(attr,copy.deepcopy(node)) #print lxml.etree.tostring(dest) tree = ET.ElementTree(dest) tree.write (pa + os.sep + "edited_metadata.xml") print fn + "--- sucessfully edited" 更新5/16/2011 无论如何,下面将删除所有attr,然后替换源attr’s.它还检查另一个标签,“子类型”(如果存在),它在attr之后添加它们,但在“详细”标签内. 再次感谢所有帮助过的人. import os,xml.etree.ElementTree as ET from lxml import etree path = r"G:63D RRC GIS Datametadatageneral2010_contract" #path = r"C:temppythonxml" for fn in os.listdir(path): correct_title = fn.replace ('_',' ') + " various facilities" correct_fc_name = fn.replace ('_',' ') filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml") for filepath in filepaths: print "-----" + fn + "-----" (pa,filename) = os.path.split(filepath) xmlatributes = open(pa + os.sep + "attributes.xml") xmlatributes_txt = xmlatributes.read() xmltarget = open(pa + os.sep + "base_metadata_overall.xml") xmltarget_txt = xmltarget.read() source = lxml.etree.fromstring(xmlatributes_txt) dest = lxml.etree.fromstring(xmltarget_txt) replacements = source.xpath('//attr') replacesubtypes = source.xpath('//subtype') subtype_true_f = len(replacesubtypes) attrtag = dest.xpath('//attr') #print len(attrtag) num_realatrs = len(replacements) for n in attrtag: n.getparent().remove(n) print n.tag + " removed" detailedtag = dest.xpath('//detailed') for n2 in detailedtag: pos = 0 for realatrs in replacements: n2.insert(pos + 1,realatrs) print "attr's replaced" if subtype_true_f >= 1: #print subtype_true_f for realsubtypes in replacesubtypes: n2.insert(num_realatrs + 1,realsubtypes) print "subtype's replaced" tree = ET.ElementTree(dest) tree.write (pa + os.sep + "base_metadata_overall_v2.xml") print fn + "--- sucessfully edited" 解决方法
以下是使用lxml执行此操作的示例.我不确定你想要< attr />节点被替换,但是这个例子应该提供一个可以重用的模式.
更新 – 我更改了它以替换每个< attr>在tree2中,使用tree1中的相应节点,按文档顺序: import copy import lxml.etree xml1 = '''<root><attr><chaos foo="0"/></attr><attr><arena foo="1"/></attr></root>''' xml2 = '''<tree><attr><one/></attr><attr><two/></attr></tree>''' tree1 = lxml.etree.fromstring(xml1) tree2 = lxml.etree.fromstring(xml2) # select <attr/> nodes from tree1,will be used to replace corresponding # nodes in tree2 replacements = tree1.xpath('//attr') replacements.reverse() for attr in tree2.xpath('//attr'): # replace the attr node in tree2 with 'replacement' from tree1 node = replacements.pop() attr.getparent().replace(attr,copy.deepcopy(node)) print lxml.etree.tostring(tree2) 结果: <tree> <attr><chaos foo="0"/></attr> <attr><arena foo="1"/></attr> </tree> (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |