Formatando XML com ElementTree
Uma busca rápida na web sobre como formatar um arquivo XML de forma que ele fique "bonito" para humanos resulta em poucas ocorrências. Com Python, uma das poucas coisas é um artigo de BruceEckel de 2003 - que também não dá a resposta.
Felizmente temos vários geradores de XML na biblioteca padrão. E, para alterar um gerador de XML bem escrito de forma que ele insira quebras de linha e identação num arquivo XML são necessárias bem poucas alterações.
Escolhi o ElementTree por ter um código python bem fácil de entender. Poderia ter herdado o ElementTree e feito as alterações necessárias na sub-classe, (o jeito "certo" ) - mas para o que eu precisava aqui, uma alteração do tipo "monkey patch" se prestava melhor.
Como é CookBook, também fica exemplo de como fazer o Monkey Patching - sempre lembrando que todos os outros usuários do módulo alterado na mesma aplicação serão afetados.
Formatador de XML
1 #!/usr/bin/env python
2 #coding: utf-8
3
4 # Author: Firederich Lung, João S. O. Bueno
5
6 # Copyright (c) 1999-2005 by Fredrik Lundh (geração de XML)
7 #Copyrigjt (c) 2009 Fundação CPqD - Formatação do XML e partes auxiliares do script
8 #
9 # By obtaining, using, and/or copying this software and/or its
10 # associated documentation, you agree that you have read, understood,
11 # and will comply with the following terms and conditions:
12 #
13 # Permission to use, copy, modify, and distribute this software and
14 # its associated documentation for any purpose and without fee is
15 # hereby granted, provided that the above copyright notice appears in
16 # all copies, and that both that copyright notice and this permission
17 # notice appear in supporting documentation, and that the name of
18 # Secret Labs AB or the author not be used in advertising or publicity
19 # pertaining to distribution of the software without specific, written
20 # prior permission.
21 #
22 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
23 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
24 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
25 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
26 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
27 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
28 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
29 # OF THIS SOFTWARE.
30 # --------------------------------------------------------------------
31
32 # Licensed to PSF under a Contributor Agreement.
33 # See http://www.python.org/2.4/license for licensing details.
34
35 """
36 Modifies the XML generator in ElementTree in order to create
37 a "human-readable", indentend by level, XML output.
38 Can be used as a stand alone script, or to change ElementTree XML generation
39 on the fly through a call to "monkey_patch_etree" function.
40
41 """
42 import xml.etree.ElementTree
43 from xml.etree.ElementTree import Comment, _encode, ProcessingInstruction,\
44 QName, fixtag, _escape_attrib, _escape_cdata
45
46 def Etree_pretty__write(self, file, node, encoding, namespaces,
47 level=0, identator=" "):
48 # write XML to file
49 tag = node.tag
50 if tag is Comment:
51 file.write(level * identator + "<!-- %s -->" % _escape_cdata(node.text, encoding))
52 elif tag is ProcessingInstruction:
53 file.write("<?%s?>" % _escape_cdata(node.text, encoding))
54 else:
55 items = node.items()
56 xmlns_items = [] # new namespaces in this scope
57 try:
58 if isinstance(tag, QName) or tag[:1] == "{":
59 tag, xmlns = fixtag(tag, namespaces)
60 if xmlns: xmlns_items.append(xmlns)
61 except TypeError:
62 _raise_serialization_error(tag)
63 file.write("\n" + level * identator + "<" + _encode(tag, encoding))
64 if items or xmlns_items:
65 items.sort() # lexical order
66 for k, v in items:
67 try:
68 if isinstance(k, QName) or k[:1] == "{":
69 k, xmlns = fixtag(k, namespaces)
70 if xmlns: xmlns_items.append(xmlns)
71 except TypeError:
72 _raise_serialization_error(k)
73 try:
74 if isinstance(v, QName):
75 v, xmlns = fixtag(v, namespaces)
76 if xmlns: xmlns_items.append(xmlns)
77 except TypeError:
78 _raise_serialization_error(v)
79 file.write(" %s=\"%s\"" % (_encode(k, encoding),
80 _escape_attrib(v, encoding)))
81 for k, v in xmlns_items:
82 file.write(" %s=\"%s\"" % (_encode(k, encoding),
83 _escape_attrib(v, encoding)))
84 if node.text or len(node):
85 file.write(">")
86 if node.text:
87 file.write(_escape_cdata(node.text.replace("\n", (level + 1) * identator + "\n"), encoding))
88 for n in node:
89 self._write(file, n, encoding, namespaces, level + 1, identator)
90 file.write("\n" + level * identator + "</" + _encode(tag, encoding) + ">")
91 else:
92 file.write(" />")
93 for k, v in xmlns_items:
94 del namespaces[v]
95 if node.tail:
96 file.write(_escape_cdata(node.tail.replace("\n", level * identator + "\n"), encoding))
97
98
99
100 original__write = xml.etree.ElementTree.ElementTree._write
101 def monkey_patch_etree():
102 """
103 Call this method to overwrite python's native library
104 xml.etree.ElementTree.ElementTree _write method in order to produce
105 humam-pleasant formated output
106 """
107 xml.etree.ElementTree.ElementTree._write = Etree_pretty__write
108
109 def un_monkey_patch_etree():
110 """Use to restore default behavior to ElementTree's XML generation"""
111 xml.etree.ElementTree.ElementTree._write = original__write
112
113 def de_tokenize(text):
114 """removes html tokens from the "Ã" type """
115 r = re.compile(r"\&\#([0-9]{1,3})\;")
116 caracteres = set(r.findall(text))
117 textn = text
118 for caracter in caracteres:
119 caracter = int(caracter)
120 r = re.compile(r"\&\#%d\;" % caracter)
121 textn = r.subn(chr(caracter), textn)[0]
122 return textn
123
124
125
126 if __name__ == "__main__":
127 import sys, re
128 if len(sys.argv) != 3:
129 sys.stderr.write("Usage: prettyxml.py <inputfile> <outputfile>\n")
130 sys.exit(1)
131
132 infile = open(sys.argv[1], "rt")
133 outfile = open(sys.argv[2], "wt")
134 monkey_patch_etree()
135 xmldata = xml.etree.ElementTree.parse(infile)
136 xmlstring = de_tokenize(xml.etree.ElementTree.tostring(xmldata.getroot()))
137 outfile.write(xmlstring)
138 outfile.close()
Muita hora nessa calma
Embora o código que gera XML acima possa "assustar", é bom lembrar que foram feitas poucas alterações em cima do código original da ElementTree, qeu só foi copiado e colado.
Segue uma pequena listagem só com as diferenças entre o método _write do objeto Element de xml.etree.ElementTree e o méotodo que eu ponho em seu lugar - pode se observar que não fiz mais que inserir as uquberas de linha e espaçamento nos pontos em que eu gostaria:
1 --- orig.py 2009-02-09 11:12:44.000000000 -0200
2 +++ changed.py 2009-02-09 11:09:45.000000000 -0200
3 @@ -1 +1,2 @@
4 -def _write(self, file, node, encoding, namespaces):
5 +def Etree_pretty__write(self, file, node, encoding, namespaces,
6 + level=0, identator=" "):
7 @@ -5 +6 @@
8 - file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
9 + file.write(level * identator + "<!-- %s -->" % _escape_cdata(node.text, encoding))
10 @@ -17 +18 @@
11 - file.write("<" + _encode(tag, encoding))
12 + file.write("\n" + level * identator + "<" + _encode(tag, encoding))
13 @@ -41 +42 @@
14 - file.write(_escape_cdata(node.text, encoding))
15 + file.write(_escape_cdata(node.text.replace("\n", (level + 1) * identator + "\n"), encoding))
16 @@ -43,2 +44,2 @@
17 - self._write(file, n, encoding, namespaces)
18 - file.write("</" + _encode(tag, encoding) + ">")
19 + self._write(file, n, encoding, namespaces, level + 1, identator)
20 + file.write("\n" + level * identator + "</" + _encode(tag, encoding) + ">")
21 @@ -50 +51 @@
22 - file.write(_escape_cdata(node.tail, encoding))
23 + file.write(_escape_cdata(node.tail.replace("\n", level * identator + "\n"), encoding))
24