import collections import collections.abc import re start_ranges = "|".join( "[{0}]".format(r) for r in [ "\xC0-\xD6", "\xD8-\xF6", "\xF8-\u02FF", "\u0370-\u037D", "\u037F-\u1FFF", "\u200C-\u200D", "\u2070-\u218F", "\u2C00-\u2FEF", "\u3001-\uD7FF", "\uF900-\uFDCF", "\uFDF0-\uFFFD", ] ) NameStartChar = re.compile(r"(:|[A-Z]|_|[a-z]|{0})".format(start_ranges)) NameChar = re.compile(r"(\-|\.|[0-9]|\xB7|[\u0300-\u036F]|[\u203F-\u2040])") ######################## ### NODE ######################## class DataSorter: """ Used to sort a map of data depending on it's type """ def keys_from(self, data): sorted_data = data if not isinstance(data, collections.OrderedDict): sorted_data = sorted(data) return sorted_data class always: def keys_from(self, data): return sorted(data) class never: def keys_from(self, data): return data class Node(object): """ Represents each tag in the tree Each node has _either_ a single value or one or more children If it has a value: The serialized result is <%(tag)s>%(value)s If it has children: The serialized result is <%(wrap)s> %(children)s Which one it is depends on the implementation of self.convert """ # A mapping of characters to treat as escapable entities and their replacements entities = [("&", "&"), ("<", "<"), (">", ">")] def __init__( self, wrap="", tag="", data=None, iterables_repeat_wrap=True, closed_tags_for=None, data_sorter=None, ): self.tag = self.sanitize_element(tag) self.wrap = self.sanitize_element(wrap) self.data = data self.type = self.determine_type() self.data_sorter = data_sorter if data_sorter is not None else DataSorter() self.closed_tags_for = closed_tags_for self.iterables_repeat_wrap = iterables_repeat_wrap if self.type == "flat" and isinstance(self.data, str): # Make sure we deal with entities for entity, replacement in self.entities: self.data = self.data.replace(entity, replacement) def serialize(self, indenter): """Returns the Node serialized as an xml string""" # Determine the start and end of this node wrap = self.wrap end, start = "", "" if wrap: end = "".format(wrap) start = "<{0}>".format(wrap) if self.closed_tags_for and self.data in self.closed_tags_for: return "<{0}/>".format(self.wrap) # Convert the data attached in this node into a value and children value, children = self.convert() # Determine the content of the node (essentially the children as a string value) content = "" if children: if self.type != "iterable": # Non-iterable wraps all it's children in the same tag content = indenter((c.serialize(indenter) for c in children), wrap) else: if self.iterables_repeat_wrap: # Iterables repeat the wrap for each child result = [] for c in children: content = c.serialize(indenter) if c.type == "flat": # Child with value, it already is surrounded by the tag result.append(content) else: # Child with children of it's own, they need to be wrapped by start and end content = indenter([content], True) result.append("".join((start, content, end))) # We already have what we want, return the indented result return indenter(result, False) else: result = [] for c in children: result.append(c.serialize(indenter)) return "".join([start, indenter(result, True), end]) # If here, either: # * Have a value # * Or this node is not an iterable return "".join((start, value, content, end)) def determine_type(self): """ Return the type of the data on this node as an identifying string * Iterable : Supports "for item in data" * Mapping : Supports "for key in data: value = data[key]" * flat : A string or something that isn't iterable or a mapping """ data = self.data if isinstance(data, str): return "flat" elif isinstance(data, collections.abc.Mapping): return "mapping" elif isinstance(data, collections.abc.Iterable): return "iterable" else: return "flat" def convert(self): """ Convert data on this node into a (value, children) tuple depending on the type of the data If the type is : * flat : Use self.tag to surround the value. value * mapping : Return a list of tags where the key for each child is the wrap for that node * iterable : Return a list of Nodes where self.wrap is the tag for that node """ val = "" typ = self.type data = self.data children = [] if typ == "mapping": sorted_keys = self.data_sorter.keys_from(data) for key in sorted_keys: item = data[key] children.append( Node( key, "", item, iterables_repeat_wrap=self.iterables_repeat_wrap, closed_tags_for=self.closed_tags_for, data_sorter=self.data_sorter, ) ) elif typ == "iterable": for item in data: children.append( Node( "", self.wrap, item, iterables_repeat_wrap=self.iterables_repeat_wrap, closed_tags_for=self.closed_tags_for, data_sorter=self.data_sorter, ) ) else: val = str(data) if self.tag: val = "<{0}>{1}".format(self.tag, val, self.tag) return val, children @staticmethod def sanitize_element(wrap): """ Convert `wrap` into a valid tag name applying the XML Naming Rules. * Names can contain letters, numbers, and other characters * Names cannot start with a number or punctuation character * Names cannot start with the letters xml (or XML, or Xml, etc) * Names cannot contain spaces * Any name can be used, no words are reserved. :ref: http://www.w3.org/TR/REC-xml/#NT-NameChar """ if wrap and isinstance(wrap, str): if wrap.lower().startswith("xml"): wrap = "_" + wrap return "".join( ["_" if not NameStartChar.match(wrap) else ""] + ["_" if not (NameStartChar.match(c) or NameChar.match(c)) else c for c in wrap] ) else: return wrap ######################## ### CONVERTER ######################## class Converter(object): """Logic for creating a Node tree and serialising that tree into a string""" def __init__(self, wrap=None, indent=" ", newlines=True): """ wrap: The tag that the everything else will be contained within indent: The string that is multiplied at the start of each new line, to represent each level of nesting newlines: A boolean specifying whether we want each tag on a new line. Note that indent only works if newlines is True """ self.wrap = wrap self.indent = indent self.newlines = newlines def _make_indenter(self): """Returns a function that given a list of strings, will return that list as a single, indented, string""" indent = self.indent newlines = self.newlines if not newlines: # No newlines, don't care about indentation ret = lambda nodes, wrapped: "".join(nodes) else: if not indent: indent = "" def eachline(nodes): """Yield each line in each node""" for node in nodes: for line in node.split("\n"): yield line def ret(nodes, wrapped): """ Indent nodes depending on value of wrapped and indent If not wrapped, then don't indent Otherwise, Seperate each child by a newline and indent each line in the child by one indent unit """ if wrapped: seperator = "\n{0}".format(indent) surrounding = "\n{0}{{0}}\n".format(indent) else: seperator = "\n" surrounding = "{0}" return surrounding.format(seperator.join(eachline(nodes))) return ret def build(self, data, iterables_repeat_wrap=True, closed_tags_for=None, data_sorter=None): """Create a Node tree from the data and return it as a serialized xml string""" indenter = self._make_indenter() return Node( wrap=self.wrap, data=data, iterables_repeat_wrap=iterables_repeat_wrap, closed_tags_for=closed_tags_for, data_sorter=data_sorter, ).serialize(indenter)