63 lines
2.3 KiB
Python
63 lines
2.3 KiB
Python
# pyright: reportImportCycles=false
|
|
|
|
"""XML parser for python-docx."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING, Dict, Type, cast
|
|
|
|
from lxml import etree
|
|
|
|
from docx.oxml.ns import NamespacePrefixedTag, nsmap
|
|
|
|
if TYPE_CHECKING:
|
|
from docx.oxml.xmlchemy import BaseOxmlElement
|
|
|
|
|
|
# -- configure XML parser --
|
|
element_class_lookup = etree.ElementNamespaceClassLookup()
|
|
oxml_parser = etree.XMLParser(remove_blank_text=True, resolve_entities=False)
|
|
oxml_parser.set_element_class_lookup(element_class_lookup)
|
|
|
|
|
|
def parse_xml(xml: str | bytes) -> "BaseOxmlElement":
|
|
"""Root lxml element obtained by parsing XML character string `xml`.
|
|
|
|
The custom parser is used, so custom element classes are produced for elements in
|
|
`xml` that have them.
|
|
"""
|
|
return cast("BaseOxmlElement", etree.fromstring(xml, oxml_parser))
|
|
|
|
|
|
def register_element_cls(tag: str, cls: Type["BaseOxmlElement"]):
|
|
"""Register an lxml custom element-class to use for `tag`.
|
|
|
|
A instance of `cls` to be constructed when the oxml parser encounters an element
|
|
with matching `tag`. `tag` is a string of the form `nspfx:tagroot`, e.g.
|
|
`'w:document'`.
|
|
"""
|
|
nspfx, tagroot = tag.split(":")
|
|
namespace = element_class_lookup.get_namespace(nsmap[nspfx])
|
|
namespace[tagroot] = cls
|
|
|
|
|
|
def OxmlElement(
|
|
nsptag_str: str,
|
|
attrs: Dict[str, str] | None = None,
|
|
nsdecls: Dict[str, str] | None = None,
|
|
) -> BaseOxmlElement | etree._Element: # pyright: ignore[reportPrivateUsage]
|
|
"""Return a 'loose' lxml element having the tag specified by `nsptag_str`.
|
|
|
|
The tag in `nsptag_str` must contain the standard namespace prefix, e.g. `a:tbl`.
|
|
The resulting element is an instance of the custom element class for this tag name
|
|
if one is defined. A dictionary of attribute values may be provided as `attrs`; they
|
|
are set if present. All namespaces defined in the dict `nsdecls` are declared in the
|
|
element using the key as the prefix and the value as the namespace name. If
|
|
`nsdecls` is not provided, a single namespace declaration is added based on the
|
|
prefix on `nsptag_str`.
|
|
"""
|
|
nsptag = NamespacePrefixedTag(nsptag_str)
|
|
if nsdecls is None:
|
|
nsdecls = nsptag.nsmap
|
|
return oxml_parser.makeelement(nsptag.clark_name, attrib=attrs, nsmap=nsdecls)
|