266 lines
11 KiB
Python
Raw Normal View History

2025-07-16 15:34:54 +08:00
# pyright: reportImportCycles=false
# pyright: reportPrivateUsage=false
"""|Document| and closely related objects."""
from __future__ import annotations
from typing import IO, TYPE_CHECKING, Iterator, List, Sequence
from docx.blkcntnr import BlockItemContainer
from docx.enum.section import WD_SECTION
from docx.enum.text import WD_BREAK
from docx.section import Section, Sections
from docx.shared import ElementProxy, Emu, Inches, Length
from docx.text.run import Run
if TYPE_CHECKING:
import docx.types as t
from docx.comments import Comment, Comments
from docx.oxml.document import CT_Body, CT_Document
from docx.parts.document import DocumentPart
from docx.settings import Settings
from docx.styles.style import ParagraphStyle, _TableStyle
from docx.table import Table
from docx.text.paragraph import Paragraph
class Document(ElementProxy):
"""WordprocessingML (WML) document.
Not intended to be constructed directly. Use :func:`docx.Document` to open or create
a document.
"""
def __init__(self, element: CT_Document, part: DocumentPart):
super(Document, self).__init__(element)
self._element = element
self._part = part
self.__body = None
def add_comment(
self,
runs: Run | Sequence[Run],
text: str | None = "",
author: str = "",
initials: str | None = "",
) -> Comment:
"""Add a comment to the document, anchored to the specified runs.
`runs` can be a single `Run` object or a non-empty sequence of `Run` objects. Only the
first and last run of a sequence are used, it's just more convenient to pass a whole
sequence when that's what you have handy, like `paragraph.runs` for example. When `runs`
contains a single `Run` object, that run serves as both the first and last run.
A comment can be anchored only on an even run boundary, meaning the text the comment
"references" must be a non-zero integer number of consecutive runs. The runs need not be
_contiguous_ per se, like the first can be in one paragraph and the last in the next
paragraph, but all runs between the first and the last will be included in the reference.
The comment reference range is delimited by placing a `w:commentRangeStart` element before
the first run and a `w:commentRangeEnd` element after the last run. This is why only the
first and last run are required and why a single run can serve as both first and last.
Word works out which text to highlight in the UI based on these range markers.
`text` allows the contents of a simple comment to be provided in the call, providing for
the common case where a comment is a single phrase or sentence without special formatting
such as bold or italics. More complex comments can be added using the returned `Comment`
object in much the same way as a `Document` or (table) `Cell` object, using methods like
`.add_paragraph()`, .add_run()`, etc.
The `author` and `initials` parameters allow that metadata to be set for the comment.
`author` is a required attribute on a comment and is the empty string by default.
`initials` is optional on a comment and may be omitted by passing |None|, but Word adds an
`initials` attribute by default and we follow that convention by using the empty string
when no `initials` argument is provided.
"""
# -- normalize `runs` to a sequence of runs --
runs = [runs] if isinstance(runs, Run) else runs
first_run = runs[0]
last_run = runs[-1]
# -- Note that comments can only appear in the document part --
comment = self.comments.add_comment(text=text, author=author, initials=initials)
# -- let the first run orchestrate placement of the comment range start and end --
first_run.mark_comment_range(last_run, comment.comment_id)
return comment
def add_heading(self, text: str = "", level: int = 1):
"""Return a heading paragraph newly added to the end of the document.
The heading paragraph will contain `text` and have its paragraph style
determined by `level`. If `level` is 0, the style is set to `Title`. If `level`
is 1 (or omitted), `Heading 1` is used. Otherwise the style is set to `Heading
{level}`. Raises |ValueError| if `level` is outside the range 0-9.
"""
if not 0 <= level <= 9:
raise ValueError("level must be in range 0-9, got %d" % level)
style = "Title" if level == 0 else "Heading %d" % level
return self.add_paragraph(text, style)
def add_page_break(self):
"""Return newly |Paragraph| object containing only a page break."""
paragraph = self.add_paragraph()
paragraph.add_run().add_break(WD_BREAK.PAGE)
return paragraph
def add_paragraph(self, text: str = "", style: str | ParagraphStyle | None = None) -> Paragraph:
"""Return paragraph newly added to the end of the document.
The paragraph is populated with `text` and having paragraph style `style`.
`text` can contain tab (``\\t``) characters, which are converted to the
appropriate XML form for a tab. `text` can also include newline (``\\n``) or
carriage return (``\\r``) characters, each of which is converted to a line
break.
"""
return self._body.add_paragraph(text, style)
def add_picture(
self,
image_path_or_stream: str | IO[bytes],
width: int | Length | None = None,
height: int | Length | None = None,
):
"""Return new picture shape added in its own paragraph at end of the document.
The picture contains the image at `image_path_or_stream`, scaled based on
`width` and `height`. If neither width nor height is specified, the picture
appears at its native size. If only one is specified, it is used to compute a
scaling factor that is then applied to the unspecified dimension, preserving the
aspect ratio of the image. The native size of the picture is calculated using
the dots-per-inch (dpi) value specified in the image file, defaulting to 72 dpi
if no value is specified, as is often the case.
"""
run = self.add_paragraph().add_run()
return run.add_picture(image_path_or_stream, width, height)
def add_section(self, start_type: WD_SECTION = WD_SECTION.NEW_PAGE):
"""Return a |Section| object newly added at the end of the document.
The optional `start_type` argument must be a member of the :ref:`WdSectionStart`
enumeration, and defaults to ``WD_SECTION.NEW_PAGE`` if not provided.
"""
new_sectPr = self._element.body.add_section_break()
new_sectPr.start_type = start_type
return Section(new_sectPr, self._part)
def add_table(self, rows: int, cols: int, style: str | _TableStyle | None = None):
"""Add a table having row and column counts of `rows` and `cols` respectively.
`style` may be a table style object or a table style name. If `style` is |None|,
the table inherits the default table style of the document.
"""
table = self._body.add_table(rows, cols, self._block_width)
table.style = style
return table
@property
def comments(self) -> Comments:
"""A |Comments| object providing access to comments added to the document."""
return self._part.comments
@property
def core_properties(self):
"""A |CoreProperties| object providing Dublin Core properties of document."""
return self._part.core_properties
@property
def inline_shapes(self):
"""The |InlineShapes| collection for this document.
An inline shape is a graphical object, such as a picture, contained in a run of
text and behaving like a character glyph, being flowed like other text in a
paragraph.
"""
return self._part.inline_shapes
def iter_inner_content(self) -> Iterator[Paragraph | Table]:
"""Generate each `Paragraph` or `Table` in this document in document order."""
return self._body.iter_inner_content()
@property
def paragraphs(self) -> List[Paragraph]:
"""The |Paragraph| instances in the document, in document order.
Note that paragraphs within revision marks such as ``<w:ins>`` or ``<w:del>`` do
not appear in this list.
"""
return self._body.paragraphs
@property
def part(self) -> DocumentPart:
"""The |DocumentPart| object of this document."""
return self._part
def save(self, path_or_stream: str | IO[bytes]):
"""Save this document to `path_or_stream`.
`path_or_stream` can be either a path to a filesystem location (a string) or a
file-like object.
"""
self._part.save(path_or_stream)
@property
def sections(self) -> Sections:
"""|Sections| object providing access to each section in this document."""
return Sections(self._element, self._part)
@property
def settings(self) -> Settings:
"""A |Settings| object providing access to the document-level settings."""
return self._part.settings
@property
def styles(self):
"""A |Styles| object providing access to the styles in this document."""
return self._part.styles
@property
def tables(self) -> List[Table]:
"""All |Table| instances in the document, in document order.
Note that only tables appearing at the top level of the document appear in this
list; a table nested inside a table cell does not appear. A table within
revision marks such as ``<w:ins>`` or ``<w:del>`` will also not appear in the
list.
"""
return self._body.tables
@property
def _block_width(self) -> Length:
"""A |Length| object specifying the space between margins in last section."""
section = self.sections[-1]
page_width = section.page_width or Inches(8.5)
left_margin = section.left_margin or Inches(1)
right_margin = section.right_margin or Inches(1)
return Emu(page_width - left_margin - right_margin)
@property
def _body(self) -> _Body:
"""The |_Body| instance containing the content for this document."""
if self.__body is None:
self.__body = _Body(self._element.body, self)
return self.__body
class _Body(BlockItemContainer):
"""Proxy for `<w:body>` element in this document.
It's primary role is a container for document content.
"""
def __init__(self, body_elm: CT_Body, parent: t.ProvidesStoryPart):
super(_Body, self).__init__(body_elm, parent)
self._body = body_elm
def clear_content(self) -> _Body:
"""Return this |_Body| instance after clearing it of all content.
Section properties for the main document story, if present, are preserved.
"""
self._body.clear_content()
return self