279 lines
11 KiB
Python
279 lines
11 KiB
Python
"""Custom element class for rendered page-break (CT_LastRenderedPageBreak)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import copy
|
|
from typing import TYPE_CHECKING
|
|
|
|
from docx.oxml.xmlchemy import BaseOxmlElement
|
|
from docx.shared import lazyproperty
|
|
|
|
if TYPE_CHECKING:
|
|
from docx.oxml.text.hyperlink import CT_Hyperlink
|
|
from docx.oxml.text.paragraph import CT_P
|
|
|
|
|
|
class CT_LastRenderedPageBreak(BaseOxmlElement):
|
|
"""`<w:lastRenderedPageBreak>` element, indicating page break inserted by renderer.
|
|
|
|
A rendered page-break is one inserted by the renderer when it runs out of room on a
|
|
page. It is an empty element (no attrs or children) and is a child of CT_R, peer to
|
|
CT_Text.
|
|
|
|
NOTE: this complex-type name does not exist in the schema, where
|
|
`w:lastRenderedPageBreak` maps to `CT_Empty`. This name was added to give it
|
|
distinguished behavior. CT_Empty is used for many elements.
|
|
"""
|
|
|
|
@property
|
|
def following_fragment_p(self) -> CT_P:
|
|
"""A "loose" `CT_P` containing only the paragraph content before this break.
|
|
|
|
Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
|
|
page-break in its paragraph.
|
|
|
|
The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
|
|
page-break with this `w:lastRenderedPageBreak` element and all content preceding
|
|
it removed.
|
|
|
|
NOTE: this `w:p` can itself contain one or more `w:renderedPageBreak` elements
|
|
(when the paragraph contained more than one). While this is rare, the caller
|
|
should treat this paragraph the same as other paragraphs and split it if
|
|
necessary in a folloing step or recursion.
|
|
"""
|
|
if not self == self._first_lrpb_in_p(self._enclosing_p):
|
|
raise ValueError("only defined on first rendered page-break in paragraph")
|
|
|
|
# -- splitting approach is different when break is inside a hyperlink --
|
|
return (
|
|
self._following_frag_in_hlink if self._is_in_hyperlink else self._following_frag_in_run
|
|
)
|
|
|
|
@property
|
|
def follows_all_content(self) -> bool:
|
|
"""True when this page-break element is the last "content" in the paragraph.
|
|
|
|
This is very uncommon case and may only occur in contrived or cases where the
|
|
XML is edited by hand, but it is not precluded by the spec.
|
|
"""
|
|
# -- a page-break inside a hyperlink never meets these criteria (for our
|
|
# -- purposes at least) because it is considered "atomic" and always associated
|
|
# -- with the page it starts on.
|
|
if self._is_in_hyperlink:
|
|
return False
|
|
|
|
return bool(
|
|
# -- XPath will match zero-or-one w:lastRenderedPageBreak element --
|
|
self._enclosing_p.xpath(
|
|
# -- in first run of paragraph --
|
|
f"(./w:r)[last()]"
|
|
# -- all page-breaks --
|
|
f"/w:lastRenderedPageBreak"
|
|
# -- that are not preceded by any content-bearing elements --
|
|
f"[not(following-sibling::*[{self._run_inner_content_xpath}])]"
|
|
)
|
|
)
|
|
|
|
@property
|
|
def precedes_all_content(self) -> bool:
|
|
"""True when a `w:lastRenderedPageBreak` precedes all paragraph content.
|
|
|
|
This is a common case; it occurs whenever the page breaks on an even paragraph
|
|
boundary.
|
|
"""
|
|
# -- a page-break inside a hyperlink never meets these criteria because there
|
|
# -- is always part of the hyperlink text before the page-break.
|
|
if self._is_in_hyperlink:
|
|
return False
|
|
|
|
return bool(
|
|
# -- XPath will match zero-or-one w:lastRenderedPageBreak element --
|
|
self._enclosing_p.xpath(
|
|
# -- in first run of paragraph --
|
|
f"./w:r[1]"
|
|
# -- all page-breaks --
|
|
f"/w:lastRenderedPageBreak"
|
|
# -- that are not preceded by any content-bearing elements --
|
|
f"[not(preceding-sibling::*[{self._run_inner_content_xpath}])]"
|
|
)
|
|
)
|
|
|
|
@property
|
|
def preceding_fragment_p(self) -> CT_P:
|
|
"""A "loose" `CT_P` containing only the paragraph content before this break.
|
|
|
|
Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
|
|
paragraph in its paragraph.
|
|
|
|
The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
|
|
page-break with this `w:lastRenderedPageBreak` element and all its following
|
|
siblings removed.
|
|
"""
|
|
if not self == self._first_lrpb_in_p(self._enclosing_p):
|
|
raise ValueError("only defined on first rendered page-break in paragraph")
|
|
|
|
# -- splitting approach is different when break is inside a hyperlink --
|
|
return (
|
|
self._preceding_frag_in_hlink if self._is_in_hyperlink else self._preceding_frag_in_run
|
|
)
|
|
|
|
def _enclosing_hyperlink(self, lrpb: CT_LastRenderedPageBreak) -> CT_Hyperlink:
|
|
"""The `w:hyperlink` grandparent of this `w:lastRenderedPageBreak`.
|
|
|
|
Raises `IndexError` when this page-break has a `w:p` grandparent, so only call
|
|
when `._is_in_hyperlink` is True.
|
|
"""
|
|
return lrpb.xpath("./parent::w:r/parent::w:hyperlink")[0]
|
|
|
|
@property
|
|
def _enclosing_p(self) -> CT_P:
|
|
"""The `w:p` element parent or grandparent of this `w:lastRenderedPageBreak`."""
|
|
return self.xpath("./ancestor::w:p[1]")[0]
|
|
|
|
def _first_lrpb_in_p(self, p: CT_P) -> CT_LastRenderedPageBreak:
|
|
"""The first `w:lastRenderedPageBreak` element in `p`.
|
|
|
|
Raises `ValueError` if there are no rendered page-breaks in `p`.
|
|
"""
|
|
lrpbs = p.xpath("./w:r/w:lastRenderedPageBreak | ./w:hyperlink/w:r/w:lastRenderedPageBreak")
|
|
if not lrpbs:
|
|
raise ValueError("no rendered page-breaks in paragraph element")
|
|
return lrpbs[0]
|
|
|
|
@lazyproperty
|
|
def _following_frag_in_hlink(self) -> CT_P:
|
|
"""Following CT_P fragment when break occurs within a hyperlink.
|
|
|
|
Note this is a *partial-function* and raises when `lrpb` is not inside a
|
|
hyperlink.
|
|
"""
|
|
if not self._is_in_hyperlink:
|
|
raise ValueError("only defined on a rendered page-break in a hyperlink")
|
|
|
|
# -- work on a clone `w:p` so our mutations don't persist --
|
|
p = copy.deepcopy(self._enclosing_p)
|
|
|
|
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
|
|
lrpb = self._first_lrpb_in_p(p)
|
|
|
|
# -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
|
|
hyperlink = lrpb._enclosing_hyperlink(lrpb)
|
|
|
|
# -- delete all w:p inner-content preceding the hyperlink --
|
|
for e in hyperlink.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
|
|
p.remove(e)
|
|
|
|
# -- remove the whole hyperlink, it belongs to the preceding-fragment-p --
|
|
hyperlink.getparent().remove(hyperlink)
|
|
|
|
# -- that's it, return the remaining fragment of `w:p` clone --
|
|
return p
|
|
|
|
@lazyproperty
|
|
def _following_frag_in_run(self) -> CT_P:
|
|
"""following CT_P fragment when break does not occur in a hyperlink.
|
|
|
|
Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
|
|
"""
|
|
if self._is_in_hyperlink:
|
|
raise ValueError("only defined on a rendered page-break not in a hyperlink")
|
|
|
|
# -- work on a clone `w:p` so our mutations don't persist --
|
|
p = copy.deepcopy(self._enclosing_p)
|
|
|
|
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
|
|
lrpb = self._first_lrpb_in_p(p)
|
|
|
|
# -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
|
|
enclosing_r = lrpb.xpath("./parent::w:r")[0]
|
|
|
|
# -- delete all w:p inner-content preceding that run (but not w:pPr) --
|
|
for e in enclosing_r.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
|
|
p.remove(e)
|
|
|
|
# -- then remove all run inner-content preceding this lrpb in its run (but not
|
|
# -- the `w:rPr`) and also remove the page-break itself
|
|
for e in lrpb.xpath("./preceding-sibling::*[not(self::w:rPr)]"):
|
|
enclosing_r.remove(e)
|
|
enclosing_r.remove(lrpb)
|
|
|
|
return p
|
|
|
|
@lazyproperty
|
|
def _is_in_hyperlink(self) -> bool:
|
|
"""True when this page-break is embedded in a hyperlink run."""
|
|
return bool(self.xpath("./parent::w:r/parent::w:hyperlink"))
|
|
|
|
@lazyproperty
|
|
def _preceding_frag_in_hlink(self) -> CT_P:
|
|
"""Preceding CT_P fragment when break occurs within a hyperlink.
|
|
|
|
Note this is a *partial-function* and raises when `lrpb` is not inside a
|
|
hyperlink.
|
|
"""
|
|
if not self._is_in_hyperlink:
|
|
raise ValueError("only defined on a rendered page-break in a hyperlink")
|
|
|
|
# -- work on a clone `w:p` so our mutations don't persist --
|
|
p = copy.deepcopy(self._enclosing_p)
|
|
|
|
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
|
|
lrpb = self._first_lrpb_in_p(p)
|
|
|
|
# -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
|
|
hyperlink = lrpb._enclosing_hyperlink(lrpb)
|
|
|
|
# -- delete all w:p inner-content following the hyperlink --
|
|
for e in hyperlink.xpath("./following-sibling::*"):
|
|
p.remove(e)
|
|
|
|
# -- remove this page-break from inside the hyperlink --
|
|
lrpb.getparent().remove(lrpb)
|
|
|
|
# -- that's it, the entire hyperlink goes into the preceding fragment so
|
|
# -- the hyperlink is not "split".
|
|
return p
|
|
|
|
@lazyproperty
|
|
def _preceding_frag_in_run(self) -> CT_P:
|
|
"""Preceding CT_P fragment when break does not occur in a hyperlink.
|
|
|
|
Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
|
|
"""
|
|
if self._is_in_hyperlink:
|
|
raise ValueError("only defined on a rendered page-break not in a hyperlink")
|
|
|
|
# -- work on a clone `w:p` so our mutations don't persist --
|
|
p = copy.deepcopy(self._enclosing_p)
|
|
|
|
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
|
|
lrpb = self._first_lrpb_in_p(p)
|
|
|
|
# -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
|
|
enclosing_r = lrpb.xpath("./parent::w:r")[0]
|
|
|
|
# -- delete all `w:p` inner-content following that run --
|
|
for e in enclosing_r.xpath("./following-sibling::*"):
|
|
p.remove(e)
|
|
|
|
# -- then delete all `w:r` inner-content following this lrpb in its run and
|
|
# -- also remove the page-break itself
|
|
for e in lrpb.xpath("./following-sibling::*"):
|
|
enclosing_r.remove(e)
|
|
enclosing_r.remove(lrpb)
|
|
|
|
return p
|
|
|
|
@lazyproperty
|
|
def _run_inner_content_xpath(self) -> str:
|
|
"""XPath fragment matching any run inner-content elements."""
|
|
return (
|
|
"self::w:br"
|
|
" | self::w:cr"
|
|
" | self::w:drawing"
|
|
" | self::w:noBreakHyphen"
|
|
" | self::w:ptab"
|
|
" | self::w:t"
|
|
" | self::w:tab"
|
|
)
|