idconvert/backend/word/builder.py

325 lines
13 KiB
Python

from __future__ import annotations
import base64
import io
from typing import Optional
import structlog
from docx import Document
from docx.enum.section import WD_SECTION
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Pt
from lxml import etree
from export.models import (
IDExportDocument,
IDExportImageFrame,
IDExportPage,
IDExportShape,
IDExportTable,
IDExportTextFrame,
)
from word.factories import FooterFactory, StyleFactory, TableFactory
log = structlog.get_logger()
# DrawingML / OOXML namespaces
_WP = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
_A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
_PIC = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
_R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
_W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
_WPS = 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape'
def _pt_to_emu(pt: float) -> int:
"""Convert points to English Metric Units (EMU)."""
return int(pt * 914400 / 72)
class DocxBuilder:
"""Builds a DOCX from a parsed IDExportDocument using absolute page anchoring.
Every element is placed as an absolutely-positioned wp:anchor at its exact
InDesign coordinates (relativeFrom="page"). Shapes are placed behind text
(behindDoc=1); text boxes and images are placed in front (behindDoc=0).
Tables are added inline in Y-position order.
"""
def build(self, document: IDExportDocument) -> bytes:
doc = Document()
self._shape_counter = 0
# Register all paragraph styles once before any content is added
StyleFactory.register_all(doc, document)
for page_idx, page in enumerate(document.pages):
if page_idx == 0:
section = doc.sections[0]
else:
section = doc.add_section(WD_SECTION.NEW_PAGE)
w = page.width_pt or document.page_width_pt
h = page.height_pt or document.page_height_pt
section.page_width = Pt(w)
section.page_height = Pt(h)
section.left_margin = Pt(56.69)
section.right_margin = Pt(56.69)
section.top_margin = Pt(56.69)
section.bottom_margin = Pt(56.69)
# 1. Shapes first — placed behind (behindDoc=1)
for item in page.items:
if isinstance(item, IDExportShape):
self._add_shape(doc, item)
# 2. Text frames and images — placed in front (behindDoc=0)
for item in page.items:
if isinstance(item, IDExportTextFrame):
self._add_text_box(doc, item)
elif isinstance(item, IDExportImageFrame):
self._add_image_anchor(doc, item)
# 3. Tables — inline, sorted by Y position
tables = sorted(
[i for i in page.items if isinstance(i, IDExportTable)],
key=lambda t: t.y_pt,
)
for table in tables:
TableFactory.add_to_doc(doc, table)
if document.page_number_style:
FooterFactory.apply(doc, document.page_number_style)
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()
# ── Anchor helpers ─────────────────────────────────────────────────────────
def _next_id(self) -> int:
self._shape_counter += 1
return self._shape_counter
def _anchor_para(self, doc: Document) -> etree._Element:
"""Add a zero-height paragraph to host an anchor; return the raw p element."""
para = doc.add_paragraph()
pPr = OxmlElement('w:pPr')
spacing = OxmlElement('w:spacing')
spacing.set(qn('w:before'), '0')
spacing.set(qn('w:after'), '0')
spacing.set(qn('w:line'), '240')
pPr.append(spacing)
para._p.insert(0, pPr)
return para._p
def _make_anchor(self, sid: int, x_pt: float, y_pt: float,
w_pt: float, h_pt: float, behind: bool = False) -> etree._Element:
"""Build a wp:anchor element (without graphic content) at absolute page coords."""
x = _pt_to_emu(x_pt)
y = _pt_to_emu(y_pt)
w = _pt_to_emu(w_pt)
h = _pt_to_emu(h_pt)
z = '0' if behind else '251658240'
bd = '1' if behind else '0'
return etree.fromstring(
f'<wp:anchor xmlns:wp="{_WP}" '
f'distT="0" distB="0" distL="0" distR="0" '
f'simplePos="0" relativeHeight="{z}" behindDoc="{bd}" '
f'locked="0" layoutInCell="1" allowOverlap="1">'
f'<wp:simplePos x="0" y="0"/>'
f'<wp:positionH relativeFrom="page"><wp:posOffset>{x}</wp:posOffset></wp:positionH>'
f'<wp:positionV relativeFrom="page"><wp:posOffset>{y}</wp:posOffset></wp:positionV>'
f'<wp:extent cx="{w}" cy="{h}"/>'
f'<wp:effectExtent l="0" t="0" r="0" b="0"/>'
f'<wp:wrapNone/>'
f'<wp:docPr id="{sid}" name="El{sid}"/>'
f'<wp:cNvGraphicFramePr/>'
f'</wp:anchor>'
)
def _attach_anchor(self, p_el: etree._Element, anchor: etree._Element) -> None:
"""Wrap an anchor in w:r > w:drawing and append it to a paragraph element."""
r_el = etree.SubElement(p_el, f'{{{_W}}}r')
drawing = etree.SubElement(r_el, f'{{{_W}}}drawing')
drawing.append(anchor)
# ── Shape placement ────────────────────────────────────────────────────────
def _add_shape(self, doc: Document, item: IDExportShape) -> None:
"""Place a DrawingML shape (rect/ellipse/line) behind page content."""
w_pt = max(item.width_pt, 0.5)
h_pt = max(item.height_pt, 0.5)
sid = self._next_id()
w_emu = _pt_to_emu(w_pt)
h_emu = _pt_to_emu(h_pt)
prst = {'rect': 'rect', 'ellipse': 'ellipse', 'line': 'line'}.get(
item.shape_type, 'rect'
)
if item.fill_hex:
fill_xml = f'<a:solidFill><a:srgbClr val="{item.fill_hex.upper()}"/></a:solidFill>'
else:
fill_xml = '<a:noFill/>'
if item.stroke_hex and item.stroke_weight_pt:
sw = max(_pt_to_emu(item.stroke_weight_pt), 12700) # min 1pt EMU
stroke_xml = (
f'<a:ln w="{sw}">'
f'<a:solidFill><a:srgbClr val="{item.stroke_hex.upper()}"/></a:solidFill>'
f'</a:ln>'
)
else:
stroke_xml = '<a:ln><a:noFill/></a:ln>'
graphic_xml = (
f'<a:graphic xmlns:a="{_A}">'
f'<a:graphicData uri="{_WPS}">'
f'<wps:wsp xmlns:wps="{_WPS}">'
f'<wps:cNvSpPr><a:spLocks noChangeArrowheads="1"/></wps:cNvSpPr>'
f'<wps:spPr>'
f'<a:xfrm><a:off x="0" y="0"/><a:ext cx="{w_emu}" cy="{h_emu}"/></a:xfrm>'
f'<a:prstGeom prst="{prst}"><a:avLst/></a:prstGeom>'
f'{fill_xml}{stroke_xml}'
f'</wps:spPr>'
f'<wps:bodyPr/>'
f'</wps:wsp>'
f'</a:graphicData>'
f'</a:graphic>'
)
anc = self._make_anchor(sid, item.x_pt, item.y_pt, w_pt, h_pt, behind=True)
anc.append(etree.fromstring(graphic_xml))
p_el = self._anchor_para(doc)
self._attach_anchor(p_el, anc)
# ── Text box placement ─────────────────────────────────────────────────────
def _add_text_box(self, doc: Document, frame: IDExportTextFrame) -> None:
"""Place a text frame as an anchored WPS text box at exact page coordinates."""
if not frame.paragraphs:
return
w_pt = max(frame.width_pt, 1.0)
h_pt = max(frame.height_pt, 1.0)
sid = self._next_id()
w_emu = _pt_to_emu(w_pt)
h_emu = _pt_to_emu(h_pt)
# Build paragraph elements via python-docx API, then transplant into txbxContent
para_elements = self._build_txbx_paragraphs(frame, doc)
paras_xml = ''.join(
etree.tostring(p, encoding='unicode') for p in para_elements
)
graphic_xml = (
f'<a:graphic xmlns:a="{_A}">'
f'<a:graphicData uri="{_WPS}">'
f'<wps:wsp xmlns:wps="{_WPS}">'
f'<wps:cNvSpPr txBx="1"><a:spLocks noChangeArrowheads="1"/></wps:cNvSpPr>'
f'<wps:spPr>'
f'<a:xfrm><a:off x="0" y="0"/><a:ext cx="{w_emu}" cy="{h_emu}"/></a:xfrm>'
f'<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>'
f'<a:noFill/><a:ln><a:noFill/></a:ln>'
f'</wps:spPr>'
f'<wps:txbx>'
f'<w:txbxContent xmlns:w="{_W}">{paras_xml}</w:txbxContent>'
f'</wps:txbx>'
f'<wps:bodyPr rot="0" vert="horz" wrap="square" '
f'lIns="0" rIns="0" tIns="0" bIns="0" anchor="t" anchorCtr="0"/>'
f'</wps:wsp>'
f'</a:graphicData>'
f'</a:graphic>'
)
anc = self._make_anchor(sid, frame.x_pt, frame.y_pt, w_pt, h_pt, behind=False)
anc.append(etree.fromstring(graphic_xml))
p_el = self._anchor_para(doc)
self._attach_anchor(p_el, anc)
def _build_txbx_paragraphs(self, frame: IDExportTextFrame,
doc: Document) -> list:
"""Build w:p XML elements for a text frame using python-docx's style API.
Paragraphs are created normally via python-docx (so styles are applied
correctly), then their XML elements are extracted and moved into the
wp:txbxContent node.
"""
result = []
for para_data in frame.paragraphs:
if not para_data.text.strip() and not para_data.runs:
result.append(etree.Element(f'{{{_W}}}p'))
continue
temp = doc.add_paragraph()
StyleFactory.apply_paragraph_style(temp, para_data.style, doc)
if para_data.runs:
for run_data in para_data.runs:
if not run_data.text:
continue
run = temp.add_run(run_data.text)
StyleFactory.apply_run_formatting(run, run_data)
else:
temp.add_run(para_data.text)
# Extract element from doc body and hand it to the caller
p_el = temp._p
p_el.getparent().remove(p_el)
result.append(p_el)
return result
# ── Image placement ────────────────────────────────────────────────────────
def _add_image_anchor(self, doc: Document, frame: IDExportImageFrame) -> None:
"""Place an image as an anchored DrawingML picture at exact page coordinates."""
if not frame.image_data_b64:
return
try:
img_bytes = base64.b64decode(frame.image_data_b64)
except Exception:
log.warning('image_decode_failed', frame_id=frame.id)
return
w_pt = max(frame.width_pt, 1.0)
h_pt = max(frame.height_pt, 1.0)
sid = self._next_id()
w_emu = _pt_to_emu(w_pt)
h_emu = _pt_to_emu(h_pt)
try:
r_id, _image = doc.part.get_or_add_image(io.BytesIO(img_bytes))
except Exception as e:
log.warning('image_register_failed', frame_id=frame.id, error=str(e))
return
graphic_xml = (
f'<a:graphic xmlns:a="{_A}">'
f'<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">'
f'<pic:pic xmlns:pic="{_PIC}">'
f'<pic:nvPicPr>'
f'<pic:cNvPr id="{sid}" name="Img{sid}"/>'
f'<pic:cNvPicPr/>'
f'</pic:nvPicPr>'
f'<pic:blipFill>'
f'<a:blip r:embed="{r_id}" xmlns:r="{_R}"/>'
f'<a:stretch><a:fillRect/></a:stretch>'
f'</pic:blipFill>'
f'<pic:spPr>'
f'<a:xfrm><a:off x="0" y="0"/><a:ext cx="{w_emu}" cy="{h_emu}"/></a:xfrm>'
f'<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>'
f'</pic:spPr>'
f'</pic:pic>'
f'</a:graphicData>'
f'</a:graphic>'
)
anc = self._make_anchor(sid, frame.x_pt, frame.y_pt, w_pt, h_pt, behind=False)
anc.append(etree.fromstring(graphic_xml))
p_el = self._anchor_para(doc)
self._attach_anchor(p_el, anc)