idconvert/backend/export/scanner.py

125 lines
4.5 KiB
Python

from __future__ import annotations
from typing import List
from export.fonts import classify_font, get_substitute
from models.scan import FontEntry, ScanReport, ScanWarning
class IDExportScanner:
"""Lightweight scan of idconvert_export.json — builds the pre-conversion report."""
def scan(self, data: dict) -> ScanReport:
"""Scan a validated export dict and return a ScanReport."""
pages = data.get('pages', [])
fonts_used = data.get('fonts_used', [])
page_count = len(pages)
frame_count, image_count, table_count = self._count_items(pages)
fonts = self._classify_fonts(fonts_used, data.get('styles', {}))
warnings = self._collect_warnings(data)
return ScanReport(
pages=page_count,
stories=frame_count,
images=image_count,
tables=table_count,
fonts=fonts,
warnings=warnings,
)
# ── private helpers ──────────────────────────────────────────────────────
def _count_items(self, pages: list):
frame_count = image_count = table_count = 0
for page in pages:
for item in page.get('items', []):
t = item.get('type', '')
if t == 'text_frame':
frame_count += 1
elif t == 'image_frame':
image_count += 1
elif t == 'table':
table_count += 1
return frame_count, image_count, table_count
def _classify_fonts(self, fonts_used: list, styles: dict) -> List[FontEntry]:
seen = {}
# Fonts declared in fonts_used
for f in fonts_used:
name = f.get('name', '')
if name and name not in seen:
seen[name] = 'General'
# Cross-reference paragraph styles for "used for" context
for style in styles.get('paragraph', []):
font = style.get('font')
if not font:
continue
style_name = style.get('name', '').lower()
if 'head' in style_name:
ctx = 'Headings'
elif 'caption' in style_name:
ctx = 'Captions'
elif 'body' in style_name or 'text' in style_name:
ctx = 'Body text'
else:
ctx = 'General'
seen[font] = ctx
result = []
for name, ctx in seen.items():
status = classify_font(name)
substitute, quality = get_substitute(name)
result.append(FontEntry(
name=name,
status=status,
substitute=substitute,
substitute_quality=quality,
used_for=ctx,
))
return result
def _collect_warnings(self, data: dict) -> List[ScanWarning]:
warnings = []
# Check for multi-column frames
for page in data.get('pages', []):
for item in page.get('items', []):
if item.get('type') == 'text_frame' and item.get('column_count', 1) > 1:
warnings.append(ScanWarning(
type='multi_column',
severity='info',
page=page.get('page_number'),
message='Multi-column text frame will flow as single column in Word',
))
# Check for shapes (not converted)
shape_pages = set()
for page in data.get('pages', []):
for item in page.get('items', []):
if item.get('type') == 'shape':
shape_pages.add(page.get('page_number'))
if shape_pages:
warnings.append(ScanWarning(
type='shapes_excluded',
severity='info',
page=None,
message='Decorative shapes and rules are not converted to Word',
))
# Check for page numbers
for page in data.get('pages', []):
for item in page.get('items', []):
if item.get('type') == 'page_number':
warnings.append(ScanWarning(
type='page_number',
severity='info',
page=None,
message='Page numbers converted to native Word footer fields',
))
return warnings # only one warning needed
return warnings