#!/usr/bin/env python3 """ post-process-docx.py ==================== Wird auf das von Pandoc erzeugte DOCX angewendet, NACH `build.ps1`. Macht drei XML-Modifikationen, die ein Stil oder die `reference.docx` nicht abbilden koennen: 1. 3-3-Regel fuer Listen-Bullets (B3.5): - Eine Liste ist eine Sequenz aufeinanderfolgender Absaetze mit -Eigenschaft im Body (nicht innerhalb von Tabellen-Zellen). - Bei einer Liste mit weniger als 6 Bullets: alle Bullets bekommen . - Bei einer Liste mit 6 oder mehr Bullets: die ersten 2 und die drittletzten und vorletzten Bullets bekommen . Bullets in Tabellen-Zellen werden uebersprungen. 2. H2-Trennlinie (S08): - Nach jedem H2-Absatz wird ein leerer Trenn-Absatz eingefuegt. - Trenn-Absatz: linksbuendige Bottom-Border, schwarz (000000), 1,25 pt (sz=10), 8,6 cm Linienlaenge. 3. Bullet-Einzuege (S08): - Pandoc erzeugt fuer alle Bullet-Listen abstractNum-Eintraege mit festen Defaults (E1 left/hanging=480 dxa, E2 left=1200/hanging=480 dxa). Pandoc IGNORIERT die numbering.xml-Werte der reference.docx. - Im Post-Processing wird numbering.xml so modifiziert, dass alle abstractNum-Eintraege die kompakteren Wunschwerte bekommen. - Word-Konvention: "Einzug links" (im Absatz-Dialog) zeigt (left - hanging) = Bullet-Position; "Sondereinzug Haengend" = hanging. Daher rechnen wir: left = (gewuenschter Einzug + gewuenschter Hanging) in dxa. Voraussetzungen: nur Python-Stdlib. """ from __future__ import annotations import re import sys import zipfile from pathlib import Path SCRIPT_DIR = Path(__file__).resolve().parent BASE_DIR = SCRIPT_DIR.parent DOCX_FILE = BASE_DIR / "output" / "Lebenslauf_Dr-Ing_Thomas_Langer.docx" W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" H2_SEP_XML = ( '' '' '' '' '' '' '' '' '' '' ) H2_STYLE_RE = re.compile(r'') # Bullet-Einzuege (1 cm = 567 dxa) # Word zeigt "Einzug links" = (left - hanging), "Sondereinzug Haengend" = hanging. # E1: Einzug 0,25 cm + Sondereinzug 0,35 cm -> hanging=198, left=142+198=340 # E2: Einzug 0,80 cm + Sondereinzug 0,40 cm -> hanging=227, left=454+227=681 # E3+: proportional zur E2 (jeweils +0,55 cm fuer Einzug), hanging analog E2 BULLET_INDENTS = { 0: {"left": 340, "hanging": 198}, 1: {"left": 681, "hanging": 227}, 2: {"left": 993, "hanging": 227}, 3: {"left": 1305, "hanging": 227}, 4: {"left": 1617, "hanging": 227}, 5: {"left": 1929, "hanging": 227}, 6: {"left": 2241, "hanging": 227}, 7: {"left": 2553, "hanging": 227}, 8: {"left": 2865, "hanging": 227}, } def log(msg): print(f"[post-process-docx] {msg}", flush=True) def is_bullet_paragraph(p_xml): return "" in p_xml: return p_xml.replace("", "", 1) if "" in p_xml: return p_xml.replace("", "", 1) new_ppr = "" if ""): return p_xml.replace("", "" + new_ppr, 1) return p_xml.replace("", new_ppr + "", 1) P_RE = re.compile(r"]*>.*?", re.DOTALL) TBL_OPEN = "" TBL_CLOSE = "" def process_document_xml(xml): out = [] bullet_run = [] table_depth = 0 stats = {"lists": 0, "bullets_in_lists": 0, "bullets_keepnext": 0, "skipped_in_tables": 0, "h2_headings": 0, "separators_added": 0} def flush_run(): if not bullet_run: return n = len(bullet_run) stats["lists"] += 1 stats["bullets_in_lists"] += n if n < 6: indices_keep = list(range(n)) else: indices_keep = [0, 1, n-3, n-2] for k in indices_keep: idx, p_xml = bullet_run[k] new_xml = add_keep_next(p_xml) if new_xml != p_xml: out[idx] = new_xml stats["bullets_keepnext"] += 1 bullet_run.clear() token_re = re.compile( r"(?P" + re.escape(TBL_OPEN) + r")" r"|(?P" + re.escape(TBL_CLOSE) + r")" r"|(?P]*>.*?)", re.DOTALL, ) last_end = 0 for m in token_re.finditer(xml): if m.start() > last_end: out.append(xml[last_end:m.start()]) last_end = m.end() if m.group("tblopen"): flush_run() table_depth += 1 out.append(m.group()) elif m.group("tblclose"): flush_run() table_depth -= 1 out.append(m.group()) else: p_xml = m.group("para") out.append(p_xml) if table_depth > 0: if is_bullet_paragraph(p_xml): stats["skipped_in_tables"] += 1 continue if is_bullet_paragraph(p_xml): bullet_run.append((len(out) - 1, p_xml)) continue flush_run() if is_h2_paragraph(p_xml): out.append(H2_SEP_XML) stats["h2_headings"] += 1 stats["separators_added"] += 1 if last_end < len(xml): out.append(xml[last_end:]) flush_run() return "".join(out), stats def process_numbering_xml(xml): """In allen abstractNum-Eintraegen die Bullet-Einzuege ersetzen.""" import xml.etree.ElementTree as ET W = "{%s}" % W_NS ET.register_namespace("w", W_NS) root = ET.fromstring(xml) stats = {"abstractNums": 0, "lvls_modified": 0} for absnum in root.findall(W+"abstractNum"): stats["abstractNums"] += 1 for lvl in absnum.findall(W+"lvl"): ilvl_str = lvl.get(W+"ilvl") try: ilvl = int(ilvl_str) except (TypeError, ValueError): continue target = BULLET_INDENTS.get(ilvl) if target is None: continue pPr = lvl.find(W+"pPr") if pPr is None: pPr = ET.SubElement(lvl, W+"pPr") ind = pPr.find(W+"ind") if ind is None: ind = ET.SubElement(pPr, W+"ind") ind.set(W+"left", str(target["left"])) ind.set(W+"hanging", str(target["hanging"])) if W+"firstLine" in ind.attrib: del ind.attrib[W+"firstLine"] stats["lvls_modified"] += 1 XML_DECL = '\n' new_xml = XML_DECL + ET.tostring(root, encoding="unicode") return new_xml, stats def main(): if not DOCX_FILE.exists(): sys.stderr.write(f"FEHLER: {DOCX_FILE} existiert nicht. " f"Erst build.ps1 laufen lassen.\n") return 1 log(f"Verarbeite: {DOCX_FILE}") with zipfile.ZipFile(DOCX_FILE, "r") as z: members = {name: z.read(name) for name in z.namelist()} doc_xml = members["word/document.xml"].decode("utf-8") new_doc_xml, doc_stats = process_document_xml(doc_xml) members["word/document.xml"] = new_doc_xml.encode("utf-8") num_stats = {"abstractNums": 0, "lvls_modified": 0} if "word/numbering.xml" in members: num_xml = members["word/numbering.xml"].decode("utf-8") new_num_xml, num_stats = process_numbering_xml(num_xml) members["word/numbering.xml"] = new_num_xml.encode("utf-8") else: log(" Hinweis: word/numbering.xml nicht im DOCX (keine Listen?)") with zipfile.ZipFile(DOCX_FILE, "w", zipfile.ZIP_DEFLATED) as z: order = sorted(members.keys(), key=lambda n: (0 if n == "[Content_Types].xml" else 1, n)) for name in order: z.writestr(name, members[name]) log(f" Listen gefunden: {doc_stats['lists']}") log(f" Bullets in Listen: {doc_stats['bullets_in_lists']}") log(f" keepNext gesetzt: {doc_stats['bullets_keepnext']}") log(f" Bullets in Tabellen uebersprungen: {doc_stats['skipped_in_tables']}") log(f" H2-Headings gefunden: {doc_stats['h2_headings']}") log(f" H2-Trenn-Absaetze eingefuegt: {doc_stats['separators_added']}") log(f" numbering.xml abstractNum-Eintraege: {num_stats['abstractNums']}") log(f" numbering.xml lvls modifiziert: {num_stats['lvls_modified']}") log("Fertig.") return 0 if __name__ == "__main__": sys.exit(main())