253 lines
8.8 KiB
Python
253 lines
8.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
post-process-docx.py
|
|
====================
|
|
|
|
Wird auf das von Pandoc erzeugte DOCX angewendet, NACH `build.ps1`. Macht
|
|
drei XML-Modifikationen, die ein Stil oder die `reference.docx` nicht
|
|
abbilden koennen:
|
|
|
|
1. 3-3-Regel fuer Listen-Bullets (B3.5):
|
|
- Eine Liste ist eine Sequenz aufeinanderfolgender Absaetze mit
|
|
<w:numPr>-Eigenschaft im Body (nicht innerhalb von Tabellen-Zellen).
|
|
- Bei einer Liste mit weniger als 6 Bullets: alle Bullets bekommen
|
|
<w:keepNext/>.
|
|
- Bei einer Liste mit 6 oder mehr Bullets: die ersten 2 und die
|
|
drittletzten und vorletzten Bullets bekommen <w:keepNext/>.
|
|
Bullets in Tabellen-Zellen werden uebersprungen.
|
|
|
|
2. H2-Trennlinie (S08):
|
|
- Nach jedem H2-Absatz wird ein leerer Trenn-Absatz eingefuegt.
|
|
- Trenn-Absatz: linksbuendige Bottom-Border, schwarz (000000),
|
|
1,25 pt (sz=10), 8,6 cm Linienlaenge.
|
|
|
|
3. Bullet-Einzuege (S08):
|
|
- Pandoc erzeugt fuer alle Bullet-Listen abstractNum-Eintraege mit
|
|
festen Defaults (E1 left/hanging=480 dxa, E2 left=1200/hanging=480 dxa).
|
|
Pandoc IGNORIERT die numbering.xml-Werte der reference.docx.
|
|
- Im Post-Processing wird numbering.xml so modifiziert, dass alle
|
|
abstractNum-Eintraege die kompakteren Wunschwerte bekommen.
|
|
- Word-Konvention: "Einzug links" (im Absatz-Dialog) zeigt
|
|
(left - hanging) = Bullet-Position; "Sondereinzug Haengend" = hanging.
|
|
Daher rechnen wir: left = (gewuenschter Einzug + gewuenschter Hanging) in dxa.
|
|
|
|
Voraussetzungen: nur Python-Stdlib.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import sys
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
BASE_DIR = SCRIPT_DIR.parent
|
|
DOCX_FILE = BASE_DIR / "output" / "Lebenslauf_Dr-Ing_Thomas_Langer.docx"
|
|
|
|
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
|
|
H2_SEP_XML = (
|
|
'<w:p>'
|
|
'<w:pPr>'
|
|
'<w:spacing w:before="0" w:after="80"/>'
|
|
'<w:ind w:right="4196"/>'
|
|
'<w:pBdr>'
|
|
'<w:bottom w:val="single" w:sz="10" w:space="2" w:color="000000"/>'
|
|
'</w:pBdr>'
|
|
'<w:rPr><w:sz w:val="2"/><w:szCs w:val="2"/></w:rPr>'
|
|
'</w:pPr>'
|
|
'</w:p>'
|
|
)
|
|
|
|
H2_STYLE_RE = re.compile(r'<w:pStyle\s+w:val="Heading2"\s*/?>')
|
|
|
|
# Bullet-Einzuege (1 cm = 567 dxa)
|
|
# Word zeigt "Einzug links" = (left - hanging), "Sondereinzug Haengend" = hanging.
|
|
# E1: Einzug 0,25 cm + Sondereinzug 0,35 cm -> hanging=198, left=142+198=340
|
|
# E2: Einzug 0,80 cm + Sondereinzug 0,40 cm -> hanging=227, left=454+227=681
|
|
# E3+: proportional zur E2 (jeweils +0,55 cm fuer Einzug), hanging analog E2
|
|
BULLET_INDENTS = {
|
|
0: {"left": 340, "hanging": 198},
|
|
1: {"left": 681, "hanging": 227},
|
|
2: {"left": 993, "hanging": 227},
|
|
3: {"left": 1305, "hanging": 227},
|
|
4: {"left": 1617, "hanging": 227},
|
|
5: {"left": 1929, "hanging": 227},
|
|
6: {"left": 2241, "hanging": 227},
|
|
7: {"left": 2553, "hanging": 227},
|
|
8: {"left": 2865, "hanging": 227},
|
|
}
|
|
|
|
def log(msg):
|
|
print(f"[post-process-docx] {msg}", flush=True)
|
|
|
|
def is_bullet_paragraph(p_xml):
|
|
return "<w:numPr" in p_xml
|
|
|
|
def is_h2_paragraph(p_xml):
|
|
return bool(H2_STYLE_RE.search(p_xml))
|
|
|
|
def has_keep_next(p_xml):
|
|
return "<w:keepNext" in p_xml
|
|
|
|
def add_keep_next(p_xml):
|
|
if has_keep_next(p_xml):
|
|
return p_xml
|
|
if "<w:pPr>" in p_xml:
|
|
return p_xml.replace("<w:pPr>", "<w:pPr><w:keepNext/>", 1)
|
|
if "<w:pPr/>" in p_xml:
|
|
return p_xml.replace("<w:pPr/>", "<w:pPr><w:keepNext/></w:pPr>", 1)
|
|
new_ppr = "<w:pPr><w:keepNext/></w:pPr>"
|
|
if "<w:r" in p_xml and p_xml.startswith("<w:p>"):
|
|
return p_xml.replace("<w:p>", "<w:p>" + new_ppr, 1)
|
|
return p_xml.replace("</w:p>", new_ppr + "</w:p>", 1)
|
|
|
|
P_RE = re.compile(r"<w:p\b[^>]*>.*?</w:p>", re.DOTALL)
|
|
TBL_OPEN = "<w:tbl>"
|
|
TBL_CLOSE = "</w:tbl>"
|
|
|
|
def process_document_xml(xml):
|
|
out = []
|
|
bullet_run = []
|
|
table_depth = 0
|
|
stats = {"lists": 0, "bullets_in_lists": 0, "bullets_keepnext": 0,
|
|
"skipped_in_tables": 0, "h2_headings": 0, "separators_added": 0}
|
|
|
|
def flush_run():
|
|
if not bullet_run:
|
|
return
|
|
n = len(bullet_run)
|
|
stats["lists"] += 1
|
|
stats["bullets_in_lists"] += n
|
|
if n < 6:
|
|
indices_keep = list(range(n))
|
|
else:
|
|
indices_keep = [0, 1, n-3, n-2]
|
|
for k in indices_keep:
|
|
idx, p_xml = bullet_run[k]
|
|
new_xml = add_keep_next(p_xml)
|
|
if new_xml != p_xml:
|
|
out[idx] = new_xml
|
|
stats["bullets_keepnext"] += 1
|
|
bullet_run.clear()
|
|
|
|
token_re = re.compile(
|
|
r"(?P<tblopen>" + re.escape(TBL_OPEN) + r")"
|
|
r"|(?P<tblclose>" + re.escape(TBL_CLOSE) + r")"
|
|
r"|(?P<para><w:p\b[^>]*>.*?</w:p>)",
|
|
re.DOTALL,
|
|
)
|
|
last_end = 0
|
|
for m in token_re.finditer(xml):
|
|
if m.start() > last_end:
|
|
out.append(xml[last_end:m.start()])
|
|
last_end = m.end()
|
|
|
|
if m.group("tblopen"):
|
|
flush_run()
|
|
table_depth += 1
|
|
out.append(m.group())
|
|
elif m.group("tblclose"):
|
|
flush_run()
|
|
table_depth -= 1
|
|
out.append(m.group())
|
|
else:
|
|
p_xml = m.group("para")
|
|
out.append(p_xml)
|
|
if table_depth > 0:
|
|
if is_bullet_paragraph(p_xml):
|
|
stats["skipped_in_tables"] += 1
|
|
continue
|
|
if is_bullet_paragraph(p_xml):
|
|
bullet_run.append((len(out) - 1, p_xml))
|
|
continue
|
|
flush_run()
|
|
if is_h2_paragraph(p_xml):
|
|
out.append(H2_SEP_XML)
|
|
stats["h2_headings"] += 1
|
|
stats["separators_added"] += 1
|
|
|
|
if last_end < len(xml):
|
|
out.append(xml[last_end:])
|
|
flush_run()
|
|
return "".join(out), stats
|
|
|
|
def process_numbering_xml(xml):
|
|
"""In allen abstractNum-Eintraegen die Bullet-Einzuege ersetzen."""
|
|
import xml.etree.ElementTree as ET
|
|
W = "{%s}" % W_NS
|
|
ET.register_namespace("w", W_NS)
|
|
root = ET.fromstring(xml)
|
|
|
|
stats = {"abstractNums": 0, "lvls_modified": 0}
|
|
|
|
for absnum in root.findall(W+"abstractNum"):
|
|
stats["abstractNums"] += 1
|
|
for lvl in absnum.findall(W+"lvl"):
|
|
ilvl_str = lvl.get(W+"ilvl")
|
|
try:
|
|
ilvl = int(ilvl_str)
|
|
except (TypeError, ValueError):
|
|
continue
|
|
target = BULLET_INDENTS.get(ilvl)
|
|
if target is None:
|
|
continue
|
|
pPr = lvl.find(W+"pPr")
|
|
if pPr is None:
|
|
pPr = ET.SubElement(lvl, W+"pPr")
|
|
ind = pPr.find(W+"ind")
|
|
if ind is None:
|
|
ind = ET.SubElement(pPr, W+"ind")
|
|
ind.set(W+"left", str(target["left"]))
|
|
ind.set(W+"hanging", str(target["hanging"]))
|
|
if W+"firstLine" in ind.attrib:
|
|
del ind.attrib[W+"firstLine"]
|
|
stats["lvls_modified"] += 1
|
|
|
|
XML_DECL = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
|
|
new_xml = XML_DECL + ET.tostring(root, encoding="unicode")
|
|
return new_xml, stats
|
|
|
|
def main():
|
|
if not DOCX_FILE.exists():
|
|
sys.stderr.write(f"FEHLER: {DOCX_FILE} existiert nicht. "
|
|
f"Erst build.ps1 laufen lassen.\n")
|
|
return 1
|
|
log(f"Verarbeite: {DOCX_FILE}")
|
|
|
|
with zipfile.ZipFile(DOCX_FILE, "r") as z:
|
|
members = {name: z.read(name) for name in z.namelist()}
|
|
|
|
doc_xml = members["word/document.xml"].decode("utf-8")
|
|
new_doc_xml, doc_stats = process_document_xml(doc_xml)
|
|
members["word/document.xml"] = new_doc_xml.encode("utf-8")
|
|
|
|
num_stats = {"abstractNums": 0, "lvls_modified": 0}
|
|
if "word/numbering.xml" in members:
|
|
num_xml = members["word/numbering.xml"].decode("utf-8")
|
|
new_num_xml, num_stats = process_numbering_xml(num_xml)
|
|
members["word/numbering.xml"] = new_num_xml.encode("utf-8")
|
|
else:
|
|
log(" Hinweis: word/numbering.xml nicht im DOCX (keine Listen?)")
|
|
|
|
with zipfile.ZipFile(DOCX_FILE, "w", zipfile.ZIP_DEFLATED) as z:
|
|
order = sorted(members.keys(),
|
|
key=lambda n: (0 if n == "[Content_Types].xml" else 1, n))
|
|
for name in order:
|
|
z.writestr(name, members[name])
|
|
|
|
log(f" Listen gefunden: {doc_stats['lists']}")
|
|
log(f" Bullets in Listen: {doc_stats['bullets_in_lists']}")
|
|
log(f" keepNext gesetzt: {doc_stats['bullets_keepnext']}")
|
|
log(f" Bullets in Tabellen uebersprungen: {doc_stats['skipped_in_tables']}")
|
|
log(f" H2-Headings gefunden: {doc_stats['h2_headings']}")
|
|
log(f" H2-Trenn-Absaetze eingefuegt: {doc_stats['separators_added']}")
|
|
log(f" numbering.xml abstractNum-Eintraege: {num_stats['abstractNums']}")
|
|
log(f" numbering.xml lvls modifiziert: {num_stats['lvls_modified']}")
|
|
log("Fertig.")
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|