code icon Code

Unpack Office File

Unpack and format XML contents of Office files (.docx, .pptx, .xlsx)

Source Code

#!/usr/bin/env python3
"""Unpack and format XML contents of Office files (.docx, .pptx, .xlsx)"""

metadata = {
    "id": "code:document.ooxml.unpack",
    "name": "Unpack Office File",
    "description": "Unpack and format XML contents of Office files (.docx, .pptx, .xlsx)",
    "language": "python",
    "packages": ["defusedxml"],
    "args": [
        {"name": "office_file", "type": "string", "description": "Path to the Office file to unpack", "position": 0},
        {"name": "output_dir", "type": "string", "description": "Directory to extract contents to", "position": 1}
    ]
}

import random
import sys
import defusedxml.minidom
import zipfile
from pathlib import Path

# Get command line arguments
assert len(sys.argv) == 3, "Usage: python unpack.py <office_file> <output_dir>"
input_file, output_dir = sys.argv[1], sys.argv[2]

# Extract and format
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
zipfile.ZipFile(input_file).extractall(output_path)

# Pretty print all XML files
xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels"))
for xml_file in xml_files:
    content = xml_file.read_text(encoding="utf-8")
    dom = defusedxml.minidom.parseString(content)
    xml_file.write_bytes(dom.toprettyxml(indent="  ", encoding="ascii"))

# For .docx files, suggest an RSID for tracked changes
if input_file.endswith(".docx"):
    suggested_rsid = "".join(random.choices("0123456789ABCDEF", k=8))
    print(f"Suggested RSID for edit session: {suggested_rsid}")