code icon Code

Validate Office XML

Validate Office document XML files against XSD schemas and tracked changes

Source Code

#!/usr/bin/env python3
"""
Command line tool to validate Office document XML files against XSD schemas and tracked changes.

Usage:
    python validate.py <dir> --original <original_file>
"""

metadata = {
    "id": "code:document.ooxml.validate",
    "name": "Validate Office XML",
    "description": "Validate Office document XML files against XSD schemas and tracked changes",
    "language": "python",
    "packages": [],
    "args": [
        {"name": "unpacked_dir", "type": "string", "description": "Path to unpacked Office document directory", "position": 0},
        {"name": "--original", "type": "string", "description": "Path to original file (.docx/.pptx/.xlsx)", "position": 1}
    ]
}

import argparse
import sys
from pathlib import Path

# Validation module bullets (resolved to sibling files in scripts/ at runtime)
# code:document.validation.init, code:document.validation.base,
# code:document.validation.docx, code:document.validation.pptx, code:document.validation.redlining
from validation import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator


def main():
    parser = argparse.ArgumentParser(description="Validate Office document XML files")
    parser.add_argument(
        "unpacked_dir",
        help="Path to unpacked Office document directory",
    )
    parser.add_argument(
        "--original",
        required=True,
        help="Path to original file (.docx/.pptx/.xlsx)",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="Enable verbose output",
    )
    args = parser.parse_args()

    # Validate paths
    unpacked_dir = Path(args.unpacked_dir)
    original_file = Path(args.original)
    file_extension = original_file.suffix.lower()
    assert unpacked_dir.is_dir(), f"Error: {unpacked_dir} is not a directory"
    assert original_file.is_file(), f"Error: {original_file} is not a file"
    assert file_extension in [".docx", ".pptx", ".xlsx"], (
        f"Error: {original_file} must be a .docx, .pptx, or .xlsx file"
    )

    # Run validations
    match file_extension:
        case ".docx":
            validators = [DOCXSchemaValidator, RedliningValidator]
        case ".pptx":
            validators = [PPTXSchemaValidator]
        case _:
            print(f"Error: Validation not supported for file type {file_extension}")
            sys.exit(1)

    # Run validators
    success = True
    for V in validators:
        validator = V(unpacked_dir, original_file, verbose=args.verbose)
        if not validator.validate():
            success = False

    if success:
        print("All validations PASSED!")

    sys.exit(0 if success else 1)


if __name__ == "__main__":
    main()