code icon Code

PDF Fill with Annotations

Fills a PDF by adding text annotations defined in fields.json

Source Code

#!/usr/bin/env python3
"""Fills a PDF by adding text annotations defined in fields.json. See forms.md."""

metadata = {
    "id": "code:document.pdf.fill_annotations",
    "name": "PDF Fill with Annotations",
    "description": "Fills a PDF by adding text annotations defined in fields.json",
    "language": "python",
    "packages": ["pypdf"],
    "args": [
        {"name": "input_pdf", "type": "string", "description": "Path to input PDF file", "position": 0},
        {"name": "fields_json", "type": "string", "description": "JSON file with field definitions", "position": 1},
        {"name": "output_pdf", "type": "string", "description": "Output PDF file path", "position": 2}
    ]
}

import json
import sys

from pypdf import PdfReader, PdfWriter
from pypdf.annotations import FreeText


# Fills a PDF by adding text annotations defined in `fields.json`. See forms.md.


def transform_coordinates(bbox, image_width, image_height, pdf_width, pdf_height):
    """Transform bounding box from image coordinates to PDF coordinates"""
    # Image coordinates: origin at top-left, y increases downward
    # PDF coordinates: origin at bottom-left, y increases upward
    x_scale = pdf_width / image_width
    y_scale = pdf_height / image_height

    left = bbox[0] * x_scale
    right = bbox[2] * x_scale

    # Flip Y coordinates for PDF
    top = pdf_height - (bbox[1] * y_scale)
    bottom = pdf_height - (bbox[3] * y_scale)

    return left, bottom, right, top


def fill_pdf_form(input_pdf_path, fields_json_path, output_pdf_path):
    """Fill the PDF form with data from fields.json"""

    # `fields.json` format described in forms.md.
    with open(fields_json_path, "r") as f:
        fields_data = json.load(f)

    # Open the PDF
    reader = PdfReader(input_pdf_path)
    writer = PdfWriter()

    # Copy all pages to writer
    writer.append(reader)

    # Get PDF dimensions for each page
    pdf_dimensions = {}
    for i, page in enumerate(reader.pages):
        mediabox = page.mediabox
        pdf_dimensions[i + 1] = [mediabox.width, mediabox.height]

    # Process each form field
    annotations = []
    for field in fields_data["form_fields"]:
        page_num = field["page_number"]

        # Get page dimensions and transform coordinates.
        page_info = next(p for p in fields_data["pages"] if p["page_number"] == page_num)
        image_width = page_info["image_width"]
        image_height = page_info["image_height"]
        pdf_width, pdf_height = pdf_dimensions[page_num]

        transformed_entry_box = transform_coordinates(
            field["entry_bounding_box"],
            image_width, image_height,
            pdf_width, pdf_height
        )

        # Skip empty fields
        if "entry_text" not in field or "text" not in field["entry_text"]:
            continue
        entry_text = field["entry_text"]
        text = entry_text["text"]
        if not text:
            continue

        font_name = entry_text.get("font", "Arial")
        font_size = str(entry_text.get("font_size", 14)) + "pt"
        font_color = entry_text.get("font_color", "000000")

        # Font size/color seems to not work reliably across viewers:
        # https://github.com/py-pdf/pypdf/issues/2084
        annotation = FreeText(
            text=text,
            rect=transformed_entry_box,
            font=font_name,
            font_size=font_size,
            font_color=font_color,
            border_color=None,
            background_color=None,
        )
        annotations.append(annotation)
        # page_number is 0-based for pypdf
        writer.add_annotation(page_number=page_num - 1, annotation=annotation)

    # Save the filled PDF
    with open(output_pdf_path, "wb") as output:
        writer.write(output)

    print(f"Successfully filled PDF form and saved to {output_pdf_path}")
    print(f"Added {len(annotations)} text annotations")


if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: fill_pdf_form_with_annotations.py [input pdf] [fields.json] [output pdf]")
        sys.exit(1)
    input_pdf = sys.argv[1]
    fields_json = sys.argv[2]
    output_pdf = sys.argv[3]

    fill_pdf_form(input_pdf, fields_json, output_pdf)