code icon Code

Parse Word Document

Extract text and structure from Word documents

Source Code

import fs from "fs";
import path from "path";
import mammoth from "mammoth";

const [inputPath, outputPath, format = "text"] = process.argv.slice(2);

if (!inputPath || !outputPath) {
  console.error("Usage: inputPath outputPath [format]");
  process.exit(1);
}

try {
  console.log(`Parsing Word document: ${inputPath}...`);

  let result;
  let content;
  const options = {};

  if (format === "text") {
    result = await mammoth.extractRawText({ path: inputPath });
    content = result.value;
  } else if (format === "html") {
    result = await mammoth.convertToHtml({ path: inputPath }, options);
    content = result.value;
  } else if (format === "markdown") {
    // Mammoth converts to HTML, so we do a basic HTML-to-markdown conversion
    result = await mammoth.convertToHtml({ path: inputPath }, options);

    // Basic HTML to Markdown conversion
    content = result.value
      .replace(/<h1[^>]*>(.*?)<\/h1>/gi, "# $1\n\n")
      .replace(/<h2[^>]*>(.*?)<\/h2>/gi, "## $1\n\n")
      .replace(/<h3[^>]*>(.*?)<\/h3>/gi, "### $1\n\n")
      .replace(/<h4[^>]*>(.*?)<\/h4>/gi, "#### $1\n\n")
      .replace(/<p[^>]*>(.*?)<\/p>/gi, "$1\n\n")
      .replace(/<strong[^>]*>(.*?)<\/strong>/gi, "**$1**")
      .replace(/<b[^>]*>(.*?)<\/b>/gi, "**$1**")
      .replace(/<em[^>]*>(.*?)<\/em>/gi, "*$1*")
      .replace(/<i[^>]*>(.*?)<\/i>/gi, "*$1*")
      .replace(/<li[^>]*>(.*?)<\/li>/gi, "- $1\n")
      .replace(/<ul[^>]*>|<\/ul>/gi, "\n")
      .replace(/<ol[^>]*>|<\/ol>/gi, "\n")
      .replace(/<br\s*\/?>/gi, "\n")
      .replace(/<[^>]+>/g, "") // Remove remaining HTML tags
      .replace(/\n{3,}/g, "\n\n") // Normalize multiple newlines
      .trim();
  } else {
    console.error(`Unknown format: ${format}. Use: text, html, markdown`);
    process.exit(1);
  }

  // Report any messages/warnings from mammoth
  if (result.messages && result.messages.length > 0) {
    console.log(`  Warnings: ${result.messages.length}`);
    for (const msg of result.messages.slice(0, 3)) {
      console.log(`    - ${msg.type}: ${msg.message}`);
    }
  }

  // Ensure output directory exists
  const dir = path.dirname(outputPath);
  if (dir && dir !== ".") {
    fs.mkdirSync(dir, { recursive: true });
  }

  // Write based on format
  if (format === "text" || format === "markdown") {
    fs.writeFileSync(outputPath, content);
  } else {
    fs.writeFileSync(outputPath, content);
  }

  const charCount = content.length;
  const wordCount = content.split(/\s+/).filter((w) => w.length > 0).length;

  console.log(`\nāœ“ Parsed Word document`);
  console.log(`  Format: ${format}`);
  console.log(`  Characters: ${charCount}`);
  console.log(`  Words: ${wordCount}`);
  console.log(`  Written to: ${outputPath}`);

  console.log(
    JSON.stringify({
      success: true,
      inputPath,
      outputPath,
      format,
      charCount,
      wordCount,
    })
  );
} catch (error) {
  console.error("Error:", error.message);
  process.exit(1);
}