Parse Word Document
Extract text and structure from Word documents
Source Code
import fs from "fs";
import path from "path";
import mammoth from "mammoth";
const [inputPath, outputPath, format = "text"] = process.argv.slice(2);
if (!inputPath || !outputPath) {
console.error("Usage: inputPath outputPath [format]");
process.exit(1);
}
try {
console.log(`Parsing Word document: ${inputPath}...`);
let result;
let content;
const options = {};
if (format === "text") {
result = await mammoth.extractRawText({ path: inputPath });
content = result.value;
} else if (format === "html") {
result = await mammoth.convertToHtml({ path: inputPath }, options);
content = result.value;
} else if (format === "markdown") {
// Mammoth converts to HTML, so we do a basic HTML-to-markdown conversion
result = await mammoth.convertToHtml({ path: inputPath }, options);
// Basic HTML to Markdown conversion
content = result.value
.replace(/<h1[^>]*>(.*?)<\/h1>/gi, "# $1\n\n")
.replace(/<h2[^>]*>(.*?)<\/h2>/gi, "## $1\n\n")
.replace(/<h3[^>]*>(.*?)<\/h3>/gi, "### $1\n\n")
.replace(/<h4[^>]*>(.*?)<\/h4>/gi, "#### $1\n\n")
.replace(/<p[^>]*>(.*?)<\/p>/gi, "$1\n\n")
.replace(/<strong[^>]*>(.*?)<\/strong>/gi, "**$1**")
.replace(/<b[^>]*>(.*?)<\/b>/gi, "**$1**")
.replace(/<em[^>]*>(.*?)<\/em>/gi, "*$1*")
.replace(/<i[^>]*>(.*?)<\/i>/gi, "*$1*")
.replace(/<li[^>]*>(.*?)<\/li>/gi, "- $1\n")
.replace(/<ul[^>]*>|<\/ul>/gi, "\n")
.replace(/<ol[^>]*>|<\/ol>/gi, "\n")
.replace(/<br\s*\/?>/gi, "\n")
.replace(/<[^>]+>/g, "") // Remove remaining HTML tags
.replace(/\n{3,}/g, "\n\n") // Normalize multiple newlines
.trim();
} else {
console.error(`Unknown format: ${format}. Use: text, html, markdown`);
process.exit(1);
}
// Report any messages/warnings from mammoth
if (result.messages && result.messages.length > 0) {
console.log(` Warnings: ${result.messages.length}`);
for (const msg of result.messages.slice(0, 3)) {
console.log(` - ${msg.type}: ${msg.message}`);
}
}
// Ensure output directory exists
const dir = path.dirname(outputPath);
if (dir && dir !== ".") {
fs.mkdirSync(dir, { recursive: true });
}
// Write based on format
if (format === "text" || format === "markdown") {
fs.writeFileSync(outputPath, content);
} else {
fs.writeFileSync(outputPath, content);
}
const charCount = content.length;
const wordCount = content.split(/\s+/).filter((w) => w.length > 0).length;
console.log(`\nā Parsed Word document`);
console.log(` Format: ${format}`);
console.log(` Characters: ${charCount}`);
console.log(` Words: ${wordCount}`);
console.log(` Written to: ${outputPath}`);
console.log(
JSON.stringify({
success: true,
inputPath,
outputPath,
format,
charCount,
wordCount,
})
);
} catch (error) {
console.error("Error:", error.message);
process.exit(1);
}