Parse PDF
Extract text and metadata from PDF files
Source Code
import fs from "fs";
import path from "path";
import pdfParse from "pdf-parse";
const [inputPath, outputPath, mode = "all"] = process.argv.slice(2);
if (!inputPath || !outputPath) {
console.error("Usage: inputPath outputPath [mode]");
process.exit(1);
}
try {
console.log(`Reading PDF: ${inputPath}...`);
const dataBuffer = fs.readFileSync(inputPath);
const data = await pdfParse(dataBuffer);
let result;
if (mode === "text") {
result = { text: data.text };
} else if (mode === "metadata") {
result = {
pages: data.numpages,
info: data.info,
metadata: data.metadata,
};
} else {
// mode === "all"
result = {
text: data.text,
pages: data.numpages,
info: data.info,
metadata: data.metadata,
};
}
// Ensure output directory exists
const dir = path.dirname(outputPath);
if (dir && dir !== ".") {
fs.mkdirSync(dir, { recursive: true });
}
fs.writeFileSync(outputPath, JSON.stringify(result, null, 2));
console.log(`\nā Extracted PDF content`);
console.log(` Pages: ${data.numpages}`);
console.log(` Text length: ${data.text.length} characters`);
console.log(` Mode: ${mode}`);
console.log(` Written to: ${outputPath}`);
console.log(
JSON.stringify({
success: true,
inputPath,
outputPath,
pages: data.numpages,
textLength: data.text.length,
mode,
})
);
} catch (error) {
console.error("Error:", error.message);
process.exit(1);
}