OCR Extract
Extract text from images via OCR
Source Code
import fs from "fs";
import path from "path";
import Tesseract from "tesseract.js";
const [inputPath, outputPath, language = "eng"] = process.argv.slice(2);
if (!inputPath || !outputPath) {
console.error("Usage: inputPath outputPath [language]");
process.exit(1);
}
try {
console.log(`Extracting text from image: ${inputPath}...`);
console.log(` Language: ${language}`);
const { data } = await Tesseract.recognize(inputPath, language, {
logger: (m) => {
if (m.status === "recognizing text") {
const progress = Math.round(m.progress * 100);
process.stdout.write(`\r Progress: ${progress}%`);
}
},
});
console.log(""); // New line after progress
const result = {
text: data.text,
confidence: data.confidence,
lines: data.lines.map((line) => ({
text: line.text,
confidence: line.confidence,
bbox: line.bbox,
})),
words: data.words.map((word) => ({
text: word.text,
confidence: word.confidence,
bbox: word.bbox,
})),
};
// Ensure output directory exists
const dir = path.dirname(outputPath);
if (dir && dir !== ".") {
fs.mkdirSync(dir, { recursive: true });
}
fs.writeFileSync(outputPath, JSON.stringify(result, null, 2));
const wordCount = result.words.length;
const lineCount = result.lines.length;
console.log(`\nā Extracted text via OCR`);
console.log(` Lines: ${lineCount}`);
console.log(` Words: ${wordCount}`);
console.log(` Confidence: ${data.confidence.toFixed(1)}%`);
console.log(` Written to: ${outputPath}`);
console.log(
JSON.stringify({
success: true,
inputPath,
outputPath,
language,
lineCount,
wordCount,
confidence: data.confidence,
})
);
} catch (error) {
console.error("Error:", error.message);
process.exit(1);
}