Code

OCR Extract

Extract text from images via OCR

Source Code

import fs from "fs";
import path from "path";
import Tesseract from "tesseract.js";

const [inputPath, outputPath, language = "eng"] = process.argv.slice(2);

if (!inputPath || !outputPath) {
  console.error("ERROR: Missing required arguments.");
  console.error("  Usage: inputPath outputPath [language]");
  process.exit(1);
}

try {
  console.log(`Extracting text from image: ${inputPath}...`);
  console.log(`  Language: ${language}`);

  const { data } = await Tesseract.recognize(inputPath, language, {
    logger: (m) => {
      if (m.status === "recognizing text") {
        const progress = Math.round(m.progress * 100);
        process.stdout.write(`\r  Progress: ${progress}%`);
      }
    },
  });

  console.log(""); // New line after progress

  const result = {
    text: data.text,
    confidence: data.confidence,
    lines: data.lines.map((line) => ({
      text: line.text,
      confidence: line.confidence,
      bbox: line.bbox,
    })),
    words: data.words.map((word) => ({
      text: word.text,
      confidence: word.confidence,
      bbox: word.bbox,
    })),
  };

  // Ensure output directory exists
  const dir = path.dirname(outputPath);
  if (dir && dir !== ".") {
    fs.mkdirSync(dir, { recursive: true });
  }

  fs.writeFileSync(outputPath, JSON.stringify(result, null, 2));

  const wordCount = result.words.length;
  const lineCount = result.lines.length;

  console.log(`\n✓ Extracted text via OCR`);
  console.log(`  Lines: ${lineCount}`);
  console.log(`  Words: ${wordCount}`);
  console.log(`  Confidence: ${data.confidence.toFixed(1)}%`);
  console.log(`  Written to: ${outputPath}`);

  console.log(
    JSON.stringify({
      success: true,
      inputPath,
      outputPath,
      language,
      lineCount,
      wordCount,
      confidence: data.confidence,
    })
  );
} catch (error) {
  console.error("ERROR: Failed to extract text via OCR.");
  console.error(`  ${error.message}`);
  process.exit(1);
}

Arguments

inputPath

Path to image file

outputPath

Path to write extracted text JSON

language

Language code: eng, fra, deu, spa, etc.

Default: eng

Packages

tesseract.js

Tokens

488

Used By

Data Utilities skill

code:stdlib.ocr.extract