code icon Code

NLP Extract

Extract people, places, dates, topics from text

Source Code

import fs from "fs";
import path from "path";
import nlp from "compromise";

const [inputPath, textField, extractTypes, outputPath] = process.argv.slice(2);

if (!inputPath || !textField || !extractTypes || !outputPath) {
  console.error("Usage: inputPath textField extractTypes outputPath");
  process.exit(1);
}

/**
 * Get nested field value using dot notation
 */
function getField(obj, fieldPath) {
  const parts = fieldPath.split(".");
  let val = obj;
  for (const part of parts) {
    if (val == null) return undefined;
    val = val[part];
  }
  return val;
}

try {
  console.log(`Extracting entities from text...`);
  console.log(`  Input: ${inputPath}`);
  console.log(`  Text field: ${textField}`);
  console.log(`  Extract: ${extractTypes}`);

  const raw = fs.readFileSync(inputPath, "utf-8");
  const data = JSON.parse(raw);

  const items = Array.isArray(data) ? data : data.items || data.results || [];

  if (!Array.isArray(items)) {
    console.error("Input must be a JSON array or object with array property");
    process.exit(1);
  }

  const types = extractTypes.split(",").map((t) => t.trim().toLowerCase());

  const results = items.map((item) => {
    const text = getField(item, textField);
    const entities = {};

    if (text) {
      const doc = nlp(String(text));

      if (types.includes("people")) {
        entities.people = doc.people().out("array");
      }

      if (types.includes("places")) {
        entities.places = doc.places().out("array");
      }

      if (types.includes("dates")) {
        entities.dates = doc.dates().out("array");
      }

      if (types.includes("organizations")) {
        entities.organizations = doc.organizations().out("array");
      }

      if (types.includes("topics")) {
        // Extract nouns as topics
        entities.topics = doc.nouns().out("array").slice(0, 10);
      }

      if (types.includes("verbs")) {
        entities.verbs = doc.verbs().out("array");
      }

      if (types.includes("numbers")) {
        entities.numbers = doc.numbers().out("array");
      }

      if (types.includes("money")) {
        entities.money = doc.money().out("array");
      }

      if (types.includes("emails")) {
        // Extract email patterns
        const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
        entities.emails = String(text).match(emailRegex) || [];
      }

      if (types.includes("urls")) {
        const urlRegex = /https?:\/\/[^\s]+/g;
        entities.urls = String(text).match(urlRegex) || [];
      }
    }

    return {
      ...item,
      entities,
    };
  });

  // Compute aggregate stats
  const stats = {};
  for (const type of types) {
    stats[type] = 0;
  }

  for (const result of results) {
    for (const [type, values] of Object.entries(result.entities)) {
      if (Array.isArray(values)) {
        stats[type] = (stats[type] || 0) + values.length;
      }
    }
  }

  // Ensure output directory exists
  const dir = path.dirname(outputPath);
  if (dir && dir !== ".") {
    fs.mkdirSync(dir, { recursive: true });
  }

  fs.writeFileSync(outputPath, JSON.stringify(results, null, 2));

  console.log(`\nāœ“ Extracted entities`);
  console.log(`  Items processed: ${items.length}`);
  for (const [type, count] of Object.entries(stats)) {
    console.log(`  ${type}: ${count}`);
  }
  console.log(`  Written to: ${outputPath}`);

  console.log(
    JSON.stringify({
      success: true,
      inputPath,
      outputPath,
      itemCount: items.length,
      extractTypes: types,
      stats,
    })
  );
} catch (error) {
  console.error("Error:", error.message);
  process.exit(1);
}