NLP Extract
Extract people, places, dates, topics from text
Source Code
import fs from "fs";
import path from "path";
import nlp from "compromise";
const [inputPath, textField, extractTypes, outputPath] = process.argv.slice(2);
if (!inputPath || !textField || !extractTypes || !outputPath) {
console.error("Usage: inputPath textField extractTypes outputPath");
process.exit(1);
}
/**
* Get nested field value using dot notation
*/
function getField(obj, fieldPath) {
const parts = fieldPath.split(".");
let val = obj;
for (const part of parts) {
if (val == null) return undefined;
val = val[part];
}
return val;
}
try {
console.log(`Extracting entities from text...`);
console.log(` Input: ${inputPath}`);
console.log(` Text field: ${textField}`);
console.log(` Extract: ${extractTypes}`);
const raw = fs.readFileSync(inputPath, "utf-8");
const data = JSON.parse(raw);
const items = Array.isArray(data) ? data : data.items || data.results || [];
if (!Array.isArray(items)) {
console.error("Input must be a JSON array or object with array property");
process.exit(1);
}
const types = extractTypes.split(",").map((t) => t.trim().toLowerCase());
const results = items.map((item) => {
const text = getField(item, textField);
const entities = {};
if (text) {
const doc = nlp(String(text));
if (types.includes("people")) {
entities.people = doc.people().out("array");
}
if (types.includes("places")) {
entities.places = doc.places().out("array");
}
if (types.includes("dates")) {
entities.dates = doc.dates().out("array");
}
if (types.includes("organizations")) {
entities.organizations = doc.organizations().out("array");
}
if (types.includes("topics")) {
// Extract nouns as topics
entities.topics = doc.nouns().out("array").slice(0, 10);
}
if (types.includes("verbs")) {
entities.verbs = doc.verbs().out("array");
}
if (types.includes("numbers")) {
entities.numbers = doc.numbers().out("array");
}
if (types.includes("money")) {
entities.money = doc.money().out("array");
}
if (types.includes("emails")) {
// Extract email patterns
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
entities.emails = String(text).match(emailRegex) || [];
}
if (types.includes("urls")) {
const urlRegex = /https?:\/\/[^\s]+/g;
entities.urls = String(text).match(urlRegex) || [];
}
}
return {
...item,
entities,
};
});
// Compute aggregate stats
const stats = {};
for (const type of types) {
stats[type] = 0;
}
for (const result of results) {
for (const [type, values] of Object.entries(result.entities)) {
if (Array.isArray(values)) {
stats[type] = (stats[type] || 0) + values.length;
}
}
}
// Ensure output directory exists
const dir = path.dirname(outputPath);
if (dir && dir !== ".") {
fs.mkdirSync(dir, { recursive: true });
}
fs.writeFileSync(outputPath, JSON.stringify(results, null, 2));
console.log(`\nā Extracted entities`);
console.log(` Items processed: ${items.length}`);
for (const [type, count] of Object.entries(stats)) {
console.log(` ${type}: ${count}`);
}
console.log(` Written to: ${outputPath}`);
console.log(
JSON.stringify({
success: true,
inputPath,
outputPath,
itemCount: items.length,
extractTypes: types,
stats,
})
);
} catch (error) {
console.error("Error:", error.message);
process.exit(1);
}