Validate Product Data
Validate parsed CSV data before analysis—checks row count, required columns, and data types. Returns warnings (not errors) so analysis can proceed with caveats.
Source Code
import fs from "fs";
const [inputPath, requiredColumns = "", minRows = "20", analysisType = "general"] = process.argv.slice(2);
if (!inputPath) {
console.error("Usage: inputPath [requiredColumns] [minRows] [analysisType]");
process.exit(1);
}
// Column name matching - flexible to handle variations
function findMatchingColumn(columns, target) {
const targetLower = target.toLowerCase().replace(/[_\-\s]/g, "");
// Exact match first
const exact = columns.find((c) => c.toLowerCase() === target.toLowerCase());
if (exact) return exact;
// Normalized match
const normalized = columns.find((c) => c.toLowerCase().replace(/[_\-\s]/g, "") === targetLower);
if (normalized) return normalized;
// Partial match (column contains target or vice versa)
const partial = columns.find(
(c) => c.toLowerCase().includes(targetLower) || targetLower.includes(c.toLowerCase().replace(/[_\-\s]/g, ""))
);
if (partial) return partial;
return null;
}
// Data type checks
function inferColumnType(values) {
const nonNull = values.filter((v) => v != null && v !== "");
if (nonNull.length === 0) return "empty";
const types = {
numeric: 0,
date: 0,
boolean: 0,
text: 0,
};
for (const val of nonNull) {
const str = String(val);
// Check numeric
if (!isNaN(Number(val)) && str.trim() !== "") {
types.numeric++;
continue;
}
// Check boolean
if (["true", "false", "yes", "no", "1", "0"].includes(str.toLowerCase())) {
types.boolean++;
continue;
}
// Check date patterns
const datePatterns = [
/^\d{4}-\d{2}-\d{2}/, // ISO date
/^\d{1,2}\/\d{1,2}\/\d{2,4}/, // US date
/^\d{1,2}-\d{1,2}-\d{2,4}/, // Other date
/^\w{3}\s+\d{1,2},?\s+\d{4}/, // "Jan 1, 2024"
];
if (datePatterns.some((p) => p.test(str))) {
types.date++;
continue;
}
types.text++;
}
const total = nonNull.length;
if (types.numeric / total > 0.8) return "numeric";
if (types.date / total > 0.8) return "date";
if (types.boolean / total > 0.8) return "boolean";
return "text";
}
function validateColumn(columnName, values, expectedType) {
const warnings = [];
const nonNull = values.filter((v) => v != null && v !== "");
const nullCount = values.length - nonNull.length;
if (nullCount > 0) {
const nullPercent = Math.round((nullCount / values.length) * 100);
if (nullPercent > 50) {
warnings.push(`Column "${columnName}" is ${nullPercent}% empty—may affect analysis reliability.`);
} else if (nullPercent > 10) {
warnings.push(`Column "${columnName}" has ${nullPercent}% missing values.`);
}
}
const inferredType = inferColumnType(values);
if (expectedType && inferredType !== expectedType && inferredType !== "empty") {
warnings.push(`Column "${columnName}" appears to be ${inferredType} but expected ${expectedType}.`);
}
return { inferredType, nullCount, nullPercent: Math.round((nullCount / values.length) * 100), warnings };
}
// Analysis-specific validation
function getExpectedColumns(analysisType) {
switch (analysisType) {
case "feedback":
return [
{ name: "feedback", aliases: ["text", "comment", "response", "message", "content"], type: "text", required: true },
{ name: "date", aliases: ["created", "timestamp", "submitted", "time"], type: "date", required: false },
{ name: "rating", aliases: ["score", "nps", "csat", "stars"], type: "numeric", required: false },
{ name: "user", aliases: ["customer", "email", "id", "userid"], type: "text", required: false },
];
case "adoption":
return [
{ name: "user", aliases: ["customer", "userid", "id", "account"], type: "text", required: true },
{ name: "date", aliases: ["period", "week", "month", "timestamp"], type: "date", required: true },
{ name: "usage", aliases: ["sessions", "logins", "actions", "events", "count"], type: "numeric", required: true },
{ name: "segment", aliases: ["plan", "tier", "type", "group"], type: "text", required: false },
];
case "abtest":
return [
{ name: "variant", aliases: ["group", "treatment", "bucket", "test_group", "ab"], type: "text", required: true },
{ name: "metric", aliases: ["converted", "conversion", "revenue", "value", "outcome"], type: "numeric", required: true },
{ name: "user", aliases: ["id", "session", "userid"], type: "text", required: false },
{ name: "segment", aliases: ["plan", "tier", "type", "group"], type: "text", required: false },
];
default:
return [];
}
}
try {
console.log(`Validating data: ${inputPath}...`);
const rawData = fs.readFileSync(inputPath, "utf-8");
const data = JSON.parse(rawData);
const result = {
valid: true,
rowCount: 0,
columns: [],
columnTypes: {},
warnings: [],
errors: [],
suggestions: [],
};
// Check if data is array
if (!Array.isArray(data)) {
result.errors.push("Data is not an array. Expected JSON array of objects.");
result.valid = false;
console.log(JSON.stringify(result));
process.exit(0);
}
result.rowCount = data.length;
// Check minimum rows
const minRowCount = parseInt(minRows, 10);
if (data.length === 0) {
result.errors.push("Data is empty (0 rows).");
result.valid = false;
} else if (data.length < minRowCount) {
result.warnings.push(
`Only ${data.length} rows—below recommended minimum of ${minRowCount}. Findings may not be statistically significant.`
);
}
if (data.length > 0) {
// Get column names
result.columns = Object.keys(data[0]);
// Check required columns
if (requiredColumns) {
const required = requiredColumns.split(",").map((c) => c.trim());
for (const req of required) {
const found = findMatchingColumn(result.columns, req);
if (!found) {
result.warnings.push(`Required column "${req}" not found. Available columns: ${result.columns.join(", ")}`);
}
}
}
// Analysis-specific column detection
const expectedColumns = getExpectedColumns(analysisType);
const foundMappings = {};
for (const expected of expectedColumns) {
const allPossible = [expected.name, ...expected.aliases];
let found = null;
for (const name of allPossible) {
found = findMatchingColumn(result.columns, name);
if (found) break;
}
if (found) {
foundMappings[expected.name] = found;
const values = data.map((row) => row[found]);
const validation = validateColumn(found, values, expected.type);
result.columnTypes[found] = validation.inferredType;
result.warnings.push(...validation.warnings);
} else if (expected.required) {
result.warnings.push(
`Expected ${analysisType} column "${expected.name}" not found. Looked for: ${allPossible.join(", ")}`
);
}
}
// Validate all columns and infer types
for (const col of result.columns) {
if (!result.columnTypes[col]) {
const values = data.map((row) => row[col]);
const validation = validateColumn(col, values, null);
result.columnTypes[col] = validation.inferredType;
// Only add high-severity warnings from non-expected columns
const highSeverity = validation.warnings.filter((w) => w.includes("50%"));
result.warnings.push(...highSeverity);
}
}
// Check for potential issues
const allEmpty = result.columns.every((c) => result.columnTypes[c] === "empty");
if (allEmpty) {
result.errors.push("All columns appear to be empty.");
result.valid = false;
}
// Add suggestions based on what we found
if (analysisType !== "general" && Object.keys(foundMappings).length > 0) {
result.suggestions.push(`Detected columns for ${analysisType} analysis: ${JSON.stringify(foundMappings)}`);
}
// Check for duplicate rows if there's an ID-like column
const idCandidates = result.columns.filter((c) => /id|email|user/i.test(c));
if (idCandidates.length > 0) {
const idCol = idCandidates[0];
const ids = data.map((row) => row[idCol]);
const uniqueIds = new Set(ids);
if (uniqueIds.size < ids.length) {
const dupCount = ids.length - uniqueIds.size;
result.warnings.push(
`${dupCount} duplicate values in "${idCol}" column. This may be expected (multiple records per user) or indicate data issues.`
);
}
}
}
// Final validity check
result.valid = result.errors.length === 0;
// Output results
console.log(`\n${result.valid ? "✓" : "✗"} Data Validation ${result.valid ? "Passed" : "Failed"}`);
console.log(` Rows: ${result.rowCount}`);
console.log(` Columns: ${result.columns.length} (${result.columns.join(", ")})`);
if (result.warnings.length > 0) {
console.log(`\n⚠️ Warnings (${result.warnings.length}):`);
for (const w of result.warnings) {
console.log(` - ${w}`);
}
}
if (result.errors.length > 0) {
console.log(`\n❌ Errors (${result.errors.length}):`);
for (const e of result.errors) {
console.log(` - ${e}`);
}
}
if (result.suggestions.length > 0) {
console.log(`\n💡 Suggestions:`);
for (const s of result.suggestions) {
console.log(` - ${s}`);
}
}
console.log(JSON.stringify(result));
} catch (error) {
console.error("Error:", error.message);
console.log(JSON.stringify({ valid: false, errors: [error.message], warnings: [], rowCount: 0 }));
process.exit(1);
}