code icon Code

Validate Product Data

Validate parsed CSV data before analysis—checks row count, required columns, and data types. Returns warnings (not errors) so analysis can proceed with caveats.

Source Code

import fs from "fs";

const [inputPath, requiredColumns = "", minRows = "20", analysisType = "general"] = process.argv.slice(2);

if (!inputPath) {
  console.error("Usage: inputPath [requiredColumns] [minRows] [analysisType]");
  process.exit(1);
}

// Column name matching - flexible to handle variations
function findMatchingColumn(columns, target) {
  const targetLower = target.toLowerCase().replace(/[_\-\s]/g, "");

  // Exact match first
  const exact = columns.find((c) => c.toLowerCase() === target.toLowerCase());
  if (exact) return exact;

  // Normalized match
  const normalized = columns.find((c) => c.toLowerCase().replace(/[_\-\s]/g, "") === targetLower);
  if (normalized) return normalized;

  // Partial match (column contains target or vice versa)
  const partial = columns.find(
    (c) => c.toLowerCase().includes(targetLower) || targetLower.includes(c.toLowerCase().replace(/[_\-\s]/g, ""))
  );
  if (partial) return partial;

  return null;
}

// Data type checks
function inferColumnType(values) {
  const nonNull = values.filter((v) => v != null && v !== "");

  if (nonNull.length === 0) return "empty";

  const types = {
    numeric: 0,
    date: 0,
    boolean: 0,
    text: 0,
  };

  for (const val of nonNull) {
    const str = String(val);

    // Check numeric
    if (!isNaN(Number(val)) && str.trim() !== "") {
      types.numeric++;
      continue;
    }

    // Check boolean
    if (["true", "false", "yes", "no", "1", "0"].includes(str.toLowerCase())) {
      types.boolean++;
      continue;
    }

    // Check date patterns
    const datePatterns = [
      /^\d{4}-\d{2}-\d{2}/, // ISO date
      /^\d{1,2}\/\d{1,2}\/\d{2,4}/, // US date
      /^\d{1,2}-\d{1,2}-\d{2,4}/, // Other date
      /^\w{3}\s+\d{1,2},?\s+\d{4}/, // "Jan 1, 2024"
    ];
    if (datePatterns.some((p) => p.test(str))) {
      types.date++;
      continue;
    }

    types.text++;
  }

  const total = nonNull.length;
  if (types.numeric / total > 0.8) return "numeric";
  if (types.date / total > 0.8) return "date";
  if (types.boolean / total > 0.8) return "boolean";
  return "text";
}

function validateColumn(columnName, values, expectedType) {
  const warnings = [];
  const nonNull = values.filter((v) => v != null && v !== "");
  const nullCount = values.length - nonNull.length;

  if (nullCount > 0) {
    const nullPercent = Math.round((nullCount / values.length) * 100);
    if (nullPercent > 50) {
      warnings.push(`Column "${columnName}" is ${nullPercent}% empty—may affect analysis reliability.`);
    } else if (nullPercent > 10) {
      warnings.push(`Column "${columnName}" has ${nullPercent}% missing values.`);
    }
  }

  const inferredType = inferColumnType(values);

  if (expectedType && inferredType !== expectedType && inferredType !== "empty") {
    warnings.push(`Column "${columnName}" appears to be ${inferredType} but expected ${expectedType}.`);
  }

  return { inferredType, nullCount, nullPercent: Math.round((nullCount / values.length) * 100), warnings };
}

// Analysis-specific validation
function getExpectedColumns(analysisType) {
  switch (analysisType) {
    case "feedback":
      return [
        { name: "feedback", aliases: ["text", "comment", "response", "message", "content"], type: "text", required: true },
        { name: "date", aliases: ["created", "timestamp", "submitted", "time"], type: "date", required: false },
        { name: "rating", aliases: ["score", "nps", "csat", "stars"], type: "numeric", required: false },
        { name: "user", aliases: ["customer", "email", "id", "userid"], type: "text", required: false },
      ];
    case "adoption":
      return [
        { name: "user", aliases: ["customer", "userid", "id", "account"], type: "text", required: true },
        { name: "date", aliases: ["period", "week", "month", "timestamp"], type: "date", required: true },
        { name: "usage", aliases: ["sessions", "logins", "actions", "events", "count"], type: "numeric", required: true },
        { name: "segment", aliases: ["plan", "tier", "type", "group"], type: "text", required: false },
      ];
    case "abtest":
      return [
        { name: "variant", aliases: ["group", "treatment", "bucket", "test_group", "ab"], type: "text", required: true },
        { name: "metric", aliases: ["converted", "conversion", "revenue", "value", "outcome"], type: "numeric", required: true },
        { name: "user", aliases: ["id", "session", "userid"], type: "text", required: false },
        { name: "segment", aliases: ["plan", "tier", "type", "group"], type: "text", required: false },
      ];
    default:
      return [];
  }
}

try {
  console.log(`Validating data: ${inputPath}...`);
  const rawData = fs.readFileSync(inputPath, "utf-8");
  const data = JSON.parse(rawData);

  const result = {
    valid: true,
    rowCount: 0,
    columns: [],
    columnTypes: {},
    warnings: [],
    errors: [],
    suggestions: [],
  };

  // Check if data is array
  if (!Array.isArray(data)) {
    result.errors.push("Data is not an array. Expected JSON array of objects.");
    result.valid = false;
    console.log(JSON.stringify(result));
    process.exit(0);
  }

  result.rowCount = data.length;

  // Check minimum rows
  const minRowCount = parseInt(minRows, 10);
  if (data.length === 0) {
    result.errors.push("Data is empty (0 rows).");
    result.valid = false;
  } else if (data.length < minRowCount) {
    result.warnings.push(
      `Only ${data.length} rows—below recommended minimum of ${minRowCount}. Findings may not be statistically significant.`
    );
  }

  if (data.length > 0) {
    // Get column names
    result.columns = Object.keys(data[0]);

    // Check required columns
    if (requiredColumns) {
      const required = requiredColumns.split(",").map((c) => c.trim());
      for (const req of required) {
        const found = findMatchingColumn(result.columns, req);
        if (!found) {
          result.warnings.push(`Required column "${req}" not found. Available columns: ${result.columns.join(", ")}`);
        }
      }
    }

    // Analysis-specific column detection
    const expectedColumns = getExpectedColumns(analysisType);
    const foundMappings = {};

    for (const expected of expectedColumns) {
      const allPossible = [expected.name, ...expected.aliases];
      let found = null;

      for (const name of allPossible) {
        found = findMatchingColumn(result.columns, name);
        if (found) break;
      }

      if (found) {
        foundMappings[expected.name] = found;
        const values = data.map((row) => row[found]);
        const validation = validateColumn(found, values, expected.type);
        result.columnTypes[found] = validation.inferredType;
        result.warnings.push(...validation.warnings);
      } else if (expected.required) {
        result.warnings.push(
          `Expected ${analysisType} column "${expected.name}" not found. Looked for: ${allPossible.join(", ")}`
        );
      }
    }

    // Validate all columns and infer types
    for (const col of result.columns) {
      if (!result.columnTypes[col]) {
        const values = data.map((row) => row[col]);
        const validation = validateColumn(col, values, null);
        result.columnTypes[col] = validation.inferredType;
        // Only add high-severity warnings from non-expected columns
        const highSeverity = validation.warnings.filter((w) => w.includes("50%"));
        result.warnings.push(...highSeverity);
      }
    }

    // Check for potential issues
    const allEmpty = result.columns.every((c) => result.columnTypes[c] === "empty");
    if (allEmpty) {
      result.errors.push("All columns appear to be empty.");
      result.valid = false;
    }

    // Add suggestions based on what we found
    if (analysisType !== "general" && Object.keys(foundMappings).length > 0) {
      result.suggestions.push(`Detected columns for ${analysisType} analysis: ${JSON.stringify(foundMappings)}`);
    }

    // Check for duplicate rows if there's an ID-like column
    const idCandidates = result.columns.filter((c) => /id|email|user/i.test(c));
    if (idCandidates.length > 0) {
      const idCol = idCandidates[0];
      const ids = data.map((row) => row[idCol]);
      const uniqueIds = new Set(ids);
      if (uniqueIds.size < ids.length) {
        const dupCount = ids.length - uniqueIds.size;
        result.warnings.push(
          `${dupCount} duplicate values in "${idCol}" column. This may be expected (multiple records per user) or indicate data issues.`
        );
      }
    }
  }

  // Final validity check
  result.valid = result.errors.length === 0;

  // Output results
  console.log(`\n${result.valid ? "✓" : "✗"} Data Validation ${result.valid ? "Passed" : "Failed"}`);
  console.log(`  Rows: ${result.rowCount}`);
  console.log(`  Columns: ${result.columns.length} (${result.columns.join(", ")})`);

  if (result.warnings.length > 0) {
    console.log(`\n⚠️ Warnings (${result.warnings.length}):`);
    for (const w of result.warnings) {
      console.log(`  - ${w}`);
    }
  }

  if (result.errors.length > 0) {
    console.log(`\n❌ Errors (${result.errors.length}):`);
    for (const e of result.errors) {
      console.log(`  - ${e}`);
    }
  }

  if (result.suggestions.length > 0) {
    console.log(`\n💡 Suggestions:`);
    for (const s of result.suggestions) {
      console.log(`  - ${s}`);
    }
  }

  console.log(JSON.stringify(result));
} catch (error) {
  console.error("Error:", error.message);
  console.log(JSON.stringify({ valid: false, errors: [error.message], warnings: [], rowCount: 0 }));
  process.exit(1);
}