code icon Code

Parse HTML

Extract data from HTML using CSS selectors

Source Code

import fs from "fs";
import path from "path";
import * as cheerio from "cheerio";

const [inputPath, selectorsJson, outputPath] = process.argv.slice(2);

if (!inputPath || !selectorsJson || !outputPath) {
  console.error("Usage: inputPath selectorsJson outputPath");
  process.exit(1);
}

/**
 * Parse selector with optional attribute suffix
 * "h1" -> { selector: "h1", attr: null }
 * "a@href" -> { selector: "a", attr: "href" }
 */
function parseSelector(selectorStr) {
  const atIndex = selectorStr.lastIndexOf("@");
  if (atIndex > 0) {
    return {
      selector: selectorStr.slice(0, atIndex),
      attr: selectorStr.slice(atIndex + 1),
    };
  }
  return { selector: selectorStr, attr: null };
}

try {
  console.log(`Reading HTML: ${inputPath}...`);
  const html = fs.readFileSync(inputPath, "utf-8");

  const $ = cheerio.load(html);

  let selectors;
  try {
    selectors = JSON.parse(selectorsJson);
  } catch {
    console.error("Invalid selectors JSON:", selectorsJson);
    process.exit(1);
  }

  console.log(`Extracting ${Object.keys(selectors).length} fields...`);

  const result = {};

  for (const [key, selectorStr] of Object.entries(selectors)) {
    const { selector, attr } = parseSelector(selectorStr);
    const elements = $(selector);

    if (elements.length === 0) {
      result[key] = null;
    } else if (elements.length === 1) {
      // Single element - return scalar
      if (attr) {
        result[key] = elements.attr(attr) || null;
      } else {
        result[key] = elements.text().trim();
      }
    } else {
      // Multiple elements - return array
      result[key] = elements
        .map((_, el) => {
          if (attr) {
            return $(el).attr(attr) || null;
          }
          return $(el).text().trim();
        })
        .get();
    }

    const count = Array.isArray(result[key]) ? result[key].length : result[key] ? 1 : 0;
    console.log(`  ${key}: ${count} match(es)`);
  }

  // Ensure output directory exists
  const dir = path.dirname(outputPath);
  if (dir && dir !== ".") {
    fs.mkdirSync(dir, { recursive: true });
  }

  fs.writeFileSync(outputPath, JSON.stringify(result, null, 2));

  console.log(`\nāœ“ Extracted HTML data`);
  console.log(`  Fields: ${Object.keys(result).join(", ")}`);
  console.log(`  Written to: ${outputPath}`);

  console.log(
    JSON.stringify({
      success: true,
      inputPath,
      outputPath,
      fields: Object.keys(result),
    })
  );
} catch (error) {
  console.error("Error:", error.message);
  process.exit(1);
}