Parse HTML
Extract data from HTML using CSS selectors
Source Code
import fs from "fs";
import path from "path";
import * as cheerio from "cheerio";
const [inputPath, selectorsJson, outputPath] = process.argv.slice(2);
if (!inputPath || !selectorsJson || !outputPath) {
console.error("Usage: inputPath selectorsJson outputPath");
process.exit(1);
}
/**
* Parse selector with optional attribute suffix
* "h1" -> { selector: "h1", attr: null }
* "a@href" -> { selector: "a", attr: "href" }
*/
function parseSelector(selectorStr) {
const atIndex = selectorStr.lastIndexOf("@");
if (atIndex > 0) {
return {
selector: selectorStr.slice(0, atIndex),
attr: selectorStr.slice(atIndex + 1),
};
}
return { selector: selectorStr, attr: null };
}
try {
console.log(`Reading HTML: ${inputPath}...`);
const html = fs.readFileSync(inputPath, "utf-8");
const $ = cheerio.load(html);
let selectors;
try {
selectors = JSON.parse(selectorsJson);
} catch {
console.error("Invalid selectors JSON:", selectorsJson);
process.exit(1);
}
console.log(`Extracting ${Object.keys(selectors).length} fields...`);
const result = {};
for (const [key, selectorStr] of Object.entries(selectors)) {
const { selector, attr } = parseSelector(selectorStr);
const elements = $(selector);
if (elements.length === 0) {
result[key] = null;
} else if (elements.length === 1) {
// Single element - return scalar
if (attr) {
result[key] = elements.attr(attr) || null;
} else {
result[key] = elements.text().trim();
}
} else {
// Multiple elements - return array
result[key] = elements
.map((_, el) => {
if (attr) {
return $(el).attr(attr) || null;
}
return $(el).text().trim();
})
.get();
}
const count = Array.isArray(result[key]) ? result[key].length : result[key] ? 1 : 0;
console.log(` ${key}: ${count} match(es)`);
}
// Ensure output directory exists
const dir = path.dirname(outputPath);
if (dir && dir !== ".") {
fs.mkdirSync(dir, { recursive: true });
}
fs.writeFileSync(outputPath, JSON.stringify(result, null, 2));
console.log(`\nā Extracted HTML data`);
console.log(` Fields: ${Object.keys(result).join(", ")}`);
console.log(` Written to: ${outputPath}`);
console.log(
JSON.stringify({
success: true,
inputPath,
outputPath,
fields: Object.keys(result),
})
);
} catch (error) {
console.error("Error:", error.message);
process.exit(1);
}