Initial Commit

This commit is contained in:
ganome 2025-12-04 18:34:35 -07:00
commit 2afe840ede
Signed by untrusted user who does not match committer: Ganome
GPG Key ID: 944DE53336D81B83
5 changed files with 783 additions and 0 deletions

13
Makefile Normal file
View File

@ -0,0 +1,13 @@
CC=gcc
CFLAGS=-Iinclude -Wall $(shell pkg-config --cflags libxml-2.0 libcurl)
LDFLAGS=$(shell pkg-config --libs libxml-2.0 libcurl)
SRCS=src/main.c src/scraper.c
TARGET=gscrape
all: $(TARGET)
$(TARGET): $(SRCS)
$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
clean:
rm -f $(TARGET) *.o src/*.o

BIN
gscrape Executable file

Binary file not shown.

33
include/scraper.h Normal file
View File

@ -0,0 +1,33 @@
#ifndef SCRAPER_H
#define SCRAPER_H
#include <stdlib.h>
int fetch_url(const char *url, char **out_buf, size_t *out_len);
char *extract_title(const char *html, size_t len);
/* Extract all <meta name=... content=...> and <meta property=... content=...>
* Returns 0 on success and allocates *out with a newline-separated list
* of "key: value" lines. Caller must free(*out).
*/
int extract_meta(const char *html, size_t len, char **out);
/* Extract Open Graph tags (meta property="og:...") similarly. */
int extract_og(const char *html, size_t len, char **out);
/* Extract the first <script type="application/ld+json"> that looks like a
* Product schema. Returns 0 and allocates *out_json (caller frees) or
* returns -1 if not found.
*/
int extract_jsonld_product(const char *html, size_t len, char **out_json);
/* Extract the text content of the first <h1> element, or NULL if none. */
char *extract_h1(const char *html, size_t len);
/* Extract product listings (one per line) as "name | price | url".
* Returns 0 and allocates *out on success (caller frees), or -1.
* Each line represents one product; fields are separated by '|'.
*/
int extract_products(const char *html, size_t len, char **out);
#endif /* SCRAPER_H */

217
src/main.c Normal file
View File

@ -0,0 +1,217 @@
#include <stdio.h>
#include <stdlib.h>
#include "scraper.h"
#include <string.h>
static char *json_escape(const char *s) {
if (!s) return strdup("\"\"");
size_t len = strlen(s);
size_t cap = len * 2 + 3;
char *out = malloc(cap);
if (!out) return NULL;
size_t oi = 0;
out[oi++] = '"';
for (size_t i = 0; i < len; ++i) {
unsigned char c = s[i];
if (c == '"' || c == '\\') {
if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); }
out[oi++] = '\\';
out[oi++] = c;
} else if (c == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; }
else if (c == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; }
else if (c == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; }
else if (c == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; }
else if (c == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; }
else if (c < 0x20) {
char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", c);
size_t bl = strlen(buf);
if (oi + bl + 1 >= cap) { cap = cap + bl + 16; out = realloc(out, cap); }
memcpy(out + oi, buf, bl); oi += bl;
} else {
if (oi + 1 >= cap) { cap *= 2; out = realloc(out, cap); }
out[oi++] = c;
}
}
if (oi + 2 >= cap) { cap += 2; out = realloc(out, cap); }
out[oi++] = '"';
out[oi] = '\0';
return out;
}
static void print_kv_json(const char *kv_lines) {
if (!kv_lines) { printf("null"); return; }
printf("{");
const char *p = kv_lines;
int first = 1;
while (*p) {
const char *line_end = strchr(p, '\n');
size_t linelen = line_end ? (size_t)(line_end - p) : strlen(p);
if (linelen > 0) {
const char *sep = memchr(p, ':', linelen);
if (sep) {
size_t keylen = (size_t)(sep - p);
const char *vstart = sep + 1;
while (vstart < p + linelen && (*vstart == ' ' || *vstart == '\t')) vstart++;
size_t vallen = (p + linelen) - vstart;
char *key = malloc(keylen + 1);
char *val = malloc(vallen + 1);
memcpy(key, p, keylen); key[keylen] = '\0';
memcpy(val, vstart, vallen); val[vallen] = '\0';
char *ek = json_escape(key);
char *ev = json_escape(val);
free(key); free(val);
if (!first) printf(",");
first = 0;
printf("%s:%s", ek, ev);
free(ek); free(ev);
}
}
if (!line_end) break;
p = line_end + 1;
}
printf("}");
}
static void print_products_json(const char *products_lines) {
if (!products_lines) { printf("null"); return; }
printf("[");
const char *p = products_lines;
int first = 1;
while (*p) {
const char *line_end = strchr(p, '\n');
size_t linelen = line_end ? (size_t)(line_end - p) : strlen(p);
if (linelen > 0) {
/* expect: name | price | url */
const char *s1 = memchr(p, '|', linelen);
const char *s2 = s1 ? memchr(s1 + 1, '|', (size_t)(p + linelen - (s1 + 1))) : NULL;
char *name = NULL; char *price = NULL; char *url = NULL;
if (s1 && s2) {
size_t nlen = (size_t)(s1 - p);
const char *p2 = s1 + 1;
while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--;
while (p2 < s2 && (*p2 == ' ' || *p2 == '\t')) p2++;
size_t plen = (size_t)(s2 - p2);
const char *u2 = s2 + 1;
while (u2 < p + linelen && (*u2 == ' ' || *u2 == '\t')) u2++;
size_t ulen = (p + linelen) - u2;
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
url = malloc(ulen + 1); memcpy(url, u2, ulen); url[ulen] = '\0';
} else if (s1) {
/* only name|price */
size_t nlen = (size_t)(s1 - p);
const char *p2 = s1 + 1;
while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--;
while (p2 < p + linelen && (*p2 == ' ' || *p2 == '\t')) p2++;
size_t plen = (p + linelen) - p2;
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
url = strdup("");
} else {
/* fallback whole line as name */
name = malloc(linelen + 1); memcpy(name, p, linelen); name[linelen] = '\0';
price = strdup("");
url = strdup("");
}
char *en = json_escape(name);
char *ep = json_escape(price);
char *eu = json_escape(url);
free(name); free(price); free(url);
if (!first) printf(",");
first = 0;
printf("{\"name\":%s,\"price\":%s,\"url\":%s}", en, ep, eu);
free(en); free(ep); free(eu);
}
if (!line_end) break;
p = line_end + 1;
}
printf("]");
}
int main(int argc, char **argv) {
if (argc != 2) {
fprintf(stderr, "Usage: %s <url>\n", argv[0]);
return 2;
}
const char *url = argv[1];
char *html = NULL;
size_t html_len = 0;
if (fetch_url(url, &html, &html_len) != 0) {
fprintf(stderr, "Failed to fetch URL: %s\n", url);
return 1;
}
char *title = extract_title(html, html_len);
/* Output JSON */
printf("{");
/* url */
char *eurl = json_escape(url);
printf("\"url\":%s,", eurl); free(eurl);
/* title */
if (title) {
char *et = json_escape(title);
printf("\"title\":%s,", et);
free(et);
free(title);
} else {
printf("\"title\":null,");
}
char *h1 = extract_h1(html, html_len);
if (h1) {
char *eh1 = json_escape(h1);
printf("\"h1\":%s,", eh1);
free(eh1);
free(h1);
} else {
printf("\"h1\":null,");
}
char *meta = NULL;
if (extract_meta(html, html_len, &meta) == 0 && meta) {
printf("\"meta\":");
print_kv_json(meta);
printf(",");
free(meta);
} else {
printf("\"meta\":null,");
}
char *og = NULL;
if (extract_og(html, html_len, &og) == 0 && og) {
printf("\"og\":");
print_kv_json(og);
printf(",");
free(og);
} else {
printf("\"og\":null,");
}
char *jsonld = NULL;
if (extract_jsonld_product(html, html_len, &jsonld) == 0 && jsonld) {
/* include raw JSON-LD as string */
char *ej = json_escape(jsonld);
printf("\"jsonld_product\":%s,", ej);
free(ej);
free(jsonld);
} else {
printf("\"jsonld_product\":null,");
}
char *products = NULL;
if (extract_products(html, html_len, &products) == 0 && products) {
printf("\"products\":");
print_products_json(products);
free(products);
} else {
printf("\"products\":null");
}
printf("}\n");
free(html);
return 0;
}

520
src/scraper.c Normal file
View File

@ -0,0 +1,520 @@
#include "scraper.h"
#include <stdio.h>
#include <string.h>
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
#include <libxml/tree.h>
#include <ctype.h>
#include <stdarg.h>
#include <stdbool.h>
struct mem {
char *buf;
size_t size;
};
static size_t write_cb(void *ptr, size_t size, size_t nmemb, void *userdata) {
size_t realsize = size * nmemb;
struct mem *m = (struct mem *)userdata;
char *newbuf = realloc(m->buf, m->size + realsize + 1);
if (!newbuf) return 0;
m->buf = newbuf;
memcpy(&(m->buf[m->size]), ptr, realsize);
m->size += realsize;
m->buf[m->size] = '\0';
return realsize;
}
int fetch_url(const char *url, char **out_buf, size_t *out_len) {
if (!url || !out_buf || !out_len) return -1;
CURL *curl = NULL;
CURLcode res;
struct mem chunk = {0};
curl_global_init(CURL_GLOBAL_DEFAULT);
curl = curl_easy_init();
if (!curl) {
curl_global_cleanup();
return -1;
}
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk);
curl_easy_setopt(curl, CURLOPT_USERAGENT, "gscrape/0.1");
res = curl_easy_perform(curl);
curl_easy_cleanup(curl);
curl_global_cleanup();
if (res != CURLE_OK) {
free(chunk.buf);
return -1;
}
*out_buf = chunk.buf;
*out_len = chunk.size;
return 0;
}
static xmlNode *find_title_node(xmlNode *node) {
for (xmlNode *cur = node; cur; cur = cur->next) {
if (cur->type == XML_ELEMENT_NODE) {
if (xmlStrcasecmp(cur->name, (const xmlChar *)"title") == 0) {
return cur;
}
xmlNode *res = find_title_node(cur->children);
if (res) return res;
}
}
return NULL;
}
char *extract_title(const char *html, size_t len) {
if (!html || len == 0) return NULL;
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (!doc) return NULL;
xmlNode *root = xmlDocGetRootElement(doc);
xmlNode *title_node = find_title_node(root);
if (!title_node) {
xmlFreeDoc(doc);
return NULL;
}
xmlChar *content = xmlNodeGetContent(title_node);
if (!content) {
xmlFreeDoc(doc);
return NULL;
}
char *title = strdup((const char *)content);
xmlFree(content);
xmlFreeDoc(doc);
xmlCleanupParser();
return title;
}
/* Helper: append formatted text to buffer (allocating). Caller owns *out. */
static int append_fmt(char **out, size_t *out_len, const char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
va_list ap2;
va_copy(ap2, ap);
int needed = vsnprintf(NULL, 0, fmt, ap);
va_end(ap);
if (needed < 0) {
va_end(ap2);
return -1;
}
char *newbuf = realloc(*out, *out_len + (size_t)needed + 1);
if (!newbuf) {
va_end(ap2);
return -1;
}
*out = newbuf;
vsnprintf(*out + *out_len, (size_t)needed + 1, fmt, ap2);
*out_len += (size_t)needed;
va_end(ap2);
return 0;
}
static void collect_meta_nodes(xmlNode *node, char **out, size_t *out_len) {
for (xmlNode *cur = node; cur; cur = cur->next) {
if (cur->type == XML_ELEMENT_NODE) {
if (xmlStrcasecmp(cur->name, (const xmlChar *)"meta") == 0) {
xmlChar *name = xmlGetProp(cur, (const xmlChar *)"name");
xmlChar *prop = xmlGetProp(cur, (const xmlChar *)"property");
xmlChar *content = xmlGetProp(cur, (const xmlChar *)"content");
const xmlChar *key = name ? name : prop;
if (key && content) {
append_fmt(out, out_len, "%s: %s\n", (const char *)key, (const char *)content);
}
if (name) xmlFree(name);
if (prop) xmlFree(prop);
if (content) xmlFree(content);
}
collect_meta_nodes(cur->children, out, out_len);
}
}
}
int extract_meta(const char *html, size_t len, char **out) {
if (!html || len == 0 || !out) return -1;
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (!doc) return -1;
xmlNode *root = xmlDocGetRootElement(doc);
*out = NULL;
size_t out_len = 0;
collect_meta_nodes(root, out, &out_len);
xmlFreeDoc(doc);
xmlCleanupParser();
if (out_len == 0) {
free(*out);
*out = NULL;
return -1;
}
return 0;
}
static void collect_og_nodes(xmlNode *node, char **out, size_t *out_len) {
for (xmlNode *cur = node; cur; cur = cur->next) {
if (cur->type == XML_ELEMENT_NODE) {
if (xmlStrcasecmp(cur->name, (const xmlChar *)"meta") == 0) {
xmlChar *prop = xmlGetProp(cur, (const xmlChar *)"property");
xmlChar *content = xmlGetProp(cur, (const xmlChar *)"content");
if (prop && content) {
if (strncasecmp((const char *)prop, "og:", 3) == 0) {
append_fmt(out, out_len, "%s: %s\n", (const char *)prop, (const char *)content);
}
}
if (prop) xmlFree(prop);
if (content) xmlFree(content);
}
collect_og_nodes(cur->children, out, out_len);
}
}
}
int extract_og(const char *html, size_t len, char **out) {
if (!html || len == 0 || !out) return -1;
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (!doc) return -1;
xmlNode *root = xmlDocGetRootElement(doc);
*out = NULL;
size_t out_len = 0;
collect_og_nodes(root, out, &out_len);
xmlFreeDoc(doc);
xmlCleanupParser();
if (out_len == 0) {
free(*out);
*out = NULL;
return -1;
}
return 0;
}
static char *get_node_content(xmlNode *node) {
xmlChar *content = xmlNodeGetContent(node);
if (!content) return NULL;
char *res = strdup((const char *)content);
xmlFree(content);
return res;
}
int extract_jsonld_product(const char *html, size_t len, char **out_json) {
if (!html || len == 0 || !out_json) return -1;
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (!doc) return -1;
xmlNode *root = xmlDocGetRootElement(doc);
*out_json = NULL;
for (xmlNode *cur = root; cur; cur = cur->next) {
if (cur->type == XML_ELEMENT_NODE) {
if (xmlStrcasecmp(cur->name, (const xmlChar *)"script") == 0) {
xmlChar *type = xmlGetProp(cur, (const xmlChar *)"type");
if (type && xmlStrcasecmp(type, (const xmlChar *)"application/ld+json") == 0) {
char *content = get_node_content(cur);
if (content) {
/* crude check for Product */
if ((strcasestr(content, "\"@type\"") && strcasestr(content, "Product")) || strcasestr(content, "\"Product\"")) {
*out_json = content; /* transfer ownership */
xmlFree(type);
xmlFreeDoc(doc);
xmlCleanupParser();
return 0;
}
free(content);
}
}
if (type) xmlFree(type);
}
/* search children */
if (cur->children) {
/* recurse using a stack-like traversal by resetting cur to children
* then letting the outer loop walk siblings. Simpler: call function
* recursively but here we inline a nested loop to find scripts.
*/
xmlNode *child = cur->children;
while (child) {
if (child->type == XML_ELEMENT_NODE && xmlStrcasecmp(child->name, (const xmlChar *)"script") == 0) {
xmlChar *type = xmlGetProp(child, (const xmlChar *)"type");
if (type && xmlStrcasecmp(type, (const xmlChar *)"application/ld+json") == 0) {
char *content = get_node_content(child);
if (content) {
if ((strcasestr(content, "\"@type\"") && strcasestr(content, "Product")) || strcasestr(content, "\"Product\"")) {
*out_json = content;
if (type) xmlFree(type);
xmlFreeDoc(doc);
xmlCleanupParser();
return 0;
}
free(content);
}
}
if (type) xmlFree(type);
}
child = child->next;
}
}
}
}
xmlFreeDoc(doc);
xmlCleanupParser();
return -1;
}
static xmlNode *find_h1_node(xmlNode *node) {
for (xmlNode *cur = node; cur; cur = cur->next) {
if (cur->type == XML_ELEMENT_NODE) {
if (xmlStrcasecmp(cur->name, (const xmlChar *)"h1") == 0) {
return cur;
}
xmlNode *res = find_h1_node(cur->children);
if (res) return res;
}
}
return NULL;
}
char *extract_h1(const char *html, size_t len) {
if (!html || len == 0) return NULL;
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (!doc) return NULL;
xmlNode *root = xmlDocGetRootElement(doc);
xmlNode *h1 = find_h1_node(root);
if (!h1) {
xmlFreeDoc(doc);
return NULL;
}
char *res = get_node_content(h1);
xmlFreeDoc(doc);
xmlCleanupParser();
return res;
}
/* Heuristic product extraction */
static bool attr_contains(const xmlChar *attr, const char *needle) {
if (!attr || !needle) return false;
return (strcasestr((const char *)attr, needle) != NULL);
}
static char *find_name_in_node(xmlNode *node) {
if (!node) return NULL;
/* look for h1-h4, a, or elements with class/id containing name/title/product */
for (xmlNode *cur = node; cur; cur = cur->next) {
if (cur->type == XML_ELEMENT_NODE) {
if (xmlStrcasecmp(cur->name, (const xmlChar *)"h1") == 0 ||
xmlStrcasecmp(cur->name, (const xmlChar *)"h2") == 0 ||
xmlStrcasecmp(cur->name, (const xmlChar *)"h3") == 0 ||
xmlStrcasecmp(cur->name, (const xmlChar *)"h4") == 0) {
char *txt = get_node_content(cur);
if (txt && strlen(txt) > 1 && strlen(txt) < 200) return txt;
if (txt) free(txt);
}
xmlChar *cls = xmlGetProp(cur, (const xmlChar *)"class");
xmlChar *id = xmlGetProp(cur, (const xmlChar *)"id");
if ((cls && (attr_contains(cls, "title") || attr_contains(cls, "name") || attr_contains(cls, "product"))) ||
(id && (attr_contains(id, "title") || attr_contains(id, "name") || attr_contains(id, "product")))) {
/* Avoid returning anchor text (which may include price); prefer
headings or specific child elements. If current node is an
anchor, recurse into children instead of returning its text. */
if (xmlStrcasecmp(cur->name, (const xmlChar *)"a") != 0) {
char *txt = get_node_content(cur);
if (txt && strlen(txt) > 0 && strlen(txt) < 200) {
if (cls) xmlFree(cls);
if (id) xmlFree(id);
return txt;
}
if (txt) free(txt);
}
}
if (cls) xmlFree(cls);
if (id) xmlFree(id);
char *res = find_name_in_node(cur->children);
if (res) return res;
}
}
return NULL;
}
static char *find_href_in_node(xmlNode *node) {
if (!node) return NULL;
for (xmlNode *cur = node; cur; cur = cur->next) {
if (cur->type == XML_ELEMENT_NODE) {
if (xmlStrcasecmp(cur->name, (const xmlChar *)"a") == 0) {
xmlChar *href = xmlGetProp(cur, (const xmlChar *)"href");
if (href) {
char *res = strdup((const char *)href);
xmlFree(href);
return res;
}
}
char *res = find_href_in_node(cur->children);
if (res) return res;
}
}
return NULL;
}
static char *find_price_in_node(xmlNode *node) {
if (!node) return NULL;
/* Check element's attributes for obvious price strings */
for (xmlNode *cur = node; cur; cur = cur->next) {
if (cur->type == XML_ELEMENT_NODE) {
xmlChar *cls = xmlGetProp(cur, (const xmlChar *)"class");
xmlChar *id = xmlGetProp(cur, (const xmlChar *)"id");
if ((cls && attr_contains(cls, "price")) || (id && attr_contains(id, "price"))) {
char *txt = get_node_content(cur);
if (cls) xmlFree(cls);
if (id) xmlFree(id);
if (txt && strlen(txt) > 0 && strlen(txt) < 200) return txt;
if (txt) free(txt);
}
if (cls) xmlFree(cls);
if (id) xmlFree(id);
/* Check textual content for currency symbols */
char *txt = get_node_content(cur);
if (txt) {
/* Avoid returning container text (e.g., anchor that includes title
* and price). Only accept textual matches when the element is not
* an anchor or heading.
*/
if (xmlStrcasecmp(cur->name, (const xmlChar *)"a") != 0 &&
xmlStrcasecmp(cur->name, (const xmlChar *)"h1") != 0 &&
xmlStrcasecmp(cur->name, (const xmlChar *)"h2") != 0 &&
(strchr(txt, '$') || strchr(txt, '£') || strchr(txt, '') || strstr(txt, "USD") || strstr(txt, "EUR"))) {
if (strlen(txt) < 200) {
char *trim = strdup(txt);
free(txt);
return trim;
}
}
free(txt);
}
char *res = find_price_in_node(cur->children);
if (res) return res;
}
}
return NULL;
}
/* Collect product information for a node that appears to be a product item.
* Looks for h2.product-name and span.price preferentially, falls back to
* generic name/price finders.
*/
static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found) {
if (!node) return;
char *name = NULL;
char *price = NULL;
char *plink = NULL;
/* Prefer h2 with class 'product-name' or titles */
for (xmlNode *c = node->children; c; c = c->next) {
if (c->type != XML_ELEMENT_NODE) continue;
xmlChar *cls = xmlGetProp(c, (const xmlChar *)"class");
if (xmlStrcasecmp(c->name, (const xmlChar *)"h2") == 0 && cls) {
if (attr_contains(cls, "product-name") || attr_contains(cls, "woocommerce-loop-product__title") || attr_contains(cls, "product-title")) {
name = get_node_content(c);
xmlFree(cls);
break;
}
}
if (cls) xmlFree(cls);
}
/* Price: look for span with class 'price' or data-products='price' */
if (!price) {
for (xmlNode *c = node->children; c; c = c->next) {
if (c->type != XML_ELEMENT_NODE) continue;
xmlChar *cls = xmlGetProp(c, (const xmlChar *)"class");
xmlChar *dp = xmlGetProp(c, (const xmlChar *)"data-products");
if (cls && attr_contains(cls, "price")) {
price = get_node_content(c);
if (cls) xmlFree(cls);
if (dp) xmlFree(dp);
break;
}
if (dp && attr_contains(dp, "price")) {
price = get_node_content(c);
if (cls) xmlFree(cls);
if (dp) xmlFree(dp);
break;
}
if (cls) xmlFree(cls);
if (dp) xmlFree(dp);
}
}
/* Fallbacks */
if (!name) name = find_name_in_node(node->children);
if (!price) price = find_price_in_node(node->children);
/* find product link if available */
plink = find_href_in_node(node);
if (!plink) plink = strdup("<no-link>");
if (name || price) {
if (!name) name = strdup("<no-name>");
if (!price) price = strdup("<no-price>");
append_fmt(out, out_len, "%s | %s | %s\n", name, price, plink);
(*found)++;
}
if (name) free(name);
if (price) free(price);
free(plink);
}
static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found) {
for (xmlNode *cur = node; cur; cur = cur->next) {
if (cur->type != XML_ELEMENT_NODE) continue;
xmlChar *data_products = xmlGetProp(cur, (const xmlChar *)"data-products");
xmlChar *itemtype = xmlGetProp(cur, (const xmlChar *)"itemtype");
xmlChar *cls = xmlGetProp(cur, (const xmlChar *)"class");
bool is_product = false;
if (data_products && attr_contains(data_products, "item")) is_product = true;
else if (itemtype && attr_contains(itemtype, "Product")) is_product = true;
else if (cls && xmlStrcasecmp(cur->name, (const xmlChar *)"li") == 0 && attr_contains(cls, "product")) is_product = true;
if (is_product) {
collect_product_from_node(cur, out, out_len, found);
}
if (data_products) xmlFree(data_products);
if (itemtype) xmlFree(itemtype);
if (cls) xmlFree(cls);
if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found);
}
}
int extract_products(const char *html, size_t len, char **out) {
if (!html || len == 0 || !out) return -1;
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (!doc) return -1;
xmlNode *root = xmlDocGetRootElement(doc);
*out = NULL;
size_t out_len = 0;
int found = 0;
traverse_and_collect_products(root, out, &out_len, &found);
xmlFreeDoc(doc);
xmlCleanupParser();
if (found == 0) {
free(*out);
*out = NULL;
return -1;
}
return 0;
}