Initial Commit

2025-12-04 18:34:35 -07:00 · 2025-12-04 18:34:35 -07:00 · 2afe840ede
commit 2afe840ede
5 changed files with 783 additions and 0 deletions
--- a/13
+++ b/13
@ -0,0 +1,13 @@
+CC=gcc
+CFLAGS=-Iinclude -Wall $(shell pkg-config --cflags libxml-2.0 libcurl)
+LDFLAGS=$(shell pkg-config --libs libxml-2.0 libcurl)
+SRCS=src/main.c src/scraper.c
+TARGET=gscrape
+
+all: $(TARGET)
+
+$(TARGET): $(SRCS)
+	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
+
+clean:
+	rm -f $(TARGET) *.o src/*.o
--- a/BIN
+++ b/BIN
--- a/include/scraper.h
+++ b/include/scraper.h
@ -0,0 +1,33 @@
+#ifndef SCRAPER_H
+#define SCRAPER_H
+
+#include <stdlib.h>
+
+int fetch_url(const char *url, char **out_buf, size_t *out_len);
+char *extract_title(const char *html, size_t len);
+
+/* Extract all <meta name=... content=...> and <meta property=... content=...>
+ * Returns 0 on success and allocates *out with a newline-separated list
+ * of "key: value" lines. Caller must free(*out).
+ */
+int extract_meta(const char *html, size_t len, char **out);
+
+/* Extract Open Graph tags (meta property="og:...") similarly. */
+int extract_og(const char *html, size_t len, char **out);
+
+/* Extract the first <script type="application/ld+json"> that looks like a
+ * Product schema. Returns 0 and allocates *out_json (caller frees) or
+ * returns -1 if not found.
+ */
+int extract_jsonld_product(const char *html, size_t len, char **out_json);
+
+/* Extract the text content of the first <h1> element, or NULL if none. */
+char *extract_h1(const char *html, size_t len);
+
+/* Extract product listings (one per line) as "name | price | url".
+ * Returns 0 and allocates *out on success (caller frees), or -1.
+ * Each line represents one product; fields are separated by '|'.
+ */
+int extract_products(const char *html, size_t len, char **out);
+
+#endif /* SCRAPER_H */
--- a/src/main.c
+++ b/src/main.c
@ -0,0 +1,217 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "scraper.h"
+#include <string.h>
+
+static char *json_escape(const char *s) {
+    if (!s) return strdup("\"\"");
+    size_t len = strlen(s);
+    size_t cap = len * 2 + 3;
+    char *out = malloc(cap);
+    if (!out) return NULL;
+    size_t oi = 0;
+    out[oi++] = '"';
+    for (size_t i = 0; i < len; ++i) {
+        unsigned char c = s[i];
+        if (c == '"' || c == '\\') {
+            if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); }
+            out[oi++] = '\\';
+            out[oi++] = c;
+        } else if (c == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; }
+        else if (c == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; }
+        else if (c == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; }
+        else if (c == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; }
+        else if (c == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; }
+        else if (c < 0x20) {
+            char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", c);
+            size_t bl = strlen(buf);
+            if (oi + bl + 1 >= cap) { cap = cap + bl + 16; out = realloc(out, cap); }
+            memcpy(out + oi, buf, bl); oi += bl;
+        } else {
+            if (oi + 1 >= cap) { cap *= 2; out = realloc(out, cap); }
+            out[oi++] = c;
+        }
+    }
+    if (oi + 2 >= cap) { cap += 2; out = realloc(out, cap); }
+    out[oi++] = '"';
+    out[oi] = '\0';
+    return out;
+}
+
+static void print_kv_json(const char *kv_lines) {
+    if (!kv_lines) { printf("null"); return; }
+    printf("{");
+    const char *p = kv_lines;
+    int first = 1;
+    while (*p) {
+        const char *line_end = strchr(p, '\n');
+        size_t linelen = line_end ? (size_t)(line_end - p) : strlen(p);
+        if (linelen > 0) {
+            const char *sep = memchr(p, ':', linelen);
+            if (sep) {
+                size_t keylen = (size_t)(sep - p);
+                const char *vstart = sep + 1;
+                while (vstart < p + linelen && (*vstart == ' ' || *vstart == '\t')) vstart++;
+                size_t vallen = (p + linelen) - vstart;
+                char *key = malloc(keylen + 1);
+                char *val = malloc(vallen + 1);
+                memcpy(key, p, keylen); key[keylen] = '\0';
+                memcpy(val, vstart, vallen); val[vallen] = '\0';
+                char *ek = json_escape(key);
+                char *ev = json_escape(val);
+                free(key); free(val);
+                if (!first) printf(",");
+                first = 0;
+                printf("%s:%s", ek, ev);
+                free(ek); free(ev);
+            }
+        }
+        if (!line_end) break;
+        p = line_end + 1;
+    }
+    printf("}");
+}
+
+static void print_products_json(const char *products_lines) {
+    if (!products_lines) { printf("null"); return; }
+    printf("[");
+    const char *p = products_lines;
+    int first = 1;
+    while (*p) {
+        const char *line_end = strchr(p, '\n');
+        size_t linelen = line_end ? (size_t)(line_end - p) : strlen(p);
+        if (linelen > 0) {
+            /* expect: name | price | url */
+            const char *s1 = memchr(p, '|', linelen);
+            const char *s2 = s1 ? memchr(s1 + 1, '|', (size_t)(p + linelen - (s1 + 1))) : NULL;
+            char *name = NULL; char *price = NULL; char *url = NULL;
+            if (s1 && s2) {
+                size_t nlen = (size_t)(s1 - p);
+                const char *p2 = s1 + 1;
+                while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--;
+                while (p2 < s2 && (*p2 == ' ' || *p2 == '\t')) p2++;
+                size_t plen = (size_t)(s2 - p2);
+                const char *u2 = s2 + 1;
+                while (u2 < p + linelen && (*u2 == ' ' || *u2 == '\t')) u2++;
+                size_t ulen = (p + linelen) - u2;
+                name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
+                price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
+                url = malloc(ulen + 1); memcpy(url, u2, ulen); url[ulen] = '\0';
+            } else if (s1) {
+                /* only name|price */
+                size_t nlen = (size_t)(s1 - p);
+                const char *p2 = s1 + 1;
+                while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--;
+                while (p2 < p + linelen && (*p2 == ' ' || *p2 == '\t')) p2++;
+                size_t plen = (p + linelen) - p2;
+                name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
+                price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
+                url = strdup("");
+            } else {
+                /* fallback whole line as name */
+                name = malloc(linelen + 1); memcpy(name, p, linelen); name[linelen] = '\0';
+                price = strdup("");
+                url = strdup("");
+            }
+            char *en = json_escape(name);
+            char *ep = json_escape(price);
+            char *eu = json_escape(url);
+            free(name); free(price); free(url);
+            if (!first) printf(",");
+            first = 0;
+            printf("{\"name\":%s,\"price\":%s,\"url\":%s}", en, ep, eu);
+            free(en); free(ep); free(eu);
+        }
+        if (!line_end) break;
+        p = line_end + 1;
+    }
+    printf("]");
+}
+
+int main(int argc, char **argv) {
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s <url>\n", argv[0]);
+        return 2;
+    }
+
+    const char *url = argv[1];
+    char *html = NULL;
+    size_t html_len = 0;
+
+    if (fetch_url(url, &html, &html_len) != 0) {
+        fprintf(stderr, "Failed to fetch URL: %s\n", url);
+        return 1;
+    }
+
+    char *title = extract_title(html, html_len);
+    /* Output JSON */
+    printf("{");
+    /* url */
+    char *eurl = json_escape(url);
+    printf("\"url\":%s,", eurl); free(eurl);
+
+    /* title */
+    if (title) {
+        char *et = json_escape(title);
+        printf("\"title\":%s,", et);
+        free(et);
+        free(title);
+    } else {
+        printf("\"title\":null,");
+    }
+
+    char *h1 = extract_h1(html, html_len);
+    if (h1) {
+        char *eh1 = json_escape(h1);
+        printf("\"h1\":%s,", eh1);
+        free(eh1);
+        free(h1);
+    } else {
+        printf("\"h1\":null,");
+    }
+
+    char *meta = NULL;
+    if (extract_meta(html, html_len, &meta) == 0 && meta) {
+        printf("\"meta\":");
+        print_kv_json(meta);
+        printf(",");
+        free(meta);
+    } else {
+        printf("\"meta\":null,");
+    }
+
+    char *og = NULL;
+    if (extract_og(html, html_len, &og) == 0 && og) {
+        printf("\"og\":");
+        print_kv_json(og);
+        printf(",");
+        free(og);
+    } else {
+        printf("\"og\":null,");
+    }
+
+    char *jsonld = NULL;
+    if (extract_jsonld_product(html, html_len, &jsonld) == 0 && jsonld) {
+        /* include raw JSON-LD as string */
+        char *ej = json_escape(jsonld);
+        printf("\"jsonld_product\":%s,", ej);
+        free(ej);
+        free(jsonld);
+    } else {
+        printf("\"jsonld_product\":null,");
+    }
+
+    char *products = NULL;
+    if (extract_products(html, html_len, &products) == 0 && products) {
+        printf("\"products\":");
+        print_products_json(products);
+        free(products);
+    } else {
+        printf("\"products\":null");
+    }
+
+    printf("}\n");
+
+    free(html);
+    return 0;
+}
--- a/src/scraper.c
+++ b/src/scraper.c
@ -0,0 +1,520 @@
+#include "scraper.h"
+#include <stdio.h>
+#include <string.h>
+#include <curl/curl.h>
+#include <libxml/HTMLparser.h>
+#include <libxml/tree.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdbool.h>
+
+struct mem {
+    char *buf;
+    size_t size;
+};
+
+static size_t write_cb(void *ptr, size_t size, size_t nmemb, void *userdata) {
+    size_t realsize = size * nmemb;
+    struct mem *m = (struct mem *)userdata;
+    char *newbuf = realloc(m->buf, m->size + realsize + 1);
+    if (!newbuf) return 0;
+    m->buf = newbuf;
+    memcpy(&(m->buf[m->size]), ptr, realsize);
+    m->size += realsize;
+    m->buf[m->size] = '\0';
+    return realsize;
+}
+
+int fetch_url(const char *url, char **out_buf, size_t *out_len) {
+    if (!url || !out_buf || !out_len) return -1;
+    CURL *curl = NULL;
+    CURLcode res;
+    struct mem chunk = {0};
+
+    curl_global_init(CURL_GLOBAL_DEFAULT);
+    curl = curl_easy_init();
+    if (!curl) {
+        curl_global_cleanup();
+        return -1;
+    }
+
+    curl_easy_setopt(curl, CURLOPT_URL, url);
+    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
+    curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk);
+    curl_easy_setopt(curl, CURLOPT_USERAGENT, "gscrape/0.1");
+
+    res = curl_easy_perform(curl);
+    curl_easy_cleanup(curl);
+    curl_global_cleanup();
+
+    if (res != CURLE_OK) {
+        free(chunk.buf);
+        return -1;
+    }
+
+    *out_buf = chunk.buf;
+    *out_len = chunk.size;
+    return 0;
+}
+
+static xmlNode *find_title_node(xmlNode *node) {
+    for (xmlNode *cur = node; cur; cur = cur->next) {
+        if (cur->type == XML_ELEMENT_NODE) {
+            if (xmlStrcasecmp(cur->name, (const xmlChar *)"title") == 0) {
+                return cur;
+            }
+            xmlNode *res = find_title_node(cur->children);
+            if (res) return res;
+        }
+    }
+    return NULL;
+}
+
+char *extract_title(const char *html, size_t len) {
+    if (!html || len == 0) return NULL;
+    htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+    if (!doc) return NULL;
+
+    xmlNode *root = xmlDocGetRootElement(doc);
+    xmlNode *title_node = find_title_node(root);
+    if (!title_node) {
+        xmlFreeDoc(doc);
+        return NULL;
+    }
+
+    xmlChar *content = xmlNodeGetContent(title_node);
+    if (!content) {
+        xmlFreeDoc(doc);
+        return NULL;
+    }
+
+    char *title = strdup((const char *)content);
+    xmlFree(content);
+    xmlFreeDoc(doc);
+    xmlCleanupParser();
+    return title;
+}
+
+/* Helper: append formatted text to buffer (allocating). Caller owns *out. */
+static int append_fmt(char **out, size_t *out_len, const char *fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    va_list ap2;
+    va_copy(ap2, ap);
+    int needed = vsnprintf(NULL, 0, fmt, ap);
+    va_end(ap);
+
+    if (needed < 0) {
+        va_end(ap2);
+        return -1;
+    }
+
+    char *newbuf = realloc(*out, *out_len + (size_t)needed + 1);
+    if (!newbuf) {
+        va_end(ap2);
+        return -1;
+    }
+    *out = newbuf;
+    vsnprintf(*out + *out_len, (size_t)needed + 1, fmt, ap2);
+    *out_len += (size_t)needed;
+    va_end(ap2);
+    return 0;
+}
+
+static void collect_meta_nodes(xmlNode *node, char **out, size_t *out_len) {
+    for (xmlNode *cur = node; cur; cur = cur->next) {
+        if (cur->type == XML_ELEMENT_NODE) {
+            if (xmlStrcasecmp(cur->name, (const xmlChar *)"meta") == 0) {
+                xmlChar *name = xmlGetProp(cur, (const xmlChar *)"name");
+                xmlChar *prop = xmlGetProp(cur, (const xmlChar *)"property");
+                xmlChar *content = xmlGetProp(cur, (const xmlChar *)"content");
+                const xmlChar *key = name ? name : prop;
+                if (key && content) {
+                    append_fmt(out, out_len, "%s: %s\n", (const char *)key, (const char *)content);
+                }
+                if (name) xmlFree(name);
+                if (prop) xmlFree(prop);
+                if (content) xmlFree(content);
+            }
+            collect_meta_nodes(cur->children, out, out_len);
+        }
+    }
+}
+
+int extract_meta(const char *html, size_t len, char **out) {
+    if (!html || len == 0 || !out) return -1;
+    htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+    if (!doc) return -1;
+    xmlNode *root = xmlDocGetRootElement(doc);
+    *out = NULL;
+    size_t out_len = 0;
+    collect_meta_nodes(root, out, &out_len);
+    xmlFreeDoc(doc);
+    xmlCleanupParser();
+    if (out_len == 0) {
+        free(*out);
+        *out = NULL;
+        return -1;
+    }
+    return 0;
+}
+
+static void collect_og_nodes(xmlNode *node, char **out, size_t *out_len) {
+    for (xmlNode *cur = node; cur; cur = cur->next) {
+        if (cur->type == XML_ELEMENT_NODE) {
+            if (xmlStrcasecmp(cur->name, (const xmlChar *)"meta") == 0) {
+                xmlChar *prop = xmlGetProp(cur, (const xmlChar *)"property");
+                xmlChar *content = xmlGetProp(cur, (const xmlChar *)"content");
+                if (prop && content) {
+                    if (strncasecmp((const char *)prop, "og:", 3) == 0) {
+                        append_fmt(out, out_len, "%s: %s\n", (const char *)prop, (const char *)content);
+                    }
+                }
+                if (prop) xmlFree(prop);
+                if (content) xmlFree(content);
+            }
+            collect_og_nodes(cur->children, out, out_len);
+        }
+    }
+}
+
+int extract_og(const char *html, size_t len, char **out) {
+    if (!html || len == 0 || !out) return -1;
+    htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+    if (!doc) return -1;
+    xmlNode *root = xmlDocGetRootElement(doc);
+    *out = NULL;
+    size_t out_len = 0;
+    collect_og_nodes(root, out, &out_len);
+    xmlFreeDoc(doc);
+    xmlCleanupParser();
+    if (out_len == 0) {
+        free(*out);
+        *out = NULL;
+        return -1;
+    }
+    return 0;
+}
+
+static char *get_node_content(xmlNode *node) {
+    xmlChar *content = xmlNodeGetContent(node);
+    if (!content) return NULL;
+    char *res = strdup((const char *)content);
+    xmlFree(content);
+    return res;
+}
+
+int extract_jsonld_product(const char *html, size_t len, char **out_json) {
+    if (!html || len == 0 || !out_json) return -1;
+    htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+    if (!doc) return -1;
+    xmlNode *root = xmlDocGetRootElement(doc);
+    *out_json = NULL;
+
+    for (xmlNode *cur = root; cur; cur = cur->next) {
+        if (cur->type == XML_ELEMENT_NODE) {
+            if (xmlStrcasecmp(cur->name, (const xmlChar *)"script") == 0) {
+                xmlChar *type = xmlGetProp(cur, (const xmlChar *)"type");
+                if (type && xmlStrcasecmp(type, (const xmlChar *)"application/ld+json") == 0) {
+                    char *content = get_node_content(cur);
+                    if (content) {
+                        /* crude check for Product */
+                        if ((strcasestr(content, "\"@type\"") && strcasestr(content, "Product")) || strcasestr(content, "\"Product\"")) {
+                            *out_json = content; /* transfer ownership */
+                            xmlFree(type);
+                            xmlFreeDoc(doc);
+                            xmlCleanupParser();
+                            return 0;
+                        }
+                        free(content);
+                    }
+                }
+                if (type) xmlFree(type);
+            }
+            /* search children */
+            if (cur->children) {
+                /* recurse using a stack-like traversal by resetting cur to children
+                 * then letting the outer loop walk siblings. Simpler: call function
+                 * recursively — but here we inline a nested loop to find scripts.
+                 */
+                xmlNode *child = cur->children;
+                while (child) {
+                    if (child->type == XML_ELEMENT_NODE && xmlStrcasecmp(child->name, (const xmlChar *)"script") == 0) {
+                        xmlChar *type = xmlGetProp(child, (const xmlChar *)"type");
+                        if (type && xmlStrcasecmp(type, (const xmlChar *)"application/ld+json") == 0) {
+                            char *content = get_node_content(child);
+                            if (content) {
+                                if ((strcasestr(content, "\"@type\"") && strcasestr(content, "Product")) || strcasestr(content, "\"Product\"")) {
+                                    *out_json = content;
+                                    if (type) xmlFree(type);
+                                    xmlFreeDoc(doc);
+                                    xmlCleanupParser();
+                                    return 0;
+                                }
+                                free(content);
+                            }
+                        }
+                        if (type) xmlFree(type);
+                    }
+                    child = child->next;
+                }
+            }
+        }
+    }
+
+    xmlFreeDoc(doc);
+    xmlCleanupParser();
+    return -1;
+}
+
+static xmlNode *find_h1_node(xmlNode *node) {
+    for (xmlNode *cur = node; cur; cur = cur->next) {
+        if (cur->type == XML_ELEMENT_NODE) {
+            if (xmlStrcasecmp(cur->name, (const xmlChar *)"h1") == 0) {
+                return cur;
+            }
+            xmlNode *res = find_h1_node(cur->children);
+            if (res) return res;
+        }
+    }
+    return NULL;
+}
+
+char *extract_h1(const char *html, size_t len) {
+    if (!html || len == 0) return NULL;
+    htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+    if (!doc) return NULL;
+    xmlNode *root = xmlDocGetRootElement(doc);
+    xmlNode *h1 = find_h1_node(root);
+    if (!h1) {
+        xmlFreeDoc(doc);
+        return NULL;
+    }
+    char *res = get_node_content(h1);
+    xmlFreeDoc(doc);
+    xmlCleanupParser();
+    return res;
+}
+
+/* Heuristic product extraction */
+static bool attr_contains(const xmlChar *attr, const char *needle) {
+    if (!attr || !needle) return false;
+    return (strcasestr((const char *)attr, needle) != NULL);
+}
+
+static char *find_name_in_node(xmlNode *node) {
+    if (!node) return NULL;
+    /* look for h1-h4, a, or elements with class/id containing name/title/product */
+    for (xmlNode *cur = node; cur; cur = cur->next) {
+        if (cur->type == XML_ELEMENT_NODE) {
+            if (xmlStrcasecmp(cur->name, (const xmlChar *)"h1") == 0 ||
+                xmlStrcasecmp(cur->name, (const xmlChar *)"h2") == 0 ||
+                xmlStrcasecmp(cur->name, (const xmlChar *)"h3") == 0 ||
+                xmlStrcasecmp(cur->name, (const xmlChar *)"h4") == 0) {
+                char *txt = get_node_content(cur);
+                if (txt && strlen(txt) > 1 && strlen(txt) < 200) return txt;
+                if (txt) free(txt);
+            }
+
+            xmlChar *cls = xmlGetProp(cur, (const xmlChar *)"class");
+            xmlChar *id = xmlGetProp(cur, (const xmlChar *)"id");
+            if ((cls && (attr_contains(cls, "title") || attr_contains(cls, "name") || attr_contains(cls, "product"))) ||
+                (id && (attr_contains(id, "title") || attr_contains(id, "name") || attr_contains(id, "product")))) {
+                /* Avoid returning anchor text (which may include price); prefer
+                   headings or specific child elements. If current node is an
+                   anchor, recurse into children instead of returning its text. */
+                if (xmlStrcasecmp(cur->name, (const xmlChar *)"a") != 0) {
+                    char *txt = get_node_content(cur);
+                    if (txt && strlen(txt) > 0 && strlen(txt) < 200) {
+                        if (cls) xmlFree(cls);
+                        if (id) xmlFree(id);
+                        return txt;
+                    }
+                    if (txt) free(txt);
+                }
+            }
+            if (cls) xmlFree(cls);
+            if (id) xmlFree(id);
+
+            char *res = find_name_in_node(cur->children);
+            if (res) return res;
+        }
+    }
+    return NULL;
+}
+
+static char *find_href_in_node(xmlNode *node) {
+    if (!node) return NULL;
+    for (xmlNode *cur = node; cur; cur = cur->next) {
+        if (cur->type == XML_ELEMENT_NODE) {
+            if (xmlStrcasecmp(cur->name, (const xmlChar *)"a") == 0) {
+                xmlChar *href = xmlGetProp(cur, (const xmlChar *)"href");
+                if (href) {
+                    char *res = strdup((const char *)href);
+                    xmlFree(href);
+                    return res;
+                }
+            }
+            char *res = find_href_in_node(cur->children);
+            if (res) return res;
+        }
+    }
+    return NULL;
+}
+
+static char *find_price_in_node(xmlNode *node) {
+    if (!node) return NULL;
+    /* Check element's attributes for obvious price strings */
+    for (xmlNode *cur = node; cur; cur = cur->next) {
+        if (cur->type == XML_ELEMENT_NODE) {
+            xmlChar *cls = xmlGetProp(cur, (const xmlChar *)"class");
+            xmlChar *id = xmlGetProp(cur, (const xmlChar *)"id");
+            if ((cls && attr_contains(cls, "price")) || (id && attr_contains(id, "price"))) {
+                char *txt = get_node_content(cur);
+                if (cls) xmlFree(cls);
+                if (id) xmlFree(id);
+                if (txt && strlen(txt) > 0 && strlen(txt) < 200) return txt;
+                if (txt) free(txt);
+            }
+            if (cls) xmlFree(cls);
+            if (id) xmlFree(id);
+
+            /* Check textual content for currency symbols */
+            char *txt = get_node_content(cur);
+            if (txt) {
+                /* Avoid returning container text (e.g., anchor that includes title
+                 * and price). Only accept textual matches when the element is not
+                 * an anchor or heading.
+                 */
+                if (xmlStrcasecmp(cur->name, (const xmlChar *)"a") != 0 &&
+                    xmlStrcasecmp(cur->name, (const xmlChar *)"h1") != 0 &&
+                    xmlStrcasecmp(cur->name, (const xmlChar *)"h2") != 0 &&
+                    (strchr(txt, '$') || strchr(txt, '£') || strchr(txt, '€') || strstr(txt, "USD") || strstr(txt, "EUR"))) {
+                    if (strlen(txt) < 200) {
+                        char *trim = strdup(txt);
+                        free(txt);
+                        return trim;
+                    }
+                }
+                free(txt);
+            }
+
+            char *res = find_price_in_node(cur->children);
+            if (res) return res;
+        }
+    }
+    return NULL;
+}
+
+/* Collect product information for a node that appears to be a product item.
+ * Looks for h2.product-name and span.price preferentially, falls back to
+ * generic name/price finders.
+ */
+static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found) {
+    if (!node) return;
+    char *name = NULL;
+    char *price = NULL;
+    char *plink = NULL;
+
+    /* Prefer h2 with class 'product-name' or titles */
+    for (xmlNode *c = node->children; c; c = c->next) {
+        if (c->type != XML_ELEMENT_NODE) continue;
+        xmlChar *cls = xmlGetProp(c, (const xmlChar *)"class");
+        if (xmlStrcasecmp(c->name, (const xmlChar *)"h2") == 0 && cls) {
+            if (attr_contains(cls, "product-name") || attr_contains(cls, "woocommerce-loop-product__title") || attr_contains(cls, "product-title")) {
+                name = get_node_content(c);
+                xmlFree(cls);
+                break;
+            }
+        }
+        if (cls) xmlFree(cls);
+    }
+
+    /* Price: look for span with class 'price' or data-products='price' */
+    if (!price) {
+        for (xmlNode *c = node->children; c; c = c->next) {
+            if (c->type != XML_ELEMENT_NODE) continue;
+            xmlChar *cls = xmlGetProp(c, (const xmlChar *)"class");
+            xmlChar *dp = xmlGetProp(c, (const xmlChar *)"data-products");
+            if (cls && attr_contains(cls, "price")) {
+                price = get_node_content(c);
+                if (cls) xmlFree(cls);
+                if (dp) xmlFree(dp);
+                break;
+            }
+            if (dp && attr_contains(dp, "price")) {
+                price = get_node_content(c);
+                if (cls) xmlFree(cls);
+                if (dp) xmlFree(dp);
+                break;
+            }
+            if (cls) xmlFree(cls);
+            if (dp) xmlFree(dp);
+        }
+    }
+
+    /* Fallbacks */
+    if (!name) name = find_name_in_node(node->children);
+    if (!price) price = find_price_in_node(node->children);
+    /* find product link if available */
+    plink = find_href_in_node(node);
+    if (!plink) plink = strdup("<no-link>");
+
+    if (name || price) {
+        if (!name) name = strdup("<no-name>");
+        if (!price) price = strdup("<no-price>");
+        append_fmt(out, out_len, "%s | %s | %s\n", name, price, plink);
+        (*found)++;
+    }
+    if (name) free(name);
+    if (price) free(price);
+    free(plink);
+}
+
+static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found) {
+    for (xmlNode *cur = node; cur; cur = cur->next) {
+        if (cur->type != XML_ELEMENT_NODE) continue;
+
+        xmlChar *data_products = xmlGetProp(cur, (const xmlChar *)"data-products");
+        xmlChar *itemtype = xmlGetProp(cur, (const xmlChar *)"itemtype");
+        xmlChar *cls = xmlGetProp(cur, (const xmlChar *)"class");
+
+        bool is_product = false;
+        if (data_products && attr_contains(data_products, "item")) is_product = true;
+        else if (itemtype && attr_contains(itemtype, "Product")) is_product = true;
+        else if (cls && xmlStrcasecmp(cur->name, (const xmlChar *)"li") == 0 && attr_contains(cls, "product")) is_product = true;
+
+        if (is_product) {
+            collect_product_from_node(cur, out, out_len, found);
+        }
+
+        if (data_products) xmlFree(data_products);
+        if (itemtype) xmlFree(itemtype);
+        if (cls) xmlFree(cls);
+
+        if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found);
+    }
+}
+
+int extract_products(const char *html, size_t len, char **out) {
+    if (!html || len == 0 || !out) return -1;
+    htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+    if (!doc) return -1;
+    xmlNode *root = xmlDocGetRootElement(doc);
+
+    *out = NULL;
+    size_t out_len = 0;
+    int found = 0;
+
+    traverse_and_collect_products(root, out, &out_len, &found);
+
+    xmlFreeDoc(doc);
+    xmlCleanupParser();
+    if (found == 0) {
+        free(*out);
+        *out = NULL;
+        return -1;
+    }
+    return 0;
+}