Initial Commit
This commit is contained in:
commit
2afe840ede
13
Makefile
Normal file
13
Makefile
Normal file
@ -0,0 +1,13 @@
|
||||
CC=gcc
|
||||
CFLAGS=-Iinclude -Wall $(shell pkg-config --cflags libxml-2.0 libcurl)
|
||||
LDFLAGS=$(shell pkg-config --libs libxml-2.0 libcurl)
|
||||
SRCS=src/main.c src/scraper.c
|
||||
TARGET=gscrape
|
||||
|
||||
all: $(TARGET)
|
||||
|
||||
$(TARGET): $(SRCS)
|
||||
$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -f $(TARGET) *.o src/*.o
|
||||
33
include/scraper.h
Normal file
33
include/scraper.h
Normal file
@ -0,0 +1,33 @@
|
||||
#ifndef SCRAPER_H
|
||||
#define SCRAPER_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
int fetch_url(const char *url, char **out_buf, size_t *out_len);
|
||||
char *extract_title(const char *html, size_t len);
|
||||
|
||||
/* Extract all <meta name=... content=...> and <meta property=... content=...>
|
||||
* Returns 0 on success and allocates *out with a newline-separated list
|
||||
* of "key: value" lines. Caller must free(*out).
|
||||
*/
|
||||
int extract_meta(const char *html, size_t len, char **out);
|
||||
|
||||
/* Extract Open Graph tags (meta property="og:...") similarly. */
|
||||
int extract_og(const char *html, size_t len, char **out);
|
||||
|
||||
/* Extract the first <script type="application/ld+json"> that looks like a
|
||||
* Product schema. Returns 0 and allocates *out_json (caller frees) or
|
||||
* returns -1 if not found.
|
||||
*/
|
||||
int extract_jsonld_product(const char *html, size_t len, char **out_json);
|
||||
|
||||
/* Extract the text content of the first <h1> element, or NULL if none. */
|
||||
char *extract_h1(const char *html, size_t len);
|
||||
|
||||
/* Extract product listings (one per line) as "name | price | url".
|
||||
* Returns 0 and allocates *out on success (caller frees), or -1.
|
||||
* Each line represents one product; fields are separated by '|'.
|
||||
*/
|
||||
int extract_products(const char *html, size_t len, char **out);
|
||||
|
||||
#endif /* SCRAPER_H */
|
||||
217
src/main.c
Normal file
217
src/main.c
Normal file
@ -0,0 +1,217 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "scraper.h"
|
||||
#include <string.h>
|
||||
|
||||
static char *json_escape(const char *s) {
|
||||
if (!s) return strdup("\"\"");
|
||||
size_t len = strlen(s);
|
||||
size_t cap = len * 2 + 3;
|
||||
char *out = malloc(cap);
|
||||
if (!out) return NULL;
|
||||
size_t oi = 0;
|
||||
out[oi++] = '"';
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
unsigned char c = s[i];
|
||||
if (c == '"' || c == '\\') {
|
||||
if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); }
|
||||
out[oi++] = '\\';
|
||||
out[oi++] = c;
|
||||
} else if (c == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; }
|
||||
else if (c == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; }
|
||||
else if (c == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; }
|
||||
else if (c == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; }
|
||||
else if (c == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; }
|
||||
else if (c < 0x20) {
|
||||
char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", c);
|
||||
size_t bl = strlen(buf);
|
||||
if (oi + bl + 1 >= cap) { cap = cap + bl + 16; out = realloc(out, cap); }
|
||||
memcpy(out + oi, buf, bl); oi += bl;
|
||||
} else {
|
||||
if (oi + 1 >= cap) { cap *= 2; out = realloc(out, cap); }
|
||||
out[oi++] = c;
|
||||
}
|
||||
}
|
||||
if (oi + 2 >= cap) { cap += 2; out = realloc(out, cap); }
|
||||
out[oi++] = '"';
|
||||
out[oi] = '\0';
|
||||
return out;
|
||||
}
|
||||
|
||||
static void print_kv_json(const char *kv_lines) {
|
||||
if (!kv_lines) { printf("null"); return; }
|
||||
printf("{");
|
||||
const char *p = kv_lines;
|
||||
int first = 1;
|
||||
while (*p) {
|
||||
const char *line_end = strchr(p, '\n');
|
||||
size_t linelen = line_end ? (size_t)(line_end - p) : strlen(p);
|
||||
if (linelen > 0) {
|
||||
const char *sep = memchr(p, ':', linelen);
|
||||
if (sep) {
|
||||
size_t keylen = (size_t)(sep - p);
|
||||
const char *vstart = sep + 1;
|
||||
while (vstart < p + linelen && (*vstart == ' ' || *vstart == '\t')) vstart++;
|
||||
size_t vallen = (p + linelen) - vstart;
|
||||
char *key = malloc(keylen + 1);
|
||||
char *val = malloc(vallen + 1);
|
||||
memcpy(key, p, keylen); key[keylen] = '\0';
|
||||
memcpy(val, vstart, vallen); val[vallen] = '\0';
|
||||
char *ek = json_escape(key);
|
||||
char *ev = json_escape(val);
|
||||
free(key); free(val);
|
||||
if (!first) printf(",");
|
||||
first = 0;
|
||||
printf("%s:%s", ek, ev);
|
||||
free(ek); free(ev);
|
||||
}
|
||||
}
|
||||
if (!line_end) break;
|
||||
p = line_end + 1;
|
||||
}
|
||||
printf("}");
|
||||
}
|
||||
|
||||
static void print_products_json(const char *products_lines) {
|
||||
if (!products_lines) { printf("null"); return; }
|
||||
printf("[");
|
||||
const char *p = products_lines;
|
||||
int first = 1;
|
||||
while (*p) {
|
||||
const char *line_end = strchr(p, '\n');
|
||||
size_t linelen = line_end ? (size_t)(line_end - p) : strlen(p);
|
||||
if (linelen > 0) {
|
||||
/* expect: name | price | url */
|
||||
const char *s1 = memchr(p, '|', linelen);
|
||||
const char *s2 = s1 ? memchr(s1 + 1, '|', (size_t)(p + linelen - (s1 + 1))) : NULL;
|
||||
char *name = NULL; char *price = NULL; char *url = NULL;
|
||||
if (s1 && s2) {
|
||||
size_t nlen = (size_t)(s1 - p);
|
||||
const char *p2 = s1 + 1;
|
||||
while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--;
|
||||
while (p2 < s2 && (*p2 == ' ' || *p2 == '\t')) p2++;
|
||||
size_t plen = (size_t)(s2 - p2);
|
||||
const char *u2 = s2 + 1;
|
||||
while (u2 < p + linelen && (*u2 == ' ' || *u2 == '\t')) u2++;
|
||||
size_t ulen = (p + linelen) - u2;
|
||||
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
|
||||
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
|
||||
url = malloc(ulen + 1); memcpy(url, u2, ulen); url[ulen] = '\0';
|
||||
} else if (s1) {
|
||||
/* only name|price */
|
||||
size_t nlen = (size_t)(s1 - p);
|
||||
const char *p2 = s1 + 1;
|
||||
while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--;
|
||||
while (p2 < p + linelen && (*p2 == ' ' || *p2 == '\t')) p2++;
|
||||
size_t plen = (p + linelen) - p2;
|
||||
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
|
||||
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
|
||||
url = strdup("");
|
||||
} else {
|
||||
/* fallback whole line as name */
|
||||
name = malloc(linelen + 1); memcpy(name, p, linelen); name[linelen] = '\0';
|
||||
price = strdup("");
|
||||
url = strdup("");
|
||||
}
|
||||
char *en = json_escape(name);
|
||||
char *ep = json_escape(price);
|
||||
char *eu = json_escape(url);
|
||||
free(name); free(price); free(url);
|
||||
if (!first) printf(",");
|
||||
first = 0;
|
||||
printf("{\"name\":%s,\"price\":%s,\"url\":%s}", en, ep, eu);
|
||||
free(en); free(ep); free(eu);
|
||||
}
|
||||
if (!line_end) break;
|
||||
p = line_end + 1;
|
||||
}
|
||||
printf("]");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "Usage: %s <url>\n", argv[0]);
|
||||
return 2;
|
||||
}
|
||||
|
||||
const char *url = argv[1];
|
||||
char *html = NULL;
|
||||
size_t html_len = 0;
|
||||
|
||||
if (fetch_url(url, &html, &html_len) != 0) {
|
||||
fprintf(stderr, "Failed to fetch URL: %s\n", url);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *title = extract_title(html, html_len);
|
||||
/* Output JSON */
|
||||
printf("{");
|
||||
/* url */
|
||||
char *eurl = json_escape(url);
|
||||
printf("\"url\":%s,", eurl); free(eurl);
|
||||
|
||||
/* title */
|
||||
if (title) {
|
||||
char *et = json_escape(title);
|
||||
printf("\"title\":%s,", et);
|
||||
free(et);
|
||||
free(title);
|
||||
} else {
|
||||
printf("\"title\":null,");
|
||||
}
|
||||
|
||||
char *h1 = extract_h1(html, html_len);
|
||||
if (h1) {
|
||||
char *eh1 = json_escape(h1);
|
||||
printf("\"h1\":%s,", eh1);
|
||||
free(eh1);
|
||||
free(h1);
|
||||
} else {
|
||||
printf("\"h1\":null,");
|
||||
}
|
||||
|
||||
char *meta = NULL;
|
||||
if (extract_meta(html, html_len, &meta) == 0 && meta) {
|
||||
printf("\"meta\":");
|
||||
print_kv_json(meta);
|
||||
printf(",");
|
||||
free(meta);
|
||||
} else {
|
||||
printf("\"meta\":null,");
|
||||
}
|
||||
|
||||
char *og = NULL;
|
||||
if (extract_og(html, html_len, &og) == 0 && og) {
|
||||
printf("\"og\":");
|
||||
print_kv_json(og);
|
||||
printf(",");
|
||||
free(og);
|
||||
} else {
|
||||
printf("\"og\":null,");
|
||||
}
|
||||
|
||||
char *jsonld = NULL;
|
||||
if (extract_jsonld_product(html, html_len, &jsonld) == 0 && jsonld) {
|
||||
/* include raw JSON-LD as string */
|
||||
char *ej = json_escape(jsonld);
|
||||
printf("\"jsonld_product\":%s,", ej);
|
||||
free(ej);
|
||||
free(jsonld);
|
||||
} else {
|
||||
printf("\"jsonld_product\":null,");
|
||||
}
|
||||
|
||||
char *products = NULL;
|
||||
if (extract_products(html, html_len, &products) == 0 && products) {
|
||||
printf("\"products\":");
|
||||
print_products_json(products);
|
||||
free(products);
|
||||
} else {
|
||||
printf("\"products\":null");
|
||||
}
|
||||
|
||||
printf("}\n");
|
||||
|
||||
free(html);
|
||||
return 0;
|
||||
}
|
||||
520
src/scraper.c
Normal file
520
src/scraper.c
Normal file
@ -0,0 +1,520 @@
|
||||
#include "scraper.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <curl/curl.h>
|
||||
#include <libxml/HTMLparser.h>
|
||||
#include <libxml/tree.h>
|
||||
#include <ctype.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
struct mem {
|
||||
char *buf;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
static size_t write_cb(void *ptr, size_t size, size_t nmemb, void *userdata) {
|
||||
size_t realsize = size * nmemb;
|
||||
struct mem *m = (struct mem *)userdata;
|
||||
char *newbuf = realloc(m->buf, m->size + realsize + 1);
|
||||
if (!newbuf) return 0;
|
||||
m->buf = newbuf;
|
||||
memcpy(&(m->buf[m->size]), ptr, realsize);
|
||||
m->size += realsize;
|
||||
m->buf[m->size] = '\0';
|
||||
return realsize;
|
||||
}
|
||||
|
||||
int fetch_url(const char *url, char **out_buf, size_t *out_len) {
|
||||
if (!url || !out_buf || !out_len) return -1;
|
||||
CURL *curl = NULL;
|
||||
CURLcode res;
|
||||
struct mem chunk = {0};
|
||||
|
||||
curl_global_init(CURL_GLOBAL_DEFAULT);
|
||||
curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
curl_global_cleanup();
|
||||
return -1;
|
||||
}
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url);
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk);
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "gscrape/0.1");
|
||||
|
||||
res = curl_easy_perform(curl);
|
||||
curl_easy_cleanup(curl);
|
||||
curl_global_cleanup();
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
free(chunk.buf);
|
||||
return -1;
|
||||
}
|
||||
|
||||
*out_buf = chunk.buf;
|
||||
*out_len = chunk.size;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static xmlNode *find_title_node(xmlNode *node) {
|
||||
for (xmlNode *cur = node; cur; cur = cur->next) {
|
||||
if (cur->type == XML_ELEMENT_NODE) {
|
||||
if (xmlStrcasecmp(cur->name, (const xmlChar *)"title") == 0) {
|
||||
return cur;
|
||||
}
|
||||
xmlNode *res = find_title_node(cur->children);
|
||||
if (res) return res;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *extract_title(const char *html, size_t len) {
|
||||
if (!html || len == 0) return NULL;
|
||||
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
if (!doc) return NULL;
|
||||
|
||||
xmlNode *root = xmlDocGetRootElement(doc);
|
||||
xmlNode *title_node = find_title_node(root);
|
||||
if (!title_node) {
|
||||
xmlFreeDoc(doc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
xmlChar *content = xmlNodeGetContent(title_node);
|
||||
if (!content) {
|
||||
xmlFreeDoc(doc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *title = strdup((const char *)content);
|
||||
xmlFree(content);
|
||||
xmlFreeDoc(doc);
|
||||
xmlCleanupParser();
|
||||
return title;
|
||||
}
|
||||
|
||||
/* Helper: append formatted text to buffer (allocating). Caller owns *out. */
|
||||
static int append_fmt(char **out, size_t *out_len, const char *fmt, ...) {
|
||||
va_list ap;
|
||||
va_start(ap, fmt);
|
||||
va_list ap2;
|
||||
va_copy(ap2, ap);
|
||||
int needed = vsnprintf(NULL, 0, fmt, ap);
|
||||
va_end(ap);
|
||||
|
||||
if (needed < 0) {
|
||||
va_end(ap2);
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *newbuf = realloc(*out, *out_len + (size_t)needed + 1);
|
||||
if (!newbuf) {
|
||||
va_end(ap2);
|
||||
return -1;
|
||||
}
|
||||
*out = newbuf;
|
||||
vsnprintf(*out + *out_len, (size_t)needed + 1, fmt, ap2);
|
||||
*out_len += (size_t)needed;
|
||||
va_end(ap2);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void collect_meta_nodes(xmlNode *node, char **out, size_t *out_len) {
|
||||
for (xmlNode *cur = node; cur; cur = cur->next) {
|
||||
if (cur->type == XML_ELEMENT_NODE) {
|
||||
if (xmlStrcasecmp(cur->name, (const xmlChar *)"meta") == 0) {
|
||||
xmlChar *name = xmlGetProp(cur, (const xmlChar *)"name");
|
||||
xmlChar *prop = xmlGetProp(cur, (const xmlChar *)"property");
|
||||
xmlChar *content = xmlGetProp(cur, (const xmlChar *)"content");
|
||||
const xmlChar *key = name ? name : prop;
|
||||
if (key && content) {
|
||||
append_fmt(out, out_len, "%s: %s\n", (const char *)key, (const char *)content);
|
||||
}
|
||||
if (name) xmlFree(name);
|
||||
if (prop) xmlFree(prop);
|
||||
if (content) xmlFree(content);
|
||||
}
|
||||
collect_meta_nodes(cur->children, out, out_len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int extract_meta(const char *html, size_t len, char **out) {
|
||||
if (!html || len == 0 || !out) return -1;
|
||||
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
if (!doc) return -1;
|
||||
xmlNode *root = xmlDocGetRootElement(doc);
|
||||
*out = NULL;
|
||||
size_t out_len = 0;
|
||||
collect_meta_nodes(root, out, &out_len);
|
||||
xmlFreeDoc(doc);
|
||||
xmlCleanupParser();
|
||||
if (out_len == 0) {
|
||||
free(*out);
|
||||
*out = NULL;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void collect_og_nodes(xmlNode *node, char **out, size_t *out_len) {
|
||||
for (xmlNode *cur = node; cur; cur = cur->next) {
|
||||
if (cur->type == XML_ELEMENT_NODE) {
|
||||
if (xmlStrcasecmp(cur->name, (const xmlChar *)"meta") == 0) {
|
||||
xmlChar *prop = xmlGetProp(cur, (const xmlChar *)"property");
|
||||
xmlChar *content = xmlGetProp(cur, (const xmlChar *)"content");
|
||||
if (prop && content) {
|
||||
if (strncasecmp((const char *)prop, "og:", 3) == 0) {
|
||||
append_fmt(out, out_len, "%s: %s\n", (const char *)prop, (const char *)content);
|
||||
}
|
||||
}
|
||||
if (prop) xmlFree(prop);
|
||||
if (content) xmlFree(content);
|
||||
}
|
||||
collect_og_nodes(cur->children, out, out_len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int extract_og(const char *html, size_t len, char **out) {
|
||||
if (!html || len == 0 || !out) return -1;
|
||||
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
if (!doc) return -1;
|
||||
xmlNode *root = xmlDocGetRootElement(doc);
|
||||
*out = NULL;
|
||||
size_t out_len = 0;
|
||||
collect_og_nodes(root, out, &out_len);
|
||||
xmlFreeDoc(doc);
|
||||
xmlCleanupParser();
|
||||
if (out_len == 0) {
|
||||
free(*out);
|
||||
*out = NULL;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static char *get_node_content(xmlNode *node) {
|
||||
xmlChar *content = xmlNodeGetContent(node);
|
||||
if (!content) return NULL;
|
||||
char *res = strdup((const char *)content);
|
||||
xmlFree(content);
|
||||
return res;
|
||||
}
|
||||
|
||||
int extract_jsonld_product(const char *html, size_t len, char **out_json) {
|
||||
if (!html || len == 0 || !out_json) return -1;
|
||||
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
if (!doc) return -1;
|
||||
xmlNode *root = xmlDocGetRootElement(doc);
|
||||
*out_json = NULL;
|
||||
|
||||
for (xmlNode *cur = root; cur; cur = cur->next) {
|
||||
if (cur->type == XML_ELEMENT_NODE) {
|
||||
if (xmlStrcasecmp(cur->name, (const xmlChar *)"script") == 0) {
|
||||
xmlChar *type = xmlGetProp(cur, (const xmlChar *)"type");
|
||||
if (type && xmlStrcasecmp(type, (const xmlChar *)"application/ld+json") == 0) {
|
||||
char *content = get_node_content(cur);
|
||||
if (content) {
|
||||
/* crude check for Product */
|
||||
if ((strcasestr(content, "\"@type\"") && strcasestr(content, "Product")) || strcasestr(content, "\"Product\"")) {
|
||||
*out_json = content; /* transfer ownership */
|
||||
xmlFree(type);
|
||||
xmlFreeDoc(doc);
|
||||
xmlCleanupParser();
|
||||
return 0;
|
||||
}
|
||||
free(content);
|
||||
}
|
||||
}
|
||||
if (type) xmlFree(type);
|
||||
}
|
||||
/* search children */
|
||||
if (cur->children) {
|
||||
/* recurse using a stack-like traversal by resetting cur to children
|
||||
* then letting the outer loop walk siblings. Simpler: call function
|
||||
* recursively — but here we inline a nested loop to find scripts.
|
||||
*/
|
||||
xmlNode *child = cur->children;
|
||||
while (child) {
|
||||
if (child->type == XML_ELEMENT_NODE && xmlStrcasecmp(child->name, (const xmlChar *)"script") == 0) {
|
||||
xmlChar *type = xmlGetProp(child, (const xmlChar *)"type");
|
||||
if (type && xmlStrcasecmp(type, (const xmlChar *)"application/ld+json") == 0) {
|
||||
char *content = get_node_content(child);
|
||||
if (content) {
|
||||
if ((strcasestr(content, "\"@type\"") && strcasestr(content, "Product")) || strcasestr(content, "\"Product\"")) {
|
||||
*out_json = content;
|
||||
if (type) xmlFree(type);
|
||||
xmlFreeDoc(doc);
|
||||
xmlCleanupParser();
|
||||
return 0;
|
||||
}
|
||||
free(content);
|
||||
}
|
||||
}
|
||||
if (type) xmlFree(type);
|
||||
}
|
||||
child = child->next;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
xmlFreeDoc(doc);
|
||||
xmlCleanupParser();
|
||||
return -1;
|
||||
}
|
||||
|
||||
static xmlNode *find_h1_node(xmlNode *node) {
|
||||
for (xmlNode *cur = node; cur; cur = cur->next) {
|
||||
if (cur->type == XML_ELEMENT_NODE) {
|
||||
if (xmlStrcasecmp(cur->name, (const xmlChar *)"h1") == 0) {
|
||||
return cur;
|
||||
}
|
||||
xmlNode *res = find_h1_node(cur->children);
|
||||
if (res) return res;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *extract_h1(const char *html, size_t len) {
|
||||
if (!html || len == 0) return NULL;
|
||||
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
if (!doc) return NULL;
|
||||
xmlNode *root = xmlDocGetRootElement(doc);
|
||||
xmlNode *h1 = find_h1_node(root);
|
||||
if (!h1) {
|
||||
xmlFreeDoc(doc);
|
||||
return NULL;
|
||||
}
|
||||
char *res = get_node_content(h1);
|
||||
xmlFreeDoc(doc);
|
||||
xmlCleanupParser();
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Heuristic product extraction */
|
||||
static bool attr_contains(const xmlChar *attr, const char *needle) {
|
||||
if (!attr || !needle) return false;
|
||||
return (strcasestr((const char *)attr, needle) != NULL);
|
||||
}
|
||||
|
||||
static char *find_name_in_node(xmlNode *node) {
|
||||
if (!node) return NULL;
|
||||
/* look for h1-h4, a, or elements with class/id containing name/title/product */
|
||||
for (xmlNode *cur = node; cur; cur = cur->next) {
|
||||
if (cur->type == XML_ELEMENT_NODE) {
|
||||
if (xmlStrcasecmp(cur->name, (const xmlChar *)"h1") == 0 ||
|
||||
xmlStrcasecmp(cur->name, (const xmlChar *)"h2") == 0 ||
|
||||
xmlStrcasecmp(cur->name, (const xmlChar *)"h3") == 0 ||
|
||||
xmlStrcasecmp(cur->name, (const xmlChar *)"h4") == 0) {
|
||||
char *txt = get_node_content(cur);
|
||||
if (txt && strlen(txt) > 1 && strlen(txt) < 200) return txt;
|
||||
if (txt) free(txt);
|
||||
}
|
||||
|
||||
xmlChar *cls = xmlGetProp(cur, (const xmlChar *)"class");
|
||||
xmlChar *id = xmlGetProp(cur, (const xmlChar *)"id");
|
||||
if ((cls && (attr_contains(cls, "title") || attr_contains(cls, "name") || attr_contains(cls, "product"))) ||
|
||||
(id && (attr_contains(id, "title") || attr_contains(id, "name") || attr_contains(id, "product")))) {
|
||||
/* Avoid returning anchor text (which may include price); prefer
|
||||
headings or specific child elements. If current node is an
|
||||
anchor, recurse into children instead of returning its text. */
|
||||
if (xmlStrcasecmp(cur->name, (const xmlChar *)"a") != 0) {
|
||||
char *txt = get_node_content(cur);
|
||||
if (txt && strlen(txt) > 0 && strlen(txt) < 200) {
|
||||
if (cls) xmlFree(cls);
|
||||
if (id) xmlFree(id);
|
||||
return txt;
|
||||
}
|
||||
if (txt) free(txt);
|
||||
}
|
||||
}
|
||||
if (cls) xmlFree(cls);
|
||||
if (id) xmlFree(id);
|
||||
|
||||
char *res = find_name_in_node(cur->children);
|
||||
if (res) return res;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static char *find_href_in_node(xmlNode *node) {
|
||||
if (!node) return NULL;
|
||||
for (xmlNode *cur = node; cur; cur = cur->next) {
|
||||
if (cur->type == XML_ELEMENT_NODE) {
|
||||
if (xmlStrcasecmp(cur->name, (const xmlChar *)"a") == 0) {
|
||||
xmlChar *href = xmlGetProp(cur, (const xmlChar *)"href");
|
||||
if (href) {
|
||||
char *res = strdup((const char *)href);
|
||||
xmlFree(href);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
char *res = find_href_in_node(cur->children);
|
||||
if (res) return res;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static char *find_price_in_node(xmlNode *node) {
|
||||
if (!node) return NULL;
|
||||
/* Check element's attributes for obvious price strings */
|
||||
for (xmlNode *cur = node; cur; cur = cur->next) {
|
||||
if (cur->type == XML_ELEMENT_NODE) {
|
||||
xmlChar *cls = xmlGetProp(cur, (const xmlChar *)"class");
|
||||
xmlChar *id = xmlGetProp(cur, (const xmlChar *)"id");
|
||||
if ((cls && attr_contains(cls, "price")) || (id && attr_contains(id, "price"))) {
|
||||
char *txt = get_node_content(cur);
|
||||
if (cls) xmlFree(cls);
|
||||
if (id) xmlFree(id);
|
||||
if (txt && strlen(txt) > 0 && strlen(txt) < 200) return txt;
|
||||
if (txt) free(txt);
|
||||
}
|
||||
if (cls) xmlFree(cls);
|
||||
if (id) xmlFree(id);
|
||||
|
||||
/* Check textual content for currency symbols */
|
||||
char *txt = get_node_content(cur);
|
||||
if (txt) {
|
||||
/* Avoid returning container text (e.g., anchor that includes title
|
||||
* and price). Only accept textual matches when the element is not
|
||||
* an anchor or heading.
|
||||
*/
|
||||
if (xmlStrcasecmp(cur->name, (const xmlChar *)"a") != 0 &&
|
||||
xmlStrcasecmp(cur->name, (const xmlChar *)"h1") != 0 &&
|
||||
xmlStrcasecmp(cur->name, (const xmlChar *)"h2") != 0 &&
|
||||
(strchr(txt, '$') || strchr(txt, '£') || strchr(txt, '€') || strstr(txt, "USD") || strstr(txt, "EUR"))) {
|
||||
if (strlen(txt) < 200) {
|
||||
char *trim = strdup(txt);
|
||||
free(txt);
|
||||
return trim;
|
||||
}
|
||||
}
|
||||
free(txt);
|
||||
}
|
||||
|
||||
char *res = find_price_in_node(cur->children);
|
||||
if (res) return res;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Collect product information for a node that appears to be a product item.
|
||||
* Looks for h2.product-name and span.price preferentially, falls back to
|
||||
* generic name/price finders.
|
||||
*/
|
||||
static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found) {
|
||||
if (!node) return;
|
||||
char *name = NULL;
|
||||
char *price = NULL;
|
||||
char *plink = NULL;
|
||||
|
||||
/* Prefer h2 with class 'product-name' or titles */
|
||||
for (xmlNode *c = node->children; c; c = c->next) {
|
||||
if (c->type != XML_ELEMENT_NODE) continue;
|
||||
xmlChar *cls = xmlGetProp(c, (const xmlChar *)"class");
|
||||
if (xmlStrcasecmp(c->name, (const xmlChar *)"h2") == 0 && cls) {
|
||||
if (attr_contains(cls, "product-name") || attr_contains(cls, "woocommerce-loop-product__title") || attr_contains(cls, "product-title")) {
|
||||
name = get_node_content(c);
|
||||
xmlFree(cls);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (cls) xmlFree(cls);
|
||||
}
|
||||
|
||||
/* Price: look for span with class 'price' or data-products='price' */
|
||||
if (!price) {
|
||||
for (xmlNode *c = node->children; c; c = c->next) {
|
||||
if (c->type != XML_ELEMENT_NODE) continue;
|
||||
xmlChar *cls = xmlGetProp(c, (const xmlChar *)"class");
|
||||
xmlChar *dp = xmlGetProp(c, (const xmlChar *)"data-products");
|
||||
if (cls && attr_contains(cls, "price")) {
|
||||
price = get_node_content(c);
|
||||
if (cls) xmlFree(cls);
|
||||
if (dp) xmlFree(dp);
|
||||
break;
|
||||
}
|
||||
if (dp && attr_contains(dp, "price")) {
|
||||
price = get_node_content(c);
|
||||
if (cls) xmlFree(cls);
|
||||
if (dp) xmlFree(dp);
|
||||
break;
|
||||
}
|
||||
if (cls) xmlFree(cls);
|
||||
if (dp) xmlFree(dp);
|
||||
}
|
||||
}
|
||||
|
||||
/* Fallbacks */
|
||||
if (!name) name = find_name_in_node(node->children);
|
||||
if (!price) price = find_price_in_node(node->children);
|
||||
/* find product link if available */
|
||||
plink = find_href_in_node(node);
|
||||
if (!plink) plink = strdup("<no-link>");
|
||||
|
||||
if (name || price) {
|
||||
if (!name) name = strdup("<no-name>");
|
||||
if (!price) price = strdup("<no-price>");
|
||||
append_fmt(out, out_len, "%s | %s | %s\n", name, price, plink);
|
||||
(*found)++;
|
||||
}
|
||||
if (name) free(name);
|
||||
if (price) free(price);
|
||||
free(plink);
|
||||
}
|
||||
|
||||
static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found) {
|
||||
for (xmlNode *cur = node; cur; cur = cur->next) {
|
||||
if (cur->type != XML_ELEMENT_NODE) continue;
|
||||
|
||||
xmlChar *data_products = xmlGetProp(cur, (const xmlChar *)"data-products");
|
||||
xmlChar *itemtype = xmlGetProp(cur, (const xmlChar *)"itemtype");
|
||||
xmlChar *cls = xmlGetProp(cur, (const xmlChar *)"class");
|
||||
|
||||
bool is_product = false;
|
||||
if (data_products && attr_contains(data_products, "item")) is_product = true;
|
||||
else if (itemtype && attr_contains(itemtype, "Product")) is_product = true;
|
||||
else if (cls && xmlStrcasecmp(cur->name, (const xmlChar *)"li") == 0 && attr_contains(cls, "product")) is_product = true;
|
||||
|
||||
if (is_product) {
|
||||
collect_product_from_node(cur, out, out_len, found);
|
||||
}
|
||||
|
||||
if (data_products) xmlFree(data_products);
|
||||
if (itemtype) xmlFree(itemtype);
|
||||
if (cls) xmlFree(cls);
|
||||
|
||||
if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found);
|
||||
}
|
||||
}
|
||||
|
||||
int extract_products(const char *html, size_t len, char **out) {
|
||||
if (!html || len == 0 || !out) return -1;
|
||||
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
if (!doc) return -1;
|
||||
xmlNode *root = xmlDocGetRootElement(doc);
|
||||
|
||||
*out = NULL;
|
||||
size_t out_len = 0;
|
||||
int found = 0;
|
||||
|
||||
traverse_and_collect_products(root, out, &out_len, &found);
|
||||
|
||||
xmlFreeDoc(doc);
|
||||
xmlCleanupParser();
|
||||
if (found == 0) {
|
||||
free(*out);
|
||||
*out = NULL;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user