Compare commits

..

3 Commits

6 changed files with 89 additions and 23 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
gscrape
test-site

1
README Normal file
View File

@ -0,0 +1 @@
This project was written ENTIRELY with AI

BIN
gscrape

Binary file not shown.

View File

@ -28,6 +28,6 @@ char *extract_h1(const char *html, size_t len);
* Returns 0 and allocates *out on success (caller frees), or -1.
* Each line represents one product; fields are separated by '|'.
*/
int extract_products(const char *html, size_t len, char **out);
int extract_products(const char *html, size_t len, const char *base_url, char **out);
#endif /* SCRAPER_H */

View File

@ -12,24 +12,24 @@ static char *json_escape(const char *s) {
size_t oi = 0;
out[oi++] = '"';
for (size_t i = 0; i < len; ++i) {
unsigned char c = s[i];
if (c == '"' || c == '\\') {
unsigned char uc = (unsigned char)s[i];
if (uc == '"' || uc == '\\') {
if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); }
out[oi++] = '\\';
out[oi++] = c;
} else if (c == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; }
else if (c == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; }
else if (c == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; }
else if (c == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; }
else if (c == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; }
else if (c < 0x20) {
char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", c);
out[oi++] = (char)uc;
} else if (uc == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; }
else if (uc == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; }
else if (uc == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; }
else if (uc == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; }
else if (uc == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; }
else if (uc < 0x20) {
char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", (unsigned)uc);
size_t bl = strlen(buf);
if (oi + bl + 1 >= cap) { cap = cap + bl + 16; out = realloc(out, cap); }
memcpy(out + oi, buf, bl); oi += bl;
} else {
if (oi + 1 >= cap) { cap *= 2; out = realloc(out, cap); }
out[oi++] = c;
out[oi++] = (char)uc;
}
}
if (oi + 2 >= cap) { cap += 2; out = realloc(out, cap); }
@ -52,7 +52,7 @@ static void print_kv_json(const char *kv_lines) {
size_t keylen = (size_t)(sep - p);
const char *vstart = sep + 1;
while (vstart < p + linelen && (*vstart == ' ' || *vstart == '\t')) vstart++;
size_t vallen = (p + linelen) - vstart;
size_t vallen = (size_t)((p + linelen) - vstart);
char *key = malloc(keylen + 1);
char *val = malloc(vallen + 1);
memcpy(key, p, keylen); key[keylen] = '\0';
@ -93,7 +93,7 @@ static void print_products_json(const char *products_lines) {
size_t plen = (size_t)(s2 - p2);
const char *u2 = s2 + 1;
while (u2 < p + linelen && (*u2 == ' ' || *u2 == '\t')) u2++;
size_t ulen = (p + linelen) - u2;
size_t ulen = (size_t)((p + linelen) - u2);
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
url = malloc(ulen + 1); memcpy(url, u2, ulen); url[ulen] = '\0';
@ -103,7 +103,7 @@ static void print_products_json(const char *products_lines) {
const char *p2 = s1 + 1;
while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--;
while (p2 < p + linelen && (*p2 == ' ' || *p2 == '\t')) p2++;
size_t plen = (p + linelen) - p2;
size_t plen = (size_t)((p + linelen) - p2);
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
url = strdup("");
@ -202,7 +202,7 @@ int main(int argc, char **argv) {
}
char *products = NULL;
if (extract_products(html, html_len, &products) == 0 && products) {
if (extract_products(html, html_len, url, &products) == 0 && products) {
printf("\"products\":");
print_products_json(products);
free(products);

View File

@ -390,7 +390,7 @@ static char *find_price_in_node(xmlNode *node) {
if (xmlStrcasecmp(cur->name, (const xmlChar *)"a") != 0 &&
xmlStrcasecmp(cur->name, (const xmlChar *)"h1") != 0 &&
xmlStrcasecmp(cur->name, (const xmlChar *)"h2") != 0 &&
(strchr(txt, '$') || strchr(txt, '£') || strchr(txt, '') || strstr(txt, "USD") || strstr(txt, "EUR"))) {
(strchr(txt, '$') || strstr(txt, "£") || strstr(txt, "") || strstr(txt, "USD") || strstr(txt, "EUR"))) {
if (strlen(txt) < 200) {
char *trim = strdup(txt);
free(txt);
@ -411,7 +411,61 @@ static char *find_price_in_node(xmlNode *node) {
* Looks for h2.product-name and span.price preferentially, falls back to
* generic name/price finders.
*/
static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found) {
/* Resolve a possibly-relative href against a base URL. Returns a newly
* allocated string (caller must free) or NULL on error.
*/
static char *resolve_url(const char *base, const char *href) {
if (!href) return NULL;
if (strstr(href, "://")) return strdup(href);
if (strncmp(href, "//", 2) == 0) {
/* scheme-relative */
const char *p = strstr(base, "://");
if (!p) return strdup(href + 2);
size_t scheme_len = (size_t)(p - base);
size_t outlen = scheme_len + 3 + strlen(href + 2) + 1;
char *out = malloc(outlen);
if (!out) return NULL;
snprintf(out, outlen, "%.*s://%s", (int)scheme_len, base, href + 2);
return out;
}
if (href[0] == '/') {
/* absolute path on same host */
const char *p = strstr(base, "://");
const char *start = base;
if (p) start = p + 3; /* host start */
const char *host_end = strchr(start, '/');
size_t prefix_len = 0;
if (host_end) prefix_len = (size_t)(host_end - base);
else prefix_len = strlen(base);
size_t outlen = prefix_len + strlen(href) + 1;
char *out = malloc(outlen);
if (!out) return NULL;
memcpy(out, base, prefix_len);
out[prefix_len] = '\0';
strncat(out, href, strlen(href));
return out;
}
/* relative path: append after last '/' in base */
const char *last_slash = strrchr(base, '/');
size_t base_prefix = strlen(base);
if (last_slash) {
/* keep up to and including last slash */
base_prefix = (size_t)(last_slash - base + 1);
} else {
base_prefix = strlen(base);
}
size_t outlen = base_prefix + strlen(href) + 1;
char *out = malloc(outlen);
if (!out) return NULL;
memcpy(out, base, base_prefix);
out[base_prefix] = '\0';
strncat(out, href, strlen(href));
return out;
}
static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found, const char *base_url) {
if (!node) return;
char *name = NULL;
char *price = NULL;
@ -460,6 +514,15 @@ static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len
/* find product link if available */
plink = find_href_in_node(node);
if (!plink) plink = strdup("<no-link>");
else {
if (base_url) {
char *abs = resolve_url(base_url, plink);
if (abs) {
free(plink);
plink = abs;
}
}
}
if (name || price) {
if (!name) name = strdup("<no-name>");
@ -472,7 +535,7 @@ static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len
free(plink);
}
static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found) {
static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found, const char *base_url) {
for (xmlNode *cur = node; cur; cur = cur->next) {
if (cur->type != XML_ELEMENT_NODE) continue;
@ -486,18 +549,18 @@ static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out
else if (cls && xmlStrcasecmp(cur->name, (const xmlChar *)"li") == 0 && attr_contains(cls, "product")) is_product = true;
if (is_product) {
collect_product_from_node(cur, out, out_len, found);
collect_product_from_node(cur, out, out_len, found, base_url);
}
if (data_products) xmlFree(data_products);
if (itemtype) xmlFree(itemtype);
if (cls) xmlFree(cls);
if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found);
if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found, base_url);
}
}
int extract_products(const char *html, size_t len, char **out) {
int extract_products(const char *html, size_t len, const char *base_url, char **out) {
if (!html || len == 0 || !out) return -1;
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (!doc) return -1;
@ -507,7 +570,7 @@ int extract_products(const char *html, size_t len, char **out) {
size_t out_len = 0;
int found = 0;
traverse_and_collect_products(root, out, &out_len, &found);
traverse_and_collect_products(root, out, &out_len, &found, base_url);
xmlFreeDoc(doc);
xmlCleanupParser();