Compare commits
3 Commits
2afe840ede
...
485333dcb1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
485333dcb1 | ||
|
|
aab9d411dd | ||
|
|
46c174be8d |
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
gscrape
|
||||
test-site
|
||||
@ -28,6 +28,6 @@ char *extract_h1(const char *html, size_t len);
|
||||
* Returns 0 and allocates *out on success (caller frees), or -1.
|
||||
* Each line represents one product; fields are separated by '|'.
|
||||
*/
|
||||
int extract_products(const char *html, size_t len, char **out);
|
||||
int extract_products(const char *html, size_t len, const char *base_url, char **out);
|
||||
|
||||
#endif /* SCRAPER_H */
|
||||
|
||||
30
src/main.c
30
src/main.c
@ -12,24 +12,24 @@ static char *json_escape(const char *s) {
|
||||
size_t oi = 0;
|
||||
out[oi++] = '"';
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
unsigned char c = s[i];
|
||||
if (c == '"' || c == '\\') {
|
||||
unsigned char uc = (unsigned char)s[i];
|
||||
if (uc == '"' || uc == '\\') {
|
||||
if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); }
|
||||
out[oi++] = '\\';
|
||||
out[oi++] = c;
|
||||
} else if (c == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; }
|
||||
else if (c == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; }
|
||||
else if (c == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; }
|
||||
else if (c == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; }
|
||||
else if (c == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; }
|
||||
else if (c < 0x20) {
|
||||
char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", c);
|
||||
out[oi++] = (char)uc;
|
||||
} else if (uc == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; }
|
||||
else if (uc == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; }
|
||||
else if (uc == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; }
|
||||
else if (uc == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; }
|
||||
else if (uc == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; }
|
||||
else if (uc < 0x20) {
|
||||
char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", (unsigned)uc);
|
||||
size_t bl = strlen(buf);
|
||||
if (oi + bl + 1 >= cap) { cap = cap + bl + 16; out = realloc(out, cap); }
|
||||
memcpy(out + oi, buf, bl); oi += bl;
|
||||
} else {
|
||||
if (oi + 1 >= cap) { cap *= 2; out = realloc(out, cap); }
|
||||
out[oi++] = c;
|
||||
out[oi++] = (char)uc;
|
||||
}
|
||||
}
|
||||
if (oi + 2 >= cap) { cap += 2; out = realloc(out, cap); }
|
||||
@ -52,7 +52,7 @@ static void print_kv_json(const char *kv_lines) {
|
||||
size_t keylen = (size_t)(sep - p);
|
||||
const char *vstart = sep + 1;
|
||||
while (vstart < p + linelen && (*vstart == ' ' || *vstart == '\t')) vstart++;
|
||||
size_t vallen = (p + linelen) - vstart;
|
||||
size_t vallen = (size_t)((p + linelen) - vstart);
|
||||
char *key = malloc(keylen + 1);
|
||||
char *val = malloc(vallen + 1);
|
||||
memcpy(key, p, keylen); key[keylen] = '\0';
|
||||
@ -93,7 +93,7 @@ static void print_products_json(const char *products_lines) {
|
||||
size_t plen = (size_t)(s2 - p2);
|
||||
const char *u2 = s2 + 1;
|
||||
while (u2 < p + linelen && (*u2 == ' ' || *u2 == '\t')) u2++;
|
||||
size_t ulen = (p + linelen) - u2;
|
||||
size_t ulen = (size_t)((p + linelen) - u2);
|
||||
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
|
||||
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
|
||||
url = malloc(ulen + 1); memcpy(url, u2, ulen); url[ulen] = '\0';
|
||||
@ -103,7 +103,7 @@ static void print_products_json(const char *products_lines) {
|
||||
const char *p2 = s1 + 1;
|
||||
while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--;
|
||||
while (p2 < p + linelen && (*p2 == ' ' || *p2 == '\t')) p2++;
|
||||
size_t plen = (p + linelen) - p2;
|
||||
size_t plen = (size_t)((p + linelen) - p2);
|
||||
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
|
||||
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
|
||||
url = strdup("");
|
||||
@ -202,7 +202,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
char *products = NULL;
|
||||
if (extract_products(html, html_len, &products) == 0 && products) {
|
||||
if (extract_products(html, html_len, url, &products) == 0 && products) {
|
||||
printf("\"products\":");
|
||||
print_products_json(products);
|
||||
free(products);
|
||||
|
||||
@ -390,7 +390,7 @@ static char *find_price_in_node(xmlNode *node) {
|
||||
if (xmlStrcasecmp(cur->name, (const xmlChar *)"a") != 0 &&
|
||||
xmlStrcasecmp(cur->name, (const xmlChar *)"h1") != 0 &&
|
||||
xmlStrcasecmp(cur->name, (const xmlChar *)"h2") != 0 &&
|
||||
(strchr(txt, '$') || strchr(txt, '£') || strchr(txt, '€') || strstr(txt, "USD") || strstr(txt, "EUR"))) {
|
||||
(strchr(txt, '$') || strstr(txt, "£") || strstr(txt, "€") || strstr(txt, "USD") || strstr(txt, "EUR"))) {
|
||||
if (strlen(txt) < 200) {
|
||||
char *trim = strdup(txt);
|
||||
free(txt);
|
||||
@ -411,7 +411,61 @@ static char *find_price_in_node(xmlNode *node) {
|
||||
* Looks for h2.product-name and span.price preferentially, falls back to
|
||||
* generic name/price finders.
|
||||
*/
|
||||
static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found) {
|
||||
/* Resolve a possibly-relative href against a base URL. Returns a newly
|
||||
* allocated string (caller must free) or NULL on error.
|
||||
*/
|
||||
static char *resolve_url(const char *base, const char *href) {
|
||||
if (!href) return NULL;
|
||||
if (strstr(href, "://")) return strdup(href);
|
||||
if (strncmp(href, "//", 2) == 0) {
|
||||
/* scheme-relative */
|
||||
const char *p = strstr(base, "://");
|
||||
if (!p) return strdup(href + 2);
|
||||
size_t scheme_len = (size_t)(p - base);
|
||||
size_t outlen = scheme_len + 3 + strlen(href + 2) + 1;
|
||||
char *out = malloc(outlen);
|
||||
if (!out) return NULL;
|
||||
snprintf(out, outlen, "%.*s://%s", (int)scheme_len, base, href + 2);
|
||||
return out;
|
||||
}
|
||||
|
||||
if (href[0] == '/') {
|
||||
/* absolute path on same host */
|
||||
const char *p = strstr(base, "://");
|
||||
const char *start = base;
|
||||
if (p) start = p + 3; /* host start */
|
||||
const char *host_end = strchr(start, '/');
|
||||
size_t prefix_len = 0;
|
||||
if (host_end) prefix_len = (size_t)(host_end - base);
|
||||
else prefix_len = strlen(base);
|
||||
size_t outlen = prefix_len + strlen(href) + 1;
|
||||
char *out = malloc(outlen);
|
||||
if (!out) return NULL;
|
||||
memcpy(out, base, prefix_len);
|
||||
out[prefix_len] = '\0';
|
||||
strncat(out, href, strlen(href));
|
||||
return out;
|
||||
}
|
||||
|
||||
/* relative path: append after last '/' in base */
|
||||
const char *last_slash = strrchr(base, '/');
|
||||
size_t base_prefix = strlen(base);
|
||||
if (last_slash) {
|
||||
/* keep up to and including last slash */
|
||||
base_prefix = (size_t)(last_slash - base + 1);
|
||||
} else {
|
||||
base_prefix = strlen(base);
|
||||
}
|
||||
size_t outlen = base_prefix + strlen(href) + 1;
|
||||
char *out = malloc(outlen);
|
||||
if (!out) return NULL;
|
||||
memcpy(out, base, base_prefix);
|
||||
out[base_prefix] = '\0';
|
||||
strncat(out, href, strlen(href));
|
||||
return out;
|
||||
}
|
||||
|
||||
static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found, const char *base_url) {
|
||||
if (!node) return;
|
||||
char *name = NULL;
|
||||
char *price = NULL;
|
||||
@ -460,6 +514,15 @@ static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len
|
||||
/* find product link if available */
|
||||
plink = find_href_in_node(node);
|
||||
if (!plink) plink = strdup("<no-link>");
|
||||
else {
|
||||
if (base_url) {
|
||||
char *abs = resolve_url(base_url, plink);
|
||||
if (abs) {
|
||||
free(plink);
|
||||
plink = abs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (name || price) {
|
||||
if (!name) name = strdup("<no-name>");
|
||||
@ -472,7 +535,7 @@ static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len
|
||||
free(plink);
|
||||
}
|
||||
|
||||
static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found) {
|
||||
static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found, const char *base_url) {
|
||||
for (xmlNode *cur = node; cur; cur = cur->next) {
|
||||
if (cur->type != XML_ELEMENT_NODE) continue;
|
||||
|
||||
@ -486,18 +549,18 @@ static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out
|
||||
else if (cls && xmlStrcasecmp(cur->name, (const xmlChar *)"li") == 0 && attr_contains(cls, "product")) is_product = true;
|
||||
|
||||
if (is_product) {
|
||||
collect_product_from_node(cur, out, out_len, found);
|
||||
collect_product_from_node(cur, out, out_len, found, base_url);
|
||||
}
|
||||
|
||||
if (data_products) xmlFree(data_products);
|
||||
if (itemtype) xmlFree(itemtype);
|
||||
if (cls) xmlFree(cls);
|
||||
|
||||
if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found);
|
||||
if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found, base_url);
|
||||
}
|
||||
}
|
||||
|
||||
int extract_products(const char *html, size_t len, char **out) {
|
||||
int extract_products(const char *html, size_t len, const char *base_url, char **out) {
|
||||
if (!html || len == 0 || !out) return -1;
|
||||
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
if (!doc) return -1;
|
||||
@ -507,7 +570,7 @@ int extract_products(const char *html, size_t len, char **out) {
|
||||
size_t out_len = 0;
|
||||
int found = 0;
|
||||
|
||||
traverse_and_collect_products(root, out, &out_len, &found);
|
||||
traverse_and_collect_products(root, out, &out_len, &found, base_url);
|
||||
|
||||
xmlFreeDoc(doc);
|
||||
xmlCleanupParser();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user