diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c197b62 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +gscrape diff --git a/gscrape b/gscrape deleted file mode 100755 index f6ecf0c..0000000 Binary files a/gscrape and /dev/null differ diff --git a/src/main.c b/src/main.c index f0132bc..bd4a08b 100644 --- a/src/main.c +++ b/src/main.c @@ -12,24 +12,24 @@ static char *json_escape(const char *s) { size_t oi = 0; out[oi++] = '"'; for (size_t i = 0; i < len; ++i) { - unsigned char c = s[i]; - if (c == '"' || c == '\\') { + unsigned char uc = (unsigned char)s[i]; + if (uc == '"' || uc == '\\') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; - out[oi++] = c; - } else if (c == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; } - else if (c == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; } - else if (c == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; } - else if (c == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; } - else if (c == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; } - else if (c < 0x20) { - char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", c); + out[oi++] = (char)uc; + } else if (uc == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; } + else if (uc == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; } + else if (uc == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; } + else if (uc == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; } + else if (uc == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; } + else if (uc < 0x20) { + char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", (unsigned)uc); size_t bl = strlen(buf); if (oi + bl + 1 >= cap) { cap = cap + bl + 16; out = realloc(out, cap); } memcpy(out + oi, buf, bl); oi += bl; } else { if (oi + 1 >= cap) { cap *= 2; out = realloc(out, cap); } - out[oi++] = c; + out[oi++] = (char)uc; } } if (oi + 2 >= cap) { cap += 2; out = realloc(out, cap); } @@ -52,7 +52,7 @@ static void print_kv_json(const char *kv_lines) { size_t keylen = (size_t)(sep - p); const char *vstart = sep + 1; while (vstart < p + linelen && (*vstart == ' ' || *vstart == '\t')) vstart++; - size_t vallen = (p + linelen) - vstart; + size_t vallen = (size_t)((p + linelen) - vstart); char *key = malloc(keylen + 1); char *val = malloc(vallen + 1); memcpy(key, p, keylen); key[keylen] = '\0'; @@ -93,7 +93,7 @@ static void print_products_json(const char *products_lines) { size_t plen = (size_t)(s2 - p2); const char *u2 = s2 + 1; while (u2 < p + linelen && (*u2 == ' ' || *u2 == '\t')) u2++; - size_t ulen = (p + linelen) - u2; + size_t ulen = (size_t)((p + linelen) - u2); name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0'; price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0'; url = malloc(ulen + 1); memcpy(url, u2, ulen); url[ulen] = '\0'; @@ -103,7 +103,7 @@ static void print_products_json(const char *products_lines) { const char *p2 = s1 + 1; while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--; while (p2 < p + linelen && (*p2 == ' ' || *p2 == '\t')) p2++; - size_t plen = (p + linelen) - p2; + size_t plen = (size_t)((p + linelen) - p2); name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0'; price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0'; url = strdup(""); @@ -202,7 +202,7 @@ int main(int argc, char **argv) { } char *products = NULL; - if (extract_products(html, html_len, &products) == 0 && products) { + if (extract_products(html, html_len, url, &products) == 0 && products) { printf("\"products\":"); print_products_json(products); free(products); diff --git a/src/scraper.c b/src/scraper.c index 15590ed..0c82bd6 100644 --- a/src/scraper.c +++ b/src/scraper.c @@ -411,7 +411,61 @@ static char *find_price_in_node(xmlNode *node) { * Looks for h2.product-name and span.price preferentially, falls back to * generic name/price finders. */ -static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found) { +/* Resolve a possibly-relative href against a base URL. Returns a newly + * allocated string (caller must free) or NULL on error. + */ +static char *resolve_url(const char *base, const char *href) { + if (!href) return NULL; + if (strstr(href, "://")) return strdup(href); + if (strncmp(href, "//", 2) == 0) { + /* scheme-relative */ + const char *p = strstr(base, "://"); + if (!p) return strdup(href + 2); + size_t scheme_len = (size_t)(p - base); + size_t outlen = scheme_len + 3 + strlen(href + 2) + 1; + char *out = malloc(outlen); + if (!out) return NULL; + snprintf(out, outlen, "%.*s://%s", (int)scheme_len, base, href + 2); + return out; + } + + if (href[0] == '/') { + /* absolute path on same host */ + const char *p = strstr(base, "://"); + const char *start = base; + if (p) start = p + 3; /* host start */ + const char *host_end = strchr(start, '/'); + size_t prefix_len = 0; + if (host_end) prefix_len = (size_t)(host_end - base); + else prefix_len = strlen(base); + size_t outlen = prefix_len + strlen(href) + 1; + char *out = malloc(outlen); + if (!out) return NULL; + memcpy(out, base, prefix_len); + out[prefix_len] = '\0'; + strncat(out, href, strlen(href)); + return out; + } + + /* relative path: append after last '/' in base */ + const char *last_slash = strrchr(base, '/'); + size_t base_prefix = strlen(base); + if (last_slash) { + /* keep up to and including last slash */ + base_prefix = (size_t)(last_slash - base + 1); + } else { + base_prefix = strlen(base); + } + size_t outlen = base_prefix + strlen(href) + 1; + char *out = malloc(outlen); + if (!out) return NULL; + memcpy(out, base, base_prefix); + out[base_prefix] = '\0'; + strncat(out, href, strlen(href)); + return out; +} + +static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found, const char *base_url) { if (!node) return; char *name = NULL; char *price = NULL; @@ -460,6 +514,15 @@ static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len /* find product link if available */ plink = find_href_in_node(node); if (!plink) plink = strdup(""); + else { + if (base_url) { + char *abs = resolve_url(base_url, plink); + if (abs) { + free(plink); + plink = abs; + } + } + } if (name || price) { if (!name) name = strdup(""); @@ -472,7 +535,7 @@ static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len free(plink); } -static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found) { +static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found, const char *base_url) { for (xmlNode *cur = node; cur; cur = cur->next) { if (cur->type != XML_ELEMENT_NODE) continue; @@ -486,18 +549,18 @@ static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out else if (cls && xmlStrcasecmp(cur->name, (const xmlChar *)"li") == 0 && attr_contains(cls, "product")) is_product = true; if (is_product) { - collect_product_from_node(cur, out, out_len, found); + collect_product_from_node(cur, out, out_len, found, base_url); } if (data_products) xmlFree(data_products); if (itemtype) xmlFree(itemtype); if (cls) xmlFree(cls); - if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found); + if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found, base_url); } } -int extract_products(const char *html, size_t len, char **out) { +int extract_products(const char *html, size_t len, const char *base_url, char **out) { if (!html || len == 0 || !out) return -1; htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); if (!doc) return -1; @@ -507,7 +570,7 @@ int extract_products(const char *html, size_t len, char **out) { size_t out_len = 0; int found = 0; - traverse_and_collect_products(root, out, &out_len, &found); + traverse_and_collect_products(root, out, &out_len, &found, base_url); xmlFreeDoc(doc); xmlCleanupParser();