Added binary to gitignore and a link to the item in the store
This commit is contained in:
parent
46c174be8d
commit
aab9d411dd
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
gscrape
|
||||||
30
src/main.c
30
src/main.c
@ -12,24 +12,24 @@ static char *json_escape(const char *s) {
|
|||||||
size_t oi = 0;
|
size_t oi = 0;
|
||||||
out[oi++] = '"';
|
out[oi++] = '"';
|
||||||
for (size_t i = 0; i < len; ++i) {
|
for (size_t i = 0; i < len; ++i) {
|
||||||
unsigned char c = s[i];
|
unsigned char uc = (unsigned char)s[i];
|
||||||
if (c == '"' || c == '\\') {
|
if (uc == '"' || uc == '\\') {
|
||||||
if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); }
|
if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); }
|
||||||
out[oi++] = '\\';
|
out[oi++] = '\\';
|
||||||
out[oi++] = c;
|
out[oi++] = (char)uc;
|
||||||
} else if (c == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; }
|
} else if (uc == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; }
|
||||||
else if (c == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; }
|
else if (uc == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; }
|
||||||
else if (c == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; }
|
else if (uc == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; }
|
||||||
else if (c == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; }
|
else if (uc == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; }
|
||||||
else if (c == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; }
|
else if (uc == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; }
|
||||||
else if (c < 0x20) {
|
else if (uc < 0x20) {
|
||||||
char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", c);
|
char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", (unsigned)uc);
|
||||||
size_t bl = strlen(buf);
|
size_t bl = strlen(buf);
|
||||||
if (oi + bl + 1 >= cap) { cap = cap + bl + 16; out = realloc(out, cap); }
|
if (oi + bl + 1 >= cap) { cap = cap + bl + 16; out = realloc(out, cap); }
|
||||||
memcpy(out + oi, buf, bl); oi += bl;
|
memcpy(out + oi, buf, bl); oi += bl;
|
||||||
} else {
|
} else {
|
||||||
if (oi + 1 >= cap) { cap *= 2; out = realloc(out, cap); }
|
if (oi + 1 >= cap) { cap *= 2; out = realloc(out, cap); }
|
||||||
out[oi++] = c;
|
out[oi++] = (char)uc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (oi + 2 >= cap) { cap += 2; out = realloc(out, cap); }
|
if (oi + 2 >= cap) { cap += 2; out = realloc(out, cap); }
|
||||||
@ -52,7 +52,7 @@ static void print_kv_json(const char *kv_lines) {
|
|||||||
size_t keylen = (size_t)(sep - p);
|
size_t keylen = (size_t)(sep - p);
|
||||||
const char *vstart = sep + 1;
|
const char *vstart = sep + 1;
|
||||||
while (vstart < p + linelen && (*vstart == ' ' || *vstart == '\t')) vstart++;
|
while (vstart < p + linelen && (*vstart == ' ' || *vstart == '\t')) vstart++;
|
||||||
size_t vallen = (p + linelen) - vstart;
|
size_t vallen = (size_t)((p + linelen) - vstart);
|
||||||
char *key = malloc(keylen + 1);
|
char *key = malloc(keylen + 1);
|
||||||
char *val = malloc(vallen + 1);
|
char *val = malloc(vallen + 1);
|
||||||
memcpy(key, p, keylen); key[keylen] = '\0';
|
memcpy(key, p, keylen); key[keylen] = '\0';
|
||||||
@ -93,7 +93,7 @@ static void print_products_json(const char *products_lines) {
|
|||||||
size_t plen = (size_t)(s2 - p2);
|
size_t plen = (size_t)(s2 - p2);
|
||||||
const char *u2 = s2 + 1;
|
const char *u2 = s2 + 1;
|
||||||
while (u2 < p + linelen && (*u2 == ' ' || *u2 == '\t')) u2++;
|
while (u2 < p + linelen && (*u2 == ' ' || *u2 == '\t')) u2++;
|
||||||
size_t ulen = (p + linelen) - u2;
|
size_t ulen = (size_t)((p + linelen) - u2);
|
||||||
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
|
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
|
||||||
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
|
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
|
||||||
url = malloc(ulen + 1); memcpy(url, u2, ulen); url[ulen] = '\0';
|
url = malloc(ulen + 1); memcpy(url, u2, ulen); url[ulen] = '\0';
|
||||||
@ -103,7 +103,7 @@ static void print_products_json(const char *products_lines) {
|
|||||||
const char *p2 = s1 + 1;
|
const char *p2 = s1 + 1;
|
||||||
while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--;
|
while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--;
|
||||||
while (p2 < p + linelen && (*p2 == ' ' || *p2 == '\t')) p2++;
|
while (p2 < p + linelen && (*p2 == ' ' || *p2 == '\t')) p2++;
|
||||||
size_t plen = (p + linelen) - p2;
|
size_t plen = (size_t)((p + linelen) - p2);
|
||||||
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
|
name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0';
|
||||||
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
|
price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0';
|
||||||
url = strdup("");
|
url = strdup("");
|
||||||
@ -202,7 +202,7 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
char *products = NULL;
|
char *products = NULL;
|
||||||
if (extract_products(html, html_len, &products) == 0 && products) {
|
if (extract_products(html, html_len, url, &products) == 0 && products) {
|
||||||
printf("\"products\":");
|
printf("\"products\":");
|
||||||
print_products_json(products);
|
print_products_json(products);
|
||||||
free(products);
|
free(products);
|
||||||
|
|||||||
@ -411,7 +411,61 @@ static char *find_price_in_node(xmlNode *node) {
|
|||||||
* Looks for h2.product-name and span.price preferentially, falls back to
|
* Looks for h2.product-name and span.price preferentially, falls back to
|
||||||
* generic name/price finders.
|
* generic name/price finders.
|
||||||
*/
|
*/
|
||||||
static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found) {
|
/* Resolve a possibly-relative href against a base URL. Returns a newly
|
||||||
|
* allocated string (caller must free) or NULL on error.
|
||||||
|
*/
|
||||||
|
static char *resolve_url(const char *base, const char *href) {
|
||||||
|
if (!href) return NULL;
|
||||||
|
if (strstr(href, "://")) return strdup(href);
|
||||||
|
if (strncmp(href, "//", 2) == 0) {
|
||||||
|
/* scheme-relative */
|
||||||
|
const char *p = strstr(base, "://");
|
||||||
|
if (!p) return strdup(href + 2);
|
||||||
|
size_t scheme_len = (size_t)(p - base);
|
||||||
|
size_t outlen = scheme_len + 3 + strlen(href + 2) + 1;
|
||||||
|
char *out = malloc(outlen);
|
||||||
|
if (!out) return NULL;
|
||||||
|
snprintf(out, outlen, "%.*s://%s", (int)scheme_len, base, href + 2);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (href[0] == '/') {
|
||||||
|
/* absolute path on same host */
|
||||||
|
const char *p = strstr(base, "://");
|
||||||
|
const char *start = base;
|
||||||
|
if (p) start = p + 3; /* host start */
|
||||||
|
const char *host_end = strchr(start, '/');
|
||||||
|
size_t prefix_len = 0;
|
||||||
|
if (host_end) prefix_len = (size_t)(host_end - base);
|
||||||
|
else prefix_len = strlen(base);
|
||||||
|
size_t outlen = prefix_len + strlen(href) + 1;
|
||||||
|
char *out = malloc(outlen);
|
||||||
|
if (!out) return NULL;
|
||||||
|
memcpy(out, base, prefix_len);
|
||||||
|
out[prefix_len] = '\0';
|
||||||
|
strncat(out, href, strlen(href));
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* relative path: append after last '/' in base */
|
||||||
|
const char *last_slash = strrchr(base, '/');
|
||||||
|
size_t base_prefix = strlen(base);
|
||||||
|
if (last_slash) {
|
||||||
|
/* keep up to and including last slash */
|
||||||
|
base_prefix = (size_t)(last_slash - base + 1);
|
||||||
|
} else {
|
||||||
|
base_prefix = strlen(base);
|
||||||
|
}
|
||||||
|
size_t outlen = base_prefix + strlen(href) + 1;
|
||||||
|
char *out = malloc(outlen);
|
||||||
|
if (!out) return NULL;
|
||||||
|
memcpy(out, base, base_prefix);
|
||||||
|
out[base_prefix] = '\0';
|
||||||
|
strncat(out, href, strlen(href));
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found, const char *base_url) {
|
||||||
if (!node) return;
|
if (!node) return;
|
||||||
char *name = NULL;
|
char *name = NULL;
|
||||||
char *price = NULL;
|
char *price = NULL;
|
||||||
@ -460,6 +514,15 @@ static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len
|
|||||||
/* find product link if available */
|
/* find product link if available */
|
||||||
plink = find_href_in_node(node);
|
plink = find_href_in_node(node);
|
||||||
if (!plink) plink = strdup("<no-link>");
|
if (!plink) plink = strdup("<no-link>");
|
||||||
|
else {
|
||||||
|
if (base_url) {
|
||||||
|
char *abs = resolve_url(base_url, plink);
|
||||||
|
if (abs) {
|
||||||
|
free(plink);
|
||||||
|
plink = abs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (name || price) {
|
if (name || price) {
|
||||||
if (!name) name = strdup("<no-name>");
|
if (!name) name = strdup("<no-name>");
|
||||||
@ -472,7 +535,7 @@ static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len
|
|||||||
free(plink);
|
free(plink);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found) {
|
static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found, const char *base_url) {
|
||||||
for (xmlNode *cur = node; cur; cur = cur->next) {
|
for (xmlNode *cur = node; cur; cur = cur->next) {
|
||||||
if (cur->type != XML_ELEMENT_NODE) continue;
|
if (cur->type != XML_ELEMENT_NODE) continue;
|
||||||
|
|
||||||
@ -486,18 +549,18 @@ static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out
|
|||||||
else if (cls && xmlStrcasecmp(cur->name, (const xmlChar *)"li") == 0 && attr_contains(cls, "product")) is_product = true;
|
else if (cls && xmlStrcasecmp(cur->name, (const xmlChar *)"li") == 0 && attr_contains(cls, "product")) is_product = true;
|
||||||
|
|
||||||
if (is_product) {
|
if (is_product) {
|
||||||
collect_product_from_node(cur, out, out_len, found);
|
collect_product_from_node(cur, out, out_len, found, base_url);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (data_products) xmlFree(data_products);
|
if (data_products) xmlFree(data_products);
|
||||||
if (itemtype) xmlFree(itemtype);
|
if (itemtype) xmlFree(itemtype);
|
||||||
if (cls) xmlFree(cls);
|
if (cls) xmlFree(cls);
|
||||||
|
|
||||||
if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found);
|
if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found, base_url);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int extract_products(const char *html, size_t len, char **out) {
|
int extract_products(const char *html, size_t len, const char *base_url, char **out) {
|
||||||
if (!html || len == 0 || !out) return -1;
|
if (!html || len == 0 || !out) return -1;
|
||||||
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||||
if (!doc) return -1;
|
if (!doc) return -1;
|
||||||
@ -507,7 +570,7 @@ int extract_products(const char *html, size_t len, char **out) {
|
|||||||
size_t out_len = 0;
|
size_t out_len = 0;
|
||||||
int found = 0;
|
int found = 0;
|
||||||
|
|
||||||
traverse_and_collect_products(root, out, &out_len, &found);
|
traverse_and_collect_products(root, out, &out_len, &found, base_url);
|
||||||
|
|
||||||
xmlFreeDoc(doc);
|
xmlFreeDoc(doc);
|
||||||
xmlCleanupParser();
|
xmlCleanupParser();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user