2025-12-04 18:34:35 -07:00
|
|
|
#ifndef SCRAPER_H
|
|
|
|
|
#define SCRAPER_H
|
|
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
|
|
|
|
int fetch_url(const char *url, char **out_buf, size_t *out_len);
|
|
|
|
|
char *extract_title(const char *html, size_t len);
|
|
|
|
|
|
|
|
|
|
/* Extract all <meta name=... content=...> and <meta property=... content=...>
|
|
|
|
|
* Returns 0 on success and allocates *out with a newline-separated list
|
|
|
|
|
* of "key: value" lines. Caller must free(*out).
|
|
|
|
|
*/
|
|
|
|
|
int extract_meta(const char *html, size_t len, char **out);
|
|
|
|
|
|
|
|
|
|
/* Extract Open Graph tags (meta property="og:...") similarly. */
|
|
|
|
|
int extract_og(const char *html, size_t len, char **out);
|
|
|
|
|
|
|
|
|
|
/* Extract the first <script type="application/ld+json"> that looks like a
|
|
|
|
|
* Product schema. Returns 0 and allocates *out_json (caller frees) or
|
|
|
|
|
* returns -1 if not found.
|
|
|
|
|
*/
|
|
|
|
|
int extract_jsonld_product(const char *html, size_t len, char **out_json);
|
|
|
|
|
|
|
|
|
|
/* Extract the text content of the first <h1> element, or NULL if none. */
|
|
|
|
|
char *extract_h1(const char *html, size_t len);
|
|
|
|
|
|
|
|
|
|
/* Extract product listings (one per line) as "name | price | url".
|
|
|
|
|
* Returns 0 and allocates *out on success (caller frees), or -1.
|
|
|
|
|
* Each line represents one product; fields are separated by '|'.
|
|
|
|
|
*/
|
2025-12-04 18:37:33 -07:00
|
|
|
int extract_products(const char *html, size_t len, const char *base_url, char **out);
|
2025-12-04 18:34:35 -07:00
|
|
|
|
|
|
|
|
#endif /* SCRAPER_H */
|