From 2afe840edee2af591d55be73797dd23132a26027 Mon Sep 17 00:00:00 2001 From: ganome Date: Thu, 4 Dec 2025 18:34:35 -0700 Subject: [PATCH] Initial Commit --- Makefile | 13 ++ gscrape | Bin 0 -> 36984 bytes include/scraper.h | 33 +++ src/main.c | 217 +++++++++++++++++++ src/scraper.c | 520 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 783 insertions(+) create mode 100644 Makefile create mode 100755 gscrape create mode 100644 include/scraper.h create mode 100644 src/main.c create mode 100644 src/scraper.c diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7349ef0 --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +CC=gcc +CFLAGS=-Iinclude -Wall $(shell pkg-config --cflags libxml-2.0 libcurl) +LDFLAGS=$(shell pkg-config --libs libxml-2.0 libcurl) +SRCS=src/main.c src/scraper.c +TARGET=gscrape + +all: $(TARGET) + +$(TARGET): $(SRCS) + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +clean: + rm -f $(TARGET) *.o src/*.o diff --git a/gscrape b/gscrape new file mode 100755 index 0000000000000000000000000000000000000000..f6ecf0cc823d10d69b4b6b478906338bb98f8fc7 GIT binary patch literal 36984 zcmeHwePC48weLyz5HK)NiAAg$Q=lLr1_Zm2ac9q=j#JmG-fXcbs;TmRm2FD(>9`dBTfqxaRd%)duUlVci$fyN3)KSi`@%cD_5M(0FjSi&RZ~}^TSRQhqjiM+daz#C!lIfWT75(h!(gyg z`F3x4^>%NiuO^^XQgy(th}4rrd9^P@$$-C(2;~d;eSrWPR4a(ri(p$WOACl!dMh*GZ<=TfW2Rt*oi@1!{J2!6Zvr>UU~T3zeZO!W>qsb<6pL!UyI2Zi>zSReB&O@}V+f-XxORparLEojME)jtZH_ zhy_2*L^W;Hf?LO}8mBZ@n`@C*W;LP3uE3$ErI zoI%!;^ph>rF~x!tPaWA7T-lmDB@CKt!H>1z^DTIm1uwMVa;nSR6&8GkMZU;_ zyDYfJf-B!6cButF&LV%A1-HiEtrq+Qi+r^Om#)U#dJ8UHlkpuE{6veM1`GZr3%z#D%)nWlgnKa@;Upp8F}OjCf3-<3>LP>+8b_ z(-c_a+mdOz;KVm2(-dIitCML8tnnquGzHXnUNTLAG=6+CO#w8XnM}JV{r(ZNy%aFx zZzj_eDC5s3(-a`%hmvUujPVDOX$pw(yOL=Ngz@hu(-Z*XHzv~*_~M~tngU*YTQW_7 zF1{(5rT`aTolH|;i!Vv0DWJvkl4%NL@#B+e3SjX}Go6_G@Ku^t={jw##{Hj|d+(L0 z={s%dTW#rEZ0Vb9>2KT8*V@u`wsfT}eYGw9HCuY4Expc`zQC4VZc7)~((`QTQ*G%J zZRr`d^kiH5$Tw2$`LQkio-O^hE&ZA;{X1LwIa~U$Eq%b2emEt)_}vXHzq#7e@}{Tp zk0Yg9+>4`&pZ2t`JRQR(k@E|Pepp%HI;{y#g8Ua0m`w`P>7)46 zu<|O1Ymu`&jVp()z*(86 zo?AUFyE`j9trO?YCkwQ=M;Avujk`Ow$V}Mbu5$^C8HfI!NW^C%^m$s{odleYav?m3 z^q?mLIieGgj3OB&HjPK5d!eaUi%jc*YAyZ(La8U_?xzF<`#tTY8J?E1eot#zuP3Ab zu$Bkz0E=!Q(blp~hz)?{dRW`zKG?APV4gl1f3J*O#rYefnV$39(a_1B-}q5<85Q5_ zY55)1pa?bQ8jMxYl}8tU$n~)M;GWF*uMq!H#FWStpIOAKfBRxZpoQDw*9iStpF{ud zFRs5bK27MK_Br&Q^2PPn#ea(HCG0c(bLijy=@+(7D1MR9KlO9yzv_$Y*W>TNzfk|v zT;IJDXRe3K4tm;`9XK2A*6#jG%Wpg_?;oHmHe86E*4ROXfJHu)FA6l^YH(V&0ndcv zXRr^RA^lH+*blsa@KIcnuGN~4u@(1@jRj8xfZg=gVR zjv)FZ0Vo=4y(tlCo}A zgs8@fwhI{%rd&;zih_EjGU{Wn(7`S4o}o!FRS#^*L8+U%{v3Bt_(aI`AUvI)tG}*J zr|K_Q#TL4IpkM%8;Vr1!2cq0TU<0CnnDM&6q;r*Yf;Z@Cd5)S=@mDfR8Ht-_FN4C8 z9w3Gkx=;uqKZi`wH_#9>W(c<(5yCO!i0I!@q?tdLbCRN%aT3lOci)4UeUMf|jUS>` z=m(sl3(6?Vh^Kw}zhT44(?T9A2FWiz=3W~i4KIrGQ96Hy&bgCtAw?$%TT;_HVY8;F zt9cGC6hj#u2X|9p`6$fdI5!M;Q?V`kFT*9v)5t`4P!5+bN9b}C62An~@Pl-rH2%b` zY34jF&DF>nYp%dwcBQEh^RzN*degmBNxF7%MQ3tF7qW<_rMVtChtSC~9W&;$G~=ie zvEf{|^ObQgrIhyTx#}@viqI}xOh4Dtx|ga1DHhH+gyMvcEig|HsJwk2QO%lnP#uo- z#P|ei=_1-7=^jZtn9|kO)pS3(apQiv1Kbx4Zx_}_QVgxBA5v$O zfr}g}qM76LVv%d-@US#Y*MpT9ebBE0MuOcH@YMjPt zzXlyZeMehW;2;F4-s2^VY`*@e&4h*MNSo)WZt*PA}z2ihd2&u%F5S>jAUg@GdW}Xq`X4d z2xTrJW{bOHXuBFtQsEDTLUgJelJ@$XVAe+tL#esH^SM5UMch44DB{TN73Um-H;Z$Q!RzUqL)%xNLdIa>Tq3H;Nc}+t`}zX`S@bjWA&Q^t%hlj&$u3_5AHfBGH3u zycXZf6MOQLqABRRC_h3Urg$0veG{NQd<%A zR0IYO)(XVE0|q1|0%2k{W0Tx;Y0lp#{EdnOxB7E+l!*&zyCA$ zR6mIxp>rCgx*O8H1U)EnV~Arcx91ElqGFP{Zx^`-NHi+WMK+ud(e2e^oIx6?jwL8a zne{Hj;tT3Mh++ptv4f;*r6`ts4=$0@Xe)NED2lyegE$x2RI%T%7W>!t(5#qyA6vh* zL=@j%woh6=1M5%C(a0*4wU6$CSD@PStoeUQ`H6{&(Zx>88+_;vh5jcbxVax=N&zup z+V6;RJ8k85%5q=9I+rZBQ5tZ+za;dMuiy6Jtq83blcnXx`VV%jo?tYgWJVK%`h;xw*=iK;wYb9TeMp-L3M=o75@SBnxLA2a%BhCzZjze1Jv#zKp!XOeXL zq*n!IJR~gFK^Yc+C2Ns7i7x~r83C>`r}N`yEpPq*W&oJpqba3m3ThpRqA5ML>zRn} zGeB@-c??JotzO~coQR)FGc&n91v<#_Bjz@U9>gU_GoD4wj84%)+}7}8l6ltO0 zRmkAt)r;B=960`~*6}&SO&WNVu3ramZ>-W5yX^8E*V(-2H;4p8$A};0ci8Mhu-S zCTVnWhMlo?!GR}x%n&ou( zGRj39oJ`yyx;Y`4SyZEY=}tm9l1>7nL$jnnA*qM`N3_W8T(OkyFEYEIujSOqQFDp1 zpZmxig{$Ebzp7^|()2Wv_9*?Kf_u;?=$HK~S2ll0OliXFP-h4XB0WeM{w|h64U}VO z5i%)Tx(IrN`+EVwA~9pPP%W!&N}gy+E`uZ-J!YKsC)xVeoLO!zn&8CqO8FnaH#U+w z*u~QN8-{bqi(B@Oa-|*`VY1g?!jt4;am!oW(oXkkO;J79wZPrrT2RvAT2Qvnwcx65 z*8+dkwV<}&wIDL+TCjV>GJ={t$g?;aYn}x<3pR;0e;H?(uG8vTwx^{z7fDZhb19LQ z=J}jzFJ$r%W%9J5E5A0y(}J6dOA$uaA=4^2#a_W|7B$5x-Jt_esvV?1Mh;rr(wv3t zjr-}Zwo?vSvM+88xRtHhtt{f9+dN)*Zf#~On6YWb+gzz^sz&2}A{3!O5Nv7{PW2oQN&)L`06*0ipx-SL)fJe8WR zj7WyAMpSbT#EYnl;bmfIVTnbi2veU?4k+y_BdjPw z&4_MjK}*odpTenmj2P$Ab_45#jdSL#=88f!`8*ADo#5#LMmYjT2?T$4?&rWVFjipk{*g^5yzg<6XwNAp4%eBzubXN{alAceQ!3|H!66?*$8QEfM;GIk z@LVA`^5_K8C@gUw7ZOpP26ZcsWw8j^tN&!-z1)1@B0kSk?(sk;Six)imJTnai% z`en(_3zs4mxs>QvSM%WiLzl{WlWpkWQqKwQ=X9wFsyXnQrZ?bHf5sn}@F1T61zPuy#`F6;Et%_F%f#?PC_}XGW)w`e%@VdffjE5KS#se-*ZUPGm%s;EbnI^Q3JbOJz+EEVgZDD$96P z?IfGFFl*GwbWLwJ!z}PaMfwg=l>* zM6eRw{IesFCchIf77j5_b~Ue{rqaD1`8yT#C;Z_-?lh9qO!W_+j5-MGj6lsld#aD- z$eUKWob!0+fs5|Z;yeJ*5a%oMv#+43YR;l=R=sFEd9?9AP+=ZV(t8tRFXcQu&u6jA zphWb;Txo6NSh4Sp&?Pi2gNU(@3bV(qY%~B}7ol4MT;wQV;ig80!js?)6ugyx9e5cU z;M{woDLqldlSJxM^&G})BJ^O6!r~LC_T1TcLPp4}Cz0Xp{>rQC>~v*d917HU6K7tA*2Fu z;vX!8Ro0x6RL)z1b1FFZq;j5Nj;rZMq%mp!n@CDj`CniIblq-YfSA!T%B>f7nwUZ% z4Dc?1?x9n%^{SUl%hF^WmY|sYGupWqUdt12%<&2Sfq>Bl5rsyEzKuMXoKYvRxRo?95K`M4MjB!x9BDD z(`_TN!-d>~@w-G~Pz`{aWLi3L5uRU^RxE){;HK-*1NnrqA=o7*khRpcRi6hA*}`ah zk>L@T=$m-^Lc$zvM}}5%e%Zp3#MmtyL@lU=@^at28FELruoNxqK?`}Y3f0%tsO~}5 z-uOLYNcXB1?xA#hPT7fEC4>QP!-dpEWXtxPRjHi0fv4kSv~X3-;p4 z-(5}j(fm8+j>0-bdnF~qQgeGm>9{n!B>;0-ge1}SOHN=p<)!CPQ)whqco5w-ha`j# zUkolxe;vw3=&MKet35n;27K>uBFqo5sUOM3yAr!vc;g9Fq<)tUT{Rk*5KK1 zXd0A_S){tSJ*Oj;bAsTEqC$xBK4i@m0wXf(3Os~^#l!VNTg;dx+C*K93^by*VpR}a zWDp~o=FL&=q8ztijWR3#GXi<~a|O!j!Q<-41|HkH9{~jA6sdB$X=_)wk}PMG_RCYs z(J_U~c|*XWjG>Zbz4#G;RCzM^{VDQn@ImRf6uk!Jgn4LU2lmhbU`d&tB$6hN8QW1& zR4fV>CSob9vp$t|CUNxtDlz*J7d1>I+b88xqro}9ezwr?X}>6|m@l5RW<;CmD_*wWd@HzC*~ zp|BdIx($+Mt`I@EsKktC2@~ z>9eqg1WVBJB(|vJMe76d5w0$cojJ7sF5+&=`lo;`LyAWl@4AayK%RO~^=8qH=xMU9 zsU@Q0F2hWf+*mZF5PqwCyb2uKf@n%F9=#w64;^zm?Uy{()9tiu{59IFCRJ*=r9ot) zh-gd~&OCy8_Ne6+xh5KOzXCzTw31gmygbSa%VZhjTXY$c7p=#h!=2YWl+)^d1>Hj{ zqcA&(Nj9Xyb0^=4K zx4^gs#w{>zfq#Yt#IMlcx0CX;D(_mG=W2O^N-$VO5dD?FHQ``gpu$@p3RXnQb*aS;N?9os zCKbtZVrsjkQLDn`Az!`!%))}j8dn#;D5U!|GKPjM_&p_kr&cb1+zHm=CI+j{)sU|S zzmlZytoLia`uadkxlgYN*5Q}r7E*1rydq-dX{Dl~TAn<~)2bJ1)l0PMv$X1^T6w@1 z4r?_PR2H>I^J&!~e}uLKQ%R)V&H)`NC|?gH%t zZ2}zzy#+LDlBV4Tnh$yxXfbFPXaMwK&^FL0Xcy?OK>I*n038N>4KxdHRT!Z8pznhg zgZ>>f06H1Vdu^cEpk1J|LHj`GfDVHefMy+|X)8eUL05wogBF7ZK)(js2D%lr3-lV$ zK2RNW81&nqSyMD^4`@E<_d$z6ZwC#4-V53WN^8trppSy~fj$K~3`#4@SyMG_05l); zP0(V{VbB2R$DnPX8kTFjK&OHBfzAXS27Mhg3onyij#c=4(APkVL0=g_e$e}{UDgJA z3U)2KK)?Db@`KjDhWwzrUPu08Q4i33(6xBTs2KEV&;aPoc-pFs_@G^&zs3WKeW3T_ ziOgZpy?EX?D@)U=|BU>g1%E+)P%Ojqk+Ex&ma!u{4Y@DY@w4l1h+|6W3Q{6m@U8QC{wu9-f0!Z#+ON?eyN@K1RvkvNCSHTi{^ir)u# z`Qb$3X82o@f5Rlj9|pYdnM5KJ{+#50GC}d_35^>uo?z}Ie-&|sE%Lz+q~RBX|FtxH zS`#ly!*2usj5Pc%@M*j|)vpiyBUl)6;tzvwq~T}5uFt38=Y#)v8h$bO_om?o!0$-I zZv%f<8h#h}6>0c=;BQRB9|nI#8h#eyWL_G6KKNN__{HFV{H(M61K#g0q_gc@Y}$jord29{=_u=KJee` zbGH95_ycM9S%@2l((v=a?@Gfj2LIMH`~dh3Y4~m6Uz3L41^$*a{66q6NW&ine|{Q% z7Ou0$r{U*=|98Ysr}$C~{@ZEz0r30N@Y}$DG!4HC{Cm>y`@p{?4SyK?9clPkm?vyY z!_No5I1RrT{N-u*0q}Fdf5v)!$vngt7{302{|xxM5T}#Zr*CGc>r)r_bK(EZDfzEA z^Y?*&Gx$eR_#c`4VeqRFzv#y;lKP)A^=IKimxB!s`YqWcf4j-g2fqfpO4CyK8799N z{LRlN5*t$ZMJ7K0{`HveIpuEyzY_DPEh+WC*sOmS`2Pm}wTNTM`qi4(?>_KH=qZPk z{7L%^gI|XE))OiG9#el7=4l^;zt+nC7PEQ&kq`cCJa&_jQvRqp|0o9kZ!aejZ(8$b zzGUhTfWIH}yFcPOl&t^DX8tztZ@~QSw<-F6Wwx&i{H2)h9Zs?De$&2v;Qt=;zY|jU z)0j_Hq5~mNE5bj70&V@qGMd6g38NzE$ZQa4>?87l-(Y1vRP%w~VdXRxK?mif`5mQ6 zA04ENnmB%pTVUJ*;}#gVz_FPcPs{J!5xRXU;DK@ zQR!E0>33v0OJp-Yw1J-};-iCajx~Gt9l{c2y(%=Ix9PvRv32r^q{cLZYK-K-IAy1@P zG+CU6muvWl%7PT$FK>KRy+@>+x*t~MlV##7gXhSBzpueITKNW8&@KPK^;ZTL$P|A7tvMB;ba@L4br)u+>jpC$1JZ1@(5KW@Y6 z@qf~D*oNOK@qQcrl*E5;!{3znA8mLB;t=V7&xYqoeAI@olX&JlG#rPrf3?K3Z1~L* zKhcKYC-GbxPQO7;^*O_aQ@kX6sST&!ARzn#8$L(k9vi-1;+t&vH4?wlhIdH3(uUtJ z@p>ChYo=76T{iqWV>R({PmrHzw4fjcWwGF>X;>9-nK8b(LhQBNQVXMS>F9uC~M#}$WfoN~D zAll0kpTAH5ymx}rwJ_;JpZ(dL(}GxdMMv%3mSze@LA79>A-W_!}#QAn!Tg^t%#& zaFxJ$uR)xSd0qGkq3{vE={kpeMsYJ_q=5nX|O)dQ}raf6VgJ&F}iolkC%h=cbBsY9+~M&d}8S zSc$A*Jt~!?TY%>!ljd2_LB5gk=~_*J5am5IoOVe3@xoO6N5Hc+%ua3X>SR3meMeRB zgRDoTlJwV7|Li3KQ^_SYp{Q!7wQmuKGiTUnW{g}fz;`tgO%6nQk zo#!Bbjssrofd7XBUhaTj>wwXJk#YVB!PYd~;ErYgQy9lQ z?F=Evd;K_F=pc`8hSt;-lxy@Xt=Ate$1}S0P;8y<-F~f?AFvfE`J8K5VB%rk5b}BX zJE^sifL`MjA2jviOQT+V)V0cq$2o$Pm0sOZVw;zqq;+KD3#j$=PTcxPxVn6s6E_?z z-;U>~9r=0o$D*RaV|E6!M3I zfouIC`DNG7l=fA49a~ZpEZ6a2Tc?JSnecI1X8}C1T_3T5<4pWFiK_6 zS)#(fEmGyo4u{bQeTTCE9x1=pnH%=|@u6KOBb2AOp}jZCt0Q&Wz1ynl(kNgNe_fh> zC=Ud~e(z^lva+t$hll^2OR5g*VSEKRO+h3fz09e-;c&HgyML#1DI`!G4C`ry5Ig+% zII-_q__I?<^&xx#`kT&tJw$Ikgq#HsV}c>?wn!x$$YHe5fQ+Q!Ik47kD0Vxr>e2Md z8h@YyA8F>{ew0g~NN=^SlVCcLS_GNTk_iX!foumQYw!}tW{;Iw<=1%}QS3cB&{(;a z$jsHZ@+yK6{tkDtaaJy)M^)=62|W-FW6Jb+ce6qSi zyp`dt!^gtuZ4nxJm9_XPcQ6p}m+M}7=Y*MI6AD&I01e?YuX+oG^;IvH%5)v>i1$-) zxeRN}q8B>6HFbh7gz-vAD#EXNn2@T5dWWPk6s#55;qVCThy*?g7P7YLnyY!&7hke^ zRk8Pyb?Y{}w|KX#T3zhMr_|S8zHwE_hBf$NIG(KXx;@@?n^u*$y`Ht3{*YNkrZP~3~5Km#rFOa8T{vBw& z>>N!cf|x$=BtTs3kl2O`34cW%=c!*vH?=J_v!}kOWPa4$3x`1VlrPd_xZqRjrVeVq zs)X4A;)CfaGRTJi?GSxVeJr>-RZ@Or9$%l=yndeQh%dCKs7sDQ^|g8PBkd`2)p~7U z6Jmdfzcv`cOH>qucxdw%=uy;0jE41oeT{s(iUs7C?Ul$H@rC-5swe{qW>QnP6Qpw7SS(fJAu%*Ns@>e1F`9ttRge(6R zA4er`ML6iK#sCRmiW}sS#T#N?FMY&b zIF^@&h^YZzuUCE`UhYDR&|xscreIKa2mG~I)W?f5;hnX*ZyTr{5)@ZjamH7|daackK3s-9e6+7!7E~p+792)lsmMIA@cp-nlM;YWZDCFBI zRH}2-#aCN{(u97DN3DS0jUzMs-yXEhkb!?#PUd44*5T+-RP7H6G`U}?(qF*|e$ObV z@~eGAMGK{%RBP@ts{Ht!8oH%2%UAn~igwD3biYRR#O)&=YJZQG=SW1Aul6Yw%~cGM zjdjRVCbY~(dwxnx?ei(RQ3|X2D>+3k0-yE}Ra)&MDk^U`u|}6#f2mfx94Ew6`PIIr zq9amJm2Yo%Bg))Mb*A4<)^(=d;VHrWLGSYiX%Ifs{LcdR`<;+U5yiK{#=<~ zQL|yGYR%;idO0^Gzr4B5l~h$nm8a5*UT@1^DDx?5eV2`NsCzkPT_JIdErTto{U3V^cS!&M literal 0 HcmV?d00001 diff --git a/include/scraper.h b/include/scraper.h new file mode 100644 index 0000000..88d23be --- /dev/null +++ b/include/scraper.h @@ -0,0 +1,33 @@ +#ifndef SCRAPER_H +#define SCRAPER_H + +#include + +int fetch_url(const char *url, char **out_buf, size_t *out_len); +char *extract_title(const char *html, size_t len); + +/* Extract all and + * Returns 0 on success and allocates *out with a newline-separated list + * of "key: value" lines. Caller must free(*out). + */ +int extract_meta(const char *html, size_t len, char **out); + +/* Extract Open Graph tags (meta property="og:...") similarly. */ +int extract_og(const char *html, size_t len, char **out); + +/* Extract the first