From aab9d411dda9923f9786577ed85e26fbe5146e0b Mon Sep 17 00:00:00 2001 From: ganome Date: Thu, 4 Dec 2025 18:42:14 -0700 Subject: [PATCH] Added binary to gitignore and a link to the item in the store --- .gitignore | 1 + gscrape | Bin 36984 -> 0 bytes src/main.c | 30 ++++++++++---------- src/scraper.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 85 insertions(+), 21 deletions(-) create mode 100644 .gitignore delete mode 100755 gscrape diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c197b62 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +gscrape diff --git a/gscrape b/gscrape deleted file mode 100755 index f6ecf0cc823d10d69b4b6b478906338bb98f8fc7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 36984 zcmeHwePC48weLyz5HK)NiAAg$Q=lLr1_Zm2ac9q=j#JmG-fXcbs;TmRm2FD(>9`dBTfqxaRd%)duUlVci$fyN3)KSi`@%cD_5M(0FjSi&RZ~}^TSRQhqjiM+daz#C!lIfWT75(h!(gyg z`F3x4^>%NiuO^^XQgy(th}4rrd9^P@$$-C(2;~d;eSrWPR4a(ri(p$WOACl!dMh*GZ<=TfW2Rt*oi@1!{J2!6Zvr>UU~T3zeZO!W>qsb<6pL!UyI2Zi>zSReB&O@}V+f-XxORparLEojME)jtZH_ zhy_2*L^W;Hf?LO}8mBZ@n`@C*W;LP3uE3$ErI zoI%!;^ph>rF~x!tPaWA7T-lmDB@CKt!H>1z^DTIm1uwMVa;nSR6&8GkMZU;_ zyDYfJf-B!6cButF&LV%A1-HiEtrq+Qi+r^Om#)U#dJ8UHlkpuE{6veM1`GZr3%z#D%)nWlgnKa@;Upp8F}OjCf3-<3>LP>+8b_ z(-c_a+mdOz;KVm2(-dIitCML8tnnquGzHXnUNTLAG=6+CO#w8XnM}JV{r(ZNy%aFx zZzj_eDC5s3(-a`%hmvUujPVDOX$pw(yOL=Ngz@hu(-Z*XHzv~*_~M~tngU*YTQW_7 zF1{(5rT`aTolH|;i!Vv0DWJvkl4%NL@#B+e3SjX}Go6_G@Ku^t={jw##{Hj|d+(L0 z={s%dTW#rEZ0Vb9>2KT8*V@u`wsfT}eYGw9HCuY4Expc`zQC4VZc7)~((`QTQ*G%J zZRr`d^kiH5$Tw2$`LQkio-O^hE&ZA;{X1LwIa~U$Eq%b2emEt)_}vXHzq#7e@}{Tp zk0Yg9+>4`&pZ2t`JRQR(k@E|Pepp%HI;{y#g8Ua0m`w`P>7)46 zu<|O1Ymu`&jVp()z*(86 zo?AUFyE`j9trO?YCkwQ=M;Avujk`Ow$V}Mbu5$^C8HfI!NW^C%^m$s{odleYav?m3 z^q?mLIieGgj3OB&HjPK5d!eaUi%jc*YAyZ(La8U_?xzF<`#tTY8J?E1eot#zuP3Ab zu$Bkz0E=!Q(blp~hz)?{dRW`zKG?APV4gl1f3J*O#rYefnV$39(a_1B-}q5<85Q5_ zY55)1pa?bQ8jMxYl}8tU$n~)M;GWF*uMq!H#FWStpIOAKfBRxZpoQDw*9iStpF{ud zFRs5bK27MK_Br&Q^2PPn#ea(HCG0c(bLijy=@+(7D1MR9KlO9yzv_$Y*W>TNzfk|v zT;IJDXRe3K4tm;`9XK2A*6#jG%Wpg_?;oHmHe86E*4ROXfJHu)FA6l^YH(V&0ndcv zXRr^RA^lH+*blsa@KIcnuGN~4u@(1@jRj8xfZg=gVR zjv)FZ0Vo=4y(tlCo}A zgs8@fwhI{%rd&;zih_EjGU{Wn(7`S4o}o!FRS#^*L8+U%{v3Bt_(aI`AUvI)tG}*J zr|K_Q#TL4IpkM%8;Vr1!2cq0TU<0CnnDM&6q;r*Yf;Z@Cd5)S=@mDfR8Ht-_FN4C8 z9w3Gkx=;uqKZi`wH_#9>W(c<(5yCO!i0I!@q?tdLbCRN%aT3lOci)4UeUMf|jUS>` z=m(sl3(6?Vh^Kw}zhT44(?T9A2FWiz=3W~i4KIrGQ96Hy&bgCtAw?$%TT;_HVY8;F zt9cGC6hj#u2X|9p`6$fdI5!M;Q?V`kFT*9v)5t`4P!5+bN9b}C62An~@Pl-rH2%b` zY34jF&DF>nYp%dwcBQEh^RzN*degmBNxF7%MQ3tF7qW<_rMVtChtSC~9W&;$G~=ie zvEf{|^ObQgrIhyTx#}@viqI}xOh4Dtx|ga1DHhH+gyMvcEig|HsJwk2QO%lnP#uo- z#P|ei=_1-7=^jZtn9|kO)pS3(apQiv1Kbx4Zx_}_QVgxBA5v$O zfr}g}qM76LVv%d-@US#Y*MpT9ebBE0MuOcH@YMjPt zzXlyZeMehW;2;F4-s2^VY`*@e&4h*MNSo)WZt*PA}z2ihd2&u%F5S>jAUg@GdW}Xq`X4d z2xTrJW{bOHXuBFtQsEDTLUgJelJ@$XVAe+tL#esH^SM5UMch44DB{TN73Um-H;Z$Q!RzUqL)%xNLdIa>Tq3H;Nc}+t`}zX`S@bjWA&Q^t%hlj&$u3_5AHfBGH3u zycXZf6MOQLqABRRC_h3Urg$0veG{NQd<%A zR0IYO)(XVE0|q1|0%2k{W0Tx;Y0lp#{EdnOxB7E+l!*&zyCA$ zR6mIxp>rCgx*O8H1U)EnV~Arcx91ElqGFP{Zx^`-NHi+WMK+ud(e2e^oIx6?jwL8a zne{Hj;tT3Mh++ptv4f;*r6`ts4=$0@Xe)NED2lyegE$x2RI%T%7W>!t(5#qyA6vh* zL=@j%woh6=1M5%C(a0*4wU6$CSD@PStoeUQ`H6{&(Zx>88+_;vh5jcbxVax=N&zup z+V6;RJ8k85%5q=9I+rZBQ5tZ+za;dMuiy6Jtq83blcnXx`VV%jo?tYgWJVK%`h;xw*=iK;wYb9TeMp-L3M=o75@SBnxLA2a%BhCzZjze1Jv#zKp!XOeXL zq*n!IJR~gFK^Yc+C2Ns7i7x~r83C>`r}N`yEpPq*W&oJpqba3m3ThpRqA5ML>zRn} zGeB@-c??JotzO~coQR)FGc&n91v<#_Bjz@U9>gU_GoD4wj84%)+}7}8l6ltO0 zRmkAt)r;B=960`~*6}&SO&WNVu3ramZ>-W5yX^8E*V(-2H;4p8$A};0ci8Mhu-S zCTVnWhMlo?!GR}x%n&ou( zGRj39oJ`yyx;Y`4SyZEY=}tm9l1>7nL$jnnA*qM`N3_W8T(OkyFEYEIujSOqQFDp1 zpZmxig{$Ebzp7^|()2Wv_9*?Kf_u;?=$HK~S2ll0OliXFP-h4XB0WeM{w|h64U}VO z5i%)Tx(IrN`+EVwA~9pPP%W!&N}gy+E`uZ-J!YKsC)xVeoLO!zn&8CqO8FnaH#U+w z*u~QN8-{bqi(B@Oa-|*`VY1g?!jt4;am!oW(oXkkO;J79wZPrrT2RvAT2Qvnwcx65 z*8+dkwV<}&wIDL+TCjV>GJ={t$g?;aYn}x<3pR;0e;H?(uG8vTwx^{z7fDZhb19LQ z=J}jzFJ$r%W%9J5E5A0y(}J6dOA$uaA=4^2#a_W|7B$5x-Jt_esvV?1Mh;rr(wv3t zjr-}Zwo?vSvM+88xRtHhtt{f9+dN)*Zf#~On6YWb+gzz^sz&2}A{3!O5Nv7{PW2oQN&)L`06*0ipx-SL)fJe8WR zj7WyAMpSbT#EYnl;bmfIVTnbi2veU?4k+y_BdjPw z&4_MjK}*odpTenmj2P$Ab_45#jdSL#=88f!`8*ADo#5#LMmYjT2?T$4?&rWVFjipk{*g^5yzg<6XwNAp4%eBzubXN{alAceQ!3|H!66?*$8QEfM;GIk z@LVA`^5_K8C@gUw7ZOpP26ZcsWw8j^tN&!-z1)1@B0kSk?(sk;Six)imJTnai% z`en(_3zs4mxs>QvSM%WiLzl{WlWpkWQqKwQ=X9wFsyXnQrZ?bHf5sn}@F1T61zPuy#`F6;Et%_F%f#?PC_}XGW)w`e%@VdffjE5KS#se-*ZUPGm%s;EbnI^Q3JbOJz+EEVgZDD$96P z?IfGFFl*GwbWLwJ!z}PaMfwg=l>* zM6eRw{IesFCchIf77j5_b~Ue{rqaD1`8yT#C;Z_-?lh9qO!W_+j5-MGj6lsld#aD- z$eUKWob!0+fs5|Z;yeJ*5a%oMv#+43YR;l=R=sFEd9?9AP+=ZV(t8tRFXcQu&u6jA zphWb;Txo6NSh4Sp&?Pi2gNU(@3bV(qY%~B}7ol4MT;wQV;ig80!js?)6ugyx9e5cU z;M{woDLqldlSJxM^&G})BJ^O6!r~LC_T1TcLPp4}Cz0Xp{>rQC>~v*d917HU6K7tA*2Fu z;vX!8Ro0x6RL)z1b1FFZq;j5Nj;rZMq%mp!n@CDj`CniIblq-YfSA!T%B>f7nwUZ% z4Dc?1?x9n%^{SUl%hF^WmY|sYGupWqUdt12%<&2Sfq>Bl5rsyEzKuMXoKYvRxRo?95K`M4MjB!x9BDD z(`_TN!-d>~@w-G~Pz`{aWLi3L5uRU^RxE){;HK-*1NnrqA=o7*khRpcRi6hA*}`ah zk>L@T=$m-^Lc$zvM}}5%e%Zp3#MmtyL@lU=@^at28FELruoNxqK?`}Y3f0%tsO~}5 z-uOLYNcXB1?xA#hPT7fEC4>QP!-dpEWXtxPRjHi0fv4kSv~X3-;p4 z-(5}j(fm8+j>0-bdnF~qQgeGm>9{n!B>;0-ge1}SOHN=p<)!CPQ)whqco5w-ha`j# zUkolxe;vw3=&MKet35n;27K>uBFqo5sUOM3yAr!vc;g9Fq<)tUT{Rk*5KK1 zXd0A_S){tSJ*Oj;bAsTEqC$xBK4i@m0wXf(3Os~^#l!VNTg;dx+C*K93^by*VpR}a zWDp~o=FL&=q8ztijWR3#GXi<~a|O!j!Q<-41|HkH9{~jA6sdB$X=_)wk}PMG_RCYs z(J_U~c|*XWjG>Zbz4#G;RCzM^{VDQn@ImRf6uk!Jgn4LU2lmhbU`d&tB$6hN8QW1& zR4fV>CSob9vp$t|CUNxtDlz*J7d1>I+b88xqro}9ezwr?X}>6|m@l5RW<;CmD_*wWd@HzC*~ zp|BdIx($+Mt`I@EsKktC2@~ z>9eqg1WVBJB(|vJMe76d5w0$cojJ7sF5+&=`lo;`LyAWl@4AayK%RO~^=8qH=xMU9 zsU@Q0F2hWf+*mZF5PqwCyb2uKf@n%F9=#w64;^zm?Uy{()9tiu{59IFCRJ*=r9ot) zh-gd~&OCy8_Ne6+xh5KOzXCzTw31gmygbSa%VZhjTXY$c7p=#h!=2YWl+)^d1>Hj{ zqcA&(Nj9Xyb0^=4K zx4^gs#w{>zfq#Yt#IMlcx0CX;D(_mG=W2O^N-$VO5dD?FHQ``gpu$@p3RXnQb*aS;N?9os zCKbtZVrsjkQLDn`Az!`!%))}j8dn#;D5U!|GKPjM_&p_kr&cb1+zHm=CI+j{)sU|S zzmlZytoLia`uadkxlgYN*5Q}r7E*1rydq-dX{Dl~TAn<~)2bJ1)l0PMv$X1^T6w@1 z4r?_PR2H>I^J&!~e}uLKQ%R)V&H)`NC|?gH%t zZ2}zzy#+LDlBV4Tnh$yxXfbFPXaMwK&^FL0Xcy?OK>I*n038N>4KxdHRT!Z8pznhg zgZ>>f06H1Vdu^cEpk1J|LHj`GfDVHefMy+|X)8eUL05wogBF7ZK)(js2D%lr3-lV$ zK2RNW81&nqSyMD^4`@E<_d$z6ZwC#4-V53WN^8trppSy~fj$K~3`#4@SyMG_05l); zP0(V{VbB2R$DnPX8kTFjK&OHBfzAXS27Mhg3onyij#c=4(APkVL0=g_e$e}{UDgJA z3U)2KK)?Db@`KjDhWwzrUPu08Q4i33(6xBTs2KEV&;aPoc-pFs_@G^&zs3WKeW3T_ ziOgZpy?EX?D@)U=|BU>g1%E+)P%Ojqk+Ex&ma!u{4Y@DY@w4l1h+|6W3Q{6m@U8QC{wu9-f0!Z#+ON?eyN@K1RvkvNCSHTi{^ir)u# z`Qb$3X82o@f5Rlj9|pYdnM5KJ{+#50GC}d_35^>uo?z}Ie-&|sE%Lz+q~RBX|FtxH zS`#ly!*2usj5Pc%@M*j|)vpiyBUl)6;tzvwq~T}5uFt38=Y#)v8h$bO_om?o!0$-I zZv%f<8h#h}6>0c=;BQRB9|nI#8h#eyWL_G6KKNN__{HFV{H(M61K#g0q_gc@Y}$jord29{=_u=KJee` zbGH95_ycM9S%@2l((v=a?@Gfj2LIMH`~dh3Y4~m6Uz3L41^$*a{66q6NW&ine|{Q% z7Ou0$r{U*=|98Ysr}$C~{@ZEz0r30N@Y}$DG!4HC{Cm>y`@p{?4SyK?9clPkm?vyY z!_No5I1RrT{N-u*0q}Fdf5v)!$vngt7{302{|xxM5T}#Zr*CGc>r)r_bK(EZDfzEA z^Y?*&Gx$eR_#c`4VeqRFzv#y;lKP)A^=IKimxB!s`YqWcf4j-g2fqfpO4CyK8799N z{LRlN5*t$ZMJ7K0{`HveIpuEyzY_DPEh+WC*sOmS`2Pm}wTNTM`qi4(?>_KH=qZPk z{7L%^gI|XE))OiG9#el7=4l^;zt+nC7PEQ&kq`cCJa&_jQvRqp|0o9kZ!aejZ(8$b zzGUhTfWIH}yFcPOl&t^DX8tztZ@~QSw<-F6Wwx&i{H2)h9Zs?De$&2v;Qt=;zY|jU z)0j_Hq5~mNE5bj70&V@qGMd6g38NzE$ZQa4>?87l-(Y1vRP%w~VdXRxK?mif`5mQ6 zA04ENnmB%pTVUJ*;}#gVz_FPcPs{J!5xRXU;DK@ zQR!E0>33v0OJp-Yw1J-};-iCajx~Gt9l{c2y(%=Ix9PvRv32r^q{cLZYK-K-IAy1@P zG+CU6muvWl%7PT$FK>KRy+@>+x*t~MlV##7gXhSBzpueITKNW8&@KPK^;ZTL$P|A7tvMB;ba@L4br)u+>jpC$1JZ1@(5KW@Y6 z@qf~D*oNOK@qQcrl*E5;!{3znA8mLB;t=V7&xYqoeAI@olX&JlG#rPrf3?K3Z1~L* zKhcKYC-GbxPQO7;^*O_aQ@kX6sST&!ARzn#8$L(k9vi-1;+t&vH4?wlhIdH3(uUtJ z@p>ChYo=76T{iqWV>R({PmrHzw4fjcWwGF>X;>9-nK8b(LhQBNQVXMS>F9uC~M#}$WfoN~D zAll0kpTAH5ymx}rwJ_;JpZ(dL(}GxdMMv%3mSze@LA79>A-W_!}#QAn!Tg^t%#& zaFxJ$uR)xSd0qGkq3{vE={kpeMsYJ_q=5nX|O)dQ}raf6VgJ&F}iolkC%h=cbBsY9+~M&d}8S zSc$A*Jt~!?TY%>!ljd2_LB5gk=~_*J5am5IoOVe3@xoO6N5Hc+%ua3X>SR3meMeRB zgRDoTlJwV7|Li3KQ^_SYp{Q!7wQmuKGiTUnW{g}fz;`tgO%6nQk zo#!Bbjssrofd7XBUhaTj>wwXJk#YVB!PYd~;ErYgQy9lQ z?F=Evd;K_F=pc`8hSt;-lxy@Xt=Ate$1}S0P;8y<-F~f?AFvfE`J8K5VB%rk5b}BX zJE^sifL`MjA2jviOQT+V)V0cq$2o$Pm0sOZVw;zqq;+KD3#j$=PTcxPxVn6s6E_?z z-;U>~9r=0o$D*RaV|E6!M3I zfouIC`DNG7l=fA49a~ZpEZ6a2Tc?JSnecI1X8}C1T_3T5<4pWFiK_6 zS)#(fEmGyo4u{bQeTTCE9x1=pnH%=|@u6KOBb2AOp}jZCt0Q&Wz1ynl(kNgNe_fh> zC=Ud~e(z^lva+t$hll^2OR5g*VSEKRO+h3fz09e-;c&HgyML#1DI`!G4C`ry5Ig+% zII-_q__I?<^&xx#`kT&tJw$Ikgq#HsV}c>?wn!x$$YHe5fQ+Q!Ik47kD0Vxr>e2Md z8h@YyA8F>{ew0g~NN=^SlVCcLS_GNTk_iX!foumQYw!}tW{;Iw<=1%}QS3cB&{(;a z$jsHZ@+yK6{tkDtaaJy)M^)=62|W-FW6Jb+ce6qSi zyp`dt!^gtuZ4nxJm9_XPcQ6p}m+M}7=Y*MI6AD&I01e?YuX+oG^;IvH%5)v>i1$-) zxeRN}q8B>6HFbh7gz-vAD#EXNn2@T5dWWPk6s#55;qVCThy*?g7P7YLnyY!&7hke^ zRk8Pyb?Y{}w|KX#T3zhMr_|S8zHwE_hBf$NIG(KXx;@@?n^u*$y`Ht3{*YNkrZP~3~5Km#rFOa8T{vBw& z>>N!cf|x$=BtTs3kl2O`34cW%=c!*vH?=J_v!}kOWPa4$3x`1VlrPd_xZqRjrVeVq zs)X4A;)CfaGRTJi?GSxVeJr>-RZ@Or9$%l=yndeQh%dCKs7sDQ^|g8PBkd`2)p~7U z6Jmdfzcv`cOH>qucxdw%=uy;0jE41oeT{s(iUs7C?Ul$H@rC-5swe{qW>QnP6Qpw7SS(fJAu%*Ns@>e1F`9ttRge(6R zA4er`ML6iK#sCRmiW}sS#T#N?FMY&b zIF^@&h^YZzuUCE`UhYDR&|xscreIKa2mG~I)W?f5;hnX*ZyTr{5)@ZjamH7|daackK3s-9e6+7!7E~p+792)lsmMIA@cp-nlM;YWZDCFBI zRH}2-#aCN{(u97DN3DS0jUzMs-yXEhkb!?#PUd44*5T+-RP7H6G`U}?(qF*|e$ObV z@~eGAMGK{%RBP@ts{Ht!8oH%2%UAn~igwD3biYRR#O)&=YJZQG=SW1Aul6Yw%~cGM zjdjRVCbY~(dwxnx?ei(RQ3|X2D>+3k0-yE}Ra)&MDk^U`u|}6#f2mfx94Ew6`PIIr zq9amJm2Yo%Bg))Mb*A4<)^(=d;VHrWLGSYiX%Ifs{LcdR`<;+U5yiK{#=<~ zQL|yGYR%;idO0^Gzr4B5l~h$nm8a5*UT@1^DDx?5eV2`NsCzkPT_JIdErTto{U3V^cS!&M diff --git a/src/main.c b/src/main.c index f0132bc..bd4a08b 100644 --- a/src/main.c +++ b/src/main.c @@ -12,24 +12,24 @@ static char *json_escape(const char *s) { size_t oi = 0; out[oi++] = '"'; for (size_t i = 0; i < len; ++i) { - unsigned char c = s[i]; - if (c == '"' || c == '\\') { + unsigned char uc = (unsigned char)s[i]; + if (uc == '"' || uc == '\\') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; - out[oi++] = c; - } else if (c == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; } - else if (c == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; } - else if (c == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; } - else if (c == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; } - else if (c == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; } - else if (c < 0x20) { - char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", c); + out[oi++] = (char)uc; + } else if (uc == '\b') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'b'; } + else if (uc == '\f') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'f'; } + else if (uc == '\n') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'n'; } + else if (uc == '\r') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 'r'; } + else if (uc == '\t') { if (oi + 2 >= cap) { cap *= 2; out = realloc(out, cap); } out[oi++] = '\\'; out[oi++] = 't'; } + else if (uc < 0x20) { + char buf[7]; snprintf(buf, sizeof(buf), "\\u%04x", (unsigned)uc); size_t bl = strlen(buf); if (oi + bl + 1 >= cap) { cap = cap + bl + 16; out = realloc(out, cap); } memcpy(out + oi, buf, bl); oi += bl; } else { if (oi + 1 >= cap) { cap *= 2; out = realloc(out, cap); } - out[oi++] = c; + out[oi++] = (char)uc; } } if (oi + 2 >= cap) { cap += 2; out = realloc(out, cap); } @@ -52,7 +52,7 @@ static void print_kv_json(const char *kv_lines) { size_t keylen = (size_t)(sep - p); const char *vstart = sep + 1; while (vstart < p + linelen && (*vstart == ' ' || *vstart == '\t')) vstart++; - size_t vallen = (p + linelen) - vstart; + size_t vallen = (size_t)((p + linelen) - vstart); char *key = malloc(keylen + 1); char *val = malloc(vallen + 1); memcpy(key, p, keylen); key[keylen] = '\0'; @@ -93,7 +93,7 @@ static void print_products_json(const char *products_lines) { size_t plen = (size_t)(s2 - p2); const char *u2 = s2 + 1; while (u2 < p + linelen && (*u2 == ' ' || *u2 == '\t')) u2++; - size_t ulen = (p + linelen) - u2; + size_t ulen = (size_t)((p + linelen) - u2); name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0'; price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0'; url = malloc(ulen + 1); memcpy(url, u2, ulen); url[ulen] = '\0'; @@ -103,7 +103,7 @@ static void print_products_json(const char *products_lines) { const char *p2 = s1 + 1; while (nlen > 0 && (p[nlen-1] == ' ' || p[nlen-1] == '\t')) nlen--; while (p2 < p + linelen && (*p2 == ' ' || *p2 == '\t')) p2++; - size_t plen = (p + linelen) - p2; + size_t plen = (size_t)((p + linelen) - p2); name = malloc(nlen + 1); memcpy(name, p, nlen); name[nlen] = '\0'; price = malloc(plen + 1); memcpy(price, p2, plen); price[plen] = '\0'; url = strdup(""); @@ -202,7 +202,7 @@ int main(int argc, char **argv) { } char *products = NULL; - if (extract_products(html, html_len, &products) == 0 && products) { + if (extract_products(html, html_len, url, &products) == 0 && products) { printf("\"products\":"); print_products_json(products); free(products); diff --git a/src/scraper.c b/src/scraper.c index 15590ed..0c82bd6 100644 --- a/src/scraper.c +++ b/src/scraper.c @@ -411,7 +411,61 @@ static char *find_price_in_node(xmlNode *node) { * Looks for h2.product-name and span.price preferentially, falls back to * generic name/price finders. */ -static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found) { +/* Resolve a possibly-relative href against a base URL. Returns a newly + * allocated string (caller must free) or NULL on error. + */ +static char *resolve_url(const char *base, const char *href) { + if (!href) return NULL; + if (strstr(href, "://")) return strdup(href); + if (strncmp(href, "//", 2) == 0) { + /* scheme-relative */ + const char *p = strstr(base, "://"); + if (!p) return strdup(href + 2); + size_t scheme_len = (size_t)(p - base); + size_t outlen = scheme_len + 3 + strlen(href + 2) + 1; + char *out = malloc(outlen); + if (!out) return NULL; + snprintf(out, outlen, "%.*s://%s", (int)scheme_len, base, href + 2); + return out; + } + + if (href[0] == '/') { + /* absolute path on same host */ + const char *p = strstr(base, "://"); + const char *start = base; + if (p) start = p + 3; /* host start */ + const char *host_end = strchr(start, '/'); + size_t prefix_len = 0; + if (host_end) prefix_len = (size_t)(host_end - base); + else prefix_len = strlen(base); + size_t outlen = prefix_len + strlen(href) + 1; + char *out = malloc(outlen); + if (!out) return NULL; + memcpy(out, base, prefix_len); + out[prefix_len] = '\0'; + strncat(out, href, strlen(href)); + return out; + } + + /* relative path: append after last '/' in base */ + const char *last_slash = strrchr(base, '/'); + size_t base_prefix = strlen(base); + if (last_slash) { + /* keep up to and including last slash */ + base_prefix = (size_t)(last_slash - base + 1); + } else { + base_prefix = strlen(base); + } + size_t outlen = base_prefix + strlen(href) + 1; + char *out = malloc(outlen); + if (!out) return NULL; + memcpy(out, base, base_prefix); + out[base_prefix] = '\0'; + strncat(out, href, strlen(href)); + return out; +} + +static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len, int *found, const char *base_url) { if (!node) return; char *name = NULL; char *price = NULL; @@ -460,6 +514,15 @@ static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len /* find product link if available */ plink = find_href_in_node(node); if (!plink) plink = strdup(""); + else { + if (base_url) { + char *abs = resolve_url(base_url, plink); + if (abs) { + free(plink); + plink = abs; + } + } + } if (name || price) { if (!name) name = strdup(""); @@ -472,7 +535,7 @@ static void collect_product_from_node(xmlNode *node, char **out, size_t *out_len free(plink); } -static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found) { +static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out_len, int *found, const char *base_url) { for (xmlNode *cur = node; cur; cur = cur->next) { if (cur->type != XML_ELEMENT_NODE) continue; @@ -486,18 +549,18 @@ static void traverse_and_collect_products(xmlNode *node, char **out, size_t *out else if (cls && xmlStrcasecmp(cur->name, (const xmlChar *)"li") == 0 && attr_contains(cls, "product")) is_product = true; if (is_product) { - collect_product_from_node(cur, out, out_len, found); + collect_product_from_node(cur, out, out_len, found, base_url); } if (data_products) xmlFree(data_products); if (itemtype) xmlFree(itemtype); if (cls) xmlFree(cls); - if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found); + if (cur->children) traverse_and_collect_products(cur->children, out, out_len, found, base_url); } } -int extract_products(const char *html, size_t len, char **out) { +int extract_products(const char *html, size_t len, const char *base_url, char **out) { if (!html || len == 0 || !out) return -1; htmlDocPtr doc = htmlReadMemory(html, (int)len, NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); if (!doc) return -1; @@ -507,7 +570,7 @@ int extract_products(const char *html, size_t len, char **out) { size_t out_len = 0; int found = 0; - traverse_and_collect_products(root, out, &out_len, &found); + traverse_and_collect_products(root, out, &out_len, &found, base_url); xmlFreeDoc(doc); xmlCleanupParser();