From 15c35944a2e984d29606f8fc24089fc6e60304ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= Date: Thu, 9 Oct 2025 17:05:02 +0200 Subject: [PATCH 1/3] feat(demo): add PDF data extractor demo with upload and extraction features --- demos/extractor/.gitignore | 20 ++ demos/extractor/README.md | 90 +++++++ demos/extractor/demo.html | 447 +++++++++++++++++++++++++++++++++++ demos/extractor/invoice.pdf | Bin 0 -> 24762 bytes demos/extractor/package.json | 27 +++ demos/extractor/server.js | 155 ++++++++++++ 6 files changed, 739 insertions(+) create mode 100644 demos/extractor/.gitignore create mode 100644 demos/extractor/README.md create mode 100644 demos/extractor/demo.html create mode 100644 demos/extractor/invoice.pdf create mode 100644 demos/extractor/package.json create mode 100644 demos/extractor/server.js diff --git a/demos/extractor/.gitignore b/demos/extractor/.gitignore new file mode 100644 index 000000000..d55627f3e --- /dev/null +++ b/demos/extractor/.gitignore @@ -0,0 +1,20 @@ +# Dependencies +node_modules/ +package-lock.json + +# Upload directory +uploads/ + +# Logs +*.log +npm-debug.log* + +# OS files +.DS_Store +Thumbs.db + +# IDE +.vscode/ +.idea/ +*.swp +*.swo diff --git a/demos/extractor/README.md b/demos/extractor/README.md new file mode 100644 index 000000000..d8e78d0e7 --- /dev/null +++ b/demos/extractor/README.md @@ -0,0 +1,90 @@ +# PDF Data Extractor Demo + +This demo application allows you to extract structured data from PDF documents using JSON schemas and AI models. + +## Features + +- 📄 Upload and process PDF files +- 📋 Define custom JSON schemas for data extraction +- đŸŽ¯ Pre-built schema examples (Invoice, Receipt, Form) +- 📊 View extracted data with token usage statistics +- âš™ī¸ Configurable temperature and model selection + +## Prerequisites + +Before running this demo, you need: + +1. **Node.js** (version 18 or higher) +2. **Docker Model Runner** +3. **A suitable AI model** for text extraction + +## Setup Instructions + +### 1. Enable Docker Model Runner + +**Using Docker Desktop:** +- Open Docker Desktop settings +- Go to the **AI** tab +- Select **Enable Docker Model Runner** +- Enable **host-side TCP support** on port `12434` (default) + +For detailed instructions, see the [Docker Model Runner documentation](https://docs.docker.com/ai/model-runner/get-started/#enable-docker-model-runner). + +**Using Standalone Docker Engine:** +TCP support is enabled by default on port `12434`. + +#### 2. Pull a Suitable Model + +You'll need a model capable of understanding and extracting text. Recommended models: + +```bash +# Pull a general-purpose model +docker model pull ai/gemma3 +``` + +To see available models, visit [Docker Hub - AI Models](https://hub.docker.com/r/ai). + +## Installation + +1. **Navigate to the demo directory:** + ```bash + cd demos/extractor + ``` + +2. **Install dependencies:** + ```bash + npm install + ``` + +3. **Start the server:** + ```bash + npm start + ``` + + The server will start on `http://localhost:3000` + +4. **Open the demo:** + Open `demo.html` in your web browser (you can simply double-click the file or serve it with a local server) + +## Usage Guide + +### Basic Workflow + +1. **Configure API Settings** + - **Base API URL**: Set to `http://127.0.0.1:12434/engines/v1` for Docker Model Runner + - **Model**: Select from available models + +2. **Define Your Schema** + - Use the provided examples (Invoice, Receipt, Form) or create your own + - The schema defines what data to extract from the PDF + - Use standard JSON Schema format with `type`, `properties`, etc. + +3. **Upload a PDF** + - Click "Choose File" and select your PDF document + - Supported: Any text-based PDF (not scanned images without OCR) + - You can use sample PDFs [invoice.pdf](invoice.pdf) + +4. **Extract Data** + - Click "Extract Data" button + - Wait for processing (may take 10-30 seconds depending on PDF size and model) + - View extracted data in the result section diff --git a/demos/extractor/demo.html b/demos/extractor/demo.html new file mode 100644 index 000000000..e4d113415 --- /dev/null +++ b/demos/extractor/demo.html @@ -0,0 +1,447 @@ + + + + + + PDF Data Extractor Demo + + + + +

PDF Data Extractor Demo

+

Extract structured data from PDF documents using JSON schemas and AI models

+ + +
+
🔧 API Configuration
+ +
+ + +
+ â„šī¸ To pull a model, run: docker model pull <model-name>
+ Find more models at: https://hub.docker.com/r/ai +
+
+ +
+ + +
+
+ +
+
+ + +
+
+
+ + +
+
📋 JSON Schema
+ +
+ + +
+ Quick examples: + + + +
+
+
+ + +
+
📄 PDF Upload
+ +
+ + +
+ +
+ +
+
+ + +
+
📊 Extraction Result
+
Upload a PDF and click "Extract Data" to see results...
+
+ + + + + diff --git a/demos/extractor/invoice.pdf b/demos/extractor/invoice.pdf new file mode 100644 index 0000000000000000000000000000000000000000..376327a73b16e87672dc6286279189e0f9f298e3 GIT binary patch literal 24762 zcmeFYWmH^Cx2TN-5ANO}32xoEySqCCcXuZV7J_Tg;10pvg1ZC>!3pjT!EXb}yZ3wc zIrp6VQpuD@M zLSztfFxGdnb$AFtxI$P$m_ax}SVLHWzilDxAdDbPAOH}I5cCj02oU(230y+n!Pdyx z(DR+js1SL1 z5sht(9(u&|_nx`i83PzZ+?>ReoWSJ~8I%M7P7clwH!=W5dPWc;gP}eENY4oFxWLm@ zKtc&%qHpE+m`C5y62Ks9Yh(OZ^~}Gk*J6;@cXBdzumLa#8@rer8Y_wk>ip>g`#<_1 zU}IzJ1a8*Z!0EA1(&jdnhzyFxhE4!2CJ-w<6UV~~006PkGcvP-*jO0>045GLdgjLj z@I5;{3+q$74kCk~t%DI*Vp@#gb{N570l?!tJk~5==mZuQxG4aGilPMK!xwIz@ z000Oc|8HUZe|6Bnf7=L;6QC|HfKWqd?C-5e;Ms!QrOLAAn5+mvHG+8B(kA#Gb!^VbD zP_PeZ3WE#Yi3uYV;$FFm6`ynC4y!Geb8Q}lBbRIYXWYJ4W=gWGU*JI=K1E~gz6#owqepW%Y-WK zB05GSU3~3?MTWR!>)T-6mrxYvp2~s_7LD+0J?f43+y}Yg-<8c#x6D@qYAd$KZ@1ad zMorj`d34uu3R{a^?+D@XR7HQ$!%(()GC^9)qP1WJq~0gPPv_vJpS}zTkb#*94~JJ{ zqb%+oe|`)#LWL}dV|Szco}4SVQaeqs)Y(wjJkSNBZBi0x0BWK1)*s9LLuC?V`WZUL z_g6VN#*kkU>ZcIo>(xF9r~o8hjOi?CBtZ$| z<>QgzU+gMF4bD$sH2k4I+94mfk!QP#h&lRZPM)x6<02ca83f1N zXlVO#n^O#Lr3J-d>JjbogN8lL@v%-T@uGyCp-JMYba3AkeN4Ycq{-NZX!Bua&V!sH zd(#K^YAprQ1Jy(YS-Xsa>P#&=GRH3)1eFprer|>diMz9G0>?aK%KyW20=qN_>)BvO zP4Oj4ThBSDjVuXQ8!`uHPg|=F`QsILxWx%H!cvf#G!WU&4!!Alw(4gxx9;@|+MuTR z8&g42~KorJ=V#&ZX~X0{cF!BV#}?m1Ep z8>9Aj_viYoMmoH5UW1 zuO)h_hY8W#`{p{~SBdhNv|2PL>m0)_9)CsZGx-E~JLgcndmRYZM$KS3;*VYz-Go<+ zCw!O)S2LCs__-I*Q^gYd-Qp1V2cLgbTxs3Xb@u#Hi@yGh9{NGVPt4yB&WJt%nwv~L zmQ&yxv1;~O!g}6tce&)wYO#{K{-nKzMk*uEhMwnV6w ze(}VeHHi5+%e7FHxR1VX*lNZVFmZKu$Lbp^nW#4Q-`)*}^}kZ=;ewc~^xu&p85Ny;29kjN%*-zoLd*~1&d;ZDY zSek*MPtUcW7;)6t4G2kM#82!$TZUy?Khw~FvLp3jL1krsRUUn4!dGi?cn0srp_=df z1B$;$&cI-;%8&F)o#;@jNh4F{{TDqm<|wow@?7PCjU87z7+MrOVbO{%edN{_k~Y%S zN=U>yC>%H=ZPURs;L5QF#JvT#iqYcIcWuZ4=p?FV(jPD^j*ubKC$AMMlG-Aqu%S8Y zwqDI*9a}8#E?iB*q|RxFeb2Mfo=-Ju)I_?-jGBI_0kr0u{~VndUxz^NO?voytFWXQ_ z5TrWGhdMcQr^oXySs|B%M)`*7R0N^fl61y4{f{Q*LE&O0h z1^GO;HeZ^@^A3SU^(S2w;gL<(q8P)XCu)gdG+pL*NcaFIZyg??n8SM-&*+fAqobaz z8jNfFj2e!R*GS~imo4OPe6d?r-7V15y;0)paCWo5onJMje3Cm#?-VaSbPYmRja$_B zq9<+jY1`$UsOj0qH^JC}AlL=M?Fbt9NQ7zx>&+xIzo&-fldC5d_ET`1M79~l5vbFZ zHo==0v1*4_&tsyu)$zFX8(_cpGMW|)y#U^%o~Cp_(4zG-mh3}_Nv?Si$3zM8)9v4= z1s=b4!7e_fVBmd5_~!Yd`*2q`&4Nq`+^C5`O7Z&PfUqRQ>bb$S!M<1EXx`H^H>J`orqE;;23c}hd9zDJ}M z<@4#PP4|QXI;j`LN8*;j^WayT%H9BVig6%h`9}~%Gh;SymjQVP0obDGOUn%qCo&ML zc|~tw+ukzM8ic;AKIhPFf#%_x^vO%zY^_6tz|C60qA*;nWBQb)H(=N)jd?a zu{uN^xkx%DWGW)@j&ig0Mi!#(IWcCA-;eH$J#_5+FEI#uDYCgxKTDC$ zU+C8Mjxak}iHbSw0VasYJ>VdhxKF*PVWdS4stN>=SrN*3&#sXHQWeex5M>=wSM9uH z-fs|yJx5=k232Fa&NQxrpxW$Cg$WV!i#JDHH@zp@>+1cMvwP&?Y29fcvT4vKiOZGP zMF@5Eh;kG#fyR*#o-Ifnd>9#w&04@d!YdP#^s&<0W-BUXAB5)be)!pTji%R z)aHk00okZSF8WMjE2gC$#Do@&NNzfrs_^1+0`0^#9Qj!xz0jWq!ww5`9Z^jzRGhcD zftiVZc#es_&=4%1b-WRJ1oGB-TmgAKzua6t3y7tnZbB5}$R~eT$cy47NfWyGsZL3} zv(xA{9*B?+YyPd20Xvi<1#46>zuaB)cw`dukud5G{x9pKcoDcY~l<4o5Zr>pLN@m+-)jEx39ylJq zM#S<6jH!_~%JCnGM|mbjaOsKq!hkL8I=zpNFAweX1l`SG3hfBvz4eB@6r1hGRDw+d zJ9qzR#X!`O+`Tdw`&rh(dnGvi#!)Atvs#(isS(0u^CoTM_t+o-KkJ}L{;?<&1}~ng zRLKz59P+3>$2I1Tld|@-P2=98(k13yv6&tGl4EAJ>P{+q0cJM5qnS+Hni$R;A5+{t z^ynN*?jiXX(;-Dl6wVc{n>-uj3YNbKqXkBl=jI_7Y05az@nj zua0#~jIpN|obip01w2|zzxCDM_S>U!`(Vg{*ua|dpU8h&b^j}BHrR-JuniyVzDLa- ztYrVuxgQL_|B23>I$|@+O4M_U*dgz<5HB8UL`zww#L#$2^ERU=)Yg#Pj>fWHbqK>g3wcLG>uMf+#}ma@ zFI&%Ak7+n=G4m#zkL=?+?&sSS*RD3Q-U@6gKVT^lkI|@N#pfF+RbaCSa=p_jdspDj z>7xYAd2ILYDy4ZwigNJfu~zjN3w%tgOLoEWeMJNHKwa)3hUUxK(s*#h(4l1O zomhWnQ)wVLgp37)EG9E%HV{K-Qp#hU4I(O)31KJ}Qiu#o%3Q_g$U!5(wSmWi`9Vo7 z8CS!HUi-@*0vyUWQK%;NsQM+36Y|dLTrF1KrhY(%U%?K(BSUPuJ+#G%kA&u`fH6f& zXTMXWf>P#mef^bxW;#U2vLxxHLShFNrx_(;;o4gAg=NBdBxcCF!e1V~J$F8W3i7;s z_m_)P!e|kuEJ7I8*%wLzMCi4is;4R9QPV}JB41D37N&?A@jhOCX}Tfw0d75`_77FO z-l%6?U`Z&=-E@@t+($X*M(F|N=FyIGZ&iFqS434v(@S;ds*V-%PJ4LEJ5xBb<=arQ zMTo^hNZBS1cYDQ4#R-J#bD`yPp|N4ZvYgn_U#|O#TgKC&aE0$e7y7c~Cldq|93?wQld2_V= z@7@v0*WSG9&CHXm&QwjBllQ`&c60Ve4iUJ*{7r$5x0&h7 z{0gIXf9nehcxLmIMd)13zu8@j=X<{&UF!MKGZ^6qZb4ZjqN1v5jN#;Ey4ePm#@GrE zB=?4>3FVBtq>3_MTBdpypGH4@Lg$xYPrK3d1LInm&*HPH^M_uVvqJs+5>?Ha6joHW zmz7u}kPv$rdm*A?DpnyNm(j1}>4{ak3hqtYb~)od{6%KfnVDL0gO(iqdvJRhlOdvw zM{LkzW*bcXmne$r3JyGrP4anNq@W$XYb|s?s;yslRYK+F-i6R$N!iCJH9cy^8Qd(F;9^;DNL5HiC3lj76Oo zwtXdQCjP$d`=sodnOf1r-bf~o3M+)N#A$KIL)`ASC>gUp!8C3=Lq@f4<)z!g zG0*1n9hQEE77-Zu+PT)_yK?bQ$M5(2>1H5Nm8Nl(o_|uZykMTbJ%k^1Xt~{se}!2v z3UEv$CxFa7|E|DGM2)P6+I~Xos{ak(d=qY#m&=O;ITOl(v?|6B<;-b(}8`K zW%pC;hmZC9{E4a}mHF?m%t!+|J_lYcWsCQ#UWL5$xvt%$7wv-~-(mQhZ&3XLN%bFo z^Z&ym;SY2JGBLCN>yg0n58foC@1$>KYYJwHVBX>gc%meK0jhr%c*ysV+u?z`{Dvw* zwl+@Spd$dt`t*yhX#51oz%dV85Bv;xgn8iL1Mva}Wx%jZU(nX=iN~>UumISZK{|g! zl8@#7(9*xLod;+qFQjA#`2B2oxc!SJf-#Vio#~&5M*svK%U?9}k@<-!=>r~~jfxM= z{R>HgD}D%yf`{^$OOzGxdoZGG0Jf)Iivk$|;P;=RzaOWM*(I!jOaSovciLYPd>rw^ zK(qjlq5wQf0PrvF`6#kK1O4;h{L|il2dlKDB35hATQ0tga`0O!dm+I;A2ehWM}dT8 z@H2#w|6}`*0DJD$3$Rj$Da}rAbxyImCDgeYtwEEB23u%h~bPLf?Dc`8mGZ zFIQNL2cJ=7=96-LRhAFf?*wbAC#>C6YI2fnvsUg+=H-`s6| z?~vhJ%(!3Qy8C=LP8yegxBSJYwB>rC^%hORpOBZd%E6hf=p>;vl%d4hVvP^|@*w#J zG^;=gf(@dPm}-&1QXfCPr6!`x~{|&@Q$9tYp7r$aXC$DxZ6bnt}2})~iNh z*#b|_y?7_d8KOL!iF5XnJv+rw@4(oGR;Y3*-{r2-Nm0D!X|7n^-Q?xx2B_jsSxwQd z*V&t=p@#u3;Z@T!Ib#C5RjK8!c+C^FfjVsmv{7Q+wG&@n7vLz?xmB#tK4)-Y^&Uk-_`7SA<7IaRL-z$Bg9j@k!uLT0H7@alfcVnxXV-$G--*}lN6p~S+ z?9N(-d}5K?_+rF$NCv}a0`=bg%D}6TDmGN80+1Z~-eHdncVp`{7jA`!aL-VTkPRkl z@)lY(#g7&g=&%$}BpDUGsgm;0M}Si@Uav!x@{|Sy=&g?=GidjMmu~v+URb%Zpq62 z^>K}{P(TWn{_zDC@3f)Sn+D@!%uESOv*ULZuXO|C1nb1x4WsQv$w4yn3||IA{4pus zF?x4tWAU(BDC)*iNHb|;6~ugn&J90d9;8c)=?i69a>t=;qamIeHzN z%=9$en@nZS4T)q)pxHBt;#&8mcEip+EoMBy(?oN|-bhqqnc1ahj+1+FGSjOpvXQ(1 zq%VQZim9>bBZC1&8fZNyrvlo&n&%n!&)k18_>S+qEa-aT%b{4`BH;qs| zTEI@?Psy^8_&%5?wqBCq44GY6pf*0~q^ST_jexFq2i8%cuezuyGyQAH+m^Q_*hp9= z6yfl>edLM8gEDi{XArJ@Kie7JEs>ZRm8BLa9t{t2oy)`#o-mYJCW}IZ63WVWfBODr z>e5VKGZ<6!()6;U&f&<4lQIPPTUQq~*X)K5)z9m+rH1lkKU|H}L&oIRWq_xKVr z76o#TGlc5Yu?4;}1bku?H%5}yEy#!%N7aRUiw&<~un1z#z#<5SPmpOCl-4^X%E_&~ zVp3hJJ)gJ*mSJP~qQ1UAKIf75omCMl`DTAySI_*NwsU^%9aD=z^j_;rDfsfv=VFG6 z?$c#$4!AFNo3x$>ynDeVjLOh>-3R+$YwGVAg1}G?+^)`LGHF6&2_#{@f$q!h9$u(8hO%Sm=`yuJSTtqZ`PtNW#V%;}7rib>>-15lVtANovRV@Qp&Pk3%DYZ;q?b#FpC#tGity z;QHwEI#hYtjEY|CoLtw02HL&1Z#W)kxWb3W-#)y-Hk?7<&L*8ja(|gxIVJ4CDqoLY zF*4pW`g~CiD?Bv5`V3;;T-}8OzglsTRdbkSoAvbzbHb=b-TB;nIBUaCmmUL*dWTH% zZEl|>xjvI4e7mbWwe7~&7ZQ^Zr)w9LT4><#yif8v$dqXGiXf0z3ASsnVYeL5?)b2| zK|xKtMlA4ZFcpg^M$y8@qKn^3$g`y{%bc2KTHwPn+{Vq|HeEKvxQiM;q%P|EthfK6 zp*|jA?h>S8*v8knONerB?v7OD9x`r_0W${NoTUc6Cq)|=! z6Hw&qRlAsKHZoCsFF-;b+4xn+;5nWUr%T@(CEKN%Z?OC81Q*hWd;#*t3Y*2)N+Zl!PP2v((^atb~wi%11B`0HRRRA*wJv$@wL)*W*2R^zr1_3}1r^k<~`)|(m81e54 zG!T3w&dv@l_FJC@{SQHuwJMUl@&-2V#-ir|6#37JM!%4{0YsYRW|~HW3`)&0Y%~pF z>M}&AOft$Z$WYi=LPR;AEWCi^;^dOaRb&1FW{IM&sw22(%*f0QW5S2B6tcFe^fBJ} zdhGB2G@4o9;!AP7JNf+mKD81T4Zn~1&r2AYv&?JXEi>B%kR;HU*V5@I%)V3 z-l%S=EPKTH2$xS)N3t|3OK|E2uJI_+=_}%s#;}J>|L+EK^5c!4O1}r`v5$V z$1l1@x@y0YuE&s9YMu(pr(1+8e%lE}&qqM4+MaMcjaGE{*=G6aJ)y2=b?axnp9xpj zM{pt{LhA3EEPKlXy!}KQE2wg5h&uV|TI^8tIadl}!}S*h>aqmm)CdrL8eB^0e||Y$(oRhUK?cCVp!)n;^&`tzG4gXoQ9PBm0>?!Pl9DAa#s-Rd zJ%#Ipl{2$Nq<&)jJh41)a{e{`qNg5LbdRIkuDHZ=$Gm!>qs9=wTwQVXHh zypHEI0ZWGAWHJ{88b4eUu1~b^r4n*W2KhFoG3<5%%Sf=tJevI_PAWqXQG5LT19?ZS zobcb(h|;O^t08H`O4a-#$D@+{sNypZX;CnBN?jeROV+@)n*Ut?)nK8tUNf-v=T<8A zqE?{(vL%xb%b;d1o@(nGNqEFiHsg(wk%d>%Sa==KlfekNLon5AmwdDiO{HgFWIjEbWA-73fn06baM`>Fb<6EH3fXsbD?( z49_IGpsJlV103#Qbht^zGHvcollU)BBvTrCxRQc~L)4Q#?58^hVrxFR)oTw7uyL9g5I>OHN6UtC$2 zR@i5Mj4ox1BOJ9iDqmdcdZ`bR|kLC5@~<1@fI@EDD`pVhvGd#m?m; z`@0GG#iMybvPr3j;r{A*PI|INe1o~LQbdk+ff$ZJxD%mQPQ(&JEQonf;=1OkFw(nD zTPuT)r^$AX&HCNjt5-$(C5qi0=SI*Pz1_|V^hKzhE5cbYH|}aGNyP-kgkV{dAqvQ| zB9Ap4j|XO|(4!?-IgxGSdw6^t*}b8%lY*sU^b&i`@&^i+-G+ToMb_{Eh3(*V*p8mx z{?-H)`)aSUhjY4>yi8bF0V(ZqoPRG+dd>Z&_Fadw-C`-7TzPz-?qxU9R=1oImIeOX zNv&?VIyy!y_*njtN#`$jA-DB-TGk zaCcj{HEMszQb|#Xgp7Akuhu5jTE`^JGKH>tG$s$UACy1YtGBH74tmIoS{Iv9DFUN{ z$2OPbIMuBBir*G{PN089{UE?xen%@iC~=xUXUZuYr<#2E#IU&E60OQP} zvYnQpL|1}c32y>cbJ?Wm1#GpwQvS#dZ+>54Zz8Q&IX@o5#(mKM@B-^fV%KL3ea>f%cCTt7JGOm5nr z=q|xTjn%D;JZbC35Huy97Kc-GL-S4|A0-8s_OgVZj@;h&4$1z_wv~x{K>3oRNIl#I z)pXDCw4;NVeEbX+EADl7&swVJ)sJ3>5q|lsIXYDu*xxJhY zoJ@5>C{R2*HWwwOB`JBW+kx3T=Bn_jcn*j{Pgtv|#+O z+bAuJSPVekx_qHRFX@)u{dHu`OYe8J~U`WY!3<>}C*I z&S_%5n*1tfo(LepFza$Fcos}<&S|a|qx<^D*jLF+%gw5r+G*?7B<}8ZkLWn`9B8h- zqI8^ZW_i&~oD#Z%90MNC_hNDv;!F`P7UDn43trxr@4J}{52ggWJCP-1E-u#q4fVwC zLgu6Q_L+`hUD&Yf={ijO!H@`X|3E>AN@j+b#fgS1e9gQD}C&PbG48hry+#SJsn1qdq zEf{bDz0T|>S94gIiOaYVDXJu^tpa3!g z{s7g#R8qjB(h3H^;0Ba!#UzAf^uhf0PkFGv-vbcVdf>qQ4n_bgJ2QX`6EbIVw z#)o?r023I>KGdg!_;=(C`s=_H@JDlGVd8ig#c#I$&}#`Bb0@%q2d<%#u@k_=+{Vb^ zH|z!&fJJ132tJH8G6(ypK7}5?4XyR<9*6w@AX60~rGJVOU}S6pt_Uo&C$0W*D(L=* zj}BbsuN;pS0k|*V62PaCjXxO zcd5tAV;w+n9gK`X4qkv3_{zfk7-SZGx;!N^3q6KFPiah`#}Ly~2qf|pdguxxBQw*} zMfmB$@^k^$4NhZy%E`w16nbib^>^A+Io78Fpr;V%cb!k=Sf0v(o~mX7_v{hE{}~_H zoOqn4e@WNS+2P@k<8d+{W&R*ba~rTT@6+K1z|Pk0VdFi1{<%T_m~0Q5=P&E*Z(H#p z^1+$)zuJQ$HsA?)*kAwJA9Uh>EGZ*>8>fej>j=;SJs!eM|U}j+gFM2xgj-v;G zK&;@+2n4aQ(=&k>nScNecChdJQ;z@IR%GV@do}~XTNns7+WxW?fnX~5@#zHTZ-RJ;_>Z zzEwxUueA&f@C;8CKGHh!8HS)AALE-AqIWuxA65=TG++FPR*lCo+MSlzt=re$Z9Gkx zARRJFiO3yK8F$2>U}|2NNJy0ObWD^!AU|@O>D-X|CT&9?#~?viyYhZ*;l35AJ3M(< znQNeB-uwRI$n)U#4hd3FoZj94(BF#a-0kw<=7yV>mn_}B4? zJgZ2I6nX8&@T$vyVve;Q??6j_!@Zrb{qPwO!+_iL6dj6XCXP?8{SXR|W4MzLSu~vO zhE0R9^26?D0U8%>3Bw*cTn#b*M5+~pYB(iH|@Evwm6s@>o@$dOFxP_W>Vpdm?Zf68pj>H0#`-@UZDj9`#X#crN; z*;O?quYd?bSrC_6o@61;wG23%p(#Tl@!G@we03MGBN?)Mihrgzc`@Qc`bMK*V{xK( zX6yh_u#V|QD;uwp+JU{uL7KzFGHRZbk?YpQ(lPx*9HG#%Z zYYsUs16A67RlPQ=ZdrXdA&*Ua)I#FOFYlY=?C6OMo-fsP!32&;MC4L*-Qi=`K++&8 zik@X#*i&O_>eI2}CB6}xO+qylsjD~!NQ)viYi>`BLW`@MjKcU0KO&xen_rFedRYm+ zmCd)kr@&-OQ-Zb{$2|hioUe8kwLQrDN3MkvlJEQGBkSPTZ!iW+h#fuqei3Pi26ru^ z{z_&f{DNJXjs7Vl#7hK)?mOydPaC(*g14oj2_2QgsvbL2vxUp{K2lkgCkfYz@vbO1 z0_kgW<6jGsT@sLxo@LvuGYuQN#ZH%Lstjwk;t#=j4Zg{!#$%!ugkz*^AVeXFp829% zLTPJ7=;GvpT~i}muSCBQNAqvFCNyX|E7z&X6ov8 z`WY4(W4YO2wJh|-e#`i_zMs+0<~l16*eROr&l8fH>KE!3v+vEF3r%U-#C*L@_5BQ% zXMHK{=dV5t!5nBWv>>I-*3F|;8;O_g8n;ArbWVRmj9NuJj_ zBQuJ|9-Z};t)`ZSoq~k}A7%p9Or~l-N6CUNn>dqru2HK-caxapsE*)Z z_MSY&Ti(G_QW8@1om{B*V&%8bcT@cyc2ic|B964`i1id#SI4+oygUsgKI~h1R)Z z)KYxaIJZE|HN&@vie0v84Dt4QVm*n|)Df9|AVJ^-Zh3hqPw;NgC!ol=jW)1@D^iEA z%-Cp>wroJnY9VlR|3@t<&QaqIb^iurqjCNia^-Xd#?Sq7k-c?aW6GCd*&m;eEGNH5 z@pkE#4~Pv+sU&_cehppCse+{iJ(XgR_S4o>JHXP8{FeWIVuAh{DY}>*>AuZIQBiWn zuT$?^>msGClY5N#!XnpN&eRcZCh9w-vqIJ7q?MymYtm~}y?v1&J9&7JaAjWs8-=giMNOAWgqSkFn)03;ZeTUP z0U7b0I{)^dYvri|IVqkN0$=wrlFXtL!g5g5H!Na+|7#T-?KZAmletpjqMpen`t5zd732hJCY$!| z3A8m%!@=yww*5(7BdC;RO{thg#jWLqPiat6mY$vkBZiJgX-xr_J+>^eIvN#qhjeG3 z$EJ=FBlM%S1;Qy<0$f4d(yAOuH!eY0g?$_OGyBfUYFtf}kjfl(ulG^N2{Y$2fyg>v z56_b}+)_kEBlzS5crIXqxEj=|=?H_vO76WB=e>Sh#rbF*sM7x6CHIbj~@p)at~As2J_L#cqaPvnt%L zcKtd|O|v;k%DY|dh7$hixYjG0EBl+X&okB)%ebg$<8v>yV+_?A`=F7?O+-yaWmrwj z@4n1UgxoR;zMGRWg7J+$U9L{Gd>ue(3qq@SEA)DW0X}GC`>-WaPxPG|UB~$Q4ID4i zt{uQOYUVIvthL+B(JMFI_6^7?d!pNkWzW6ZB>!HiRcar!;s6Bhw70eTfrP8)jtnQx zH{LfwHxa)+AF(0xc0}q&EF*Lb1gB~AWTZT!614$j# zav`{47}L=;qGPyHi3Gc6Cz;IIVq-n3-?Jtnc7)8{7Dt42#ZE&~(fW&s zUp|394bBGY;5E^PYOhz5`!rY3RK66w6gjeJb+iq3Q zukCgx0!N$}8T(wy{OfD4`R{t@3-F$aiAv{WUdV2B=tdy2946)OJ2qx^Yzwn@JSDz{ z+(0LavQmXhIEojoyIyr&wZC|2NHi^nkV z_yQ2y-=%m#7uE#|@xSjyD$g$a!Ly6Q`XMA%`56!&=ba-GnMX_)> z!g*+(VWJ(aaI^8;g+-w!>W@9vBsPd=QCYz-2oiOA%2-rFaa;;7 zvfh8eX6;RnKBA&i$63x?5>BM8$P7y4;GB@~UZV`JKMV_p@h-_{lZ(FmG+LQqxqX%! zT-J)e&h~-#8P4aa6xS;pb)d);fhHIEVM)7_&@b-ro@`l51Yx-omO@Om_^~(2BkW&y z5slEkXiPPLEK9SSLs))VD_5rERa3ZU=qpqA50&%#tmqp#L_Mlz>s3ZFmKNani;^X8}J<3eb#U|H#HP;u0-F zm^Ppwlx)%EET`FRUjwnao?JgNs??~>chrQZ8VVC|zw@Z9l2a_s z+Bi{3I)klFY)iWo{oZdR`Hky!s=Zj8u=Xr6OKoKp2X92ashByQYXf8rO^y@P=9rdV z-2hZhb7Sp+sPzg-+a8$&ms?5{cgdRFy{b(b4KJ8x~8*OiDU|*X;a$>dn)%6}V=0pnXpL7HCU?x9U(oF0i zribK56#3sWkN*lk{)0CB?*YgM7V_T#NH#VQfRz;lU}9nfFf%hgUcrAbf4pbrURd>+ys@4;pNL!0kMw{|6BH*MH&v0+B3C|38TQ zpMglhClL9YB(gohM%E_~$@;_q*`CnjZ(7Lu8$>?QLN>N1y!e|dvOk8{o`53T6JGoc zB3Yku{^p9Tzj@?u5cxzr*`8`)`(5=@>ksG=j5FB;9xuQ7C+icI1c7V$U&WG-{>%T@ zBZ7Z$$$xCl|0$V#bYuQ^GWqBO{TGk}@2@{V?(dZJzaWz$f`9j6`7e>=AIA}}`wAN) zJqHIXBbXop!N(-@Kt|vLon&NYqi1J%xc#q@znuJ~=V zpC8%}k!M)#nK@M=SQhjAl!8TBWF3?uAEz=bN<_3~KvN?bszu`?`xdw+RW52LH^ZI` z>Mg@JEevkn7YnWjJR50izQgX%;9p=I)1sqATb&|L`(1;Tvw&{UcufLA zl<7)c4Y%zc0fxjkyPFr`*x2rCBKu#XaD|LpNZ<|Xb$B>7O8LZL zAQs2_l>&AgG~f%(0>$mpSn3h@pT(sUoR%N$Hf_And1U=U$()gDQPj+HvTk>2Qb;>* zKW|xa4zTCdZCex1tM>1Q+@GSJQF~ORB%) z!jNoc-Ru)nxFHR^KiKqI6_kwi@7;|If~b#~kfr@rXFfqty&$lkjXVmsr5-PA5S8P; zQnHh*si}E4scVej>(yqoyC=bx_|4N(DJ+(NWNX)N-)5cnM936ZSd5L86R3Ilh!FsbrS=FRDaYBXPN*@tBo9f+$fsq03Xh0V@X7gdY%wOg^Pd#4Z6r~YaxV2amqOJbhb!0w# z#c1!UuTctjgXj>3{Vf|asu%MXR+-vuKBg4YKe()&lkO-Pw>DJc0;+?mYGr!6)5arZ zST$EC#}qy|o$f9hM1>e@eyA8Lt1!1{a`Bnh+{vnP|MB%@5GY2G+GnoGITA5K<6GU= zNI{N{Se-^qRvj((nsoS|ToW#eZW+*1d#pK7*>oZvW>oFZt6~c}qbez3z98g?7LCG5 zP=#q05CrHmruwYxp_^R0;eb^m^^9zCPK}|HknDu7T2zF=plvl{c8*%Nhp#W(#T3*Y196^ zc`_@O8>mZzKXM}6bt3Sq8i1fZoQUI?nyJ0%sDBCQRBH6|Vtb-3AuC&b1pUjlhrA4<} zpOMex8_ze6uCAaF5SqOa>>=zCyqgga#J?igq!w~zLS&+b|!mc)of!wcg)^~!PATcvk>uHB}%uxz*ePPgtKy~ZdseA zVkM<}*WQ)$crJ{ov07S5;dWS|y7m9gy3X>}zTr&FTutA%`DwB3yc9Fx00xxUh&O$|fs zHpe|x2vznVMtvs0$>sWrOz0yHcXqbJES^$D=C8PS+$Q74JX;>yFLx~W;|mFg2sK^R zeRaxg?*=vANY5kt1Uq_Iq<{S2TwF+SSy+y4zAM)34~RB2dV_A?&Qv~dO=F4H47YRy zBLhv<=bdaKJ!WF8wIC#4kTKY~?=AwX>eDcs*j=TKGCoR`gTHzSQJ+U6cOeBActS(y7Rb+ql!J z8sfo`$@;HY~*MRO-WMJpu|dpDPd2y)ivVj_$U@JvXOoZQY|F}MXG`C zRLq@@lp|t?rRm0opVN%ju$Zm4qS%L=TEuN(4;qiEZm%pTC@vxPDMgUw-7ps=2u3U0 zx^L|PZGn)gCl2VABXgudj5&!#l2R!2GVeJ;KGoG+ptmjkFzqfS`Hp;6`-;50+}M61 zcL(1bBDr?XWPfP!i%WDzySP+l>*s_Z`kMT*4St0euUuaR24YiOUV6G+5jw`ZrY*mm z1xVJ3uJWe@CqikR4jT!5IV0SGVeg{to-Y`r zlKj9eWo?!xg59UU>wsMdoQ#}a7$+b^J+jw2cX>swaL2f|KGb1qJ9l$*enyyw;&TSG z?(!P7TZGpPVZ6^ObQJPcf1rUZCVg)Zi??6s&tgdA=a)zmat?}Nnt9kVlJ<5Q5`x;E z$zLNsO_Z~Y9deV!*z>jdWHl0O!z}DwFjOlU*BNL@f=Z{;BC8P8K82={kfe#mG$!S? z;I>{GY^My z?c?}S=onF^EFoJY5oRB!kTFJNA4@3N9U-A)t1Q`0N+}&JbR^59NcMe6jF~JIvLqCt ziIgqty!V*tl;(Y}>%4!w|Gm$3xt{U6e)sRbpT|7+b6xl6`@PNdYQCumvmCwnn}T2p z;dOzV7snBOS{BZ)X2|@ZuF70S6nCy&pVRU+v$`EnO_c+0q)Yt!^Lbz19a1jI8_z8j z2x-|Jx8KP5_QOyi)Lq`8Uw$=DLdWs*9+tg!sAS~ha;uzwU21S@{%LcaF3p{X!c&4K zD0z!+ZIsE43&_)A!sz_>Q`;nLepgToIi23@XQ59rS4-tE@bQnZeYw!}bwt`-drI>L z$E0RLA_w*HQMa7hI4fcT0^VPtC~i|W-5x~ji6f;xYuqs`==JBAsMkY!hW?cpASkaO%iJA$me+`&!wA~&c3%lcc{Oi>=Bo5<}dVd zZylBJ-PHN`SKA)v8(ci-I4vDqzC*ZdH>ZObUsTt1oq?Q7(>KzR#>ER{(K!{09VI2b zFZCvdZwRUN%gY3$Seb0TGx%KCWM}*rYfVLg(&Wt-p>lsE%Z8zoidMGSuW;Dv-q3^z zSE;$e)6=Vc46y}wzpk6o%PS10q3FdujwhPh z@G`miI^NWtvUrV$!R7{Yg;+z<$_M@Np0IoMEfG<8WM0I0EO$ZtSaPy@lirDi;?J|y z1)oA2m9OVeto^lSPMLB%AmQdIJOZRPdUuG9hswx7{z^~Z&(bFO0tf#}kW}XDJSaEKdSmsdM2*#SGi zV@j&4nY=l(xV|oRG3w1k%9(JE9NZbmjvlUhD>vKp1_Ht^+-$G1eC@&?ompa1eI?$l ziL_`(`j?Ttt={h40mUBoXhbw$hQ@pa-M$5`I|M)%z}7xt5*r&!L|F$~(nymRMRDSEu%Eha?JB6RZ54jz8x zS#@3zO$cZR%NhKjWo(qTsp+63A%Ihtw5P!@7Jln2e-T+uV9Z_6>Q$!^=e)bfyJGb# z8!zAP@t2Yo5ZPdAuNxJ;;bcY468ZJm(Ay9@(Je;3=bEDyj!{de1cxtQozlqGgUoME z?QgQKUhM0}qm{IGyd>&+i;2s|2L-;~M+s1FvU`)THS_3eNy$qy^jWH_*BSMhcJ-%C z_=tkQo=N4-s{5v|s&40;3FI{8CT~|;;j)=1=j2TkwZ#{vSB4ew{&9?dTI==Atb5^~ z7ov1C68nRCA9JS!Exi3x?AB3kMPCd#AuzP$0o!**uUM`WwzKsFYbgWs!2b?peP|49WLa!Ttw}mxRm_f|E9#Oeq{eB z$+b4ngYH9#(`$<=r$_2-enY&FeYU}(Mf}*w^b-tmT(=Cak?zN|2Q`-YF zj~j8FBj)qSKFc&6cy=4(1&2A2TGOXPy|*H`qs405f7RW#zE|`{7-6R%e?H$xw@t#> z@G-q@riSTfwS`lQt`>0D3-jju{$blW-IR*~+KU6hKYgqc0&yryZ}7t%CG$26~O}lD?@lxL-LfdxhhBJRjQiucVkqJKPK)zh3!a8HE;M3N zzUQMEuC9Iu+ROTj9}8z9MJ!r-7G$24tvJxCb$64Conh8`miE~Vv$6^@k-IM75~APj z`zUz1!+{RIUpKGZX-uvb zRcMiK!djhI=aR9N(AXm@*r<4bZ-V-y@D!HPcq>0UoBrh3Q_0TEcUh2;`6)p%HRMup zZ1R>x12SFzIK93baxby5`N4ptP&als{c`-AHj&)Cp#>1y*l+c83rf`hve4 zpRJnWnU_pcA1q*J9;@qcjHNoH8uD++HIN|xyA{kIbe{i-d3=*x|F_EEZ*pIDc`#G^ z_|m&^;TVh*SSAI~0waY2o9coMgQbuFPM}bjRpm4WNJ>#aU5x+=OE^$i zV=xQ_Hpm4(aOC&Tj2hqKC?rr~18F4+$*jj{hXVRP%|31{bY9 zq(2JT|DJ1s+#Sg0ff0kTZ4pG}pu4l56vP(7gu9@Iw)ZK92N~?|!*XhX&ByTsFm{&! zVBc8XT^h$aYyW5h3GfAe2>^Ui5Qf2lc>rGt3b;&MrM|}i{bn3e$5}Br0tFEX~VfR_|_eAf1hfw5f& z3C1_2mDfh$4>e>0JFUB>c1Kwx(B&1zRxPFRxT!fR5A1R g22Tg`X#q6W@1%q8$ { + res.json({ status: 'ok', message: 'PDF Data Extractor Demo Server' }); +}); + +// Fetch available models from the API +app.post('/api/models', async (req, res) => { + try { + const { baseUrl } = req.body; + + if (!baseUrl) { + return res.status(400).json({ error: 'Base URL is required' }); + } + + const response = await fetch(`${baseUrl}/models`); + + if (!response.ok) { + return res.status(response.status).json({ + error: `Failed to fetch models: ${response.statusText}` + }); + } + + const data = await response.json(); + res.json(data); + } catch (error) { + console.error('Error fetching models:', error); + res.status(500).json({ + error: 'Failed to fetch models', + message: error.message + }); + } +}); + +// Extract data from PDF +app.post('/api/extract', upload.single('pdf'), async (req, res) => { + let pdfPath = null; + + try { + // Validate request + if (!req.file) { + return res.status(400).json({ error: 'No PDF file provided' }); + } + + const { schema, baseUrl, model, apiKey, temperature, maxTokens } = req.body; + + if (!schema) { + return res.status(400).json({ error: 'No schema provided' }); + } + + if (!baseUrl) { + return res.status(400).json({ error: 'No base URL provided' }); + } + + if (!model) { + return res.status(400).json({ error: 'No model provided' }); + } + + // Parse schema + let parsedSchema; + try { + parsedSchema = JSON.parse(schema); + } catch (error) { + return res.status(400).json({ + error: 'Invalid JSON schema', + message: error.message + }); + } + + // Initialize extractor with provided configuration + const extractor = new PdfDataExtractor({ + openaiApiKey: apiKey || 'not-required-for-local-models', + model: model, + baseUrl: baseUrl + }); + + // Extract data from uploaded PDF + pdfPath = req.file.path; + + const extractOptions = { + pdfPath: pdfPath, + schema: parsedSchema + }; + + // Add optional parameters if provided + if (temperature !== undefined && temperature !== '') { + extractOptions.temperature = parseFloat(temperature); + } + if (maxTokens !== undefined && maxTokens !== '') { + extractOptions.maxTokens = parseInt(maxTokens); + } + + console.log(`Extracting data from PDF using model: ${model}`); + const result = await extractor.extract(extractOptions); + + // Clean up uploaded file + await fs.unlink(pdfPath); + pdfPath = null; + + // Return results + res.json({ + success: true, + data: result.data, + tokensUsed: result.tokensUsed, + model: result.model + }); + + } catch (error) { + console.error('Error extracting data:', error); + + // Clean up file if it exists + if (pdfPath) { + try { + await fs.unlink(pdfPath); + } catch (cleanupError) { + console.error('Error cleaning up file:', cleanupError); + } + } + + res.status(500).json({ + success: false, + error: 'Failed to extract data from PDF', + message: error.message + }); + } +}); + +// Start server +app.listen(PORT, () => { + console.log(`PDF Data Extractor Demo Server running on http://localhost:${PORT}`); + console.log(`Upload endpoint: http://localhost:${PORT}/api/extract`); +}); + +// Create uploads directory if it doesn't exist +const uploadsDir = path.join(__dirname, 'uploads'); +fs.mkdir(uploadsDir, { recursive: true }).catch(console.error); From 782e7ec8ef0f6bec3b42d283a71ec5413a3774c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= Date: Fri, 10 Oct 2025 13:04:13 +0200 Subject: [PATCH 2/3] fix(pdf-extraction): improve file cleanup and error handling for PDF uploads --- demos/extractor/server.js | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/demos/extractor/server.js b/demos/extractor/server.js index 7e8fe5f31..8cfee7c05 100644 --- a/demos/extractor/server.js +++ b/demos/extractor/server.js @@ -61,6 +61,9 @@ app.post('/api/extract', upload.single('pdf'), async (req, res) => { return res.status(400).json({ error: 'No PDF file provided' }); } + // Store file path for cleanup in finally block + pdfPath = req.file.path; + const { schema, baseUrl, model, apiKey, temperature, maxTokens } = req.body; if (!schema) { @@ -92,9 +95,6 @@ app.post('/api/extract', upload.single('pdf'), async (req, res) => { model: model, baseUrl: baseUrl }); - - // Extract data from uploaded PDF - pdfPath = req.file.path; const extractOptions = { pdfPath: pdfPath, @@ -112,10 +112,6 @@ app.post('/api/extract', upload.single('pdf'), async (req, res) => { console.log(`Extracting data from PDF using model: ${model}`); const result = await extractor.extract(extractOptions); - // Clean up uploaded file - await fs.unlink(pdfPath); - pdfPath = null; - // Return results res.json({ success: true, @@ -126,21 +122,22 @@ app.post('/api/extract', upload.single('pdf'), async (req, res) => { } catch (error) { console.error('Error extracting data:', error); - - // Clean up file if it exists + + res.status(500).json({ + success: false, + error: 'Failed to extract data from PDF', + message: error.message + }); + } finally { + // Always clean up uploaded file if (pdfPath) { try { await fs.unlink(pdfPath); + console.log(`Cleaned up uploaded file: ${pdfPath}`); } catch (cleanupError) { console.error('Error cleaning up file:', cleanupError); } } - - res.status(500).json({ - success: false, - error: 'Failed to extract data from PDF', - message: error.message - }); } }); From 2014ca4ee4cf8faf5614024836fd160a0f17209a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignacio=20L=C3=B3pez=20Luna?= Date: Fri, 10 Oct 2025 13:17:42 +0200 Subject: [PATCH 3/3] fix(pdf-extraction): improve file cleanup and error handling for PDF uploads --- demos/extractor/server.js | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/demos/extractor/server.js b/demos/extractor/server.js index 8cfee7c05..8d5038bbb 100644 --- a/demos/extractor/server.js +++ b/demos/extractor/server.js @@ -7,6 +7,7 @@ const path = require('path'); const app = express(); const PORT = process.env.PORT || 3000; +const UPLOADS_DIR = 'uploads/'; // Middleware app.use(cors()); @@ -14,7 +15,7 @@ app.use(express.json()); // Configure multer for file upload const upload = multer({ - dest: 'uploads/', + dest: UPLOADS_DIR, limits: { fileSize: 10 * 1024 * 1024 } // 10MB limit }); @@ -141,12 +142,21 @@ app.post('/api/extract', upload.single('pdf'), async (req, res) => { } }); -// Start server -app.listen(PORT, () => { - console.log(`PDF Data Extractor Demo Server running on http://localhost:${PORT}`); - console.log(`Upload endpoint: http://localhost:${PORT}/api/extract`); -}); - -// Create uploads directory if it doesn't exist -const uploadsDir = path.join(__dirname, 'uploads'); -fs.mkdir(uploadsDir, { recursive: true }).catch(console.error); +// Initialize server +(async () => { + try { + // Create uploads directory before starting server + const uploadsDir = path.join(__dirname, UPLOADS_DIR); + await fs.mkdir(uploadsDir, { recursive: true }); + console.log(`Uploads directory ready: ${uploadsDir}`); + + // Start server only after uploads directory is ready + app.listen(PORT, () => { + console.log(`PDF Data Extractor Demo Server running on http://localhost:${PORT}`); + console.log(`Upload endpoint: http://localhost:${PORT}/api/extract`); + }); + } catch (error) { + console.error('Failed to initialize server:', error); + process.exit(1); + } +})();