From fb257b8ae33f2b0535484a59f9c19d9de92f3099 Mon Sep 17 00:00:00 2001 From: assitan-h Date: Thu, 23 Apr 2026 23:34:11 +0200 Subject: [PATCH] Add PGP ingestion pipeline (Excel -> PostgreSQL) --- .../PGP x D4G- Exported Vaccine Data.xlsx | Bin 0 -> 23128 bytes pipeline_pgp/pgp_ingest.py | 150 ++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 pipeline_pgp/data/PGP x D4G- Exported Vaccine Data.xlsx create mode 100644 pipeline_pgp/pgp_ingest.py diff --git a/pipeline_pgp/data/PGP x D4G- Exported Vaccine Data.xlsx b/pipeline_pgp/data/PGP x D4G- Exported Vaccine Data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..c9cbf9c5ec507717822ddb9472c5105d07d633e9 GIT binary patch literal 23128 zcmeIa1yCN{mM)CDySuwX2=4Cg?ry<7SdidOkl^m_?(P~05(oi8aQlJG+>c~3XU=?e z>ehd&&Q2xo+i$v`etLDUwU_OdlLQ7q0f2yj03c_J)&h7jFdo0VTG1Oh=)Wi2g+BRJbFyP0?U*VM3%r!8O1yOV+WEkK!NNupAqlVyeR=g)A*vvfu@vMh= zuZ~7-oa!!x43IsE_p6XV-BD{!w29%4Y^&obIQ-}J0{TU?-uK$v)L^ofj<@6!9eU{G zS0Q?ZcWX$faC2@j!R$@fz}%H;?dLafoyZX6F3EU>-1iF zKCq4z>ViKV=bD2s5Y6Bk^ulDLq!7GJo?Gq~u+PVzISeS6ErF`xfO1hx;EoHD;~wj5 zbwHtQXjXS%6{Q~#SGJp`QP6!LxROk#E!J4%%{RY#Ghkf5;eiH@%dOQ)szJ6|3TmrL zowd@DYNC2wIZ1E`a0BxIP(T0xa9{ucx&N&wnf@)xf0Zbi|1HXYl_**MEy{nDC|Un4 z%72w8+5Rocf0ZcN|1HXYl_)v>Ey{nDC|~_sl>ci{V*WQSI>WzZ`M;Lse}p@4Y#l5e z&5Vtm9O<7weug_Ut2U_&DB#EB3$F>*!3|*``T1(giszb(milWg)AT?MjZ)vY+>MBq zl9VXayk^+Fy`8TVKDR@vePblyxcC|13h-D>(0DM_3(ot_Q-`!;I*5g|(74;d&IF4K zMYl?)D4-03IRg|cJzyw_?XBE&7-Y8BVpX5_qBvGKbd?t1H_yh!kVwR z*erL@${~lbY>bOPtBC+gacJdAz6Av{5!@UA6*`PQ0zx-B8Pn&ZpGL^vnikYAb0D7 z1Vtdgt|XLD-N1{MC4+Rq_)TbsTEU3;swhm#C2t4TLpkGHAnyHD^la6*Uu(l@7xucU ziJGr4nD+YRacZ@WE|*6)t9RmehL)B%Rq4dq@h!h)Zu9&I6yAjzP#~Q_mygAx?CKAn zatps|?3Gt)`oy+UN8U+u9p_vM!B8@hddR}cL{EPDRE&DIw*+Rb8YTR`Gbci5H>}?n z)B$zE9S#;3i%z2p+o=em-FQ>tJm4@;Rb=+Kw2Pgz92I0NX%z4QB9&oqHR? zawrmLlZ&S(mIrTzM{+W`oEnk6(X~A??%A!&3eQ}P{g&eub=GH4hN_`2gK$V=tM;wH zXvi87)zu{WSrF2#MKUagL+tixo<^9J@?A+W>(Gd^VqnB?GOT3W(P8qqg7{sP&~e19 z!VIXp@&V5h2Q_F3YnnjjXfy6Bwby0C;AFI67k_NMod7&)R3l@Ai6gPQUux}+F+~&7 z4lNHmLw=G$X~BVX_2U&%J<8#KjC5bf;8}})M!NHhkxL9HzMshVUJEtiZ#sUZ5#gH^ zR`yU;)-IdY3o?trAk|N*S<_h};|%pKFs_f(=id}_ieH&q z-Qv7IxtkV56zx`XS+0ssSL+`)2wAsTD;cZu!8#WRkA`yU=3I-~(oZH4tVOqeMe~{8 zX`lD?V*PeO#(C7B-JWn`x{O6C2;^W)T}kt0ZpGIl^MTc}vK~=&T+PV{`e+@N?Sees zuLs@78kY1rWu?VBa1QKrREuj87GTOmbn!`KN@~a!#U3Uz33t;c$C)wUtnb#H6{E(d zTXsgwaq(|#+$_09rW-PYDW(?pet^64r$3`8(43kKR&1e#Q<)FI}RaGm))bE`_*Wf<7&k{qyAD7#@O4&g{vB-va z?S}Zfn^3m9Bg!s?O#a7dy#z!Znh<0!OuuP}%s4{EAT?!9)k4K>o%obl7=m88_)%pO zWj}=Sp%E$*P)L>O#uQHFVVw! z#wh6>^#g6j!pNLHd+R4&A|{U4VwM>S=-kxC$Pm4a2|ElL0fS?_K@A+M-Z{ku@50J# zm*#f_BV`5}OvI%N3yWjEAsH~Dq1#8WFgw#x3D(A1QGuEibP+HI2FHHf=yw<~=|fWi z1|G?A-0LS@MqrKwl|vQ`_v&~sOT0|(H7$q!%;p3lcpC>x?{*q$Cv|Y2Y6>yj{rB{s z1S6S%*RZu5PCb6~aMJc^4_g69Pg_~^aJ2R$qumFYy;;O?UiL{#`0+Rani#CFoCdOF z;iMh74E3=J2D1d=yd2D8hEHMq@#Ft`Cju#|#sblY-;?cwFStx&sOP zk$%yrd_T_h4uzWjSXWY^h$ybpUQP}gXWk5Gb6@($0JsMSYV&N#*$+{4jNgjuz1&pV z^>Vz`i&L-fhtcwGuCGK7cHp0M-EM_ADDN?&bqV*6bp0RNyJwC6lfA2hMl@fFfLN8_ z?szYSzd%M98nw1&g#t~dHFi6~ertS}M(#q6FJejZnG0A%M4dZDrz@A+`l3TXD964k zh=c|uvy5#!nJoMZ8B7i|4I!BrhJh0}p+A;cUq>l%Y|Pj-rXiLWJx&qzIY}7JGUS6b zUoDe*4(X-HZpTvZByZhellxh@C>!)?;5pmv1poj zc)xuYbfjVLEU$v*v3^i%XFGJPryuragyC>W)my*FSH^JZZ7VsoSxg(+I{wg~S9`g|}F6_Myok!cd2; zvyr>ZhWVRF_%Khg{@tvk*CnnuvO6BTXR>;d^xk^ll~1e?3KYM{7o7HYS88sIGDW)) zm_E_VgjL$QqTZJE2RkeHMTS?U+D^Ofm*0I>6N7i0kzv5US$ALTo3HD#1ui>te0y_u zEUVn+GPc{h&2YcSGM>oW_Koay<0Ab$zNcC>#~Uxj2jw<;LjT?}&yn%qdq>C1`lYiQ z{Zy?!i0H|?6lSIf@!SI4^y_8f^$6&NmiL8bJ|PPlC_ zV!gEXQK42Tm*J3z(!}smLgCSNnOKvbL&)s$#TSx7i0qCK-R|-y#gT+1p)T-3%cIQ- znee0{^q^E{LBd1j7`A-}0i`ej6pw$ZepY4DfjTT_IS~*ggSBm2+ExOO?&RxT1iB_=pimD|tA6 zj|8~P3N10js#Qdc3<5J2utsxd4$YqCc%fZbgyV~!V-von`c#lH69oARGxH=6e2u12 z;Hgqj#UXvga&ERA^R;dR5Q2!PYYpTp_%SR#y>`hW0I~o(iz#zRMcfsIK43Fxf{IH~5;;sslsr7oj_A z>%bfh^_ABoEOGo=(4d%lAD%40Gd4ABfua4#(?{$8KD4x~fH&#!0`C!G8kJ1)T(3%a zN$Q%i@Aa4l5~VH`etoJAole=CnL}G6C zR`19y?53nVo;TwP!nTbg4mG%ob6yEY3hYG7zGox{-y{Tm++9#V)p8$}EWovl-C(CG z>gl*8IklZ2i^t>EJ|o5(w6$K%$<1Lw4X?;qGGO+kauvpy!J34?%W7Jh(Krg0b{ZW6 ze*dhm9F`Bp-1E+~G^=t1wifJ>+X>>Dr=^)CY=p5*0qfI6&AlHfeB!VEbRaN2xQUve z>^BY&jey6KI_hZaC1YDW5sAf)#3)aE7RdiOHaY6pM2*-gFCSxUyb`l%d5cW#oC$69B@}4Oq$`85<)VA77Qud=OG$4 zLEdcuNrpfoU1O~Bhh(~^FxVXoeEeAJ-IXM=4jH^3h8rrKa2WW&mZ_CgGKb_@>ZmvR zbq{~J?l5!DY}T3x*{20zX^R+DlH@uv8)R&~S+-&OabiO=kgxQ7p4hUDCgT3~#P+i~ zkh~t5%7GI6L2n(NdL~>NoyZ7wDw<~PhZ1>9NC;6-R zsTXw~hqA`6gbewnZMUWwwC2R8(DSRtZj?PJQ%?pe5BEJW+Fj_yeZCCcGNdbgKo${8 zm~or!xd?QwC&}T)E6%=ML7Q)~3i1$B%{bLh^Y$sCyF0462JW3zo~-rqz)=y_2tImYh^??Rl7TM!YpBwdFGA{<(a8 z|MX}Ro+@D!n>J78ht)Yg%bkyS97&u%Mf6s!8c%5pgZNn$Mm2Giq)~k`QM+@qvp5Za?ZZo%Z2H>kkMcOQ2Y;aPl6sE? zd)9whx+rMg{>s*S*!{t9b3u|QvJd&%{)cs9k@R=Au|n54t}UcjMN#6`C)4~N*QW`;NG|NwW7OaR7#c8WLqqH2w)y*CP znud{d4h1A9nkrXc;zDHphXVS zfx5PAH8mutcTIf;S{@5eKP+W0Xd=143&Yfu80n+rud7Axy^fL7b=oOE;Q^|)mw0cD zD^l4*5{HqlE=9d!lo3a;oDyyjFJ8bzWs3XlX7vd}AZNi032Ss_CDLN&i!Bb4kY=fT z2?=DYiN&{~Gp+66*1nQ?478X)9cI>xZ*!MyMvb#cwOFSs8vMn{DUj)#x`&XE+3Cyl zR>W_=wI>CjC3oy?OBkw{!wie@=RWQsZU4qiz~9W0B}n^Y=A_i96LlXqbeoph2vmeV zDkTMrqGLPs4=s~nufCFT47O;09%fd>ydH^@K#)llQYsf;v4 zpw@j_sS#wod)sV=b0$waq%zK%WbiI#gd-$tCfk1{1U2r{PNp319(72xKVXVv{g^r2 zaKIFrx)h5@q8V>*MF^|hryVaIIXEocmmVxmLl}`krXA~cJ@oYd(e_^nV1@g%qjS~B zGQtTK-NOw*&G#6;xsROQej0>`DedE6dmn_gRn`aMopxpfGB{^|11%=_&VJqiD7EJa zA=LgWJ}haUcBmU(9yca`l`}^F{p5uHiZjMV$+<%?9c zc?i<`bPn6;gDMZoi|wlwQDG13pc{vj$pY%AX}YqDEg7;5Vt1}=FHwc6wfr8Bz(~=( z8bY-8Wt^tlRtueM8*SV=`-?L>dd3IVq~(VFw4DN!r^RwpDa6$sSp(DoVz$kyBHZD;18ia1d3*; zoKiHz4)KH~cWLjaoS5=NYd|$+=pQ&?X`4yu$LZ70tHU@O|{1 zhq|j_YUww0K4qY0TaL@_J^5GtIuBUw(3JUi1HqVT8fY@>5+=UT3J5Bukk-cnAI9XUvEu*?CgyP3U$Zi{gtorcn zSc^0aauX)U-Vnmu)8uwCIGJ!9tSB5OQ#$^7TClPH?1W1*@8JxZq$`ENL@YRb>KpR1 z`)BtG`0Q%W2nYaR6#Q?i>+|#aM|G9^e?(osvmKKBZFT)w>z{vC*T-z}le%JiS~!hmesIF*-rPL8knAmp5LBiTsnPcV>m_#ocq$`eg5uA7KBISf?o5(Zr{;0BU z!+5rh{%*ZJ;~UcY!F3DmX-d7-7oV!J_VG5_b1hFN%RYBmAnW=c^Iq-A;QXp3>alN_ z+w@C4O?!o7FTUrx!Z$_exxLbwnE@1`fJ$kxxQk^#ugCDLErL29cxT(P0|t$T}u(w|BdJDIMiAi=_`K33c=1GznDQjY5QRzX*!;|)YsxRpHXa;wa-3u zYlSV+-j$l|nr_KN*teL_@%pq`?w9w1XV#B;RVJIim&xPhyYxZI1n9^#^KL}1kBUc^ zu&^PSj$vJTUs;smHob?{&42A={#Iba3xVG=S#UWW#Z( zSm~9I>(SqT8F3o5^={X}A-e6{yy4S#=6BZfr#s}Yy+AH?234-=zWE$q+bW-1y1|VT zWskLgZ`HsV+dLkC-!xp?T@*NOC6AP;qlg4w&!y1JZX>7P`@Er&JyCX?>CuG#5UoNMG3b?yVV9yL>)T$pTmqie`yPMD&FnW0Z_ljIDi-h1w z5U}0RZbP>P1w7W_Zg)2jQX91!m+ClC`nI)W5Et@@|J;#$JL{2FLNM*Nd^Hyr<P_GIMLDXb}W9h?DP}fcX34P!w;AlNvhxT6J6=_FVAxC zHH;n}3o+mjPMDbrq>stk44A)dZ=MzIPxfX``2Sk6)O$nTSuAQ1tFn5m{^EiZHW*gZ}@;aqqihq#=_1+1Am5Dte2T30MN{W>~9 z=? zvv$6z9NfV?`v*IRAZep-97O^r{KYZ;twY>znJ8-xN!wguE@bT|_NX`T&_*&q(Qr>%3 z$zU#Z%PBn_J&Y(Gt8qm&EfmLT3hEUfN!3Vsze_vXYyT}Jz$4k!5MNVU+Lr}zw8{NA zcrwQ#B7Gm@(LpGtn@g@O7^OQ1O63+Wtl6M;Uju5A5n*o_IZGdSsFJcB#lZ@WHMy6@ z!REk_4Uo{qMpxTniQe2B7>XtPh;_>%a!PkqW{5_?>>-wgPrjii5Do44x&tI*bOn}i z!URnarYiY{(8Pdwn6R%tIhc^FB6$`8d1=HwBzb9M3juj?WEx?*@bnY_*=7Hhspqzs znc>blB|L>J=borsv0|AyfbLDJE7P~S1tK|?vpoAm^%~4|_?806m3l}Nk=RU0nq{F7 zxfw^U1$DykTe>O>;%Nlo)_!$OB;FvNeWEBdB$>O6&x1sp1s~29-Pa&MTT@!qA|bLu z=VcshD9u{ClY2x@>zzJ_C&PzmnO58fvG5N3fPT``xKu40?MJ`B=(@;+zsZDe_NKX>%TWgxQlE8iGhB zP;&9bn^MGmPRQ4tpa}@bBtICN0ucDmSoAmMqQ%86?*bVk+K}O9QC}ma(kw$g>S-f` zdMW9p#%@Q-;$u<8dYS*Js3P>DsDk^msAAWT44qBWlWlMX^dyA4>$$tXt*1Y`t>VXH zvKbKi9&=BAn{`yHX)BF$mO<`z?21P82{mC9S0Cx;}T4=_rHK z@u+jj5aMo)6|Dn@t9Pu;%MjS(dUZS#Eo|Lajyu6rhsy_ z78(`uF#tjxyWTrphQ?hRje`nG*!BVr#1c?g#;5^u&8GtfnVGMn7@UNFtsha6Sw>Qe zNH`-Se&mx_PJG2ZXv^w?JHQwQn&?@MGqR{H{<_0gi_sir6jl(1cQb_0zC;F@)@_Q8 z$DDswb*ni375!IP_r6fd|I*022`crWIZNV;_kLP=xc3o-AK` zx9SjV;uj2=4u!s{l1O6>zKJGlQ2?bKY_2fsbvkWN__&eywv_%G_c1?wV-YZcG+*jG z&ApsMbb76_?w0IW)R{>lrnt7tG>q1KPrOo^^7kk35W->weiHN<4J0|anrPZOi*)k+=iCRLA5#_yVPN5ngC+g_7XwsPTvzU8LmLID zbqr@LBsm-~PU)%A^6fI+KHlZJMi@2PDz;TqGwxVz2W+>TH%zaRcS={yS3wgpZSp^r z9h`4bSe&MpRk;|QH`PU#nirK+^QO)htg>@1YmqA($MsZ(6qOWoV|O>W1$A513zsF5 z@hUZavmuF~s~#J&ebd`^r&l}{QbzT?uf}9K!a;F0xx}AqYFA4aRGgq-=!3kH@#wPt zg8}7*O7Dkknx>25vWy)LSa@mfHpiA9OmvQYE_aVbQ{E2Ku8ipjHbr5m0$$|%ha@o^ zM5asw%(ZvB#>Q^rO5CE;9|pLEO=79H$4%H6gUe_v?y4b|1dZ7ryvigUQ-av+kAU05 z=C$8ohM2?_e0XzrSwE3KcLhghe%qf^KOBXR%`n(oRt|16KLT%UT6A<>2Lc%ov1Srn zrJ3}--C+9gs~Qth4gQ0%40lqLf_GLs6J0f3sx+1(R0{OCRg5lAo4SamBn`~hz_~~~ zQxa*i$;q&xqohV~Vz8vQ@PdRUU%DAW)UOzD4CZ*k_ZXwk7O znXm%UZIVu1F?9XEK#l4)+Gj5?w1J(d<-q|YWw%o9ziRC)8!s#llM8a9*8-8v6&gxJ zq1s9#c0}kBql)Iu3?z?xFAArA^xiGfI_blRc8YiGKNorXF%~ZLoS(+cwoN-{UNS)5;ODn!j^W#9~Kvql&9k?C> zea|8ySppc4Ex_$S=3zRuB?Xfvo;a64S4Gwtv`MAa1VCnwT1HY6R4w1dxdT5ebv75* zufbW7x1uF3Pe+TwfgHt*o{1qNQUTCg53a^1cg}z*etTupk7w#|Y1f}QH4#Y%z=x=1 z{@zd0%Ep%fq7p1b2ncf`V>-cg&9yA(JS0_tgH&JZlLat9*D+Wr01@FOo*zI47fOiZ}F{t5s19hoIY365@xgsz0cyW$JMJO zx+DZZ5HB1j+Ix+mywC+@5VN4zZCN0KfN7W$JuPON^3~{}3T(8YLfiAyP_Qh*^MT&^ z{JEoz^kJW|;xTCW^7lJDo^}7vrmx^{I3utBZ#X04zc?evaXTOWXVdo|O}4+cv@!l{ zvQ=G<*<^dReZxlqw1Y7;Pzz8PR5x|Cwc;7AA~0o@v_ou(S2qwrX9cT7S>H`&L`A)3 zL=~8C914p!J}#HBjB=60J{mv8BE?W!*!T2ERZ>NaFhrbnTMWd9 znLfvat6q+OJ0@jc;yms1zNTr&Bh|Z}9)6X+?%IEEzsgM7)aC9Y!&&;EiJZ{2!k(r8 zrC0%7#$dtNa{bn)UIjcWG~VH&uj{sMuq^p*u2!jE(zlGowm3~K=09ZY@1tJ|#O!_- zNi8G-t!hX35A&BT;n0FUo*$16tayhrDi_@>j8=`kUGKuk+}}opeOC(GWN7_p{tBrb zGlJ(DF8q$R58B${gx;yMj5K@QD(^e)dV;HBKd|f^UT+!v-LZTN$=U#n_;tY z1tc&R0aH;~XreDPoQogBUYmUBgn(qE#wUB5G!iV2P<_6QEI85^ObAR?g$17FgdlN3 z!b7Cvt+yu{*JH&5RH6kCG0TzR=gI_E4(LPFcM^fU8|t*lhk`A{7)8N^)Vm^VET!mg z1PEamPOJ~WFOEfQO-H0(DX`BDzz-6E|3wnaD^zrSAw4uzqe#w#8odLO%3`N4nNL|g zfFoBd5gT2JOQ9EBc_k+-LdpKkWFSFtsg|!)1?p-;zLu}Af?%|`0Gcn+y`ohgyNg1E`b|kYTi<6Nj%oRJ z%xX#62^fb30s?St@u_h_NOE`eh^ax*2@xHClK9l?8xIK3Eh6O?Rf)LDu?6WxA_dzO zC%UlLs6k~%sX!OafSv`yqid7ayATS+5x)`E2rw2a+9Y;+rP7aQoFnSu2EN+bXIPU% z?JIX#25z66S*i{UkcEWm3>@ov8-tjSc+#@3714`mSNy365J2z*riXH>?LO)X8=)Sy zNAPvb_*j6e20Wt;Hw376=V&Js0fb=WFn}*|6vy>Z#u|XHQWW>qq94NJ9qglpu>asa zWF}<~55you86+bGM#y!>WeLzls&AGB8)!FV2TPo9xK?+yc+U>IWNyBEc7Bks!g>Db zXm1n$DFSI}o?0n;w0u+l(DIS|!(QHJ)&H~U`~Pb*Z`H;ksJ!ZCvEd!alN2t^z?Gg% zAN4=%=0*M~1t&9OYh(K7f1lgoM^%??RtO%A-aYVUYv+6`WwU%$sxrQY@_yYubLB~H z=K>I$`F$IPhqb3H8OQeP-t8)6d@u7Ga9-Zf6_JDHdnWTv|G49 zBtELONVK6kbKIx6&ZQ6SeY{cGECkJ5943K-VS&hDxiNGxH3$d{X_OJcfe4fvJ0#x| zVx|P+f_%RVX2!!;d}`IcN91Z-R)E7}&=!<@AXw7M2IJdCsGo1gQU&hv*d^u3-^AW@ z_r&rI;1g!lT6RRcUn{NzE5L$lv@PeJKYQ&Ou#Rid*tn>n?OosN0o-z+P48JBaG}w{ zi*#URx`^AdzG>R+YRcYxSQ0Rw8JPFg+qceoeci^=2{_c@dfV9boj~7M;car-k|^eV z>}e~|-e%F^psy#DSRnA*<{ipBQf?hYE*tosB;2!*>bTZvDq2P+;8}gs)q?f|;=-Ir z1qt}?p)utGl1Q(lyIYy|LtQLlv`8$}=;u3oaSj+a8gJ8wKEK}(pfx-|HIP^A2SRC- z3xYyOFN)W0YJCJC_IpYtrj?e zL=*HiFTo@CD?z^FWtZ%HyP_Yh#Bf&_#w$ts$u}=~>A(b&!m@U8Y~{LPGhwcO7-t_n zSqKsGb6{?IkuB=r-aNbiu(-EUHoW*H#J9cZhYl+F`xQE5j64ChMHodJt8Z-YS`UhN z!ied)?tHU^{SP7ybaqg+qOf0y;AoihQn4g-HM{ymmfch-9!;g05u6Tce!WX+$r{2{ z-Huvw9)5e~nds^v$ncT2FSl`PD$l(i6Cb3{p*pnXRqU*%A^YBuhEWUkS;>X$@?h1~ zbMe+V$4{T)(cF~&Lvw@jQ_&pF^c{?i6rCKN7TiCr;fZ+EG;H=~O(V?m3m}XBS|fh2 zK!-xv_Aw?$qv{=y7pA>vb6~^g1oXwJi^nKyg8f0w^i4cEEa;EHIlBg&v9wo?uIC4e zg}qqms&i>?BG-Ss28yUjX1a3Q_4dBH-immQO9G=b!T4%yw`Qq@ss4^Q4%gFeL}J- zB9rs&v*H7FO(y@3KnAo4Xj~o~pYiM{5X_y|wS`-G zb8nLZ*6O1s;x<&i7R_-34S(v{A1PZ5RfUUI{h^SoM9@t;B(S1C3*hQ=#+ib{0mE~p zhwR*>`-+O{^_Ahw55#Xgk+N!ZR!*;|gX%1$aO}36-x=%07u9!ARSJ_0a|7Zg9l@;n z`{-w)Q_4jWmNYu=(Xi4p%a#C|FOn-!80;^9#+%Ulwva$wYv;UYkT&S+26p)V7b~JIHtqY75cM@I` zxsNUJv2UNu26j;7_WFJ~M6h(Hy^i1@GNL|Z8ntOTGE>y$(emm*p(L1HK>Jkn}h*3;F|>5q(i0%+yVewNlZ!@6w4%TQ70bY*GGG~4_}edWSU z4g)4G6sE>A`SL4!-{-}9m7Gs~l+uU&Lo+>{4ypk4uLhJ#A^7l%R&dM&)ceL>OKgj$ z2^KC0N#=egRnC5^^4r{2t*8H%riZdw8sQ`ehA9s~RV?N2 z*uvLA5l5p#3!;iXzJPay*=@Exq{EuqJtsG+n;(6?%cP~n?0YVwPY*s!WHP6;F-7CV z$f7WzYekKm!7=+!ww=jh@7j0917w9tnx={mQCyG)X;y~1m}}E4vIo_bfbyW%vm~R< zF(qKeAViX)Zo|z}*%yCfy(9vFuWPnB?m3JRF>|>c27LIKd4x9Gq-_$qKUJLvK_x0m z41l|?AwYr-g9D=-Y9lR+NNcHmv=J>r`{{$|SA5EYw}ct$#qp~8{DPQjn0q8&kSfbN zhYQRdH;%s~V7qrW&@_TofZ!KnjbtFbpFFvRV^K=@Vc~h7GVxja3(9WMw=AhTrd&P+ z=@Q*$XG0q|ZL3DH0)-+*V;}-L)$6O`96(0`^~F}+ZN_u=3;9fUlR3XCn1|o~F zl3Hc(48eg{&Cr_&ip=a+R#Nnq zDdT>XN2v0Ut-L)j2^ee+VkhuL@&_xvT^vIy2jl+kfYqf1gLE%Wi}@Tq$tRBwC@LpKdp2qS(&GNl-s zV;$M8U3KslWW!@E_}$O)-a*Kh>xakgAD%~N?7v@k1^XWph?AR@@$<~FsxoS`%mCx< z-x`qFEn^w~k-yTY?wy`*uy&Oovz}^if??M62}xt^&5s-!a{KqN((hPrP6&}G_qRmD zkdRkd6f!+bXy&QMRtEs(d$i+^l#i_FQ#D`ug-vmd5u6JHaWtMy+^2!90v zG1S-KRF;BlP#q&~LGRp)Qi8H1w$I0cZ9v^^W^+eIIbg)nL>A-l7PzYgQ`X!ikl4+n zX7+9KNDWC7g==4Aw26_3S)Fi9KLc+TabaU|F|==K!ImXmgXL=#j!-39#V<5NPNg`X zjq?TcFVh0PyNAgG+!iv?%H% z3)&I6fuJXt_m*wIYK~E6TK>YVHaiBi`|T$sS7mg=@;EK?=_}}hPZv%l0oJc2wvZ`X z?{q9L^yOb!PsU*7;QQ~O2d3>FR^Qw_XW7S{pkLKSgz zIJwBGLAOz^k_#?(CfERsTUt9buy-RtQrRxi3uH=#X!@wOp@Qd0`}s$dlQcNtIlX>6 zaQ&rifiF{;bO}^WAqF%lQ^NUu%X(JxhUdncbEbA_dm=xQ>}BE4N8~Mpx-G>v)wHmx zZ>ZB9<_C{=I5Mtsv@LgQdiFXeEP7LH7W=1A$R}|hG@Z!X18fRmeLz}w4Lu_^nMLea z!#DOZ5kbt=hgq4;FKl?6VRpw3@TmgIVp|Nb#!>pd`aoIDv6eDjbo_`FgGB(6@x@G*tn0riibPSl@I zr=78bqpgi2{m;)Q{{}al5$^FlLIAoQ_(VrDafL-GE`|ny;6kC2M652#ngzERUG7cI z`b!6!Yc=MG=M!w`c0KY6pAv(2w_Lt;6oSg=Ba*@6DRa zf**NiJ^uag^$h;!l{_Vb3~X&JpVwK>=tre6V1OUXRvbaS5nM$ac{ZzjWod0o*^t}ft#Q3<+UVF(0EahlqHS&nH2_qy$AdMuceFH(Cqc$N) ziL_nLt@z1`EK|E5<~RCx{R`b|Nx+?Mhio-J+GH>tvq5@SduB3EWNb<{@#w8PwXFS2 z%Rr8c45m4B7Kfsj|Y3t8`jtZY#e&k9Mk zedCeNPFShU6#L+9$ly z+0i=H%Mad5x$I(v3=Udu@C4Z~t(~cn+6C52cL~15uc>ovZH8I>;`(O35%J2EjGh%D zbhTx$8EQ3n@|%x39vchXCw1s*%U$My%7{f-65M+_SJEc#&a&4=H_tg!E(o@n?ZyVHCvkw1s#k z@4u2UFKkmzHm5NneZ7IZM=A@;MmsL((S)A5WeV`7@all-Sdc4|WT0%Ud5?!LZl(x{ zJ1LaGDw%Jxqh9S|VKAN&ugFZ=w@_>M7H3#7wh<=Q1|p@ca(xZdzDRqEoEulo9`@?p zI(gMGk)HQ z|H#(x!}&8HfKEIvy*4>lu+tCEs$e9ug2l_M6C~uzH!towJXDB)PK>=e_0B6vbTfee zp-(`aIpCmYhWe!w3vm9tg&P1`UW1mribAg!z`2%^o?|e{Mu4}W1IZx3mq+ByXRZP~ zQ%oiYv*|oy-r*z>+lE++IIt_a@r#vqY&ox_l1U)X9B946{bT&pCL1|dBIgg2efX%l zp(K*F3{Yxh_awY)A$>Gi9mO#|TxzY)4L%=jvk0mlFNETM%++WL+S)i7+c@bex!D;z zYCl=0(Zp4o00NZ$4|+#>4cc*aH#!y`l~KdigH7Oi4kdXIfr9Ke=YrOWPA3LdEN3D; zylLzBGr&^2&ZOD!3~E%#G4@PEhyzk53UrF!>s-Uj#RC(>#{y&C$G{RKkIUq}ivSDP zG7Y7qESas3Gf?5{Wst}EsOVr!WuiSxQ)sdnM(Zadk|UWyY0kg2z{WUWKf zc|3K3QeWd8li}P|HXTMNo&~uJTPo+Ba69~eLR&z zpt-e~>qJv^4Ptf3?nd~#z$c7&u)PC(o>E|&TUG=-bl{s?CImcXk01MXJQaFim&L3$ z`aL}K9`=uJO!pJ3dCftwD7uHTk^nw3>_VK$OIR-OMp_Dc@ zEWQo>IM5&ACQpSYI^Bd);jreU!AMf<1N0&}c5)cRl;IG&7y?oo;)B7rZ9sk|S z`u=$o$;b2l0p)ou?>~={@aPZx0p)q&)IX2%=`m^Y2bAYd_J1CQ=FwLF0p+>D_Mb<2 z^VlHp2bAZwxqlv|>#_0Y4=B$~J^wrkEaV?io*NPVc@(urf9(%ko@;vl2<535_SeXH zw6A|a`B@tK9p>drG=6<}j~+W0A7TDWh3(0H{f_eTH59~uF?syP_~@Jcit@+TZ~PAU za;^KXv{(^-2mI?@7QdssTrB=8EyBm#-hWH$uT}ED1HN3T_$w{4M{oL9z-L;2RIisy z{eGo&{phFsit<8Q&sfhh=I^v#u0{Ek*2ClKlwSd#Y5h@OUoPhOl~y_7?yHNDWz+hvv~);+M|nZ(uWtR{rS-Bk_g7kmWWNJG)B2-DdD+|g zE3Glg-%(!B`m3q;cUmty5r3sMP4zqAGp#?GsF$sLztVD{{T<~6t-pF0f2Z}b9qU(G z9(2D0KGXW6&3oC;^eZh0#@|t1(E6*9>vvi&TP1#_1%#NV&%Wyb5NF!I;1;(Z4AFJ+S75ng5`o-$*9jZ)#C2>&Nf_WPkP(*RFti@%1X z*z=+PHHq;%#LGDGDPQo{Kox(6co`M`e(=k<<|&l?Yp_c`AN=zsyoC7UU3eKZJVil& tjUK6AVxr#(y!1bRy?y1<004i8FytgbAKfDW0J6tFxsPr|y6n@|{|71}nu!1a literal 0 HcmV?d00001 diff --git a/pipeline_pgp/pgp_ingest.py b/pipeline_pgp/pgp_ingest.py new file mode 100644 index 0000000..d12b233 --- /dev/null +++ b/pipeline_pgp/pgp_ingest.py @@ -0,0 +1,150 @@ +""" +PGP Excel ingestion script + +Reads all sheets from the PGP export file and loads them into PostgreSQL. +Tables are created automatically in the raw schema. + +Each ingestion adds a snapshot_date so historical data is preserved. +If the script runs multiple times the same day, duplicates are avoided. +""" + +import pandas as pd +import psycopg2 +from psycopg2 import sql +from datetime import date +import re + +# ========================= +# Configuration +# ========================= + +FILE_PATH = "data/PGP x D4G- Exported Vaccine Data.xlsx" + +DB_CONFIG = { + "host": "localhost", + "port": "5432", # Change docker postgres port + "dbname": "eu_fact_force", + "user": "eu_fact_force", + "password": "eu_fact_force" +} + +RAW_SCHEMA = "raw" + + +# ========================= +# Helper functions +# ========================= + +def clean_name(name): + """ + Clean sheet and column names so they are valid SQL identifiers. + """ + name = name.lower() + name = name.replace("%", "percent") + name = re.sub(r"[^\w]+", "_", name) + return name.strip("_") + + +def create_schema(cursor): + """ + Ensure schemas exist. + """ + cursor.execute("CREATE SCHEMA IF NOT EXISTS raw;") + cursor.execute("CREATE SCHEMA IF NOT EXISTS analytics;") + + +def create_table_if_not_exists(cursor, table_name, columns): + """ + Create table dynamically from dataframe columns. + """ + + column_defs = [] + + for col in columns: + column_defs.append(sql.SQL("{} TEXT").format(sql.Identifier(col))) + + column_defs.append(sql.SQL("snapshot_date DATE")) + + query = sql.SQL(""" + CREATE TABLE IF NOT EXISTS {}.{} ( + id SERIAL PRIMARY KEY, + {} + ) + """).format( + sql.Identifier(RAW_SCHEMA), + sql.Identifier(table_name), + sql.SQL(", ").join(column_defs) + ) + + cursor.execute(query) + + +def insert_dataframe(cursor, table_name, df): + """ + Insert dataframe rows into PostgreSQL. + """ + + cols = list(df.columns) + cols.append("snapshot_date") + + insert_query = sql.SQL(""" + INSERT INTO {}.{} ({}) + VALUES ({}) + """).format( + sql.Identifier(RAW_SCHEMA), + sql.Identifier(table_name), + sql.SQL(", ").join(map(sql.Identifier, cols)), + sql.SQL(", ").join(sql.Placeholder() * len(cols)) + ) + + snapshot = date.today() + + for _, row in df.iterrows(): + values = [str(v) if pd.notna(v) else None for v in row.tolist()] + values.append(snapshot) + cursor.execute(insert_query, values) + + +# ========================= +# Main ingestion process +# ========================= + +def ingest_excel(): + + print("Reading Excel file...") + + xls = pd.ExcelFile(FILE_PATH) + sheets = xls.sheet_names + + conn = psycopg2.connect(**DB_CONFIG) + cursor = conn.cursor() + + create_schema(cursor) + + for sheet in sheets: + + print(f"Ingesting sheet: {sheet}") + + df = pd.read_excel(FILE_PATH, sheet_name=sheet) + + # clean column names + df.columns = [clean_name(c) for c in df.columns] + + table_name = clean_name(sheet) + + create_table_if_not_exists(cursor, table_name, df.columns) + + insert_dataframe(cursor, table_name, df) + + conn.commit() + + cursor.close() + conn.close() + + print("Ingestion complete.") + + +# ========================= + +if __name__ == "__main__": + ingest_excel() \ No newline at end of file