From e46b7100e3a467b7d8e8b5afadd85a7a31d4071d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20Co?= <gregoire.corbiere@gmail.com>
Date: Wed, 22 Apr 2026 16:44:59 +0200
Subject: [PATCH 1/8] feat(deps): add ML dependencies for embedding benchmark

---
 backend/pyproject.toml |   6 +
 backend/uv.lock        | 621 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 627 insertions(+)

diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index cbd63d7c..8bc9dffc 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -26,4 +26,10 @@ dependencies = [
     "torch>=2.11.0",
     "accelerate>=1.13.0",
     "bitsandbytes>=0.49.2",
+    "anthropic>=0.50.0",
+    "peft>=0.14.0",
+    "lightgbm>=4.0.0",
+    "sentence-transformers>=3.0.0",
+    "datasets>=3.0.0",
+    "python-dotenv>=1.0.0",
 ]
diff --git a/backend/uv.lock b/backend/uv.lock
index 6bfdcf26..e3561056 100644
--- a/backend/uv.lock
+++ b/backend/uv.lock
@@ -26,6 +26,79 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7e/46/02ac5e262d4af18054b3e922b2baedbb2a03289ee792162de60a865defc5/accelerate-1.13.0-py3-none-any.whl", hash = "sha256:cf1a3efb96c18f7b152eb0fa7490f3710b19c3f395699358f08decca2b8b62e0", size = 383744, upload-time = "2026-03-04T19:34:10.313Z" },
 ]
 
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" },
+]
+
+[[package]]
+name = "aiohttp"
+version = "3.13.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohappyeyeballs" },
+    { name = "aiosignal" },
+    { name = "attrs" },
+    { name = "frozenlist" },
+    { name = "multidict" },
+    { name = "propcache" },
+    { name = "yarl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/77/9a/152096d4808df8e4268befa55fba462f440f14beab85e8ad9bf990516918/aiohttp-3.13.5.tar.gz", hash = "sha256:9d98cc980ecc96be6eb4c1994ce35d28d8b1f5e5208a23b421187d1209dbb7d1", size = 7858271, upload-time = "2026-03-31T22:01:03.343Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/6f/353954c29e7dcce7cf00280a02c75f30e133c00793c7a2ed3776d7b2f426/aiohttp-3.13.5-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:023ecba036ddd840b0b19bf195bfae970083fd7024ce1ac22e9bba90464620e9", size = 748876, upload-time = "2026-03-31T21:57:36.319Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/1b/428a7c64687b3b2e9cd293186695affc0e1e54a445d0361743b231f11066/aiohttp-3.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15c933ad7920b7d9a20de151efcd05a6e38302cbf0e10c9b2acb9a42210a2416", size = 499557, upload-time = "2026-03-31T21:57:38.236Z" },
+    { url = "https://files.pythonhosted.org/packages/29/47/7be41556bfbb6917069d6a6634bb7dd5e163ba445b783a90d40f5ac7e3a7/aiohttp-3.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab2899f9fa2f9f741896ebb6fa07c4c883bfa5c7f2ddd8cf2aafa86fa981b2d2", size = 500258, upload-time = "2026-03-31T21:57:39.923Z" },
+    { url = "https://files.pythonhosted.org/packages/67/84/c9ecc5828cb0b3695856c07c0a6817a99d51e2473400f705275a2b3d9239/aiohttp-3.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a60eaa2d440cd4707696b52e40ed3e2b0f73f65be07fd0ef23b6b539c9c0b0b4", size = 1749199, upload-time = "2026-03-31T21:57:41.938Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/d3/3c6d610e66b495657622edb6ae7c7fd31b2e9086b4ec50b47897ad6042a9/aiohttp-3.13.5-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:55b3bdd3292283295774ab585160c4004f4f2f203946997f49aac032c84649e9", size = 1721013, upload-time = "2026-03-31T21:57:43.904Z" },
+    { url = "https://files.pythonhosted.org/packages/49/a0/24409c12217456df0bae7babe3b014e460b0b38a8e60753d6cb339f6556d/aiohttp-3.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c2b2355dc094e5f7d45a7bb262fe7207aa0460b37a0d87027dcf21b5d890e7d5", size = 1781501, upload-time = "2026-03-31T21:57:46.285Z" },
+    { url = "https://files.pythonhosted.org/packages/98/9d/b65ec649adc5bccc008b0957a9a9c691070aeac4e41cea18559fef49958b/aiohttp-3.13.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b38765950832f7d728297689ad78f5f2cf79ff82487131c4d26fe6ceecdc5f8e", size = 1878981, upload-time = "2026-03-31T21:57:48.734Z" },
+    { url = "https://files.pythonhosted.org/packages/57/d8/8d44036d7eb7b6a8ec4c5494ea0c8c8b94fbc0ed3991c1a7adf230df03bf/aiohttp-3.13.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b18f31b80d5a33661e08c89e202edabf1986e9b49c42b4504371daeaa11b47c1", size = 1767934, upload-time = "2026-03-31T21:57:51.171Z" },
+    { url = "https://files.pythonhosted.org/packages/31/04/d3f8211f273356f158e3464e9e45484d3fb8c4ce5eb2f6fe9405c3273983/aiohttp-3.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:33add2463dde55c4f2d9635c6ab33ce154e5ecf322bd26d09af95c5f81cfa286", size = 1566671, upload-time = "2026-03-31T21:57:53.326Z" },
+    { url = "https://files.pythonhosted.org/packages/41/db/073e4ebe00b78e2dfcacff734291651729a62953b48933d765dc513bf798/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:327cc432fdf1356fb4fbc6fe833ad4e9f6aacb71a8acaa5f1855e4b25910e4a9", size = 1705219, upload-time = "2026-03-31T21:57:55.385Z" },
+    { url = "https://files.pythonhosted.org/packages/48/45/7dfba71a2f9fd97b15c95c06819de7eb38113d2cdb6319669195a7d64270/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:7c35b0bf0b48a70b4cb4fc5d7bed9b932532728e124874355de1a0af8ec4bc88", size = 1743049, upload-time = "2026-03-31T21:57:57.341Z" },
+    { url = "https://files.pythonhosted.org/packages/18/71/901db0061e0f717d226386a7f471bb59b19566f2cae5f0d93874b017271f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:df23d57718f24badef8656c49743e11a89fd6f5358fa8a7b96e728fda2abf7d3", size = 1749557, upload-time = "2026-03-31T21:57:59.626Z" },
+    { url = "https://files.pythonhosted.org/packages/08/d5/41eebd16066e59cd43728fe74bce953d7402f2b4ddfdfef2c0e9f17ca274/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:02e048037a6501a5ec1f6fc9736135aec6eb8a004ce48838cb951c515f32c80b", size = 1558931, upload-time = "2026-03-31T21:58:01.972Z" },
+    { url = "https://files.pythonhosted.org/packages/30/e6/4a799798bf05740e66c3a1161079bda7a3dd8e22ca392481d7a7f9af82a6/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:31cebae8b26f8a615d2b546fee45d5ffb76852ae6450e2a03f42c9102260d6fe", size = 1774125, upload-time = "2026-03-31T21:58:04.007Z" },
+    { url = "https://files.pythonhosted.org/packages/84/63/7749337c90f92bc2cb18f9560d67aa6258c7060d1397d21529b8004fcf6f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:888e78eb5ca55a615d285c3c09a7a91b42e9dd6fc699b166ebd5dee87c9ccf14", size = 1732427, upload-time = "2026-03-31T21:58:06.337Z" },
+    { url = "https://files.pythonhosted.org/packages/98/de/cf2f44ff98d307e72fb97d5f5bbae3bfcb442f0ea9790c0bf5c5c2331404/aiohttp-3.13.5-cp312-cp312-win32.whl", hash = "sha256:8bd3ec6376e68a41f9f95f5ed170e2fcf22d4eb27a1f8cb361d0508f6e0557f3", size = 433534, upload-time = "2026-03-31T21:58:08.712Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/ca/eadf6f9c8fa5e31d40993e3db153fb5ed0b11008ad5d9de98a95045bed84/aiohttp-3.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:110e448e02c729bcebb18c60b9214a87ba33bac4a9fa5e9a5f139938b56c6cb1", size = 460446, upload-time = "2026-03-31T21:58:10.945Z" },
+    { url = "https://files.pythonhosted.org/packages/78/e9/d76bf503005709e390122d34e15256b88f7008e246c4bdbe915cd4f1adce/aiohttp-3.13.5-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5029cc80718bbd545123cd8fe5d15025eccaaaace5d0eeec6bd556ad6163d61", size = 742930, upload-time = "2026-03-31T21:58:13.155Z" },
+    { url = "https://files.pythonhosted.org/packages/57/00/4b7b70223deaebd9bb85984d01a764b0d7bd6526fcdc73cca83bcbe7243e/aiohttp-3.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4bb6bf5811620003614076bdc807ef3b5e38244f9d25ca5fe888eaccea2a9832", size = 496927, upload-time = "2026-03-31T21:58:15.073Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/f5/0fb20fb49f8efdcdce6cd8127604ad2c503e754a8f139f5e02b01626523f/aiohttp-3.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a84792f8631bf5a94e52d9cc881c0b824ab42717165a5579c760b830d9392ac9", size = 497141, upload-time = "2026-03-31T21:58:17.009Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/86/b7c870053e36a94e8951b803cb5b909bfbc9b90ca941527f5fcafbf6b0fa/aiohttp-3.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57653eac22c6a4c13eb22ecf4d673d64a12f266e72785ab1c8b8e5940d0e8090", size = 1732476, upload-time = "2026-03-31T21:58:18.925Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/e5/4e161f84f98d80c03a238671b4136e6530453d65262867d989bbe78244d0/aiohttp-3.13.5-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5e5f7debc7a57af53fdf5c5009f9391d9f4c12867049d509bf7bb164a6e295b", size = 1706507, upload-time = "2026-03-31T21:58:21.094Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/56/ea11a9f01518bd5a2a2fcee869d248c4b8a0cfa0bb13401574fa31adf4d4/aiohttp-3.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c719f65bebcdf6716f10e9eff80d27567f7892d8988c06de12bbbd39307c6e3a", size = 1773465, upload-time = "2026-03-31T21:58:23.159Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/40/333ca27fb74b0383f17c90570c748f7582501507307350a79d9f9f3c6eb1/aiohttp-3.13.5-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d97f93fdae594d886c5a866636397e2bcab146fd7a132fd6bb9ce182224452f8", size = 1873523, upload-time = "2026-03-31T21:58:25.59Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/d2/e2f77eef1acb7111405433c707dc735e63f67a56e176e72e9e7a2cd3f493/aiohttp-3.13.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3df334e39d4c2f899a914f1dba283c1aadc311790733f705182998c6f7cae665", size = 1754113, upload-time = "2026-03-31T21:58:27.624Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/56/3f653d7f53c89669301ec9e42c95233e2a0c0a6dd051269e6e678db4fdb0/aiohttp-3.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fe6970addfea9e5e081401bcbadf865d2b6da045472f58af08427e108d618540", size = 1562351, upload-time = "2026-03-31T21:58:29.918Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/a6/9b3e91eb8ae791cce4ee736da02211c85c6f835f1bdfac0594a8a3b7018c/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7becdf835feff2f4f335d7477f121af787e3504b48b449ff737afb35869ba7bb", size = 1693205, upload-time = "2026-03-31T21:58:32.214Z" },
+    { url = "https://files.pythonhosted.org/packages/98/fc/bfb437a99a2fcebd6b6eaec609571954de2ed424f01c352f4b5504371dd3/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:676e5651705ad5d8a70aeb8eb6936c436d8ebbd56e63436cb7dd9bb36d2a9a46", size = 1730618, upload-time = "2026-03-31T21:58:34.728Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/b6/c8534862126191a034f68153194c389addc285a0f1347d85096d349bbc15/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:9b16c653d38eb1a611cc898c41e76859ca27f119d25b53c12875fd0474ae31a8", size = 1745185, upload-time = "2026-03-31T21:58:36.909Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/93/4ca8ee2ef5236e2707e0fd5fecb10ce214aee1ff4ab307af9c558bda3b37/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:999802d5fa0389f58decd24b537c54aa63c01c3219ce17d1214cbda3c2b22d2d", size = 1557311, upload-time = "2026-03-31T21:58:39.38Z" },
+    { url = "https://files.pythonhosted.org/packages/57/ae/76177b15f18c5f5d094f19901d284025db28eccc5ae374d1d254181d33f4/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:ec707059ee75732b1ba130ed5f9580fe10ff75180c812bc267ded039db5128c6", size = 1773147, upload-time = "2026-03-31T21:58:41.476Z" },
+    { url = "https://files.pythonhosted.org/packages/01/a4/62f05a0a98d88af59d93b7fcac564e5f18f513cb7471696ac286db970d6a/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d6d44a5b48132053c2f6cd5c8cb14bc67e99a63594e336b0f2af81e94d5530c", size = 1730356, upload-time = "2026-03-31T21:58:44.049Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/85/fc8601f59dfa8c9523808281f2da571f8b4699685f9809a228adcc90838d/aiohttp-3.13.5-cp313-cp313-win32.whl", hash = "sha256:329f292ed14d38a6c4c435e465f48bebb47479fd676a0411936cc371643225cc", size = 432637, upload-time = "2026-03-31T21:58:46.167Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/1b/ac685a8882896acf0f6b31d689e3792199cfe7aba37969fa91da63a7fa27/aiohttp-3.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:69f571de7500e0557801c0b51f4780482c0ec5fe2ac851af5a92cfce1af1cb83", size = 458896, upload-time = "2026-03-31T21:58:48.119Z" },
+]
+
+[[package]]
+name = "aiosignal"
+version = "1.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "frozenlist" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" },
+]
+
 [[package]]
 name = "alembic"
 version = "1.18.4"
@@ -58,6 +131,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
 ]
 
+[[package]]
+name = "anthropic"
+version = "0.96.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "distro" },
+    { name = "docstring-parser" },
+    { name = "httpx" },
+    { name = "jiter" },
+    { name = "pydantic" },
+    { name = "sniffio" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b9/7e/672f533dee813028d2c699bfd2a7f52c9118d7353680d9aa44b9e23f717f/anthropic-0.96.0.tar.gz", hash = "sha256:9de947b737f39452f68aa520f1c2239d44119c9b73b0fb6d4e6ca80f00279ee6", size = 658210, upload-time = "2026-04-16T14:28:02.846Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/48/5a/72f33204064b6e87601a71a6baf8d855769f8a0c1eaae8d06a1094872371/anthropic-0.96.0-py3-none-any.whl", hash = "sha256:9a6e335a354602a521cd9e777e92bfd46ba6e115bf9bbfe6135311e8fb2015b2", size = 635930, upload-time = "2026-04-16T14:28:01.436Z" },
+]
+
 [[package]]
 name = "anyio"
 version = "4.12.1"
@@ -128,6 +220,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/91/66/b25ccb84a246b470eb943b0107c07edcae51804912b824054b3413995a10/asyncpg-0.31.0-cp313-cp313-win_amd64.whl", hash = "sha256:dc5f2fa9916f292e5c5c8b2ac2813763bcd7f58e130055b4ad8a0531314201ab", size = 596569, upload-time = "2025-11-24T23:26:16.189Z" },
 ]
 
+[[package]]
+name = "attrs"
+version = "26.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9a/8e/82a0fe20a541c03148528be8cac2408564a6c9a0cc7e9171802bc1d26985/attrs-26.1.0.tar.gz", hash = "sha256:d03ceb89cb322a8fd706d4fb91940737b6642aa36998fe130a9bc96c985eff32", size = 952055, upload-time = "2026-03-19T14:22:25.026Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/64/b4/17d4b0b2a2dc85a6df63d1157e028ed19f90d4cd97c36717afef2bc2f395/attrs-26.1.0-py3-none-any.whl", hash = "sha256:c647aa4a12dfbad9333ca4e71fe62ddc36f4e63b2d260a37a8b83d2f043ac309", size = 67548, upload-time = "2026-03-19T14:22:23.645Z" },
+]
+
 [[package]]
 name = "balanceteshaters"
 version = "0.1.0"
@@ -135,20 +236,26 @@ source = { virtual = "." }
 dependencies = [
     { name = "accelerate" },
     { name = "alembic" },
+    { name = "anthropic" },
     { name = "argon2-cffi" },
     { name = "asyncpg" },
     { name = "bitsandbytes" },
     { name = "colorlog" },
+    { name = "datasets" },
     { name = "dependency-injector", extra = ["pydantic2"] },
     { name = "fastapi", extra = ["standard"] },
+    { name = "lightgbm" },
     { name = "lingua-language-detector" },
     { name = "llama-cpp-python" },
     { name = "ollama" },
     { name = "pandas" },
+    { name = "peft" },
     { name = "pyjwt" },
+    { name = "python-dotenv" },
     { name = "python-multipart" },
     { name = "requests" },
     { name = "scikit-learn" },
+    { name = "sentence-transformers" },
     { name = "sqlalchemy", extra = ["asyncio"] },
     { name = "torch" },
     { name = "tqdm" },
@@ -160,20 +267,26 @@ dependencies = [
 requires-dist = [
     { name = "accelerate", specifier = ">=1.13.0" },
     { name = "alembic", specifier = ">=1.17.2" },
+    { name = "anthropic", specifier = ">=0.50.0" },
     { name = "argon2-cffi", specifier = ">=25.1.0" },
     { name = "asyncpg", specifier = ">=0.31.0" },
     { name = "bitsandbytes", specifier = ">=0.49.2" },
     { name = "colorlog", specifier = ">=6.10.1" },
+    { name = "datasets", specifier = ">=3.0.0" },
     { name = "dependency-injector", extras = ["pydantic2"], specifier = ">=4.48.2" },
     { name = "fastapi", extras = ["standard"], specifier = ">=0.122.0" },
+    { name = "lightgbm", specifier = ">=4.0.0" },
     { name = "lingua-language-detector", specifier = ">=2.1.1" },
     { name = "llama-cpp-python", specifier = ">=0.3.0" },
     { name = "ollama", specifier = ">=0.6.1" },
     { name = "pandas", specifier = ">=3.0.1" },
+    { name = "peft", specifier = ">=0.14.0" },
     { name = "pyjwt", specifier = ">=2.10.1" },
+    { name = "python-dotenv", specifier = ">=1.0.0" },
     { name = "python-multipart", specifier = ">=0.0.20" },
     { name = "requests", specifier = ">=2.32.5" },
     { name = "scikit-learn", specifier = ">=1.8.0" },
+    { name = "sentence-transformers", specifier = ">=3.0.0" },
     { name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0.44" },
     { name = "torch", specifier = ">=2.11.0" },
     { name = "tqdm", specifier = ">=4.67.3" },
@@ -380,6 +493,31 @@ nvtx = [
     { name = "nvidia-nvtx", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 
+[[package]]
+name = "datasets"
+version = "4.8.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dill" },
+    { name = "filelock" },
+    { name = "fsspec", extra = ["http"] },
+    { name = "httpx" },
+    { name = "huggingface-hub" },
+    { name = "multiprocess" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pandas" },
+    { name = "pyarrow" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "xxhash" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/22/22/73e46ac7a8c25e7ef0b3bd6f10da3465021d90219a32eb0b4d2afea4c56e/datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52", size = 604382, upload-time = "2026-03-23T14:21:17.987Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b0/e5/247d094108e42ac26363ab8dc57f168840cf7c05774b40ffeb0d78868fcc/datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d", size = 526991, upload-time = "2026-03-23T14:21:15.89Z" },
+]
+
 [[package]]
 name = "dependency-injector"
 version = "4.48.3"
@@ -403,6 +541,15 @@ pydantic2 = [
     { name = "pydantic-settings" },
 ]
 
+[[package]]
+name = "dill"
+version = "0.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/81/e1/56027a71e31b02ddc53c7d65b01e68edf64dea2932122fe7746a516f75d5/dill-0.4.1.tar.gz", hash = "sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa", size = 187315, upload-time = "2026-01-19T02:36:56.85Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" },
+]
+
 [[package]]
 name = "diskcache"
 version = "5.6.3"
@@ -412,6 +559,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" },
 ]
 
+[[package]]
+name = "distro"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
+]
+
 [[package]]
 name = "dnspython"
 version = "2.8.0"
@@ -421,6 +577,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ba/5a/18ad964b0086c6e62e2e7500f7edc89e3faa45033c71c1893d34eed2b2de/dnspython-2.8.0-py3-none-any.whl", hash = "sha256:01d9bbc4a2d76bf0db7c1f729812ded6d912bd318d3b1cf81d30c0f845dbf3af", size = 331094, upload-time = "2025-09-07T18:57:58.071Z" },
 ]
 
+[[package]]
+name = "docstring-parser"
+version = "0.18.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/4d/f332313098c1de1b2d2ff91cf2674415cc7cddab2ca1b01ae29774bd5fdf/docstring_parser-0.18.0.tar.gz", hash = "sha256:292510982205c12b1248696f44959db3cdd1740237a968ea1e2e7a900eeb2015", size = 29341, upload-time = "2026-04-14T04:09:19.867Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/5f/ed01f9a3cdffbd5a008556fc7b2a08ddb1cc6ace7effa7340604b1d16699/docstring_parser-0.18.0-py3-none-any.whl", hash = "sha256:b3fcbed555c47d8479be0796ef7e19c2670d428d72e96da63f3a40122860374b", size = 22484, upload-time = "2026-04-14T04:09:18.638Z" },
+]
+
 [[package]]
 name = "email-validator"
 version = "2.3.0"
@@ -548,6 +713,63 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a4/a5/842ae8f0c08b61d6484b52f99a03510a3a72d23141942d216ebe81fefbce/filelock-3.25.2-py3-none-any.whl", hash = "sha256:ca8afb0da15f229774c9ad1b455ed96e85a81373065fb10446672f64444ddf70", size = 26759, upload-time = "2026-03-11T20:45:37.437Z" },
 ]
 
+[[package]]
+name = "frozenlist"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/69/29/948b9aa87e75820a38650af445d2ef2b6b8a6fab1a23b6bb9e4ef0be2d59/frozenlist-1.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1", size = 87782, upload-time = "2025-10-06T05:36:06.649Z" },
+    { url = "https://files.pythonhosted.org/packages/64/80/4f6e318ee2a7c0750ed724fa33a4bdf1eacdc5a39a7a24e818a773cd91af/frozenlist-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b", size = 50594, upload-time = "2025-10-06T05:36:07.69Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/94/5c8a2b50a496b11dd519f4a24cb5496cf125681dd99e94c604ccdea9419a/frozenlist-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4", size = 50448, upload-time = "2025-10-06T05:36:08.78Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/bd/d91c5e39f490a49df14320f4e8c80161cfcce09f1e2cde1edd16a551abb3/frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:494a5952b1c597ba44e0e78113a7266e656b9794eec897b19ead706bd7074383", size = 242411, upload-time = "2025-10-06T05:36:09.801Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/83/f61505a05109ef3293dfb1ff594d13d64a2324ac3482be2cedc2be818256/frozenlist-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f423a119f4777a4a056b66ce11527366a8bb92f54e541ade21f2374433f6d4", size = 243014, upload-time = "2025-10-06T05:36:11.394Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/cb/cb6c7b0f7d4023ddda30cf56b8b17494eb3a79e3fda666bf735f63118b35/frozenlist-1.8.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3462dd9475af2025c31cc61be6652dfa25cbfb56cbbf52f4ccfe029f38decaf8", size = 234909, upload-time = "2025-10-06T05:36:12.598Z" },
+    { url = "https://files.pythonhosted.org/packages/31/c5/cd7a1f3b8b34af009fb17d4123c5a778b44ae2804e3ad6b86204255f9ec5/frozenlist-1.8.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4c800524c9cd9bac5166cd6f55285957fcfc907db323e193f2afcd4d9abd69b", size = 250049, upload-time = "2025-10-06T05:36:14.065Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/01/2f95d3b416c584a1e7f0e1d6d31998c4a795f7544069ee2e0962a4b60740/frozenlist-1.8.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d6a5df73acd3399d893dafc71663ad22534b5aa4f94e8a2fabfe856c3c1b6a52", size = 256485, upload-time = "2025-10-06T05:36:15.39Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/03/024bf7720b3abaebcff6d0793d73c154237b85bdf67b7ed55e5e9596dc9a/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:405e8fe955c2280ce66428b3ca55e12b3c4e9c336fb2103a4937e891c69a4a29", size = 237619, upload-time = "2025-10-06T05:36:16.558Z" },
+    { url = "https://files.pythonhosted.org/packages/69/fa/f8abdfe7d76b731f5d8bd217827cf6764d4f1d9763407e42717b4bed50a0/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:908bd3f6439f2fef9e85031b59fd4f1297af54415fb60e4254a95f75b3cab3f3", size = 250320, upload-time = "2025-10-06T05:36:17.821Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/3c/b051329f718b463b22613e269ad72138cc256c540f78a6de89452803a47d/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:294e487f9ec720bd8ffcebc99d575f7eff3568a08a253d1ee1a0378754b74143", size = 246820, upload-time = "2025-10-06T05:36:19.046Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/ae/58282e8f98e444b3f4dd42448ff36fa38bef29e40d40f330b22e7108f565/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:74c51543498289c0c43656701be6b077f4b265868fa7f8a8859c197006efb608", size = 250518, upload-time = "2025-10-06T05:36:20.763Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/96/007e5944694d66123183845a106547a15944fbbb7154788cbf7272789536/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:776f352e8329135506a1d6bf16ac3f87bc25b28e765949282dcc627af36123aa", size = 239096, upload-time = "2025-10-06T05:36:22.129Z" },
+    { url = "https://files.pythonhosted.org/packages/66/bb/852b9d6db2fa40be96f29c0d1205c306288f0684df8fd26ca1951d461a56/frozenlist-1.8.0-cp312-cp312-win32.whl", hash = "sha256:433403ae80709741ce34038da08511d4a77062aa924baf411ef73d1146e74faf", size = 39985, upload-time = "2025-10-06T05:36:23.661Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/af/38e51a553dd66eb064cdf193841f16f077585d4d28394c2fa6235cb41765/frozenlist-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:34187385b08f866104f0c0617404c8eb08165ab1272e884abc89c112e9c00746", size = 44591, upload-time = "2025-10-06T05:36:24.958Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/06/1dc65480ab147339fecc70797e9c2f69d9cea9cf38934ce08df070fdb9cb/frozenlist-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:fe3c58d2f5db5fbd18c2987cba06d51b0529f52bc3a6cdc33d3f4eab725104bd", size = 40102, upload-time = "2025-10-06T05:36:26.333Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/40/0832c31a37d60f60ed79e9dfb5a92e1e2af4f40a16a29abcc7992af9edff/frozenlist-1.8.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8d92f1a84bb12d9e56f818b3a746f3efba93c1b63c8387a73dde655e1e42282a", size = 85717, upload-time = "2025-10-06T05:36:27.341Z" },
+    { url = "https://files.pythonhosted.org/packages/30/ba/b0b3de23f40bc55a7057bd38434e25c34fa48e17f20ee273bbde5e0650f3/frozenlist-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:96153e77a591c8adc2ee805756c61f59fef4cf4073a9275ee86fe8cba41241f7", size = 49651, upload-time = "2025-10-06T05:36:28.855Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/ab/6e5080ee374f875296c4243c381bbdef97a9ac39c6e3ce1d5f7d42cb78d6/frozenlist-1.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f21f00a91358803399890ab167098c131ec2ddd5f8f5fd5fe9c9f2c6fcd91e40", size = 49417, upload-time = "2025-10-06T05:36:29.877Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/4e/e4691508f9477ce67da2015d8c00acd751e6287739123113a9fca6f1604e/frozenlist-1.8.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fb30f9626572a76dfe4293c7194a09fb1fe93ba94c7d4f720dfae3b646b45027", size = 234391, upload-time = "2025-10-06T05:36:31.301Z" },
+    { url = "https://files.pythonhosted.org/packages/40/76/c202df58e3acdf12969a7895fd6f3bc016c642e6726aa63bd3025e0fc71c/frozenlist-1.8.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eaa352d7047a31d87dafcacbabe89df0aa506abb5b1b85a2fb91bc3faa02d822", size = 233048, upload-time = "2025-10-06T05:36:32.531Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/c0/8746afb90f17b73ca5979c7a3958116e105ff796e718575175319b5bb4ce/frozenlist-1.8.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:03ae967b4e297f58f8c774c7eabcce57fe3c2434817d4385c50661845a058121", size = 226549, upload-time = "2025-10-06T05:36:33.706Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/eb/4c7eefc718ff72f9b6c4893291abaae5fbc0c82226a32dcd8ef4f7a5dbef/frozenlist-1.8.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f6292f1de555ffcc675941d65fffffb0a5bcd992905015f85d0592201793e0e5", size = 239833, upload-time = "2025-10-06T05:36:34.947Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/4e/e5c02187cf704224f8b21bee886f3d713ca379535f16893233b9d672ea71/frozenlist-1.8.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:29548f9b5b5e3460ce7378144c3010363d8035cea44bc0bf02d57f5a685e084e", size = 245363, upload-time = "2025-10-06T05:36:36.534Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/96/cb85ec608464472e82ad37a17f844889c36100eed57bea094518bf270692/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ec3cc8c5d4084591b4237c0a272cc4f50a5b03396a47d9caaf76f5d7b38a4f11", size = 229314, upload-time = "2025-10-06T05:36:38.582Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/6f/4ae69c550e4cee66b57887daeebe006fe985917c01d0fff9caab9883f6d0/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:517279f58009d0b1f2e7c1b130b377a349405da3f7621ed6bfae50b10adf20c1", size = 243365, upload-time = "2025-10-06T05:36:40.152Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/58/afd56de246cf11780a40a2c28dc7cbabbf06337cc8ddb1c780a2d97e88d8/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:db1e72ede2d0d7ccb213f218df6a078a9c09a7de257c2fe8fcef16d5925230b1", size = 237763, upload-time = "2025-10-06T05:36:41.355Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/36/cdfaf6ed42e2644740d4a10452d8e97fa1c062e2a8006e4b09f1b5fd7d63/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:b4dec9482a65c54a5044486847b8a66bf10c9cb4926d42927ec4e8fd5db7fed8", size = 240110, upload-time = "2025-10-06T05:36:42.716Z" },
+    { url = "https://files.pythonhosted.org/packages/03/a8/9ea226fbefad669f11b52e864c55f0bd57d3c8d7eb07e9f2e9a0b39502e1/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:21900c48ae04d13d416f0e1e0c4d81f7931f73a9dfa0b7a8746fb2fe7dd970ed", size = 233717, upload-time = "2025-10-06T05:36:44.251Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/0b/1b5531611e83ba7d13ccc9988967ea1b51186af64c42b7a7af465dcc9568/frozenlist-1.8.0-cp313-cp313-win32.whl", hash = "sha256:8b7b94a067d1c504ee0b16def57ad5738701e4ba10cec90529f13fa03c833496", size = 39628, upload-time = "2025-10-06T05:36:45.423Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/cf/174c91dbc9cc49bc7b7aab74d8b734e974d1faa8f191c74af9b7e80848e6/frozenlist-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:878be833caa6a3821caf85eb39c5ba92d28e85df26d57afb06b35b2efd937231", size = 43882, upload-time = "2025-10-06T05:36:46.796Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/17/502cd212cbfa96eb1388614fe39a3fc9ab87dbbe042b66f97acb57474834/frozenlist-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:44389d135b3ff43ba8cc89ff7f51f5a0bb6b63d829c8300f79a2fe4fe61bcc62", size = 39676, upload-time = "2025-10-06T05:36:47.8Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/5c/3bbfaa920dfab09e76946a5d2833a7cbdf7b9b4a91c714666ac4855b88b4/frozenlist-1.8.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:e25ac20a2ef37e91c1b39938b591457666a0fa835c7783c3a8f33ea42870db94", size = 89235, upload-time = "2025-10-06T05:36:48.78Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/d6/f03961ef72166cec1687e84e8925838442b615bd0b8854b54923ce5b7b8a/frozenlist-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:07cdca25a91a4386d2e76ad992916a85038a9b97561bf7a3fd12d5d9ce31870c", size = 50742, upload-time = "2025-10-06T05:36:49.837Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/bb/a6d12b7ba4c3337667d0e421f7181c82dda448ce4e7ad7ecd249a16fa806/frozenlist-1.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4e0c11f2cc6717e0a741f84a527c52616140741cd812a50422f83dc31749fb52", size = 51725, upload-time = "2025-10-06T05:36:50.851Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/71/d1fed0ffe2c2ccd70b43714c6cab0f4188f09f8a67a7914a6b46ee30f274/frozenlist-1.8.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b3210649ee28062ea6099cfda39e147fa1bc039583c8ee4481cb7811e2448c51", size = 284533, upload-time = "2025-10-06T05:36:51.898Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/1f/fb1685a7b009d89f9bf78a42d94461bc06581f6e718c39344754a5d9bada/frozenlist-1.8.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:581ef5194c48035a7de2aefc72ac6539823bb71508189e5de01d60c9dcd5fa65", size = 292506, upload-time = "2025-10-06T05:36:53.101Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/3b/b991fe1612703f7e0d05c0cf734c1b77aaf7c7d321df4572e8d36e7048c8/frozenlist-1.8.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3ef2d026f16a2b1866e1d86fc4e1291e1ed8a387b2c333809419a2f8b3a77b82", size = 274161, upload-time = "2025-10-06T05:36:54.309Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/ec/c5c618767bcdf66e88945ec0157d7f6c4a1322f1473392319b7a2501ded7/frozenlist-1.8.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5500ef82073f599ac84d888e3a8c1f77ac831183244bfd7f11eaa0289fb30714", size = 294676, upload-time = "2025-10-06T05:36:55.566Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/ce/3934758637d8f8a88d11f0585d6495ef54b2044ed6ec84492a91fa3b27aa/frozenlist-1.8.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:50066c3997d0091c411a66e710f4e11752251e6d2d73d70d8d5d4c76442a199d", size = 300638, upload-time = "2025-10-06T05:36:56.758Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/4f/a7e4d0d467298f42de4b41cbc7ddaf19d3cfeabaf9ff97c20c6c7ee409f9/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5c1c8e78426e59b3f8005e9b19f6ff46e5845895adbde20ece9218319eca6506", size = 283067, upload-time = "2025-10-06T05:36:57.965Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/48/c7b163063d55a83772b268e6d1affb960771b0e203b632cfe09522d67ea5/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:eefdba20de0d938cec6a89bd4d70f346a03108a19b9df4248d3cf0d88f1b0f51", size = 292101, upload-time = "2025-10-06T05:36:59.237Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/d0/2366d3c4ecdc2fd391e0afa6e11500bfba0ea772764d631bbf82f0136c9d/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cf253e0e1c3ceb4aaff6df637ce033ff6535fb8c70a764a8f46aafd3d6ab798e", size = 289901, upload-time = "2025-10-06T05:37:00.811Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/94/daff920e82c1b70e3618a2ac39fbc01ae3e2ff6124e80739ce5d71c9b920/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:032efa2674356903cd0261c4317a561a6850f3ac864a63fc1583147fb05a79b0", size = 289395, upload-time = "2025-10-06T05:37:02.115Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/20/bba307ab4235a09fdcd3cc5508dbabd17c4634a1af4b96e0f69bfe551ebd/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6da155091429aeba16851ecb10a9104a108bcd32f6c1642867eadaee401c1c41", size = 283659, upload-time = "2025-10-06T05:37:03.711Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/00/04ca1c3a7a124b6de4f8a9a17cc2fcad138b4608e7a3fc5877804b8715d7/frozenlist-1.8.0-cp313-cp313t-win32.whl", hash = "sha256:0f96534f8bfebc1a394209427d0f8a63d343c9779cda6fc25e8e121b5fd8555b", size = 43492, upload-time = "2025-10-06T05:37:04.915Z" },
+    { url = "https://files.pythonhosted.org/packages/59/5e/c69f733a86a94ab10f68e496dc6b7e8bc078ebb415281d5698313e3af3a1/frozenlist-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5d63a068f978fc69421fb0e6eb91a9603187527c86b7cd3f534a5b77a592b888", size = 48034, upload-time = "2025-10-06T05:37:06.343Z" },
+    { url = "https://files.pythonhosted.org/packages/16/6c/be9d79775d8abe79b05fa6d23da99ad6e7763a1d080fbae7290b286093fd/frozenlist-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf0a7e10b077bf5fb9380ad3ae8ce20ef919a6ad93b4552896419ac7e1d8e042", size = 41749, upload-time = "2025-10-06T05:37:07.431Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409, upload-time = "2025-10-06T05:38:16.721Z" },
+]
+
 [[package]]
 name = "fsspec"
 version = "2026.2.0"
@@ -557,6 +779,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" },
 ]
 
+[package.optional-dependencies]
+http = [
+    { name = "aiohttp" },
+]
+
 [[package]]
 name = "greenlet"
 version = "3.3.0"
@@ -705,6 +932,51 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
 ]
 
+[[package]]
+name = "jiter"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6e/c1/0cddc6eb17d4c53a99840953f95dd3accdc5cfc7a337b0e9b26476276be9/jiter-0.14.0.tar.gz", hash = "sha256:e8a39e66dac7153cf3f964a12aad515afa8d74938ec5cc0018adcdae5367c79e", size = 165725, upload-time = "2026-04-10T14:28:42.01Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/68/7390a418f10897da93b158f2d5a8bd0bcd73a0f9ec3bb36917085bb759ef/jiter-0.14.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:2fb2ce3a7bc331256dfb14cefc34832366bb28a9aca81deaf43bbf2a5659e607", size = 316295, upload-time = "2026-04-10T14:26:24.887Z" },
+    { url = "https://files.pythonhosted.org/packages/60/a0/5854ac00ff63551c52c6c89534ec6aba4b93474e7924d64e860b1c94165b/jiter-0.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5252a7ca23785cef5d02d4ece6077a1b556a410c591b379f82091c3001e14844", size = 315898, upload-time = "2026-04-10T14:26:26.601Z" },
+    { url = "https://files.pythonhosted.org/packages/41/a1/4f44832650a16b18e8391f1bf1d6ca4909bc738351826bcc198bba4357f4/jiter-0.14.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c409578cbd77c338975670ada777add4efd53379667edf0aceea730cabede6fb", size = 343730, upload-time = "2026-04-10T14:26:28.326Z" },
+    { url = "https://files.pythonhosted.org/packages/48/64/a329e9d469f86307203594b1707e11ae51c3348d03bfd514a5f997870012/jiter-0.14.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7ede4331a1899d604463369c730dbb961ffdc5312bc7f16c41c2896415b1304a", size = 370102, upload-time = "2026-04-10T14:26:30.089Z" },
+    { url = "https://files.pythonhosted.org/packages/94/c1/5e3dfc59635aa4d4c7bd20a820ac1d09b8ed851568356802cf1c08edb3cf/jiter-0.14.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:92cd8b6025981a041f5310430310b55b25ca593972c16407af8837d3d7d2ca01", size = 461335, upload-time = "2026-04-10T14:26:31.911Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/1b/dd157009dbc058f7b00108f545ccb72a2d56461395c4fc7b9cfdccb00af4/jiter-0.14.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:351bf6eda4e3a7ceb876377840c702e9a3e4ecc4624dbfb2d6463c67ae52637d", size = 378536, upload-time = "2026-04-10T14:26:33.595Z" },
+    { url = "https://files.pythonhosted.org/packages/91/78/256013667b7c10b8834f8e6e54cd3e562d4c6e34227a1596addccc05e38c/jiter-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1dcfbeb93d9ecd9ca128bbf8910120367777973fa193fb9a39c31237d8df165", size = 353859, upload-time = "2026-04-10T14:26:35.098Z" },
+    { url = "https://files.pythonhosted.org/packages/de/d9/137d65ade9093a409fe80955ce60b12bb753722c986467aeda47faf450ad/jiter-0.14.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:ae039aaef8de3f8157ecc1fdd4d85043ac4f57538c245a0afaecb8321ec951c3", size = 357626, upload-time = "2026-04-10T14:26:36.685Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/48/76750835b87029342727c1a268bea8878ab988caf81ee4e7b880900eeb5a/jiter-0.14.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7d9d51eb96c82a9652933bd769fe6de66877d6eb2b2440e281f2938c51b5643e", size = 393172, upload-time = "2026-04-10T14:26:38.097Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/60/456c4e81d5c8045279aefe60e9e483be08793828800a4e64add8fdde7f2a/jiter-0.14.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d824ca4148b705970bf4e120924a212fdfca9859a73e42bd7889a63a4ea6bb98", size = 520300, upload-time = "2026-04-10T14:26:39.532Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/9f/2020e0984c235f678dced38fe4eec3058cf528e6af36ebf969b410305941/jiter-0.14.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ff3a6465b3a0f54b1a430f45c3c0ba7d61ceb45cbc3e33f9e1a7f638d690baf3", size = 553059, upload-time = "2026-04-10T14:26:40.991Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/32/e2d298e1a22a4bbe6062136d1c7192db7dba003a6975e51d9a9eecabc4c2/jiter-0.14.0-cp312-cp312-win32.whl", hash = "sha256:5dec7c0a3e98d2a3f8a2e67382d0d7c3ac60c69103a4b271da889b4e8bb1e129", size = 206030, upload-time = "2026-04-10T14:26:42.517Z" },
+    { url = "https://files.pythonhosted.org/packages/36/ac/96369141b3d8a4a8e4590e983085efe1c436f35c0cda940dd76d942e3e40/jiter-0.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:fc7e37b4b8bc7e80a63ad6cfa5fc11fab27dbfea4cc4ae644b1ab3f273dc348f", size = 201603, upload-time = "2026-04-10T14:26:44.328Z" },
+    { url = "https://files.pythonhosted.org/packages/01/c3/75d847f264647017d7e3052bbcc8b1e24b95fa139c320c5f5066fa7a0bdd/jiter-0.14.0-cp312-cp312-win_arm64.whl", hash = "sha256:ee4a72f12847ef29b072aee9ad5474041ab2924106bdca9fcf5d7d965853e057", size = 191525, upload-time = "2026-04-10T14:26:46Z" },
+    { url = "https://files.pythonhosted.org/packages/97/2a/09f70020898507a89279659a1afe3364d57fc1b2c89949081975d135f6f5/jiter-0.14.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:af72f204cf4d44258e5b4c1745130ac45ddab0e71a06333b01de660ab4187a94", size = 315502, upload-time = "2026-04-10T14:26:47.697Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/be/080c96a45cd74f9fce5db4fd68510b88087fb37ffe2541ff73c12db92535/jiter-0.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4b77da71f6e819be5fbcec11a453fde5b1d0267ef6ed487e2a392fd8e14e4e3a", size = 314870, upload-time = "2026-04-10T14:26:49.149Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/5e/2d0fee155826a968a832cc32438de5e2a193292c8721ca70d0b53e58245b/jiter-0.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f4ea612fe8b84b8b04e51d0e78029ecf3466348e25973f953de6e6a59aa4c1", size = 343406, upload-time = "2026-04-10T14:26:50.762Z" },
+    { url = "https://files.pythonhosted.org/packages/70/af/bf9ee0d3a4f8dc0d679fc1337f874fe60cdbf841ebbb304b374e1c9aaceb/jiter-0.14.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:62fe2451f8fcc0240261e6a4df18ecbcd58327857e61e625b2393ea3b468aac9", size = 369415, upload-time = "2026-04-10T14:26:52.188Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/83/8e8561eadba31f4d3948a5b712fb0447ec71c3560b57a855449e7b8ddc98/jiter-0.14.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6112f26f5afc75bcb475787d29da3aa92f9d09c7858f632f4be6ffe607be82e9", size = 461456, upload-time = "2026-04-10T14:26:53.611Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/c9/c5299e826a5fe6108d172b344033f61c69b1bb979dd8d9ddd4278a160971/jiter-0.14.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:215a6cb8fb7dc702aa35d475cc00ddc7f970e5c0b1417fb4b4ac5d82fa2a29db", size = 378488, upload-time = "2026-04-10T14:26:55.211Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/37/c16d9d15c0a471b8644b1abe3c82668092a707d9bedcf076f24ff2e380cd/jiter-0.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4ab96a30fb3cb2c7e0cd33f7616c8860da5f5674438988a54ac717caccdbaa", size = 353242, upload-time = "2026-04-10T14:26:56.705Z" },
+    { url = "https://files.pythonhosted.org/packages/58/ea/8050cb0dc654e728e1bfacbc0c640772f2181af5dedd13ae70145743a439/jiter-0.14.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:3a99c1387b1f2928f799a9de899193484d66206a50e98233b6b088a7f0c1edb2", size = 356823, upload-time = "2026-04-10T14:26:58.281Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/3b/cf71506d270e5f84d97326bf220e47aed9b95e9a4a060758fb07772170ab/jiter-0.14.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ab18d11074485438695f8d34a1b6da61db9754248f96d51341956607a8f39985", size = 392564, upload-time = "2026-04-10T14:27:00.018Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/cc/8c6c74a3efb5bd671bfd14f51e8a73375464ca914b1551bc3b40e26ac2c9/jiter-0.14.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:801028dcfc26ac0895e4964cbc0fd62c73be9fd4a7d7b1aaf6e5790033a719b7", size = 520322, upload-time = "2026-04-10T14:27:01.664Z" },
+    { url = "https://files.pythonhosted.org/packages/41/24/68d7b883ec959884ddf00d019b2e0e82ba81b167e1253684fa90519ce33c/jiter-0.14.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ad425b087aafb4a1c7e1e98a279200743b9aaf30c3e0ba723aec93f061bd9bc8", size = 552619, upload-time = "2026-04-10T14:27:03.316Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/89/b1a0985223bbf3150ff9e8f46f98fc9360c1de94f48abe271bbe1b465682/jiter-0.14.0-cp313-cp313-win32.whl", hash = "sha256:882bcb9b334318e233950b8be366fe5f92c86b66a7e449e76975dfd6d776a01f", size = 205699, upload-time = "2026-04-10T14:27:04.662Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/19/3f339a5a7f14a11730e67f6be34f9d5105751d547b615ef593fa122a5ded/jiter-0.14.0-cp313-cp313-win_amd64.whl", hash = "sha256:9b8c571a5dba09b98bd3462b5a53f27209a5cbbe85670391692ede71974e979f", size = 201323, upload-time = "2026-04-10T14:27:06.139Z" },
+    { url = "https://files.pythonhosted.org/packages/50/56/752dd89c84be0e022a8ea3720bcfa0a8431db79a962578544812ce061739/jiter-0.14.0-cp313-cp313-win_arm64.whl", hash = "sha256:34f19dcc35cb1abe7c369b3756babf8c7f04595c0807a848df8f26ef8298ef92", size = 191099, upload-time = "2026-04-10T14:27:07.564Z" },
+    { url = "https://files.pythonhosted.org/packages/91/28/292916f354f25a1fe8cf2c918d1415c699a4a659ae00be0430e1c5d9ffea/jiter-0.14.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e89bcd7d426a75bb4952c696b267075790d854a07aad4c9894551a82c5b574ab", size = 320880, upload-time = "2026-04-10T14:27:09.326Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/c7/b002a7d8b8957ac3d469bd59c18ef4b1595a5216ae0de639a287b9816023/jiter-0.14.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b25beaa0d4447ea8c7ae0c18c688905d34840d7d0b937f2f7bdd52162c98a40", size = 346563, upload-time = "2026-04-10T14:27:11.287Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/3b/f8d07580d8706021d255a6356b8fab13ee4c869412995550ce6ed4ddf97d/jiter-0.14.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:651a8758dd413c51e3b7f6557cdc6921faf70b14106f45f969f091f5cda990ea", size = 357928, upload-time = "2026-04-10T14:27:12.729Z" },
+    { url = "https://files.pythonhosted.org/packages/47/5b/ac1a974da29e35507230383110ffec59998b290a8732585d04e19a9eb5ba/jiter-0.14.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e1a7eead856a5038a8d291f1447176ab0b525c77a279a058121b5fccee257f6f", size = 203519, upload-time = "2026-04-10T14:27:14.125Z" },
+    { url = "https://files.pythonhosted.org/packages/96/6d/9fc8433d667d2454271378a79747d8c76c10b51b482b454e6190e511f244/jiter-0.14.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e692633a12cda97e352fdcd1c4acc971b1c28707e1e33aeef782b0cbf051975", size = 190113, upload-time = "2026-04-10T14:27:16.638Z" },
+    { url = "https://files.pythonhosted.org/packages/21/42/9042c3f3019de4adcb8c16591c325ec7255beea9fcd33a42a43f3b0b1000/jiter-0.14.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:fbd9e482663ca9d005d051330e4d2d8150bb208a209409c10f7e7dfdf7c49da9", size = 308810, upload-time = "2026-04-10T14:28:34.673Z" },
+    { url = "https://files.pythonhosted.org/packages/60/cf/a7e19b308bd86bb04776803b1f01a5f9a287a4c55205f4708827ee487fbf/jiter-0.14.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:33a20d838b91ef376b3a56896d5b04e725c7df5bc4864cc6569cf046a8d73b6d", size = 308443, upload-time = "2026-04-10T14:28:36.658Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/44/e26ede3f0caeff93f222559cb0cc4ca68579f07d009d7b6010c5b586f9b1/jiter-0.14.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:432c4db5255d86a259efde91e55cb4c8d18c0521d844c9e2e7efcce3899fb016", size = 343039, upload-time = "2026-04-10T14:28:38.356Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e9/1f9ada30cef7b05e74bb06f52127e7a724976c225f46adb65c37b1dadfb6/jiter-0.14.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67f00d94b281174144d6532a04b66a12cb866cbdc47c3af3bfe2973677f9861a", size = 349613, upload-time = "2026-04-10T14:28:40.066Z" },
+]
+
 [[package]]
 name = "joblib"
 version = "1.5.3"
@@ -714,6 +986,23 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" },
 ]
 
+[[package]]
+name = "lightgbm"
+version = "4.6.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "scipy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/68/0b/a2e9f5c5da7ef047cc60cef37f86185088845e8433e54d2e7ed439cce8a3/lightgbm-4.6.0.tar.gz", hash = "sha256:cb1c59720eb569389c0ba74d14f52351b573af489f230032a1c9f314f8bab7fe", size = 1703705, upload-time = "2025-02-15T04:03:03.111Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/75/cffc9962cca296bc5536896b7e65b4a7cdeb8db208e71b9c0133c08f8f7e/lightgbm-4.6.0-py3-none-macosx_10_15_x86_64.whl", hash = "sha256:b7a393de8a334d5c8e490df91270f0763f83f959574d504c7ccb9eee4aef70ed", size = 2010151, upload-time = "2025-02-15T04:02:50.961Z" },
+    { url = "https://files.pythonhosted.org/packages/21/1b/550ee378512b78847930f5d74228ca1fdba2a7fbdeaac9aeccc085b0e257/lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:2dafd98d4e02b844ceb0b61450a660681076b1ea6c7adb8c566dfd66832aafad", size = 1592172, upload-time = "2025-02-15T04:02:53.937Z" },
+    { url = "https://files.pythonhosted.org/packages/64/41/4fbde2c3d29e25ee7c41d87df2f2e5eda65b431ee154d4d462c31041846c/lightgbm-4.6.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4d68712bbd2b57a0b14390cbf9376c1d5ed773fa2e71e099cac588703b590336", size = 3454567, upload-time = "2025-02-15T04:02:56.443Z" },
+    { url = "https://files.pythonhosted.org/packages/42/86/dabda8fbcb1b00bcfb0003c3776e8ade1aa7b413dff0a2c08f457dace22f/lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:cb19b5afea55b5b61cbb2131095f50538bd608a00655f23ad5d25ae3e3bf1c8d", size = 3569831, upload-time = "2025-02-15T04:02:58.925Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/23/f8b28ca248bb629b9e08f877dd2965d1994e1674a03d67cd10c5246da248/lightgbm-4.6.0-py3-none-win_amd64.whl", hash = "sha256:37089ee95664b6550a7189d887dbf098e3eadab03537e411f52c63c121e3ba4b", size = 1451509, upload-time = "2025-02-15T04:03:01.515Z" },
+]
+
 [[package]]
 name = "lingua-language-detector"
 version = "2.1.1"
@@ -830,6 +1119,85 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
 ]
 
+[[package]]
+name = "multidict"
+version = "6.7.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010, upload-time = "2026-01-26T02:46:45.979Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893, upload-time = "2026-01-26T02:43:52.754Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456, upload-time = "2026-01-26T02:43:53.893Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872, upload-time = "2026-01-26T02:43:55.041Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/3b/d6bd75dc4f3ff7c73766e04e705b00ed6dbbaccf670d9e05a12b006f5a21/multidict-6.7.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53", size = 251018, upload-time = "2026-01-26T02:43:56.198Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75", size = 258883, upload-time = "2026-01-26T02:43:57.499Z" },
+    { url = "https://files.pythonhosted.org/packages/86/85/7ed40adafea3d4f1c8b916e3b5cc3a8e07dfcdcb9cd72800f4ed3ca1b387/multidict-6.7.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b", size = 242413, upload-time = "2026-01-26T02:43:58.755Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/57/b8565ff533e48595503c785f8361ff9a4fde4d67de25c207cd0ba3befd03/multidict-6.7.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733", size = 268404, upload-time = "2026-01-26T02:44:00.216Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/50/9810c5c29350f7258180dfdcb2e52783a0632862eb334c4896ac717cebcb/multidict-6.7.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a", size = 269456, upload-time = "2026-01-26T02:44:02.202Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961", size = 256322, upload-time = "2026-01-26T02:44:03.56Z" },
+    { url = "https://files.pythonhosted.org/packages/31/6e/d8a26d81ac166a5592782d208dd90dfdc0a7a218adaa52b45a672b46c122/multidict-6.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582", size = 253955, upload-time = "2026-01-26T02:44:04.845Z" },
+    { url = "https://files.pythonhosted.org/packages/59/4c/7c672c8aad41534ba619bcd4ade7a0dc87ed6b8b5c06149b85d3dd03f0cd/multidict-6.7.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e", size = 251254, upload-time = "2026-01-26T02:44:06.133Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/bd/84c24de512cbafbdbc39439f74e967f19570ce7924e3007174a29c348916/multidict-6.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3", size = 252059, upload-time = "2026-01-26T02:44:07.518Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/ba/f5449385510825b73d01c2d4087bf6d2fccc20a2d42ac34df93191d3dd03/multidict-6.7.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6", size = 263588, upload-time = "2026-01-26T02:44:09.382Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/11/afc7c677f68f75c84a69fe37184f0f82fce13ce4b92f49f3db280b7e92b3/multidict-6.7.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a", size = 259642, upload-time = "2026-01-26T02:44:10.73Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/17/ebb9644da78c4ab36403739e0e6e0e30ebb135b9caf3440825001a0bddcb/multidict-6.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba", size = 251377, upload-time = "2026-01-26T02:44:12.042Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/a4/840f5b97339e27846c46307f2530a2805d9d537d8b8bd416af031cad7fa0/multidict-6.7.1-cp312-cp312-win32.whl", hash = "sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511", size = 41887, upload-time = "2026-01-26T02:44:14.245Z" },
+    { url = "https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19", size = 46053, upload-time = "2026-01-26T02:44:15.371Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/5b/aba28e4ee4006ae4c7df8d327d31025d760ffa992ea23812a601d226e682/multidict-6.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf", size = 43307, upload-time = "2026-01-26T02:44:16.852Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/22/929c141d6c0dba87d3e1d38fbdf1ba8baba86b7776469f2bc2d3227a1e67/multidict-6.7.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2b41f5fed0ed563624f1c17630cb9941cf2309d4df00e494b551b5f3e3d67a23", size = 76174, upload-time = "2026-01-26T02:44:18.509Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/75/bc704ae15fee974f8fccd871305e254754167dce5f9e42d88a2def741a1d/multidict-6.7.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84e61e3af5463c19b67ced91f6c634effb89ef8bfc5ca0267f954451ed4bb6a2", size = 45116, upload-time = "2026-01-26T02:44:19.745Z" },
+    { url = "https://files.pythonhosted.org/packages/79/76/55cd7186f498ed080a18440c9013011eb548f77ae1b297206d030eb1180a/multidict-6.7.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:935434b9853c7c112eee7ac891bc4cb86455aa631269ae35442cb316790c1445", size = 43524, upload-time = "2026-01-26T02:44:21.571Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/3c/414842ef8d5a1628d68edee29ba0e5bcf235dbfb3ccd3ea303a7fe8c72ff/multidict-6.7.1-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:432feb25a1cb67fe82a9680b4d65fb542e4635cb3166cd9c01560651ad60f177", size = 249368, upload-time = "2026-01-26T02:44:22.803Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/32/befed7f74c458b4a525e60519fe8d87eef72bb1e99924fa2b0f9d97a221e/multidict-6.7.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e82d14e3c948952a1a85503817e038cba5905a3352de76b9a465075d072fba23", size = 256952, upload-time = "2026-01-26T02:44:24.306Z" },
+    { url = "https://files.pythonhosted.org/packages/03/d6/c878a44ba877f366630c860fdf74bfb203c33778f12b6ac274936853c451/multidict-6.7.1-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4cfb48c6ea66c83bcaaf7e4dfa7ec1b6bbcf751b7db85a328902796dfde4c060", size = 240317, upload-time = "2026-01-26T02:44:25.772Z" },
+    { url = "https://files.pythonhosted.org/packages/68/49/57421b4d7ad2e9e60e25922b08ceb37e077b90444bde6ead629095327a6f/multidict-6.7.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1d540e51b7e8e170174555edecddbd5538105443754539193e3e1061864d444d", size = 267132, upload-time = "2026-01-26T02:44:27.648Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/fe/ec0edd52ddbcea2a2e89e174f0206444a61440b40f39704e64dc807a70bd/multidict-6.7.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:273d23f4b40f3dce4d6c8a821c741a86dec62cded82e1175ba3d99be128147ed", size = 268140, upload-time = "2026-01-26T02:44:29.588Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/73/6e1b01cbeb458807aa0831742232dbdd1fa92bfa33f52a3f176b4ff3dc11/multidict-6.7.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d624335fd4fa1c08a53f8b4be7676ebde19cd092b3895c421045ca87895b429", size = 254277, upload-time = "2026-01-26T02:44:30.902Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/b2/5fb8c124d7561a4974c342bc8c778b471ebbeb3cc17df696f034a7e9afe7/multidict-6.7.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:12fad252f8b267cc75b66e8fc51b3079604e8d43a75428ffe193cd9e2195dfd6", size = 252291, upload-time = "2026-01-26T02:44:32.31Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/96/51d4e4e06bcce92577fcd488e22600bd38e4fd59c20cb49434d054903bd2/multidict-6.7.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:03ede2a6ffbe8ef936b92cb4529f27f42be7f56afcdab5ab739cd5f27fb1cbf9", size = 250156, upload-time = "2026-01-26T02:44:33.734Z" },
+    { url = "https://files.pythonhosted.org/packages/db/6b/420e173eec5fba721a50e2a9f89eda89d9c98fded1124f8d5c675f7a0c0f/multidict-6.7.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:90efbcf47dbe33dcf643a1e400d67d59abeac5db07dc3f27d6bdeae497a2198c", size = 249742, upload-time = "2026-01-26T02:44:35.222Z" },
+    { url = "https://files.pythonhosted.org/packages/44/a3/ec5b5bd98f306bc2aa297b8c6f11a46714a56b1e6ef5ebda50a4f5d7c5fb/multidict-6.7.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:5c4b9bfc148f5a91be9244d6264c53035c8a0dcd2f51f1c3c6e30e30ebaa1c84", size = 262221, upload-time = "2026-01-26T02:44:36.604Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/f7/e8c0d0da0cd1e28d10e624604e1a36bcc3353aaebdfdc3a43c72bc683a12/multidict-6.7.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:401c5a650f3add2472d1d288c26deebc540f99e2fb83e9525007a74cd2116f1d", size = 258664, upload-time = "2026-01-26T02:44:38.008Z" },
+    { url = "https://files.pythonhosted.org/packages/52/da/151a44e8016dd33feed44f730bd856a66257c1ee7aed4f44b649fb7edeb3/multidict-6.7.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:97891f3b1b3ffbded884e2916cacf3c6fc87b66bb0dde46f7357404750559f33", size = 249490, upload-time = "2026-01-26T02:44:39.386Z" },
+    { url = "https://files.pythonhosted.org/packages/87/af/a3b86bf9630b732897f6fc3f4c4714b90aa4361983ccbdcd6c0339b21b0c/multidict-6.7.1-cp313-cp313-win32.whl", hash = "sha256:e1c5988359516095535c4301af38d8a8838534158f649c05dd1050222321bcb3", size = 41695, upload-time = "2026-01-26T02:44:41.318Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/35/e994121b0e90e46134673422dd564623f93304614f5d11886b1b3e06f503/multidict-6.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:960c83bf01a95b12b08fd54324a4eb1d5b52c88932b5cba5d6e712bb3ed12eb5", size = 45884, upload-time = "2026-01-26T02:44:42.488Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/61/42d3e5dbf661242a69c97ea363f2d7b46c567da8eadef8890022be6e2ab0/multidict-6.7.1-cp313-cp313-win_arm64.whl", hash = "sha256:563fe25c678aaba333d5399408f5ec3c383ca5b663e7f774dd179a520b8144df", size = 43122, upload-time = "2026-01-26T02:44:43.664Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/b3/e6b21c6c4f314bb956016b0b3ef2162590a529b84cb831c257519e7fde44/multidict-6.7.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:c76c4bec1538375dad9d452d246ca5368ad6e1c9039dadcf007ae59c70619ea1", size = 83175, upload-time = "2026-01-26T02:44:44.894Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/76/23ecd2abfe0957b234f6c960f4ade497f55f2c16aeb684d4ecdbf1c95791/multidict-6.7.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:57b46b24b5d5ebcc978da4ec23a819a9402b4228b8a90d9c656422b4bdd8a963", size = 48460, upload-time = "2026-01-26T02:44:46.106Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/57/a0ed92b23f3a042c36bc4227b72b97eca803f5f1801c1ab77c8a212d455e/multidict-6.7.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e954b24433c768ce78ab7929e84ccf3422e46deb45a4dc9f93438f8217fa2d34", size = 46930, upload-time = "2026-01-26T02:44:47.278Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/66/02ec7ace29162e447f6382c495dc95826bf931d3818799bbef11e8f7df1a/multidict-6.7.1-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3bd231490fa7217cc832528e1cd8752a96f0125ddd2b5749390f7c3ec8721b65", size = 242582, upload-time = "2026-01-26T02:44:48.604Z" },
+    { url = "https://files.pythonhosted.org/packages/58/18/64f5a795e7677670e872673aca234162514696274597b3708b2c0d276cce/multidict-6.7.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:253282d70d67885a15c8a7716f3a73edf2d635793ceda8173b9ecc21f2fb8292", size = 250031, upload-time = "2026-01-26T02:44:50.544Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/ed/e192291dbbe51a8290c5686f482084d31bcd9d09af24f63358c3d42fd284/multidict-6.7.1-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0b4c48648d7649c9335cf1927a8b87fa692de3dcb15faa676c6a6f1f1aabda43", size = 228596, upload-time = "2026-01-26T02:44:51.951Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/7e/3562a15a60cf747397e7f2180b0a11dc0c38d9175a650e75fa1b4d325e15/multidict-6.7.1-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:98bc624954ec4d2c7cb074b8eefc2b5d0ce7d482e410df446414355d158fe4ca", size = 257492, upload-time = "2026-01-26T02:44:53.902Z" },
+    { url = "https://files.pythonhosted.org/packages/24/02/7d0f9eae92b5249bb50ac1595b295f10e263dd0078ebb55115c31e0eaccd/multidict-6.7.1-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1b99af4d9eec0b49927b4402bcbb58dea89d3e0db8806a4086117019939ad3dd", size = 255899, upload-time = "2026-01-26T02:44:55.316Z" },
+    { url = "https://files.pythonhosted.org/packages/00/e3/9b60ed9e23e64c73a5cde95269ef1330678e9c6e34dd4eb6b431b85b5a10/multidict-6.7.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6aac4f16b472d5b7dc6f66a0d49dd57b0e0902090be16594dc9ebfd3d17c47e7", size = 247970, upload-time = "2026-01-26T02:44:56.783Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/06/538e58a63ed5cfb0bd4517e346b91da32fde409d839720f664e9a4ae4f9d/multidict-6.7.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:21f830fe223215dffd51f538e78c172ed7c7f60c9b96a2bf05c4848ad49921c3", size = 245060, upload-time = "2026-01-26T02:44:58.195Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/2f/d743a3045a97c895d401e9bd29aaa09b94f5cbdf1bd561609e5a6c431c70/multidict-6.7.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:f5dd81c45b05518b9aa4da4aa74e1c93d715efa234fd3e8a179df611cc85e5f4", size = 235888, upload-time = "2026-01-26T02:44:59.57Z" },
+    { url = "https://files.pythonhosted.org/packages/38/83/5a325cac191ab28b63c52f14f1131f3b0a55ba3b9aa65a6d0bf2a9b921a0/multidict-6.7.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:eb304767bca2bb92fb9c5bd33cedc95baee5bb5f6c88e63706533a1c06ad08c8", size = 243554, upload-time = "2026-01-26T02:45:01.054Z" },
+    { url = "https://files.pythonhosted.org/packages/20/1f/9d2327086bd15da2725ef6aae624208e2ef828ed99892b17f60c344e57ed/multidict-6.7.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:c9035dde0f916702850ef66460bc4239d89d08df4d02023a5926e7446724212c", size = 252341, upload-time = "2026-01-26T02:45:02.484Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/2c/2a1aa0280cf579d0f6eed8ee5211c4f1730bd7e06c636ba2ee6aafda302e/multidict-6.7.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:af959b9beeb66c822380f222f0e0a1889331597e81f1ded7f374f3ecb0fd6c52", size = 246391, upload-time = "2026-01-26T02:45:03.862Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/03/7ca022ffc36c5a3f6e03b179a5ceb829be9da5783e6fe395f347c0794680/multidict-6.7.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:41f2952231456154ee479651491e94118229844dd7226541788be783be2b5108", size = 243422, upload-time = "2026-01-26T02:45:05.296Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/1d/b31650eab6c5778aceed46ba735bd97f7c7d2f54b319fa916c0f96e7805b/multidict-6.7.1-cp313-cp313t-win32.whl", hash = "sha256:df9f19c28adcb40b6aae30bbaa1478c389efd50c28d541d76760199fc1037c32", size = 47770, upload-time = "2026-01-26T02:45:06.754Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/5b/2d2d1d522e51285bd61b1e20df8f47ae1a9d80839db0b24ea783b3832832/multidict-6.7.1-cp313-cp313t-win_amd64.whl", hash = "sha256:d54ecf9f301853f2c5e802da559604b3e95bb7a3b01a9c295c6ee591b9882de8", size = 53109, upload-time = "2026-01-26T02:45:08.044Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/a3/cc409ba012c83ca024a308516703cf339bdc4b696195644a7215a5164a24/multidict-6.7.1-cp313-cp313t-win_arm64.whl", hash = "sha256:5a37ca18e360377cfda1d62f5f382ff41f2b8c4ccb329ed974cc2e1643440118", size = 45573, upload-time = "2026-01-26T02:45:09.349Z" },
+    { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" },
+]
+
+[[package]]
+name = "multiprocess"
+version = "0.70.19"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dill" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a2/f2/e783ac7f2aeeed14e9e12801f22529cc7e6b7ab80928d6dcce4e9f00922d/multiprocess-0.70.19.tar.gz", hash = "sha256:952021e0e6c55a4a9fe4cd787895b86e239a40e76802a789d6305398d3975897", size = 2079989, upload-time = "2026-01-19T06:47:39.744Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e3/45/8004d1e6b9185c1a444d6b55ac5682acf9d98035e54386d967366035a03a/multiprocess-0.70.19-py310-none-any.whl", hash = "sha256:97404393419dcb2a8385910864eedf47a3cadf82c66345b44f036420eb0b5d87", size = 134948, upload-time = "2026-01-19T06:47:32.325Z" },
+    { url = "https://files.pythonhosted.org/packages/86/c2/dec9722dc3474c164a0b6bcd9a7ed7da542c98af8cabce05374abab35edd/multiprocess-0.70.19-py311-none-any.whl", hash = "sha256:928851ae7973aea4ce0eaf330bbdafb2e01398a91518d5c8818802845564f45c", size = 144457, upload-time = "2026-01-19T06:47:33.711Z" },
+    { url = "https://files.pythonhosted.org/packages/71/70/38998b950a97ea279e6bd657575d22d1a2047256caf707d9a10fbce4f065/multiprocess-0.70.19-py312-none-any.whl", hash = "sha256:3a56c0e85dd5025161bac5ce138dcac1e49174c7d8e74596537e729fd5c53c28", size = 150281, upload-time = "2026-01-19T06:47:35.037Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/74/d2c27e03cb84251dfe7249b8e82923643c6d48fa4883b9476b025e7dc7eb/multiprocess-0.70.19-py313-none-any.whl", hash = "sha256:8d5eb4ec5017ba2fab4e34a747c6d2c2b6fecfe9e7236e77988db91580ada952", size = 156414, upload-time = "2026-01-19T06:47:35.915Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/82/69e539c4c2027f1e1697e09aaa2449243085a0edf81ae2c6341e84d769b6/multiprocess-0.70.19-py39-none-any.whl", hash = "sha256:0d4b4397ed669d371c81dcd1ef33fd384a44d6c3de1bd0ca7ac06d837720d3c5", size = 133477, upload-time = "2026-01-19T06:47:38.619Z" },
+]
+
 [[package]]
 name = "networkx"
 version = "3.6.1"
@@ -1086,6 +1454,81 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/55/80/178af0594890dee17e239fca96d3d8670ba0f5ff59b7d0439850924a9c09/pandas-3.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b03f91ae8c10a85c1613102c7bef5229b5379f343030a3ccefeca8a33414cf35", size = 10485047, upload-time = "2026-02-17T22:19:34.605Z" },
 ]
 
+[[package]]
+name = "peft"
+version = "0.19.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "accelerate" },
+    { name = "huggingface-hub" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "psutil" },
+    { name = "pyyaml" },
+    { name = "safetensors" },
+    { name = "torch" },
+    { name = "tqdm" },
+    { name = "transformers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/86/cf/037f1e3d5186496c05513a6754639e2dab3038a05f384284d49a9bd06a2d/peft-0.19.1.tar.gz", hash = "sha256:0d97542fe96dcdaa20d3b81c06f26f988618f416a73544ab23c3618ccb674a40", size = 763738, upload-time = "2026-04-16T15:46:45.105Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e8/b6/f54d676ed93cc2dd2234c3b172ea9c8c3d7d29361e66b1b23dec57a67465/peft-0.19.1-py3-none-any.whl", hash = "sha256:2113f72a81621b5913ef28f9022204c742df111890c5f49d812716a4a301e356", size = 680692, upload-time = "2026-04-16T15:46:42.886Z" },
+]
+
+[[package]]
+name = "propcache"
+version = "0.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a2/0f/f17b1b2b221d5ca28b4b876e8bb046ac40466513960646bda8e1853cdfa2/propcache-0.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e153e9cd40cc8945138822807139367f256f89c6810c2634a4f6902b52d3b4e2", size = 80061, upload-time = "2025-10-08T19:46:46.075Z" },
+    { url = "https://files.pythonhosted.org/packages/76/47/8ccf75935f51448ba9a16a71b783eb7ef6b9ee60f5d14c7f8a8a79fbeed7/propcache-0.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cd547953428f7abb73c5ad82cbb32109566204260d98e41e5dfdc682eb7f8403", size = 46037, upload-time = "2025-10-08T19:46:47.23Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/b6/5c9a0e42df4d00bfb4a3cbbe5cf9f54260300c88a0e9af1f47ca5ce17ac0/propcache-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f048da1b4f243fc44f205dfd320933a951b8d89e0afd4c7cacc762a8b9165207", size = 47324, upload-time = "2025-10-08T19:46:48.384Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/d3/6c7ee328b39a81ee877c962469f1e795f9db87f925251efeb0545e0020d0/propcache-0.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ec17c65562a827bba85e3872ead335f95405ea1674860d96483a02f5c698fa72", size = 225505, upload-time = "2025-10-08T19:46:50.055Z" },
+    { url = "https://files.pythonhosted.org/packages/01/5d/1c53f4563490b1d06a684742cc6076ef944bc6457df6051b7d1a877c057b/propcache-0.4.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:405aac25c6394ef275dee4c709be43745d36674b223ba4eb7144bf4d691b7367", size = 230242, upload-time = "2025-10-08T19:46:51.815Z" },
+    { url = "https://files.pythonhosted.org/packages/20/e1/ce4620633b0e2422207c3cb774a0ee61cac13abc6217763a7b9e2e3f4a12/propcache-0.4.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0013cb6f8dde4b2a2f66903b8ba740bdfe378c943c4377a200551ceb27f379e4", size = 238474, upload-time = "2025-10-08T19:46:53.208Z" },
+    { url = "https://files.pythonhosted.org/packages/46/4b/3aae6835b8e5f44ea6a68348ad90f78134047b503765087be2f9912140ea/propcache-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15932ab57837c3368b024473a525e25d316d8353016e7cc0e5ba9eb343fbb1cf", size = 221575, upload-time = "2025-10-08T19:46:54.511Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/a5/8a5e8678bcc9d3a1a15b9a29165640d64762d424a16af543f00629c87338/propcache-0.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:031dce78b9dc099f4c29785d9cf5577a3faf9ebf74ecbd3c856a7b92768c3df3", size = 216736, upload-time = "2025-10-08T19:46:56.212Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/63/b7b215eddeac83ca1c6b934f89d09a625aa9ee4ba158338854c87210cc36/propcache-0.4.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:ab08df6c9a035bee56e31af99be621526bd237bea9f32def431c656b29e41778", size = 213019, upload-time = "2025-10-08T19:46:57.595Z" },
+    { url = "https://files.pythonhosted.org/packages/57/74/f580099a58c8af587cac7ba19ee7cb418506342fbbe2d4a4401661cca886/propcache-0.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4d7af63f9f93fe593afbf104c21b3b15868efb2c21d07d8732c0c4287e66b6a6", size = 220376, upload-time = "2025-10-08T19:46:59.067Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/ee/542f1313aff7eaf19c2bb758c5d0560d2683dac001a1c96d0774af799843/propcache-0.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:cfc27c945f422e8b5071b6e93169679e4eb5bf73bbcbf1ba3ae3a83d2f78ebd9", size = 226988, upload-time = "2025-10-08T19:47:00.544Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/18/9c6b015dd9c6930f6ce2229e1f02fb35298b847f2087ea2b436a5bfa7287/propcache-0.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:35c3277624a080cc6ec6f847cbbbb5b49affa3598c4535a0a4682a697aaa5c75", size = 215615, upload-time = "2025-10-08T19:47:01.968Z" },
+    { url = "https://files.pythonhosted.org/packages/80/9e/e7b85720b98c45a45e1fca6a177024934dc9bc5f4d5dd04207f216fc33ed/propcache-0.4.1-cp312-cp312-win32.whl", hash = "sha256:671538c2262dadb5ba6395e26c1731e1d52534bfe9ae56d0b5573ce539266aa8", size = 38066, upload-time = "2025-10-08T19:47:03.503Z" },
+    { url = "https://files.pythonhosted.org/packages/54/09/d19cff2a5aaac632ec8fc03737b223597b1e347416934c1b3a7df079784c/propcache-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:cb2d222e72399fcf5890d1d5cc1060857b9b236adff2792ff48ca2dfd46c81db", size = 41655, upload-time = "2025-10-08T19:47:04.973Z" },
+    { url = "https://files.pythonhosted.org/packages/68/ab/6b5c191bb5de08036a8c697b265d4ca76148efb10fa162f14af14fb5f076/propcache-0.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:204483131fb222bdaaeeea9f9e6c6ed0cac32731f75dfc1d4a567fc1926477c1", size = 37789, upload-time = "2025-10-08T19:47:06.077Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/df/6d9c1b6ac12b003837dde8a10231a7344512186e87b36e855bef32241942/propcache-0.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:43eedf29202c08550aac1d14e0ee619b0430aaef78f85864c1a892294fbc28cf", size = 77750, upload-time = "2025-10-08T19:47:07.648Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/e8/677a0025e8a2acf07d3418a2e7ba529c9c33caf09d3c1f25513023c1db56/propcache-0.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d62cdfcfd89ccb8de04e0eda998535c406bf5e060ffd56be6c586cbcc05b3311", size = 44780, upload-time = "2025-10-08T19:47:08.851Z" },
+    { url = "https://files.pythonhosted.org/packages/89/a4/92380f7ca60f99ebae761936bc48a72a639e8a47b29050615eef757cb2a7/propcache-0.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cae65ad55793da34db5f54e4029b89d3b9b9490d8abe1b4c7ab5d4b8ec7ebf74", size = 46308, upload-time = "2025-10-08T19:47:09.982Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/48/c5ac64dee5262044348d1d78a5f85dd1a57464a60d30daee946699963eb3/propcache-0.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:333ddb9031d2704a301ee3e506dc46b1fe5f294ec198ed6435ad5b6a085facfe", size = 208182, upload-time = "2025-10-08T19:47:11.319Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/0c/cd762dd011a9287389a6a3eb43aa30207bde253610cca06824aeabfe9653/propcache-0.4.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:fd0858c20f078a32cf55f7e81473d96dcf3b93fd2ccdb3d40fdf54b8573df3af", size = 211215, upload-time = "2025-10-08T19:47:13.146Z" },
+    { url = "https://files.pythonhosted.org/packages/30/3e/49861e90233ba36890ae0ca4c660e95df565b2cd15d4a68556ab5865974e/propcache-0.4.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:678ae89ebc632c5c204c794f8dab2837c5f159aeb59e6ed0539500400577298c", size = 218112, upload-time = "2025-10-08T19:47:14.913Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/8b/544bc867e24e1bd48f3118cecd3b05c694e160a168478fa28770f22fd094/propcache-0.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d472aeb4fbf9865e0c6d622d7f4d54a4e101a89715d8904282bb5f9a2f476c3f", size = 204442, upload-time = "2025-10-08T19:47:16.277Z" },
+    { url = "https://files.pythonhosted.org/packages/50/a6/4282772fd016a76d3e5c0df58380a5ea64900afd836cec2c2f662d1b9bb3/propcache-0.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4d3df5fa7e36b3225954fba85589da77a0fe6a53e3976de39caf04a0db4c36f1", size = 199398, upload-time = "2025-10-08T19:47:17.962Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/ec/d8a7cd406ee1ddb705db2139f8a10a8a427100347bd698e7014351c7af09/propcache-0.4.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:ee17f18d2498f2673e432faaa71698032b0127ebf23ae5974eeaf806c279df24", size = 196920, upload-time = "2025-10-08T19:47:19.355Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/6c/f38ab64af3764f431e359f8baf9e0a21013e24329e8b85d2da32e8ed07ca/propcache-0.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:580e97762b950f993ae618e167e7be9256b8353c2dcd8b99ec100eb50f5286aa", size = 203748, upload-time = "2025-10-08T19:47:21.338Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/e3/fa846bd70f6534d647886621388f0a265254d30e3ce47e5c8e6e27dbf153/propcache-0.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:501d20b891688eb8e7aa903021f0b72d5a55db40ffaab27edefd1027caaafa61", size = 205877, upload-time = "2025-10-08T19:47:23.059Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/39/8163fc6f3133fea7b5f2827e8eba2029a0277ab2c5beee6c1db7b10fc23d/propcache-0.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9a0bd56e5b100aef69bd8562b74b46254e7c8812918d3baa700c8a8009b0af66", size = 199437, upload-time = "2025-10-08T19:47:24.445Z" },
+    { url = "https://files.pythonhosted.org/packages/93/89/caa9089970ca49c7c01662bd0eeedfe85494e863e8043565aeb6472ce8fe/propcache-0.4.1-cp313-cp313-win32.whl", hash = "sha256:bcc9aaa5d80322bc2fb24bb7accb4a30f81e90ab8d6ba187aec0744bc302ad81", size = 37586, upload-time = "2025-10-08T19:47:25.736Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/ab/f76ec3c3627c883215b5c8080debb4394ef5a7a29be811f786415fc1e6fd/propcache-0.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:381914df18634f5494334d201e98245c0596067504b9372d8cf93f4bb23e025e", size = 40790, upload-time = "2025-10-08T19:47:26.847Z" },
+    { url = "https://files.pythonhosted.org/packages/59/1b/e71ae98235f8e2ba5004d8cb19765a74877abf189bc53fc0c80d799e56c3/propcache-0.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:8873eb4460fd55333ea49b7d189749ecf6e55bf85080f11b1c4530ed3034cba1", size = 37158, upload-time = "2025-10-08T19:47:27.961Z" },
+    { url = "https://files.pythonhosted.org/packages/83/ce/a31bbdfc24ee0dcbba458c8175ed26089cf109a55bbe7b7640ed2470cfe9/propcache-0.4.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:92d1935ee1f8d7442da9c0c4fa7ac20d07e94064184811b685f5c4fada64553b", size = 81451, upload-time = "2025-10-08T19:47:29.445Z" },
+    { url = "https://files.pythonhosted.org/packages/25/9c/442a45a470a68456e710d96cacd3573ef26a1d0a60067e6a7d5e655621ed/propcache-0.4.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:473c61b39e1460d386479b9b2f337da492042447c9b685f28be4f74d3529e566", size = 46374, upload-time = "2025-10-08T19:47:30.579Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/bf/b1d5e21dbc3b2e889ea4327044fb16312a736d97640fb8b6aa3f9c7b3b65/propcache-0.4.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c0ef0aaafc66fbd87842a3fe3902fd889825646bc21149eafe47be6072725835", size = 48396, upload-time = "2025-10-08T19:47:31.79Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/04/5b4c54a103d480e978d3c8a76073502b18db0c4bc17ab91b3cb5092ad949/propcache-0.4.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f95393b4d66bfae908c3ca8d169d5f79cd65636ae15b5e7a4f6e67af675adb0e", size = 275950, upload-time = "2025-10-08T19:47:33.481Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/c1/86f846827fb969c4b78b0af79bba1d1ea2156492e1b83dea8b8a6ae27395/propcache-0.4.1-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c07fda85708bc48578467e85099645167a955ba093be0a2dcba962195676e859", size = 273856, upload-time = "2025-10-08T19:47:34.906Z" },
+    { url = "https://files.pythonhosted.org/packages/36/1d/fc272a63c8d3bbad6878c336c7a7dea15e8f2d23a544bda43205dfa83ada/propcache-0.4.1-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:af223b406d6d000830c6f65f1e6431783fc3f713ba3e6cc8c024d5ee96170a4b", size = 280420, upload-time = "2025-10-08T19:47:36.338Z" },
+    { url = "https://files.pythonhosted.org/packages/07/0c/01f2219d39f7e53d52e5173bcb09c976609ba30209912a0680adfb8c593a/propcache-0.4.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a78372c932c90ee474559c5ddfffd718238e8673c340dc21fe45c5b8b54559a0", size = 263254, upload-time = "2025-10-08T19:47:37.692Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/18/cd28081658ce597898f0c4d174d4d0f3c5b6d4dc27ffafeef835c95eb359/propcache-0.4.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:564d9f0d4d9509e1a870c920a89b2fec951b44bf5ba7d537a9e7c1ccec2c18af", size = 261205, upload-time = "2025-10-08T19:47:39.659Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/71/1f9e22eb8b8316701c2a19fa1f388c8a3185082607da8e406a803c9b954e/propcache-0.4.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:17612831fda0138059cc5546f4d12a2aacfb9e47068c06af35c400ba58ba7393", size = 247873, upload-time = "2025-10-08T19:47:41.084Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/65/3d4b61f36af2b4eddba9def857959f1016a51066b4f1ce348e0cf7881f58/propcache-0.4.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:41a89040cb10bd345b3c1a873b2bf36413d48da1def52f268a055f7398514874", size = 262739, upload-time = "2025-10-08T19:47:42.51Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/42/26746ab087faa77c1c68079b228810436ccd9a5ce9ac85e2b7307195fd06/propcache-0.4.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e35b88984e7fa64aacecea39236cee32dd9bd8c55f57ba8a75cf2399553f9bd7", size = 263514, upload-time = "2025-10-08T19:47:43.927Z" },
+    { url = "https://files.pythonhosted.org/packages/94/13/630690fe201f5502d2403dd3cfd451ed8858fe3c738ee88d095ad2ff407b/propcache-0.4.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6f8b465489f927b0df505cbe26ffbeed4d6d8a2bbc61ce90eb074ff129ef0ab1", size = 257781, upload-time = "2025-10-08T19:47:45.448Z" },
+    { url = "https://files.pythonhosted.org/packages/92/f7/1d4ec5841505f423469efbfc381d64b7b467438cd5a4bbcbb063f3b73d27/propcache-0.4.1-cp313-cp313t-win32.whl", hash = "sha256:2ad890caa1d928c7c2965b48f3a3815c853180831d0e5503d35cf00c472f4717", size = 41396, upload-time = "2025-10-08T19:47:47.202Z" },
+    { url = "https://files.pythonhosted.org/packages/48/f0/615c30622316496d2cbbc29f5985f7777d3ada70f23370608c1d3e081c1f/propcache-0.4.1-cp313-cp313t-win_amd64.whl", hash = "sha256:f7ee0e597f495cf415bcbd3da3caa3bd7e816b74d0d52b8145954c5e6fd3ff37", size = 44897, upload-time = "2025-10-08T19:47:48.336Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/ca/6002e46eccbe0e33dcd4069ef32f7f1c9e243736e07adca37ae8c4830ec3/propcache-0.4.1-cp313-cp313t-win_arm64.whl", hash = "sha256:929d7cbe1f01bb7baffb33dc14eb5691c95831450a26354cd210a8155170c93a", size = 39789, upload-time = "2025-10-08T19:47:49.876Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
+]
+
 [[package]]
 name = "psutil"
 version = "7.2.2"
@@ -1108,6 +1551,35 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" },
 ]
 
+[[package]]
+name = "pyarrow"
+version = "24.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559, upload-time = "2026-04-21T10:47:22.17Z" },
+    { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654, upload-time = "2026-04-21T10:47:28.315Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394, upload-time = "2026-04-21T10:47:34.821Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122, upload-time = "2026-04-21T10:47:42.056Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032, upload-time = "2026-04-21T10:47:48.967Z" },
+    { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490, upload-time = "2026-04-21T10:47:55.981Z" },
+    { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660, upload-time = "2026-04-21T10:48:01.718Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/d3/a1abf004482026ddc17f4503db227787fa3cfe41ec5091ff20e4fea55e57/pyarrow-24.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:02b001b3ed4723caa44f6cd1af2d5c86aa2cf9971dacc2ffa55b21237713dfba", size = 34976759, upload-time = "2026-04-21T10:48:07.258Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/4a/34f0a36d28a2dd32225301b79daad44e243dc1a2bb77d43b60749be255c4/pyarrow-24.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:04920d6a71aabd08a0417709efce97d45ea8e6fb733d9ca9ecffb13c67839f68", size = 36658471, upload-time = "2026-04-21T10:48:13.347Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/78/543b94712ae8bb1a6023bcc1acf1a740fbff8286747c289cd9468fced2a5/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a964266397740257f16f7bb2e4f08a0c81454004beab8ff59dd531b73610e9f2", size = 45675981, upload-time = "2026-04-21T10:48:20.201Z" },
+    { url = "https://files.pythonhosted.org/packages/84/9f/8fb7c222b100d314137fa40ec050de56cd8c6d957d1cfff685ce72f15b17/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6f066b179d68c413374294bc1735f68475457c933258df594443bb9d88ddc2a0", size = 48859172, upload-time = "2026-04-21T10:48:27.541Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/d3/1ea72538e6c8b3b475ed78d1049a2c518e655761ea50fe1171fc855fcab7/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1183baeb14c5f587b1ec52831e665718ce632caab84b7cd6b85fd44f96114495", size = 49385733, upload-time = "2026-04-21T10:48:34.7Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/be/c3d8b06a1ba35f2260f8e1f771abbee7d5e345c0937aab90675706b1690a/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:806f24b4085453c197a5078218d1ee08783ebbba271badd153d1ae22a3ee804f", size = 51934335, upload-time = "2026-04-21T10:48:42.099Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/62/89e07a1e7329d2cde3e3c6994ba0839a24977a2beda8be6005ea3d860b99/pyarrow-24.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e4505fc6583f7b05ab854934896bcac8253b04ac1171a77dfb73efef92076d91", size = 27271748, upload-time = "2026-04-21T10:49:42.532Z" },
+    { url = "https://files.pythonhosted.org/packages/17/1a/cff3a59f80b5b1658549d46611b67163f65e0664431c076ad728bf9d5af4/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:1a4e45017efbf115032e4475ee876d525e0e36c742214fbe405332480ecd6275", size = 35238554, upload-time = "2026-04-21T10:48:48.526Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/99/cce0f42a327bfef2c420fb6078a3eb834826e5d6697bf3009fe11d2ad051/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:7986f1fa71cee060ad00758bcc79d3a93bab8559bf978fab9e53472a2e25a17b", size = 36782301, upload-time = "2026-04-21T10:48:55.181Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/66/8e560d5ff6793ca29aca213c53eec0dd482dd46cb93b2819e5aab52e4252/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:d3e0b61e8efb24ed38898e5cdc5fffa9124be480008d401a1f8071500494ae42", size = 45721929, upload-time = "2026-04-21T10:49:03.676Z" },
+    { url = "https://files.pythonhosted.org/packages/27/0c/a26e25505d030716e078d9f16eb74973cbf0b33b672884e9f9da1c83b871/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:55a3bc1e3df3b5567b7d27ef551b2283f0c68a5e86f1cd56abc569da4f31335b", size = 48825365, upload-time = "2026-04-21T10:49:11.714Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/eb/771f9ecb0c65e73fe9dccdd1717901b9594f08c4515d000c7c62df573811/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:641f795b361874ac9da5294f8f443dfdbee355cf2bd9e3b8d97aaac2306b9b37", size = 49451819, upload-time = "2026-04-21T10:49:21.474Z" },
+    { url = "https://files.pythonhosted.org/packages/48/da/61ae89a88732f5a785646f3ec6125dbb640fa98a540eb2b9889caa561403/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8adc8e6ce5fccf5dc707046ae4914fd537def529709cc0d285d37a7f9cd442ca", size = 51909252, upload-time = "2026-04-21T10:49:31.164Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/1a/8dd5cafab7b66573fa91c03d06d213356ad4edd71813aa75e08ce2b3a844/pyarrow-24.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:9b18371ad2f44044b81a8d23bc2d8a9b6a6226dca775e8e16cfee640473d6c5d", size = 27388127, upload-time = "2026-04-21T10:49:37.334Z" },
+]
+
 [[package]]
 name = "pycparser"
 version = "3.0"
@@ -1514,6 +1986,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543, upload-time = "2026-02-23T00:20:45.313Z" },
 ]
 
+[[package]]
+name = "sentence-transformers"
+version = "5.4.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "numpy" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+    { name = "torch" },
+    { name = "tqdm" },
+    { name = "transformers" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4d/68/7f98c221940ce783b492ad6140384daf2e2918cd7175009d6a362c22b9ee/sentence_transformers-5.4.1.tar.gz", hash = "sha256:436bcb1182a0ff42a8fb2b1c43498a70d0a75b688d182f2cd0d1dd115af61ddc", size = 428910, upload-time = "2026-04-14T13:34:59.006Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c5/d9/3a9b6f2ccdedc9dc00fe37b2fc58f58f8efbff44565cf4bf39d8568bb13a/sentence_transformers-5.4.1-py3-none-any.whl", hash = "sha256:a6d640fc363849b63affb8e140e9d328feabab86f83d58ac3e16b1c28140b790", size = 571311, upload-time = "2026-04-14T13:34:57.731Z" },
+]
+
 [[package]]
 name = "sentry-sdk"
 version = "2.53.0"
@@ -1554,6 +2045,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
 ]
 
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
+]
+
 [[package]]
 name = "sqlalchemy"
 version = "2.0.45"
@@ -1906,3 +2406,124 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6b/ae/90366304d7c2ce80f9b826096a9e9048b4bb760e44d3b873bb272cba696b/websockets-16.0-cp313-cp313-win_amd64.whl", hash = "sha256:3425ac5cf448801335d6fdc7ae1eb22072055417a96cc6b31b3861f455fbc156", size = 178689, upload-time = "2026-01-10T09:23:10.483Z" },
     { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" },
 ]
+
+[[package]]
+name = "xxhash"
+version = "3.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/02/84/30869e01909fb37a6cc7e18688ee8bf1e42d57e7e0777636bd47524c43c7/xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6", size = 85160, upload-time = "2025-10-02T14:37:08.097Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9a/07/d9412f3d7d462347e4511181dea65e47e0d0e16e26fbee2ea86a2aefb657/xxhash-3.6.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:01362c4331775398e7bb34e3ab403bc9ee9f7c497bc7dee6272114055277dd3c", size = 32744, upload-time = "2025-10-02T14:34:34.622Z" },
+    { url = "https://files.pythonhosted.org/packages/79/35/0429ee11d035fc33abe32dca1b2b69e8c18d236547b9a9b72c1929189b9a/xxhash-3.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b7b2df81a23f8cb99656378e72501b2cb41b1827c0f5a86f87d6b06b69f9f204", size = 30816, upload-time = "2025-10-02T14:34:36.043Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/f2/57eb99aa0f7d98624c0932c5b9a170e1806406cdbcdb510546634a1359e0/xxhash-3.6.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:dc94790144e66b14f67b10ac8ed75b39ca47536bf8800eb7c24b50271ea0c490", size = 194035, upload-time = "2025-10-02T14:34:37.354Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/ed/6224ba353690d73af7a3f1c7cdb1fc1b002e38f783cb991ae338e1eb3d79/xxhash-3.6.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:93f107c673bccf0d592cdba077dedaf52fe7f42dcd7676eba1f6d6f0c3efffd2", size = 212914, upload-time = "2025-10-02T14:34:38.6Z" },
+    { url = "https://files.pythonhosted.org/packages/38/86/fb6b6130d8dd6b8942cc17ab4d90e223653a89aa32ad2776f8af7064ed13/xxhash-3.6.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2aa5ee3444c25b69813663c9f8067dcfaa2e126dc55e8dddf40f4d1c25d7effa", size = 212163, upload-time = "2025-10-02T14:34:39.872Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/dc/e84875682b0593e884ad73b2d40767b5790d417bde603cceb6878901d647/xxhash-3.6.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f7f99123f0e1194fa59cc69ad46dbae2e07becec5df50a0509a808f90a0f03f0", size = 445411, upload-time = "2025-10-02T14:34:41.569Z" },
+    { url = "https://files.pythonhosted.org/packages/11/4f/426f91b96701ec2f37bb2b8cec664eff4f658a11f3fa9d94f0a887ea6d2b/xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:49e03e6fe2cac4a1bc64952dd250cf0dbc5ef4ebb7b8d96bce82e2de163c82a2", size = 193883, upload-time = "2025-10-02T14:34:43.249Z" },
+    { url = "https://files.pythonhosted.org/packages/53/5a/ddbb83eee8e28b778eacfc5a85c969673e4023cdeedcfcef61f36731610b/xxhash-3.6.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bd17fede52a17a4f9a7bc4472a5867cb0b160deeb431795c0e4abe158bc784e9", size = 210392, upload-time = "2025-10-02T14:34:45.042Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/c2/ff69efd07c8c074ccdf0a4f36fcdd3d27363665bcdf4ba399abebe643465/xxhash-3.6.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6fb5f5476bef678f69db04f2bd1efbed3030d2aba305b0fc1773645f187d6a4e", size = 197898, upload-time = "2025-10-02T14:34:46.302Z" },
+    { url = "https://files.pythonhosted.org/packages/58/ca/faa05ac19b3b622c7c9317ac3e23954187516298a091eb02c976d0d3dd45/xxhash-3.6.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:843b52f6d88071f87eba1631b684fcb4b2068cd2180a0224122fe4ef011a9374", size = 210655, upload-time = "2025-10-02T14:34:47.571Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/7a/06aa7482345480cc0cb597f5c875b11a82c3953f534394f620b0be2f700c/xxhash-3.6.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7d14a6cfaf03b1b6f5f9790f76880601ccc7896aff7ab9cd8978a939c1eb7e0d", size = 414001, upload-time = "2025-10-02T14:34:49.273Z" },
+    { url = "https://files.pythonhosted.org/packages/23/07/63ffb386cd47029aa2916b3d2f454e6cc5b9f5c5ada3790377d5430084e7/xxhash-3.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:418daf3db71e1413cfe211c2f9a528456936645c17f46b5204705581a45390ae", size = 191431, upload-time = "2025-10-02T14:34:50.798Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/93/14fde614cadb4ddf5e7cebf8918b7e8fac5ae7861c1875964f17e678205c/xxhash-3.6.0-cp312-cp312-win32.whl", hash = "sha256:50fc255f39428a27299c20e280d6193d8b63b8ef8028995323bf834a026b4fbb", size = 30617, upload-time = "2025-10-02T14:34:51.954Z" },
+    { url = "https://files.pythonhosted.org/packages/13/5d/0d125536cbe7565a83d06e43783389ecae0c0f2ed037b48ede185de477c0/xxhash-3.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:c0f2ab8c715630565ab8991b536ecded9416d615538be8ecddce43ccf26cbc7c", size = 31534, upload-time = "2025-10-02T14:34:53.276Z" },
+    { url = "https://files.pythonhosted.org/packages/54/85/6ec269b0952ec7e36ba019125982cf11d91256a778c7c3f98a4c5043d283/xxhash-3.6.0-cp312-cp312-win_arm64.whl", hash = "sha256:eae5c13f3bc455a3bbb68bdc513912dc7356de7e2280363ea235f71f54064829", size = 27876, upload-time = "2025-10-02T14:34:54.371Z" },
+    { url = "https://files.pythonhosted.org/packages/33/76/35d05267ac82f53ae9b0e554da7c5e281ee61f3cad44c743f0fcd354f211/xxhash-3.6.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:599e64ba7f67472481ceb6ee80fa3bd828fd61ba59fb11475572cc5ee52b89ec", size = 32738, upload-time = "2025-10-02T14:34:55.839Z" },
+    { url = "https://files.pythonhosted.org/packages/31/a8/3fbce1cd96534a95e35d5120637bf29b0d7f5d8fa2f6374e31b4156dd419/xxhash-3.6.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7d8b8aaa30fca4f16f0c84a5c8d7ddee0e25250ec2796c973775373257dde8f1", size = 30821, upload-time = "2025-10-02T14:34:57.219Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/ea/d387530ca7ecfa183cb358027f1833297c6ac6098223fd14f9782cd0015c/xxhash-3.6.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d597acf8506d6e7101a4a44a5e428977a51c0fadbbfd3c39650cca9253f6e5a6", size = 194127, upload-time = "2025-10-02T14:34:59.21Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/0c/71435dcb99874b09a43b8d7c54071e600a7481e42b3e3ce1eb5226a5711a/xxhash-3.6.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:858dc935963a33bc33490128edc1c12b0c14d9c7ebaa4e387a7869ecc4f3e263", size = 212975, upload-time = "2025-10-02T14:35:00.816Z" },
+    { url = "https://files.pythonhosted.org/packages/84/7a/c2b3d071e4bb4a90b7057228a99b10d51744878f4a8a6dd643c8bd897620/xxhash-3.6.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ba284920194615cb8edf73bf52236ce2e1664ccd4a38fdb543506413529cc546", size = 212241, upload-time = "2025-10-02T14:35:02.207Z" },
+    { url = "https://files.pythonhosted.org/packages/81/5f/640b6eac0128e215f177df99eadcd0f1b7c42c274ab6a394a05059694c5a/xxhash-3.6.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4b54219177f6c6674d5378bd862c6aedf64725f70dd29c472eaae154df1a2e89", size = 445471, upload-time = "2025-10-02T14:35:03.61Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/1e/3c3d3ef071b051cc3abbe3721ffb8365033a172613c04af2da89d5548a87/xxhash-3.6.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:42c36dd7dbad2f5238950c377fcbf6811b1cdb1c444fab447960030cea60504d", size = 193936, upload-time = "2025-10-02T14:35:05.013Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/bd/4a5f68381939219abfe1c22a9e3a5854a4f6f6f3c4983a87d255f21f2e5d/xxhash-3.6.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f22927652cba98c44639ffdc7aaf35828dccf679b10b31c4ad72a5b530a18eb7", size = 210440, upload-time = "2025-10-02T14:35:06.239Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/37/b80fe3d5cfb9faff01a02121a0f4d565eb7237e9e5fc66e73017e74dcd36/xxhash-3.6.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b45fad44d9c5c119e9c6fbf2e1c656a46dc68e280275007bbfd3d572b21426db", size = 197990, upload-time = "2025-10-02T14:35:07.735Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/fd/2c0a00c97b9e18f72e1f240ad4e8f8a90fd9d408289ba9c7c495ed7dc05c/xxhash-3.6.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6f2580ffab1a8b68ef2b901cde7e55fa8da5e4be0977c68f78fc80f3c143de42", size = 210689, upload-time = "2025-10-02T14:35:09.438Z" },
+    { url = "https://files.pythonhosted.org/packages/93/86/5dd8076a926b9a95db3206aba20d89a7fc14dd5aac16e5c4de4b56033140/xxhash-3.6.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:40c391dd3cd041ebc3ffe6f2c862f402e306eb571422e0aa918d8070ba31da11", size = 414068, upload-time = "2025-10-02T14:35:11.162Z" },
+    { url = "https://files.pythonhosted.org/packages/af/3c/0bb129170ee8f3650f08e993baee550a09593462a5cddd8e44d0011102b1/xxhash-3.6.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f205badabde7aafd1a31e8ca2a3e5a763107a71c397c4481d6a804eb5063d8bd", size = 191495, upload-time = "2025-10-02T14:35:12.971Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/3a/6797e0114c21d1725e2577508e24006fd7ff1d8c0c502d3b52e45c1771d8/xxhash-3.6.0-cp313-cp313-win32.whl", hash = "sha256:2577b276e060b73b73a53042ea5bd5203d3e6347ce0d09f98500f418a9fcf799", size = 30620, upload-time = "2025-10-02T14:35:14.129Z" },
+    { url = "https://files.pythonhosted.org/packages/86/15/9bc32671e9a38b413a76d24722a2bf8784a132c043063a8f5152d390b0f9/xxhash-3.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:757320d45d2fbcce8f30c42a6b2f47862967aea7bf458b9625b4bbe7ee390392", size = 31542, upload-time = "2025-10-02T14:35:15.21Z" },
+    { url = "https://files.pythonhosted.org/packages/39/c5/cc01e4f6188656e56112d6a8e0dfe298a16934b8c47a247236549a3f7695/xxhash-3.6.0-cp313-cp313-win_arm64.whl", hash = "sha256:457b8f85dec5825eed7b69c11ae86834a018b8e3df5e77783c999663da2f96d6", size = 27880, upload-time = "2025-10-02T14:35:16.315Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/30/25e5321c8732759e930c555176d37e24ab84365482d257c3b16362235212/xxhash-3.6.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a42e633d75cdad6d625434e3468126c73f13f7584545a9cf34e883aa1710e702", size = 32956, upload-time = "2025-10-02T14:35:17.413Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/3c/0573299560d7d9f8ab1838f1efc021a280b5ae5ae2e849034ef3dee18810/xxhash-3.6.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:568a6d743219e717b07b4e03b0a828ce593833e498c3b64752e0f5df6bfe84db", size = 31072, upload-time = "2025-10-02T14:35:18.844Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/1c/52d83a06e417cd9d4137722693424885cc9878249beb3a7c829e74bf7ce9/xxhash-3.6.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:bec91b562d8012dae276af8025a55811b875baace6af510412a5e58e3121bc54", size = 196409, upload-time = "2025-10-02T14:35:20.31Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/8e/c6d158d12a79bbd0b878f8355432075fc82759e356ab5a111463422a239b/xxhash-3.6.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:78e7f2f4c521c30ad5e786fdd6bae89d47a32672a80195467b5de0480aa97b1f", size = 215736, upload-time = "2025-10-02T14:35:21.616Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/68/c4c80614716345d55071a396cf03d06e34b5f4917a467faf43083c995155/xxhash-3.6.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3ed0df1b11a79856df5ffcab572cbd6b9627034c1c748c5566fa79df9048a7c5", size = 214833, upload-time = "2025-10-02T14:35:23.32Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/e9/ae27c8ffec8b953efa84c7c4a6c6802c263d587b9fc0d6e7cea64e08c3af/xxhash-3.6.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0e4edbfc7d420925b0dd5e792478ed393d6e75ff8fc219a6546fb446b6a417b1", size = 448348, upload-time = "2025-10-02T14:35:25.111Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/6b/33e21afb1b5b3f46b74b6bd1913639066af218d704cc0941404ca717fc57/xxhash-3.6.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fba27a198363a7ef87f8c0f6b171ec36b674fe9053742c58dd7e3201c1ab30ee", size = 196070, upload-time = "2025-10-02T14:35:26.586Z" },
+    { url = "https://files.pythonhosted.org/packages/96/b6/fcabd337bc5fa624e7203aa0fa7d0c49eed22f72e93229431752bddc83d9/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:794fe9145fe60191c6532fa95063765529770edcdd67b3d537793e8004cabbfd", size = 212907, upload-time = "2025-10-02T14:35:28.087Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/d3/9ee6160e644d660fcf176c5825e61411c7f62648728f69c79ba237250143/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:6105ef7e62b5ac73a837778efc331a591d8442f8ef5c7e102376506cb4ae2729", size = 200839, upload-time = "2025-10-02T14:35:29.857Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/98/e8de5baa5109394baf5118f5e72ab21a86387c4f89b0e77ef3e2f6b0327b/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:f01375c0e55395b814a679b3eea205db7919ac2af213f4a6682e01220e5fe292", size = 213304, upload-time = "2025-10-02T14:35:31.222Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/1d/71056535dec5c3177eeb53e38e3d367dd1d16e024e63b1cee208d572a033/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:d706dca2d24d834a4661619dcacf51a75c16d65985718d6a7d73c1eeeb903ddf", size = 416930, upload-time = "2025-10-02T14:35:32.517Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/6c/5cbde9de2cd967c322e651c65c543700b19e7ae3e0aae8ece3469bf9683d/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5f059d9faeacd49c0215d66f4056e1326c80503f51a1532ca336a385edadd033", size = 193787, upload-time = "2025-10-02T14:35:33.827Z" },
+    { url = "https://files.pythonhosted.org/packages/19/fa/0172e350361d61febcea941b0cc541d6e6c8d65d153e85f850a7b256ff8a/xxhash-3.6.0-cp313-cp313t-win32.whl", hash = "sha256:1244460adc3a9be84731d72b8e80625788e5815b68da3da8b83f78115a40a7ec", size = 30916, upload-time = "2025-10-02T14:35:35.107Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/e6/e8cf858a2b19d6d45820f072eff1bea413910592ff17157cabc5f1227a16/xxhash-3.6.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b1e420ef35c503869c4064f4a2f2b08ad6431ab7b229a05cce39d74268bca6b8", size = 31799, upload-time = "2025-10-02T14:35:36.165Z" },
+    { url = "https://files.pythonhosted.org/packages/56/15/064b197e855bfb7b343210e82490ae672f8bc7cdf3ddb02e92f64304ee8a/xxhash-3.6.0-cp313-cp313t-win_arm64.whl", hash = "sha256:ec44b73a4220623235f67a996c862049f375df3b1052d9899f40a6382c32d746", size = 28044, upload-time = "2025-10-02T14:35:37.195Z" },
+]
+
+[[package]]
+name = "yarl"
+version = "1.23.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "multidict" },
+    { name = "propcache" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/23/6e/beb1beec874a72f23815c1434518bfc4ed2175065173fb138c3705f658d4/yarl-1.23.0.tar.gz", hash = "sha256:53b1ea6ca88ebd4420379c330aea57e258408dd0df9af0992e5de2078dc9f5d5", size = 194676, upload-time = "2026-03-01T22:07:53.373Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/8a/94615bc31022f711add374097ad4144d569e95ff3c38d39215d07ac153a0/yarl-1.23.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1932b6b8bba8d0160a9d1078aae5838a66039e8832d41d2992daa9a3a08f7860", size = 124737, upload-time = "2026-03-01T22:05:12.897Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/6f/c6554045d59d64052698add01226bc867b52fe4a12373415d7991fdca95d/yarl-1.23.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:411225bae281f114067578891bc75534cfb3d92a3b4dfef7a6ca78ba354e6069", size = 87029, upload-time = "2026-03-01T22:05:14.376Z" },
+    { url = "https://files.pythonhosted.org/packages/19/2a/725ecc166d53438bc88f76822ed4b1e3b10756e790bafd7b523fe97c322d/yarl-1.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13a563739ae600a631c36ce096615fe307f131344588b0bc0daec108cdb47b25", size = 86310, upload-time = "2026-03-01T22:05:15.71Z" },
+    { url = "https://files.pythonhosted.org/packages/99/30/58260ed98e6ff7f90ba84442c1ddd758c9170d70327394a6227b310cd60f/yarl-1.23.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cbf44c5cb4a7633d078788e1b56387e3d3cf2b8139a3be38040b22d6c3221c8", size = 97587, upload-time = "2026-03-01T22:05:17.384Z" },
+    { url = "https://files.pythonhosted.org/packages/76/0a/8b08aac08b50682e65759f7f8dde98ae8168f72487e7357a5d684c581ef9/yarl-1.23.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53ad387048f6f09a8969631e4de3f1bf70c50e93545d64af4f751b2498755072", size = 92528, upload-time = "2026-03-01T22:05:18.804Z" },
+    { url = "https://files.pythonhosted.org/packages/52/07/0b7179101fe5f8385ec6c6bb5d0cb9f76bd9fb4a769591ab6fb5cdbfc69a/yarl-1.23.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4a59ba56f340334766f3a4442e0efd0af895fae9e2b204741ef885c446b3a1a8", size = 105339, upload-time = "2026-03-01T22:05:20.235Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/8a/36d82869ab5ec829ca8574dfcb92b51286fcfb1e9c7a73659616362dc880/yarl-1.23.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:803a3c3ce4acc62eaf01eaca1208dcf0783025ef27572c3336502b9c232005e7", size = 105061, upload-time = "2026-03-01T22:05:22.268Z" },
+    { url = "https://files.pythonhosted.org/packages/66/3e/868e5c3364b6cee19ff3e1a122194fa4ce51def02c61023970442162859e/yarl-1.23.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3d2bff8f37f8d0f96c7ec554d16945050d54462d6e95414babaa18bfafc7f51", size = 100132, upload-time = "2026-03-01T22:05:23.638Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/26/9c89acf82f08a52cb52d6d39454f8d18af15f9d386a23795389d1d423823/yarl-1.23.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c75eb09e8d55bceb4367e83496ff8ef2bc7ea6960efb38e978e8073ea59ecb67", size = 99289, upload-time = "2026-03-01T22:05:25.749Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/54/5b0db00d2cb056922356104468019c0a132e89c8d3ab67d8ede9f4483d2a/yarl-1.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877b0738624280e34c55680d6054a307aa94f7d52fa0e3034a9cc6e790871da7", size = 96950, upload-time = "2026-03-01T22:05:27.318Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/40/10fa93811fd439341fad7e0718a86aca0de9548023bbb403668d6555acab/yarl-1.23.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b5405bb8f0e783a988172993cfc627e4d9d00432d6bbac65a923041edacf997d", size = 93960, upload-time = "2026-03-01T22:05:28.738Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/d2/8ae2e6cd77d0805f4526e30ec43b6f9a3dfc542d401ac4990d178e4bf0cf/yarl-1.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c3a3598a832590c5a3ce56ab5576361b5688c12cb1d39429cf5dba30b510760", size = 104703, upload-time = "2026-03-01T22:05:30.438Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/0c/b3ceacf82c3fe21183ce35fa2acf5320af003d52bc1fcf5915077681142e/yarl-1.23.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8419ebd326430d1cbb7efb5292330a2cf39114e82df5cc3d83c9a0d5ebeaf2f2", size = 98325, upload-time = "2026-03-01T22:05:31.835Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/e0/12900edd28bdab91a69bd2554b85ad7b151f64e8b521fe16f9ad2f56477a/yarl-1.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:be61f6fff406ca40e3b1d84716fde398fc08bc63dd96d15f3a14230a0973ed86", size = 105067, upload-time = "2026-03-01T22:05:33.358Z" },
+    { url = "https://files.pythonhosted.org/packages/15/61/74bb1182cf79c9bbe4eb6b1f14a57a22d7a0be5e9cedf8e2d5c2086474c3/yarl-1.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ceb13c5c858d01321b5d9bb65e4cf37a92169ea470b70fec6f236b2c9dd7e34", size = 100285, upload-time = "2026-03-01T22:05:35.4Z" },
+    { url = "https://files.pythonhosted.org/packages/69/7f/cd5ef733f2550de6241bd8bd8c3febc78158b9d75f197d9c7baa113436af/yarl-1.23.0-cp312-cp312-win32.whl", hash = "sha256:fffc45637bcd6538de8b85f51e3df3223e4ad89bccbfca0481c08c7fc8b7ed7d", size = 82359, upload-time = "2026-03-01T22:05:36.811Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/be/25216a49daeeb7af2bec0db22d5e7df08ed1d7c9f65d78b14f3b74fd72fc/yarl-1.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:f69f57305656a4852f2a7203efc661d8c042e6cc67f7acd97d8667fb448a426e", size = 87674, upload-time = "2026-03-01T22:05:38.171Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/35/aeab955d6c425b227d5b7247eafb24f2653fedc32f95373a001af5dfeb9e/yarl-1.23.0-cp312-cp312-win_arm64.whl", hash = "sha256:6e87a6e8735b44816e7db0b2fbc9686932df473c826b0d9743148432e10bb9b9", size = 81879, upload-time = "2026-03-01T22:05:40.006Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/4b/a0a6e5d0ee8a2f3a373ddef8a4097d74ac901ac363eea1440464ccbe0898/yarl-1.23.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:16c6994ac35c3e74fb0ae93323bf8b9c2a9088d55946109489667c510a7d010e", size = 123796, upload-time = "2026-03-01T22:05:41.412Z" },
+    { url = "https://files.pythonhosted.org/packages/67/b6/8925d68af039b835ae876db5838e82e76ec87b9782ecc97e192b809c4831/yarl-1.23.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4a42e651629dafb64fd5b0286a3580613702b5809ad3f24934ea87595804f2c5", size = 86547, upload-time = "2026-03-01T22:05:42.841Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/50/06d511cc4b8e0360d3c94af051a768e84b755c5eb031b12adaaab6dec6e5/yarl-1.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7c6b9461a2a8b47c65eef63bb1c76a4f1c119618ffa99ea79bc5bb1e46c5821b", size = 85854, upload-time = "2026-03-01T22:05:44.85Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/f4/4e30b250927ffdab4db70da08b9b8d2194d7c7b400167b8fbeca1e4701ca/yarl-1.23.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2569b67d616eab450d262ca7cb9f9e19d2f718c70a8b88712859359d0ab17035", size = 98351, upload-time = "2026-03-01T22:05:46.836Z" },
+    { url = "https://files.pythonhosted.org/packages/86/fc/4118c5671ea948208bdb1492d8b76bdf1453d3e73df051f939f563e7dcc5/yarl-1.23.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e9d9a4d06d3481eab79803beb4d9bd6f6a8e781ec078ac70d7ef2dcc29d1bea5", size = 92711, upload-time = "2026-03-01T22:05:48.316Z" },
+    { url = "https://files.pythonhosted.org/packages/56/11/1ed91d42bd9e73c13dc9e7eb0dd92298d75e7ac4dd7f046ad0c472e231cd/yarl-1.23.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f514f6474e04179d3d33175ed3f3e31434d3130d42ec153540d5b157deefd735", size = 106014, upload-time = "2026-03-01T22:05:50.028Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/c9/74e44e056a23fbc33aca71779ef450ca648a5bc472bdad7a82339918f818/yarl-1.23.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:fda207c815b253e34f7e1909840fd14299567b1c0eb4908f8c2ce01a41265401", size = 105557, upload-time = "2026-03-01T22:05:51.416Z" },
+    { url = "https://files.pythonhosted.org/packages/66/fe/b1e10b08d287f518994f1e2ff9b6d26f0adeecd8dd7d533b01bab29a3eda/yarl-1.23.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34b6cf500e61c90f305094911f9acc9c86da1a05a7a3f5be9f68817043f486e4", size = 101559, upload-time = "2026-03-01T22:05:52.872Z" },
+    { url = "https://files.pythonhosted.org/packages/72/59/c5b8d94b14e3d3c2a9c20cb100119fd534ab5a14b93673ab4cc4a4141ea5/yarl-1.23.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d7504f2b476d21653e4d143f44a175f7f751cd41233525312696c76aa3dbb23f", size = 100502, upload-time = "2026-03-01T22:05:54.954Z" },
+    { url = "https://files.pythonhosted.org/packages/77/4f/96976cb54cbfc5c9fd73ed4c51804f92f209481d1fb190981c0f8a07a1d7/yarl-1.23.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:578110dd426f0d209d1509244e6d4a3f1a3e9077655d98c5f22583d63252a08a", size = 98027, upload-time = "2026-03-01T22:05:56.409Z" },
+    { url = "https://files.pythonhosted.org/packages/63/6e/904c4f476471afdbad6b7e5b70362fb5810e35cd7466529a97322b6f5556/yarl-1.23.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:609d3614d78d74ebe35f54953c5bbd2ac647a7ddb9c30a5d877580f5e86b22f2", size = 95369, upload-time = "2026-03-01T22:05:58.141Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/40/acfcdb3b5f9d68ef499e39e04d25e141fe90661f9d54114556cf83be8353/yarl-1.23.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4966242ec68afc74c122f8459abd597afd7d8a60dc93d695c1334c5fd25f762f", size = 105565, upload-time = "2026-03-01T22:06:00.286Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/c6/31e28f3a6ba2869c43d124f37ea5260cac9c9281df803c354b31f4dd1f3c/yarl-1.23.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:e0fd068364a6759bc794459f0a735ab151d11304346332489c7972bacbe9e72b", size = 99813, upload-time = "2026-03-01T22:06:01.712Z" },
+    { url = "https://files.pythonhosted.org/packages/08/1f/6f65f59e72d54aa467119b63fc0b0b1762eff0232db1f4720cd89e2f4a17/yarl-1.23.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:39004f0ad156da43e86aa71f44e033de68a44e5a31fc53507b36dd253970054a", size = 105632, upload-time = "2026-03-01T22:06:03.188Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/c4/18b178a69935f9e7a338127d5b77d868fdc0f0e49becd286d51b3a18c61d/yarl-1.23.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e5723c01a56c5028c807c701aa66722916d2747ad737a046853f6c46f4875543", size = 101895, upload-time = "2026-03-01T22:06:04.651Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/54/f5b870b5505663911dba950a8e4776a0dbd51c9c54c0ae88e823e4b874a0/yarl-1.23.0-cp313-cp313-win32.whl", hash = "sha256:1b6b572edd95b4fa8df75de10b04bc81acc87c1c7d16bcdd2035b09d30acc957", size = 82356, upload-time = "2026-03-01T22:06:06.04Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/84/266e8da36879c6edcd37b02b547e2d9ecdfea776be49598e75696e3316e1/yarl-1.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:baaf55442359053c7d62f6f8413a62adba3205119bcb6f49594894d8be47e5e3", size = 87515, upload-time = "2026-03-01T22:06:08.107Z" },
+    { url = "https://files.pythonhosted.org/packages/00/fd/7e1c66efad35e1649114fa13f17485f62881ad58edeeb7f49f8c5e748bf9/yarl-1.23.0-cp313-cp313-win_arm64.whl", hash = "sha256:fb4948814a2a98e3912505f09c9e7493b1506226afb1f881825368d6fb776ee3", size = 81785, upload-time = "2026-03-01T22:06:10.181Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/fc/119dd07004f17ea43bb91e3ece6587759edd7519d6b086d16bfbd3319982/yarl-1.23.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:aecfed0b41aa72b7881712c65cf764e39ce2ec352324f5e0837c7048d9e6daaa", size = 130719, upload-time = "2026-03-01T22:06:11.708Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/0d/9f2348502fbb3af409e8f47730282cd6bc80dec6630c1e06374d882d6eb2/yarl-1.23.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a41bcf68efd19073376eb8cf948b8d9be0af26256403e512bb18f3966f1f9120", size = 89690, upload-time = "2026-03-01T22:06:13.429Z" },
+    { url = "https://files.pythonhosted.org/packages/50/93/e88f3c80971b42cfc83f50a51b9d165a1dbf154b97005f2994a79f212a07/yarl-1.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cde9a2ecd91668bcb7f077c4966d8ceddb60af01b52e6e3e2680e4cf00ad1a59", size = 89851, upload-time = "2026-03-01T22:06:15.53Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/07/61c9dd8ba8f86473263b4036f70fb594c09e99c0d9737a799dfd8bc85651/yarl-1.23.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5023346c4ee7992febc0068e7593de5fa2bf611848c08404b35ebbb76b1b0512", size = 95874, upload-time = "2026-03-01T22:06:17.553Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/e9/f9ff8ceefba599eac6abddcfb0b3bee9b9e636e96dbf54342a8577252379/yarl-1.23.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d1009abedb49ae95b136a8904a3f71b342f849ffeced2d3747bf29caeda218c4", size = 88710, upload-time = "2026-03-01T22:06:19.004Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/78/0231bfcc5d4c8eec220bc2f9ef82cb4566192ea867a7c5b4148f44f6cbcd/yarl-1.23.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a8d00f29b42f534cc8aa3931cfe773b13b23e561e10d2b26f27a8d309b0e82a1", size = 101033, upload-time = "2026-03-01T22:06:21.203Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/9b/30ea5239a61786f18fd25797151a17fbb3be176977187a48d541b5447dd4/yarl-1.23.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:95451e6ce06c3e104556d73b559f5da6c34a069b6b62946d3ad66afcd51642ea", size = 100817, upload-time = "2026-03-01T22:06:22.738Z" },
+    { url = "https://files.pythonhosted.org/packages/62/e2/a4980481071791bc83bce2b7a1a1f7adcabfa366007518b4b845e92eeee3/yarl-1.23.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:531ef597132086b6cf96faa7c6c1dcd0361dd5f1694e5cc30375907b9b7d3ea9", size = 97482, upload-time = "2026-03-01T22:06:24.21Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/1e/304a00cf5f6100414c4b5a01fc7ff9ee724b62158a08df2f8170dfc72a2d/yarl-1.23.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:88f9fb0116fbfcefcab70f85cf4b74a2b6ce5d199c41345296f49d974ddb4123", size = 95949, upload-time = "2026-03-01T22:06:25.697Z" },
+    { url = "https://files.pythonhosted.org/packages/68/03/093f4055ed4cae649ac53bca3d180bd37102e9e11d048588e9ab0c0108d0/yarl-1.23.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e7b0460976dc75cb87ad9cc1f9899a4b97751e7d4e77ab840fc9b6d377b8fd24", size = 95839, upload-time = "2026-03-01T22:06:27.309Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/28/4c75ebb108f322aa8f917ae10a8ffa4f07cae10a8a627b64e578617df6a0/yarl-1.23.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:115136c4a426f9da976187d238e84139ff6b51a20839aa6e3720cd1026d768de", size = 90696, upload-time = "2026-03-01T22:06:29.048Z" },
+    { url = "https://files.pythonhosted.org/packages/23/9c/42c2e2dd91c1a570402f51bdf066bfdb1241c2240ba001967bad778e77b7/yarl-1.23.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:ead11956716a940c1abc816b7df3fa2b84d06eaed8832ca32f5c5e058c65506b", size = 100865, upload-time = "2026-03-01T22:06:30.525Z" },
+    { url = "https://files.pythonhosted.org/packages/74/05/1bcd60a8a0a914d462c305137246b6f9d167628d73568505fce3f1cb2e65/yarl-1.23.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:fe8f8f5e70e6dbdfca9882cd9deaac058729bcf323cf7a58660901e55c9c94f6", size = 96234, upload-time = "2026-03-01T22:06:32.692Z" },
+    { url = "https://files.pythonhosted.org/packages/90/b2/f52381aac396d6778ce516b7bc149c79e65bfc068b5de2857ab69eeea3b7/yarl-1.23.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:a0e317df055958a0c1e79e5d2aa5a5eaa4a6d05a20d4b0c9c3f48918139c9fc6", size = 100295, upload-time = "2026-03-01T22:06:34.268Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/e8/638bae5bbf1113a659b2435d8895474598afe38b4a837103764f603aba56/yarl-1.23.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6f0fd84de0c957b2d280143522c4f91a73aada1923caee763e24a2b3fda9f8a5", size = 97784, upload-time = "2026-03-01T22:06:35.864Z" },
+    { url = "https://files.pythonhosted.org/packages/80/25/a3892b46182c586c202629fc2159aa13975d3741d52ebd7347fd501d48d5/yarl-1.23.0-cp313-cp313t-win32.whl", hash = "sha256:93a784271881035ab4406a172edb0faecb6e7d00f4b53dc2f55919d6c9688595", size = 88313, upload-time = "2026-03-01T22:06:37.39Z" },
+    { url = "https://files.pythonhosted.org/packages/43/68/8c5b36aa5178900b37387937bc2c2fe0e9505537f713495472dcf6f6fccc/yarl-1.23.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dd00607bffbf30250fe108065f07453ec124dbf223420f57f5e749b04295e090", size = 94932, upload-time = "2026-03-01T22:06:39.579Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/cc/d79ba8292f51f81f4dc533a8ccfb9fc6992cabf0998ed3245de7589dc07c/yarl-1.23.0-cp313-cp313t-win_arm64.whl", hash = "sha256:ac09d42f48f80c9ee1635b2fcaa819496a44502737660d3c0f2ade7526d29144", size = 84786, upload-time = "2026-03-01T22:06:41.988Z" },
+    { url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288, upload-time = "2026-03-01T22:07:51.388Z" },
+]

From 0da6d29fcfa9f23ecebed0a40bfbaace71871a15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20Co?= <gregoire.corbiere@gmail.com>
Date: Wed, 22 Apr 2026 17:14:18 +0200
Subject: [PATCH 2/8] Add record deletion for deduplication of model
 performance in Nocodb

---
 backend/balanceteshaters/services/nocodb.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/backend/balanceteshaters/services/nocodb.py b/backend/balanceteshaters/services/nocodb.py
index ca9b6a80..1b610eda 100644
--- a/backend/balanceteshaters/services/nocodb.py
+++ b/backend/balanceteshaters/services/nocodb.py
@@ -49,6 +49,16 @@ def get_table_info(self, table_id: str) -> dict[str, Any]:
         response.raise_for_status()
         return response.json()
 
+    def delete_records(self, table_id: str, record_ids: list[int]) -> None:
+        """Delete records by ID list (batched to avoid URL length limits)."""
+        url = f"{self.nocodb_url}/api/v3/data/{self.base_id}/{table_id}/records"
+        headers = {"accept": "application/json", "xc-token": self.token, "Content-Type": "application/json"}
+        batch_size = 10
+        for i in range(0, len(record_ids), batch_size):
+            batch = record_ids[i:i + batch_size]
+            response = requests.delete(url, headers=headers, json=[{"id": rid} for rid in batch])
+            response.raise_for_status()
+
     def count_records(self, table_id: str, where_str: str | None = None) -> int:
         """Count the number of records in a NocoDB table."""
         url = f"{self.nocodb_url}/api/v3/data/{self.base_id}/{table_id}/count"

From 7183a1e852abae9aed135ebdad4ba492737733d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20Co?= <gregoire.corbiere@gmail.com>
Date: Wed, 22 Apr 2026 17:14:56 +0200
Subject: [PATCH 3/8] edit dependancies

---
 backend/pyproject.toml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index 8bc9dffc..4ead3a35 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -33,3 +33,10 @@ dependencies = [
     "datasets>=3.0.0",
     "python-dotenv>=1.0.0",
 ]
+
+[tool.ruff]
+line-length = 120
+
+[tool.ruff.lint]
+# E402: scripts that patch sys.path before project imports (standalone runner pattern)
+per-file-ignores = { "balanceteshaters/scripts/**/*.py" = ["E402"] }

From df84d809b9d17e975a4bdcc8c2bdd136b7b7981e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20Co?= <gregoire.corbiere@gmail.com>
Date: Wed, 22 Apr 2026 17:15:58 +0200
Subject: [PATCH 4/8] Dataset preparation, suynthetic data generation and fine
 tuning loops

---
 .../scripts/ml/00_prepare_dataset.py          |  75 +++++
 .../scripts/ml/01_generate_synthetic.py       | 266 ++++++++++++++++++
 .../ml/02_embed_and_train_classical.py        | 233 +++++++++++++++
 3 files changed, 574 insertions(+)
 create mode 100644 backend/balanceteshaters/scripts/ml/00_prepare_dataset.py
 create mode 100644 backend/balanceteshaters/scripts/ml/01_generate_synthetic.py
 create mode 100644 backend/balanceteshaters/scripts/ml/02_embed_and_train_classical.py

diff --git a/backend/balanceteshaters/scripts/ml/00_prepare_dataset.py b/backend/balanceteshaters/scripts/ml/00_prepare_dataset.py
new file mode 100644
index 00000000..58e18d76
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/00_prepare_dataset.py
@@ -0,0 +1,75 @@
+# ruff: noqa: E402
+import argparse
+import os
+import sys
+from pathlib import Path
+
+import pandas as pd
+from dotenv import load_dotenv
+from sklearn.model_selection import train_test_split
+
+SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+from balanceteshaters.services.annotation import AnnotationService, BinaryConfidence
+from balanceteshaters.services.nocodb import NocoDBService
+from balanceteshaters.scripts.ml.config import ANNOTATION_TABLE_ID, DATA_DIR, compute_binary_label
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Prepare train/val/test splits from NocoDB annotations")
+    parser.add_argument("--high-confidence-only", action="store_true", help="Keep only HIGH_CONFIDENCE annotations")
+    args = parser.parse_args()
+
+    load_dotenv()
+    nocodb = NocoDBService(
+        nocodb_url=os.environ["NOCODB_BASE_URL"],
+        token=os.environ["NOCODB_TOKEN"],
+        base_id=os.environ["NOCODB_BASE_ID"],
+    )
+    service = AnnotationService(nocodb=nocodb, annotation_table_id=ANNOTATION_TABLE_ID)
+
+    print("Fetching annotations from NocoDB...")
+    annotations = service.fetch_records_paginated()
+    print(f"  Total records fetched: {len(annotations)}")
+
+    rows = []
+    for ann in annotations:
+        if not ann.annotated_category:
+            continue
+        if args.high_confidence_only and ann.binary_confidence != BinaryConfidence.HIGH_CONFIDENCE:
+            continue
+        cats = [c.value for c in ann.annotated_category]
+        label = compute_binary_label(cats)
+        if label is None:
+            continue
+        rows.append({
+            "id": ann.id,
+            "comment": ann.comment,
+            "label": label,
+            "annotated_category": ",".join(cats),
+            "binary_confidence": ann.binary_confidence.value if ann.binary_confidence else None,
+            "source": "real",
+        })
+
+    df = pd.DataFrame(rows)
+    print(f"  Usable annotated records: {len(df)}")
+    print(f"  Label distribution: {df['label'].value_counts().to_dict()}")
+
+    train_val, test = train_test_split(df, test_size=0.15, stratify=df["label"], random_state=42)
+    train, val = train_test_split(train_val, test_size=0.15 / 0.85, stratify=train_val["label"], random_state=42)
+
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    train.to_parquet(DATA_DIR / "train_real.parquet", index=False)
+    val.to_parquet(DATA_DIR / "val.parquet", index=False)
+    test.to_parquet(DATA_DIR / "test.parquet", index=False)
+
+    print(f"\nSplits saved to {DATA_DIR}")
+    for name, split in [("train_real", train), ("val", val), ("test", test)]:
+        dist = split["label"].value_counts().to_dict()
+        print(f"  {name}: {len(split)} rows  label dist={dist}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/balanceteshaters/scripts/ml/01_generate_synthetic.py b/backend/balanceteshaters/scripts/ml/01_generate_synthetic.py
new file mode 100644
index 00000000..1b47e9fe
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/01_generate_synthetic.py
@@ -0,0 +1,266 @@
+# ruff: noqa: E402
+"""
+Generate synthetic French social media comments for minority harassment categories.
+Produces synthetic.parquet and train_augmented.parquet (train_real + synthetic).
+
+Improvements over v1:
+  - Bigger model (Sonnet by default) for higher-quality, more nuanced output
+  - Harder examples: subtle language, indirect threats, edge cases near category boundaries
+  - Few-shot grounding: real examples from train set included in each prompt
+"""
+import argparse
+import os
+import random
+import re
+import sys
+from pathlib import Path
+
+import anthropic
+import pandas as pd
+from dotenv import load_dotenv
+
+SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+from balanceteshaters.scripts.ml.config import DATA_DIR
+
+# Pricing per million tokens (as of 2025)
+MODEL_PRICING = {
+    "claude-haiku-4-5-20251001":  {"input": 0.80,  "output": 4.00,  "cache_write": 1.00,  "cache_read": 0.08},
+    "claude-sonnet-4-6":          {"input": 3.00,  "output": 15.00, "cache_write": 3.75,  "cache_read": 0.30},
+    "claude-opus-4-7":            {"input": 15.00, "output": 75.00, "cache_write": 18.75, "cache_read": 1.50},
+}
+
+TOTAL_TARGET = 1000
+BATCH_SIZE = 10
+FEW_SHOT_PER_CATEGORY = 5  # real examples to include in each generation prompt
+
+CATEGORY_DESCRIPTIONS = {
+    "Doxxing": "publication ou menace de publication d'informations privées (adresse, lieu de travail, numéro de téléphone, photo personnelle, identité réelle)",
+    "Incitation au suicide": "encouragement à se suicider ou à se blesser soi-même, messages du type 'tue-toi', 'va mourir', 'le monde irait mieux sans toi'",
+    "Cyberharcèlement à caractère sexuel": "harcèlement de nature sexuelle, propositions non sollicitées, slut-shaming, menaces sexuelles, commentaires dégradants sur le corps ou la sexualité",
+    "Menaces": "menaces directes ou voilées de violence physique, intimidation, messages impliquant des représailles",
+    "Incitation à la haine": "appels à la haine envers un groupe (racisme, homophobie, islamophobie, antisémitisme, transphobie, etc.), déshumanisation",
+    "Absence de cyberharcèlement": "commentaire anodin, critique constructive, expression d'une opinion sans agressivité, humour bienveillant, désaccord poli",
+}
+
+SYSTEM_PROMPT = """Tu es un expert en modération de contenu et en sécurité en ligne. Tu génères des exemples de commentaires de réseaux sociaux en français pour entraîner des modèles de détection de cyberharcèlement.
+
+Les commentaires doivent être réalistes et variés :
+- Style : argot, verlan, abréviations SMS, emojis, fautes d'orthographe intentionnelles, mélange français/anglais
+- Longueur : courts à moyens (5-80 mots), comme on en trouve sur Instagram, TikTok ou Twitter
+- Difficulté : inclure un mélange d'exemples évidents ET d'exemples subtils/ambigus qui nécessitent une lecture attentive pour être classifiés
+- Pour les catégories de harcèlement : certains doivent utiliser un langage indirect, des métaphores, du sous-entendu, ou du codé plutôt que des insultes directes
+- Pour l'absence de harcèlement : inclure des cas qui ressemblent superficiellement à du harcèlement mais n'en sont pas (critique légitime, humour, sarcasme bienveillant)
+
+IMPORTANT : génère UNIQUEMENT des commentaires bruts, sans explication ni méta-commentaire. Chaque commentaire sur une ligne séparée. Numérote-les de 1 à N."""
+
+
+def estimate_cost(model: str, allocation: dict[str, int], n_shots: int) -> float:
+    pricing = MODEL_PRICING[model]
+    n_categories = len(allocation)
+    total_calls = sum(-(-v // BATCH_SIZE) for v in allocation.values())
+    avg_system_tokens = 350
+    avg_shots_tokens = n_shots * 20  # ~20 tokens per real example
+    avg_user_tokens = 80 + avg_shots_tokens
+    avg_output_tokens = BATCH_SIZE * 30
+
+    # First call per category writes the system prompt to cache; subsequent calls hit cache
+    cache_write_calls = n_categories
+    cache_read_calls = max(0, total_calls - n_categories)
+
+    cost = (
+        (cache_write_calls * avg_system_tokens * pricing["cache_write"]
+         + cache_read_calls * avg_system_tokens * pricing["cache_read"]
+         + total_calls * avg_user_tokens * pricing["input"]
+         + total_calls * avg_output_tokens * pricing["output"])
+        / 1_000_000
+    )
+    return cost
+
+
+def allocate_examples(train_df: pd.DataFrame) -> dict[str, int]:
+    benign_count = TOTAL_TARGET // 5  # 200 benign
+    harassment_count = TOTAL_TARGET - benign_count  # 800 harassment
+
+    harassment_cats = [c for c in CATEGORY_DESCRIPTIONS if c != "Absence de cyberharcèlement"]
+    per_cat = harassment_count // len(harassment_cats)
+    remainder = harassment_count % len(harassment_cats)
+
+    allocation = {cat: per_cat for cat in harassment_cats}
+    for i, cat in enumerate(harassment_cats[:remainder]):
+        allocation[cat] += 1
+    allocation["Absence de cyberharcèlement"] = benign_count
+    return allocation
+
+
+def get_real_examples(train_df: pd.DataFrame, category: str, n: int) -> list[str]:
+    """Sample up to n real training examples for a given category."""
+    col = "annotated_category"
+    if col not in train_df.columns:
+        return []
+    subset = train_df[train_df[col] == category]["comment"].dropna().tolist()
+    if not subset:
+        # fall back: for benign, use label=0; for harassment, label=1
+        label = 0 if category == "Absence de cyberharcèlement" else 1
+        subset = train_df[train_df["label"] == label]["comment"].dropna().tolist()
+    return random.sample(subset, min(n, len(subset)))
+
+
+def generate_batch(
+    client: anthropic.Anthropic,
+    category: str,
+    n: int,
+    real_examples: list[str],
+    tokens_used: dict,
+    model: str,
+) -> list[str]:
+    description = CATEGORY_DESCRIPTIONS[category]
+
+    shots_block = ""
+    if real_examples:
+        formatted = "\n".join(f"  • {ex[:150]}" for ex in real_examples)
+        shots_block = f"\nExemples RÉELS de cette catégorie (pour calibrer le style et la difficulté) :\n{formatted}\n\nGénère des commentaires DIFFÉRENTS de ces exemples mais de style et difficulté similaires.\n"
+
+    user_msg = (
+        f"Catégorie : **{category}**\n"
+        f"Description : {description}\n"
+        f"{shots_block}\n"
+        f"Génère exactement {n} commentaires, numérotés de 1 à {n}."
+    )
+
+    response = client.messages.create(
+        model=model,
+        max_tokens=n * 80 + 150,
+        system=[
+            {
+                "type": "text",
+                "text": SYSTEM_PROMPT,
+                "cache_control": {"type": "ephemeral"},
+            }
+        ],
+        messages=[{"role": "user", "content": user_msg}],
+    )
+
+    tokens_used["input"] += response.usage.input_tokens
+    tokens_used["output"] += response.usage.output_tokens
+    if hasattr(response.usage, "cache_read_input_tokens"):
+        tokens_used["cache_read"] += response.usage.cache_read_input_tokens
+    if hasattr(response.usage, "cache_creation_input_tokens"):
+        tokens_used["cache_write"] += response.usage.cache_creation_input_tokens
+
+    lines = response.content[0].text.strip().split("\n")
+    comments = []
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        cleaned = re.sub(r"^\d+[.)]\s*", "", line).strip()
+        if cleaned:
+            comments.append(cleaned)
+    return comments[:n]
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate synthetic French harassment comments")
+    parser.add_argument("--dry-run", action="store_true", help="Print allocation and cost estimate only")
+    parser.add_argument(
+        "--model",
+        choices=list(MODEL_PRICING.keys()),
+        default="claude-sonnet-4-6",
+        help="Anthropic model to use for generation",
+    )
+    parser.add_argument("--total", type=int, default=TOTAL_TARGET, help="Total examples to generate")
+    args = parser.parse_args()
+
+    load_dotenv()
+
+    train_path = DATA_DIR / "train_real.parquet"
+    if not train_path.exists():
+        print(f"ERROR: {train_path} not found. Run 00_prepare_dataset.py first.")
+        sys.exit(1)
+
+    train_df = pd.read_parquet(train_path)
+    allocation = allocate_examples(train_df)
+    # Rescale if --total was overridden
+    if args.total != TOTAL_TARGET:
+        scale = args.total / TOTAL_TARGET
+        allocation = {k: max(1, round(v * scale)) for k, v in allocation.items()}
+
+    cost_estimate = estimate_cost(args.model, allocation, FEW_SHOT_PER_CATEGORY)
+
+    print(f"=== Synthetic data allocation ({sum(allocation.values())} total) ===")
+    for cat, n in allocation.items():
+        real_count = len(train_df[train_df["annotated_category"] == cat]) if "annotated_category" in train_df.columns else "?"
+        print(f"  {cat}: {n} synthetic  (real in train: {real_count})")
+    print(f"\nModel: {args.model}")
+    print(f"Few-shot examples per prompt: {FEW_SHOT_PER_CATEGORY}")
+    print(f"Estimated API cost: ~${cost_estimate:.3f}")
+
+    if args.dry_run:
+        print("\n[dry-run] No API calls made.")
+        return
+
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        print("ERROR: ANTHROPIC_API_KEY not set in environment.")
+        sys.exit(1)
+
+    client = anthropic.Anthropic(api_key=api_key)
+    tokens_used = {"input": 0, "output": 0, "cache_read": 0, "cache_write": 0}
+
+    all_rows = []
+    for category, total_needed in allocation.items():
+        print(f"\nGenerating {total_needed} examples for: {category}")
+        label = 0 if category == "Absence de cyberharcèlement" else 1
+        real_examples = get_real_examples(train_df, category, FEW_SHOT_PER_CATEGORY)
+        print(f"  Using {len(real_examples)} real few-shot examples")
+
+        generated = []
+        while len(generated) < total_needed:
+            batch_n = min(BATCH_SIZE, total_needed - len(generated))
+            # Resample real examples each batch to add variety
+            shots = get_real_examples(train_df, category, FEW_SHOT_PER_CATEGORY)
+            batch = generate_batch(client, category, batch_n, shots, tokens_used, args.model)
+            generated.extend(batch)
+            print(f"  {len(generated)}/{total_needed}", end="\r")
+
+        for comment in generated[:total_needed]:
+            all_rows.append({
+                "id": None,
+                "comment": comment,
+                "label": label,
+                "annotated_category": category,
+                "binary_confidence": None,
+                "source": "synthetic_v2",
+            })
+
+        samples = random.sample(generated[:total_needed], min(5, len(generated)))
+        print(f"\n  Samples from '{category}':")
+        for s in samples:
+            print(f"    • {s[:120]}")
+
+    synthetic_df = pd.DataFrame(all_rows)
+    synthetic_df.to_parquet(DATA_DIR / "synthetic_v2.parquet", index=False)
+
+    augmented_df = pd.concat([train_df, synthetic_df], ignore_index=True)
+    augmented_df.to_parquet(DATA_DIR / "train_augmented_v2.parquet", index=False)
+
+    pricing = MODEL_PRICING[args.model]
+    actual_cost = (
+        tokens_used["input"] * pricing["input"]
+        + tokens_used["output"] * pricing["output"]
+        + tokens_used.get("cache_write", 0) * pricing["cache_write"]
+        + tokens_used.get("cache_read", 0) * pricing["cache_read"]
+    ) / 1_000_000
+    print("\n=== Done ===")
+    print(f"  Synthetic examples: {len(synthetic_df)}")
+    print(f"  train_augmented_v2 size: {len(augmented_df)}")
+    print(f"  Tokens — input: {tokens_used['input']}, output: {tokens_used['output']}, cache_read: {tokens_used['cache_read']}, cache_write: {tokens_used['cache_write']}")
+    print(f"  Actual API cost: ~${actual_cost:.4f}")
+    print(f"  Files: {DATA_DIR}/synthetic_v2.parquet, train_augmented_v2.parquet")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/balanceteshaters/scripts/ml/02_embed_and_train_classical.py b/backend/balanceteshaters/scripts/ml/02_embed_and_train_classical.py
new file mode 100644
index 00000000..ad7591d0
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/02_embed_and_train_classical.py
@@ -0,0 +1,233 @@
+# ruff: noqa: E402
+"""
+Train three classifiers on frozen embeddings:
+  - Logistic Regression  (linear baseline)
+  - LightGBM             (tree ensemble)
+  - MLP (512→128)        (neural head, best at dense vectors)
+
+Loops over 2 embedding models × 3 classifiers × 2 datasets = 12 runs.
+Embeddings for val/test are shared per encoder to avoid recomputing.
+"""
+# Must be set before torch is imported: jina-v5 EuroBERT allocates MPS memory
+# even when device="cpu", which segfaults on Apple Silicon.
+import os
+os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0")
+
+import argparse
+import sys
+import time
+from pathlib import Path
+
+import joblib
+import numpy as np
+import pandas as pd
+import sklearn.metrics
+from dotenv import load_dotenv
+from sentence_transformers import SentenceTransformer
+
+SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+from balanceteshaters.scripts.ml.config import (
+    ANNOTATION_TABLE_ID,
+    ARCTIC_EMBED_MODEL_ID,
+    BIDIR_MODEL_ID,
+    CHECKPOINTS_DIR,
+    DATA_DIR,
+    EVAL_TABLE_ID,
+    JINA_MODEL_ID,
+    JINA_SMALL_MODEL_ID,
+    get_device_for_model,
+    model_slug,
+)
+from balanceteshaters.services.nocodb import NocoDBService
+
+
+def embed(model: SentenceTransformer, texts: list[str], task: str | None = None, batch_size: int = 32) -> np.ndarray:
+    kwargs = {"batch_size": batch_size, "show_progress_bar": True, "convert_to_numpy": True}
+    if task is not None:
+        kwargs["task"] = task
+    return model.encode(texts, **kwargs).astype(np.float32)
+
+
+def evaluate(y_true, y_pred) -> dict:
+    return {
+        "f1": sklearn.metrics.f1_score(y_true, y_pred, zero_division=0),
+        "precision": sklearn.metrics.precision_score(y_true, y_pred, zero_division=0),
+        "recall": sklearn.metrics.recall_score(y_true, y_pred, zero_division=0),
+        "accuracy": sklearn.metrics.accuracy_score(y_true, y_pred),
+    }
+
+
+def log_to_nocodb(nocodb, run_name: str, dataset: str, metrics: dict, n_total: int, n_pos: int, model_type: str = "encoder embedding"):
+    if nocodb is None:
+        return
+    data = {
+        "model_name": run_name,
+        "table_id": ANNOTATION_TABLE_ID,
+        "table_name": f"finetune/{dataset}",
+        "f1": metrics["f1"],
+        "precision": metrics["precision"],
+        "recall": metrics["recall"],
+        "accuracy": metrics["accuracy"],
+        "total_samples": n_total,
+        "positive_samples": n_pos,
+        "negative_samples": n_total - n_pos,
+        "prompt": f"frozen+{run_name.split('+')[1]} dataset={dataset}",
+        "model_type": model_type,
+    }
+    try:
+        nocodb.create_record(EVAL_TABLE_ID, data)
+    except Exception as e:
+        print(f"  [warn] NocoDB logging failed: {e}")
+
+
+def make_classifiers():
+    # Lazy imports so LightGBM's OpenMP doesn't initialize before jina loads
+    from lightgbm import LGBMClassifier
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.neural_network import MLPClassifier
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import StandardScaler
+
+    return [
+        (
+            "logreg",
+            Pipeline([
+                ("scaler", StandardScaler()),
+                ("clf", LogisticRegression(max_iter=1000, C=1.0)),
+            ]),
+            False,
+        ),
+        (
+            "lightgbm",
+            LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, verbose=-1, n_jobs=1),
+            True,  # uses early stopping on val set
+        ),
+        (
+            "mlp",
+            Pipeline([
+                ("scaler", StandardScaler()),
+                ("clf", MLPClassifier(
+                    hidden_layer_sizes=(512, 128),
+                    activation="relu",
+                    max_iter=200,
+                    early_stopping=True,
+                    validation_fraction=0.1,
+                    n_iter_no_change=10,
+                    random_state=42,
+                )),
+            ]),
+            False,
+        ),
+    ]
+
+
+def run_for_encoder(model_id: str, is_jina: bool, datasets: list[str], nocodb):
+    import gc
+    device = get_device_for_model(model_id)
+    print(f"\n{'='*60}")
+    print(f"Encoder: {model_id}  device={device}")
+
+    load_kwargs = {"device": device, "trust_remote_code": True}
+    st_model = SentenceTransformer(model_id, **load_kwargs)
+    task = "classification" if is_jina else None
+
+    val_df = pd.read_parquet(DATA_DIR / "val.parquet")
+    test_df = pd.read_parquet(DATA_DIR / "test.parquet")
+
+    print("Embedding val set...")
+    X_val = embed(st_model, val_df["comment"].tolist(), task=task)
+    y_val = val_df["label"].values
+
+    print("Embedding test set...")
+    X_test = embed(st_model, test_df["comment"].tolist(), task=task)
+    y_test = test_df["label"].values
+
+    # Embed all training splits before freeing the encoder
+    train_embeddings: dict[str, tuple[np.ndarray, np.ndarray]] = {}
+    for dataset in datasets:
+        train_file = DATA_DIR / f"train_{dataset}.parquet"
+        if not train_file.exists():
+            print(f"  [skip] {train_file.name} not found")
+            continue
+        train_df = pd.read_parquet(train_file)
+        print(f"\nEmbedding train set ({dataset}, {len(train_df)} rows)...")
+        train_embeddings[dataset] = (
+            embed(st_model, train_df["comment"].tolist(), task=task),
+            train_df["label"].values,
+        )
+
+    # Free encoder before initialising classifier threads (avoids OpenMP conflict)
+    del st_model
+    gc.collect()
+
+    slug = model_slug(model_id)
+    CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
+
+    for dataset, (X_train, y_train) in train_embeddings.items():
+        for clf_name, clf, needs_val in make_classifiers():
+            run_name = f"{slug}+{clf_name}+{dataset}"
+            print(f"\n  Training {run_name}...")
+            t0 = time.time()
+
+            if needs_val:
+                from lightgbm import early_stopping, log_evaluation
+                clf.fit(
+                    X_train, y_train,
+                    eval_set=[(X_val, y_val)],
+                    callbacks=[early_stopping(50, verbose=False), log_evaluation(-1)],
+                )
+            else:
+                clf.fit(X_train, y_train)
+
+            elapsed = time.time() - t0
+            y_pred = clf.predict(X_test)
+            m = evaluate(y_test, y_pred)
+            n_pos = int(y_test.sum())
+
+            print(f"  F1={m['f1']:.4f}  P={m['precision']:.4f}  R={m['recall']:.4f}  Acc={m['accuracy']:.4f}  ({elapsed:.1f}s)")
+
+            ckpt_path = CHECKPOINTS_DIR / f"{run_name}.joblib"
+            joblib.dump(clf, ckpt_path)
+            print(f"  Saved to {ckpt_path.name}")
+
+            log_to_nocodb(nocodb, run_name, dataset, m, len(y_test), n_pos)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Embed + train classical ML classifiers")
+    parser.add_argument("--models", nargs="+", choices=["jina", "jina-small", "bidir", "arctic", "all"], default=["all"])
+    parser.add_argument("--datasets", nargs="+", choices=["real", "augmented", "augmented_v2", "all"], default=["all"])
+    args = parser.parse_args()
+
+    load_dotenv()
+    nocodb = None
+    if all(os.environ.get(k) for k in ["NOCODB_BASE_URL", "NOCODB_TOKEN", "NOCODB_BASE_ID"]):
+        nocodb = NocoDBService(
+            nocodb_url=os.environ["NOCODB_BASE_URL"],
+            token=os.environ["NOCODB_TOKEN"],
+            base_id=os.environ["NOCODB_BASE_ID"],
+        )
+
+    encoders = []
+    if "all" in args.models or "jina" in args.models:
+        encoders.append((JINA_MODEL_ID, True))
+    if "all" in args.models or "jina-small" in args.models:
+        encoders.append((JINA_SMALL_MODEL_ID, True))
+    if "all" in args.models or "bidir" in args.models:
+        encoders.append((BIDIR_MODEL_ID, False))
+    if "all" in args.models or "arctic" in args.models:
+        encoders.append((ARCTIC_EMBED_MODEL_ID, False))
+
+    datasets = ["real", "augmented", "augmented_v2"] if "all" in args.datasets else args.datasets
+
+    for model_id, is_jina in encoders:
+        run_for_encoder(model_id, is_jina, datasets, nocodb)
+
+    print("\nAll runs complete.")
+
+
+if __name__ == "__main__":
+    main()

From 68b9671003faa79834363ec96da796a8959c3271 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20Co?= <gregoire.corbiere@gmail.com>
Date: Wed, 22 Apr 2026 17:16:40 +0200
Subject: [PATCH 5/8] Compare all trainings, and add script for prompting
 Claude for annotations

---
 .../scripts/ml/04_compare_evaluate.py         | 253 ++++++++++++++++
 .../scripts/ml/05_claude_annotate.py          | 286 ++++++++++++++++++
 2 files changed, 539 insertions(+)
 create mode 100644 backend/balanceteshaters/scripts/ml/04_compare_evaluate.py
 create mode 100644 backend/balanceteshaters/scripts/ml/05_claude_annotate.py

diff --git a/backend/balanceteshaters/scripts/ml/04_compare_evaluate.py b/backend/balanceteshaters/scripts/ml/04_compare_evaluate.py
new file mode 100644
index 00000000..efba8cae
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/04_compare_evaluate.py
@@ -0,0 +1,253 @@
+# ruff: noqa: E402
+"""
+Load all checkpoints and print a comparison table.
+Also logs all results to NocoDB and highlights A/B delta (real vs augmented).
+
+SentenceTransformer embedding and fine-tuned inference run in isolated subprocesses
+to avoid the OpenMP conflict between jina-nano's EuroBERT (libomp) and LightGBM (libgomp).
+"""
+import os
+os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0")
+
+import sys
+import subprocess
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import sklearn.metrics
+from dotenv import load_dotenv
+
+SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+from balanceteshaters.scripts.ml.config import (
+    ANNOTATION_TABLE_ID,
+    ARCTIC_EMBED_MODEL_ID,
+    BIDIR_MODEL_ID,
+    CHECKPOINTS_DIR,
+    DATA_DIR,
+    EVAL_TABLE_ID,
+    JINA_MODEL_ID,
+    JINA_SMALL_MODEL_ID,
+    MODEL_TYPE,
+    XLMR_TOXICITY_MODEL_ID,
+    get_device_for_model,
+    model_slug,
+)
+from balanceteshaters.services.nocodb import NocoDBService
+
+_HELPER = Path(__file__).parent / "_eval_subprocess.py"
+
+
+def metrics(y_true, y_pred) -> dict:
+    return {
+        "f1": sklearn.metrics.f1_score(y_true, y_pred, zero_division=0),
+        "precision": sklearn.metrics.precision_score(y_true, y_pred, zero_division=0),
+        "recall": sklearn.metrics.recall_score(y_true, y_pred, zero_division=0),
+        "accuracy": sklearn.metrics.accuracy_score(y_true, y_pred),
+    }
+
+
+def load_llm_baseline(data_dir: Path) -> dict | None:
+    csv_dir = data_dir.parent
+    csvs = list(csv_dir.glob("predictions_m5t7qqaer2oa441_*.csv"))
+    if not csvs:
+        return None
+
+    best = None
+    best_f1 = -1.0
+    for p in csvs:
+        df = pd.read_csv(p)
+        if "annotated_category" not in df.columns or "predicted_category" not in df.columns:
+            continue
+        df = df[df["annotated_category"].notna()]
+        df["predicted_category"] = df["predicted_category"].astype(str).str.strip()
+        df = df[df["predicted_category"].isin(["0", "1"])]
+        if df.empty:
+            continue
+        y_true = (~df["annotated_category"].str.contains("Absence de cyberharcèlement")).astype(int)
+        y_pred = df["predicted_category"].astype(int)
+        f1 = sklearn.metrics.f1_score(y_true, y_pred, zero_division=0)
+        if f1 > best_f1:
+            best_f1 = f1
+            best = {"run_name": f"LLM baseline ({p.stem})", "approach": "LLM prompt", "model": "best LLM", "dataset": "real", **metrics(y_true, y_pred)}
+    return best
+
+
+def _subprocess_run(args: list[str]):
+    result = subprocess.run([sys.executable] + args, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Subprocess failed (exit {result.returncode}):\n{result.stderr[-2000:]}")
+
+
+def embed_in_subprocess(model_id: str, output_npy: str):
+    print(f"  Embedding {model_slug(model_id)}...")
+    _subprocess_run([str(_HELPER), "embed", model_id, output_npy])
+
+
+def predict_classical_in_subprocess(embedding_npy: str, ckpt_path: str, output_npy: str):
+    _subprocess_run([str(_HELPER), "predict_classical", embedding_npy, ckpt_path, output_npy])
+
+
+def predict_ft_in_subprocess(model_id: str, ckpt_dir: str, output_npy: str):
+    _subprocess_run([str(_HELPER), "predict", model_id, ckpt_dir, output_npy])
+
+
+def predict_xlmr_zero_shot(test_df: pd.DataFrame) -> np.ndarray:
+    import torch
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    device = get_device_for_model(XLMR_TOXICITY_MODEL_ID)
+    tokenizer = AutoTokenizer.from_pretrained(XLMR_TOXICITY_MODEL_ID)
+    model = AutoModelForSequenceClassification.from_pretrained(XLMR_TOXICITY_MODEL_ID)
+    model.eval().to(device)
+    texts = test_df["comment"].tolist()
+    all_preds = []
+    for i in range(0, len(texts), 32):
+        batch = texts[i:i + 32]
+        enc = tokenizer(batch, truncation=True, padding=True, max_length=512, return_tensors="pt").to(device)
+        with torch.no_grad():
+            logits = model(**enc).logits
+        all_preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())
+    return np.array(all_preds)
+
+
+def main():
+    load_dotenv()
+
+    test_df = pd.read_parquet(DATA_DIR / "test.parquet")
+    y_test = test_df["label"].values
+
+    nocodb = None
+    if all(os.environ.get(k) for k in ["NOCODB_BASE_URL", "NOCODB_TOKEN", "NOCODB_BASE_ID"]):
+        nocodb = NocoDBService(
+            nocodb_url=os.environ["NOCODB_BASE_URL"],
+            token=os.environ["NOCODB_TOKEN"],
+            base_id=os.environ["NOCODB_BASE_ID"],
+        )
+
+    results = []
+    tmp = Path(tempfile.mkdtemp())
+
+    encoder_configs = [
+        (JINA_MODEL_ID, True),
+        (JINA_SMALL_MODEL_ID, True),
+        (BIDIR_MODEL_ID, False),
+        (ARCTIC_EMBED_MODEL_ID, False),
+    ]
+
+    # ── Phase 1: embed test set — each model in isolated subprocess ────────
+    # Prevents OpenMP conflict: jina-nano (EuroBERT/libomp) vs LightGBM (libgomp)
+    print("── Phase 1: computing test embeddings ──")
+    embedding_cache: dict[str, np.ndarray] = {}
+    for model_id, _ in encoder_configs:
+        npy_path = str(tmp / f"X_{model_slug(model_id)}.npy")
+        embed_in_subprocess(model_id, npy_path)
+        embedding_cache[model_id] = np.load(npy_path)
+
+    # ── Phase 2: classical ML predictions — each in isolated subprocess ──────
+    # LightGBM (libgomp) conflicts with leaked OpenMP state from Phase 1 subprocesses
+    print("\n── Phase 2: classical ML predictions ──")
+    for model_id, _ in encoder_configs:
+        slug = model_slug(model_id)
+        embedding_npy = str(tmp / f"X_{slug}.npy")
+
+        for clf_name in ["logreg", "lightgbm", "mlp"]:
+            for dataset in ["real", "augmented", "augmented_v2"]:
+                run_name = f"{slug}+{clf_name}+{dataset}"
+                ckpt = CHECKPOINTS_DIR / f"{run_name}.joblib"
+                if not ckpt.exists():
+                    print(f"  [skip] {ckpt.name}")
+                    continue
+                pred_npy = str(tmp / f"pred_{run_name}.npy")
+                predict_classical_in_subprocess(embedding_npy, str(ckpt), pred_npy)
+                y_pred = np.load(pred_npy)
+                m = metrics(y_test, y_pred)
+                results.append({"run": run_name, "approach": f"frozen+{clf_name}", "model": slug, "dataset": dataset, "model_type": MODEL_TYPE.get(model_id, "encoder embedding"), **m})
+                print(f"  {run_name:<55} F1={m['f1']:.4f}")
+
+    # ── Phase 3: fine-tuned model predictions (each in isolated subprocess) ──
+    print("\n── Phase 3: fine-tuned predictions ──")
+    for model_id, _ in encoder_configs:
+        slug = model_slug(model_id)
+        for strategy in ["head_only", "full"]:
+            for dataset in ["real", "augmented", "augmented_v2"]:
+                run_name = f"{slug}-finetuned-{strategy}-{dataset}"
+                ckpt_dir = CHECKPOINTS_DIR / run_name
+                if not ckpt_dir.exists():
+                    print(f"  [skip] {run_name}")
+                    continue
+                npy_path = str(tmp / f"pred_{run_name}.npy")
+                print(f"  Predicting {run_name}...")
+                predict_ft_in_subprocess(model_id, str(ckpt_dir), npy_path)
+                y_pred = np.load(npy_path)
+                if len(y_pred) == 1 and y_pred[0] == -1:
+                    print(f"  [skip] no best_model.pt in {run_name}")
+                    continue
+                m = metrics(y_test, y_pred)
+                results.append({"run": run_name, "approach": f"finetune-{strategy}", "model": slug, "dataset": dataset, "model_type": MODEL_TYPE.get(model_id, "encoder embedding"), **m})
+                print(f"  {run_name:<55} F1={m['f1']:.4f}")
+
+    # ── XLM-R zero-shot (no LightGBM conflict — transformers only) ────────
+    print("\nRunning XLM-R toxicity zero-shot...")
+    xlmr_slug = model_slug(XLMR_TOXICITY_MODEL_ID)
+    y_pred = predict_xlmr_zero_shot(test_df)
+    m = metrics(y_test, y_pred)
+    results.append({"run": f"{xlmr_slug}-zero-shot", "approach": "zero-shot", "model": xlmr_slug, "dataset": "real", "model_type": "encoder classifier", **m})
+    print(f"  XLM-R zero-shot  F1={m['f1']:.4f}")
+
+    # ── LLM baseline ──────────────────────────────────────────────────────
+    baseline = load_llm_baseline(DATA_DIR)
+    if baseline:
+        run_name = baseline.pop("run_name")
+        results.append({"run": run_name, "model_type": "generative", **baseline})
+
+    if not results:
+        print("No results found. Run the training scripts first.")
+        return
+
+    df = pd.DataFrame(results)
+    df = df.sort_values(["approach", "model", "dataset"])
+
+    print("\n" + "="*90)
+    print(f"{'Run':<55} {'Dataset':<12} {'F1':>6} {'P':>6} {'R':>6} {'Acc':>6}")
+    print("="*90)
+    for _, row in df.iterrows():
+        print(f"{row['run']:<55} {row['dataset']:<12} {row['f1']:>6.4f} {row['precision']:>6.4f} {row['recall']:>6.4f} {row['accuracy']:>6.4f}")
+
+    print("\n── A/B delta (augmented − real F1) ─────────────")
+    for (approach, model), group in df.groupby(["approach", "model"]):
+        real_row = group[group["dataset"] == "real"]
+        aug_row = group[group["dataset"] == "augmented"]
+        if real_row.empty or aug_row.empty:
+            continue
+        delta = aug_row["f1"].values[0] - real_row["f1"].values[0]
+        print(f"  {approach:<25} {model:<35} {delta:+.4f}")
+
+    if nocodb:
+        for _, row in df.iterrows():
+            data = {
+                "model_name": row["run"],
+                "table_id": ANNOTATION_TABLE_ID,
+                "table_name": f"finetune/{row.get('dataset','?')}",
+                "f1": row["f1"],
+                "precision": row["precision"],
+                "recall": row["recall"],
+                "accuracy": row["accuracy"],
+                "total_samples": len(y_test),
+                "positive_samples": int(y_test.sum()),
+                "negative_samples": len(y_test) - int(y_test.sum()),
+                "prompt": f"approach={row.get('approach','?')} dataset={row.get('dataset','?')}",
+                "model_type": row.get("model_type", "encoder embedding"),
+            }
+            try:
+                nocodb.create_record(EVAL_TABLE_ID, data)
+            except Exception as e:
+                print(f"  [warn] NocoDB: {e}")
+        print("\nAll results logged to NocoDB.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/balanceteshaters/scripts/ml/05_claude_annotate.py b/backend/balanceteshaters/scripts/ml/05_claude_annotate.py
new file mode 100644
index 00000000..c77ce713
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/05_claude_annotate.py
@@ -0,0 +1,286 @@
+# ruff: noqa: E402
+"""
+Auto-annotate unannotated comments using Claude API, with agreement evaluation.
+
+Modes:
+  evaluate  — Run Claude on a subsample of manually-annotated rows, measure agreement
+              against human labels to build confidence before using for training.
+  annotate  — Fetch unannotated rows, classify with Claude, save to parquet.
+
+Usage:
+  python 05_claude_annotate.py --mode evaluate --n 100
+  python 05_claude_annotate.py --mode annotate --limit 500 --dry-run
+  python 05_claude_annotate.py --mode annotate --limit 500
+"""
+import argparse
+import json
+import os
+import random
+import re
+import sys
+from pathlib import Path
+
+import anthropic
+import pandas as pd
+from dotenv import load_dotenv
+
+SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+from balanceteshaters.scripts.ml.config import ANNOTATION_TABLE_ID, DATA_DIR
+from balanceteshaters.services.annotation import AnnotatedCategory, AnnotationService
+from balanceteshaters.services.nocodb import NocoDBService
+
+# Sonnet pricing per million tokens
+SONNET_INPUT_PRICE = 3.00
+SONNET_OUTPUT_PRICE = 15.00
+SONNET_CACHE_WRITE_PRICE = 3.75
+SONNET_CACHE_READ_PRICE = 0.30
+
+CATEGORIES = [c.value for c in AnnotatedCategory]
+HARASSMENT_CATEGORIES = [c for c in CATEGORIES if c != "Absence de cyberharcèlement"]
+
+SYSTEM_PROMPT = f"""Tu es un expert en modération de contenu et en sécurité en ligne. Tu analyses des commentaires de réseaux sociaux français (Instagram, TikTok, YouTube, Twitter) pour détecter le cyberharcèlement.
+
+Pour chaque commentaire, tu dois choisir la catégorie la plus appropriée parmi :
+{chr(10).join(f'- {c}' for c in CATEGORIES)}
+
+Définitions :
+- Absence de cyberharcèlement : commentaire anodin, critique constructive, humour bienveillant, opinion sans agressivité
+- Cyberharcèlement (définition générale) : harcèlement en ligne ne rentrant pas dans une catégorie spécifique
+- Cyberharcèlement (autre) : forme de harcèlement non couverte par les autres catégories
+- Cyberharcèlement à caractère sexuel : harcèlement sexuel, slut-shaming, commentaires dégradants sur le corps/sexualité
+- Menaces : menaces directes ou voilées de violence physique, intimidation
+- Incitation au suicide : encouragement à se suicider ou se blesser
+- Injure : insulte directe, terme offensant, dénigrement
+- Diffamation : fausses accusations destinées à nuire à la réputation
+- Injure et diffamation publique : combinaison d'injure et diffamation
+- Doxxing : publication ou menace de publication d'informations privées
+- Incitation à la haine : appel à la haine envers un groupe (racisme, homophobie, etc.)
+- Suspect : commentaire ambigu nécessitant une vérification humaine
+
+Points importants :
+- L'ironie, le sarcasme et les emojis péjoratifs peuvent constituer du cyberharcèlement même sans insulte directe
+- Un commentaire qui semble superficiellement bénin peut être du harcèlement selon le contexte
+- Si tu n'es pas certain, utilise "Suspect"
+
+Réponds UNIQUEMENT avec un objet JSON valide, sans texte avant ou après :
+{{"category": "<catégorie exacte>", "binary_label": <0 ou 1>, "confidence": "<high ou low>", "reasoning": "<explication courte en français>"}}
+
+binary_label : 0 = Absence de cyberharcèlement, 1 = toute forme de cyberharcèlement"""
+
+
+def classify_comment(client: anthropic.Anthropic, comment: str, tokens_used: dict) -> dict | None:
+    try:
+        response = client.messages.create(
+            model="claude-sonnet-4-6",
+            max_tokens=200,
+            system=[{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
+            messages=[{"role": "user", "content": f"Commentaire : {comment}"}],
+        )
+        tokens_used["input"] += response.usage.input_tokens
+        tokens_used["output"] += response.usage.output_tokens
+        if hasattr(response.usage, "cache_read_input_tokens"):
+            tokens_used["cache_read"] += response.usage.cache_read_input_tokens
+        if hasattr(response.usage, "cache_creation_input_tokens"):
+            tokens_used["cache_write"] += response.usage.cache_creation_input_tokens
+
+        text = response.content[0].text.strip()
+        # Strip markdown code blocks if present
+        text = re.sub(r"^```(?:json)?\s*|\s*```$", "", text, flags=re.MULTILINE).strip()
+        result = json.loads(text)
+
+        # Validate category
+        if result.get("category") not in CATEGORIES:
+            result["category"] = "Suspect"
+            result["binary_label"] = 1
+        return result
+
+    except Exception as e:
+        print(f"  [warn] Classification failed: {e}")
+        return None
+
+
+def compute_cost(tokens_used: dict) -> float:
+    return (
+        tokens_used["input"] * SONNET_INPUT_PRICE
+        + tokens_used["output"] * SONNET_OUTPUT_PRICE
+        + tokens_used.get("cache_write", 0) * SONNET_CACHE_WRITE_PRICE
+        + tokens_used.get("cache_read", 0) * SONNET_CACHE_READ_PRICE
+    ) / 1_000_000
+
+
+def mode_evaluate(service: AnnotationService, client: anthropic.Anthropic, n: int):
+    """Sample n manually-annotated rows, run Claude blind, measure agreement."""
+    import sklearn.metrics
+
+    print("Fetching annotated records...")
+    all_annotated = service.fetch_records_paginated()
+    all_annotated = [a for a in all_annotated if a.annotated_category]
+    print(f"  Found {len(all_annotated)} annotated records")
+
+    sample = random.sample(all_annotated, min(n, len(all_annotated)))
+    print(f"  Evaluating on {len(sample)} randomly sampled records\n")
+
+    tokens_used = {"input": 0, "output": 0, "cache_read": 0, "cache_write": 0}
+    rows = []
+
+    for i, ann in enumerate(sample):
+        result = classify_comment(client, ann.comment, tokens_used)
+        if result is None:
+            continue
+
+        # Human binary label
+        cats = [c.value for c in ann.annotated_category]
+        human_binary = 0 if any("Absence de cyberharcèlement" in c for c in cats) else 1
+        claude_binary = result.get("binary_label", 1)
+
+        rows.append({
+            "id": ann.id,
+            "comment": ann.comment[:120],
+            "human_category": cats[0] if cats else "?",
+            "human_binary": human_binary,
+            "claude_category": result.get("category"),
+            "claude_binary": claude_binary,
+            "claude_confidence": result.get("confidence"),
+            "claude_reasoning": result.get("reasoning", ""),
+            "agree": human_binary == claude_binary,
+        })
+
+        if (i + 1) % 10 == 0:
+            cost_so_far = compute_cost(tokens_used)
+            print(f"  {i+1}/{len(sample)}  cost so far: ${cost_so_far:.3f}")
+
+    df = pd.DataFrame(rows)
+    y_true = df["human_binary"].values
+    y_pred = df["claude_binary"].values
+
+    acc = sklearn.metrics.accuracy_score(y_true, y_pred)
+    f1 = sklearn.metrics.f1_score(y_true, y_pred, zero_division=0)
+    prec = sklearn.metrics.precision_score(y_true, y_pred, zero_division=0)
+    rec = sklearn.metrics.recall_score(y_true, y_pred, zero_division=0)
+    kappa = sklearn.metrics.cohen_kappa_score(y_true, y_pred)
+
+    print(f"\n{'='*60}")
+    print(f"Agreement metrics (Claude vs human, n={len(df)})")
+    print(f"{'='*60}")
+    print(f"  Accuracy : {acc:.4f}")
+    print(f"  F1       : {f1:.4f}  (P={prec:.4f}  R={rec:.4f})")
+    print(f"  Kappa    : {kappa:.4f}  {'(substantial)' if kappa > 0.6 else '(moderate)' if kappa > 0.4 else '(fair)'}")
+
+    print("\nConfusion matrix (rows=human, cols=claude):")
+    cm = sklearn.metrics.confusion_matrix(y_true, y_pred)
+    print("              Claude=0  Claude=1")
+    print(f"  Human=0       {cm[0,0]:5}     {cm[0,1]:5}")
+    print(f"  Human=1       {cm[1,0]:5}     {cm[1,1]:5}")
+
+    disagree_df = df[~df["agree"]].head(20)
+    if not disagree_df.empty:
+        print(f"\nDisagreements (first {len(disagree_df)}):")
+        print(f"{'Comment':<60} {'Human':>6} {'Claude':>6} {'Conf':<6} Reasoning")
+        print("-" * 120)
+        for _, row in disagree_df.iterrows():
+            print(f"{row['comment'][:58]:<60} {row['human_binary']:>6} {row['claude_binary']:>6} {row['claude_confidence']:<6} {row['claude_reasoning'][:60]}")
+
+    total_cost = compute_cost(tokens_used)
+    print(f"\nAPI cost: ${total_cost:.4f}  (tokens in={tokens_used['input']}, out={tokens_used['output']}, cache_read={tokens_used['cache_read']})")
+
+    out_path = DATA_DIR / "claude_evaluate_agreement.parquet"
+    df.to_parquet(out_path, index=False)
+    print(f"Full results saved to {out_path}")
+
+
+def mode_annotate(service: AnnotationService, client: anthropic.Anthropic, limit: int, dry_run: bool):
+    """Fetch unannotated rows, classify with Claude, save to parquet."""
+    print("Fetching all records...")
+    all_records = service.fetch_records_paginated()
+    unannotated = [a for a in all_records if not a.annotated_category]
+    print(f"  Total records: {len(all_records)}")
+    print(f"  Unannotated: {len(unannotated)}")
+
+    to_annotate = unannotated[:limit]
+    print(f"  Will annotate: {len(to_annotate)}")
+
+    # Cost estimate: ~400 tokens system (cached after first) + ~20 tokens per comment
+    n_calls = len(to_annotate)
+    est_input = 400 + 20 * n_calls  # first call full, rest cache hits
+    est_output = 60 * n_calls
+    est_cost = (est_input * SONNET_INPUT_PRICE + est_output * SONNET_OUTPUT_PRICE + 400 * SONNET_CACHE_WRITE_PRICE) / 1_000_000
+    print(f"  Estimated cost: ~${est_cost:.3f}")
+
+    if dry_run:
+        print("\n[dry-run] No API calls made.")
+        return
+
+    tokens_used = {"input": 0, "output": 0, "cache_read": 0, "cache_write": 0}
+    rows = []
+
+    for i, ann in enumerate(to_annotate):
+        result = classify_comment(client, ann.comment, tokens_used)
+        if result is None:
+            continue
+
+        rows.append({
+            "id": ann.id,
+            "comment": ann.comment,
+            "claude_category": result.get("category"),
+            "claude_binary_label": result.get("binary_label", 1),
+            "claude_confidence": result.get("confidence"),
+            "claude_reasoning": result.get("reasoning", ""),
+            "label": result.get("binary_label", 1),
+            "annotated_category": result.get("category"),
+            "binary_confidence": None,
+            "source": "claude_annotated",
+        })
+
+        if (i + 1) % 25 == 0:
+            print(f"  {i+1}/{len(to_annotate)}  cost: ${compute_cost(tokens_used):.3f}")
+
+    df = pd.DataFrame(rows)
+    out_path = DATA_DIR / "claude_annotated.parquet"
+    df.to_parquet(out_path, index=False)
+
+    total_cost = compute_cost(tokens_used)
+    label_dist = df["label"].value_counts().to_dict()
+    print("\n=== Done ===")
+    print(f"  Annotated: {len(df)} comments")
+    print(f"  Label distribution: {label_dist}")
+    print(f"  API cost: ${total_cost:.4f}")
+    print(f"  Saved to {out_path}")
+    print(f"\nNext: review {out_path.name}, then run 00_prepare_dataset.py to rebuild train splits.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Claude-based annotation and agreement evaluation")
+    parser.add_argument("--mode", choices=["evaluate", "annotate"], required=True)
+    parser.add_argument("--n", type=int, default=100, help="[evaluate] Number of annotated rows to sample")
+    parser.add_argument("--limit", type=int, default=500, help="[annotate] Max unannotated rows to process")
+    parser.add_argument("--dry-run", action="store_true", help="[annotate] Show cost estimate only")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    random.seed(args.seed)
+    load_dotenv()
+
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        print("ERROR: ANTHROPIC_API_KEY not set.")
+        sys.exit(1)
+
+    nocodb = NocoDBService(
+        nocodb_url=os.environ["NOCODB_BASE_URL"],
+        token=os.environ["NOCODB_TOKEN"],
+        base_id=os.environ["NOCODB_BASE_ID"],
+    )
+    service = AnnotationService(nocodb=nocodb, annotation_table_id=ANNOTATION_TABLE_ID)
+    client = anthropic.Anthropic(api_key=api_key)
+
+    if args.mode == "evaluate":
+        mode_evaluate(service, client, args.n)
+    else:
+        mode_annotate(service, client, args.limit, args.dry_run)
+
+
+if __name__ == "__main__":
+    main()

From eadb5e33127801bb34c33240a718737a628a6411 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20Co?= <gregoire.corbiere@gmail.com>
Date: Wed, 22 Apr 2026 17:17:59 +0200
Subject: [PATCH 6/8] add utils for model handling

---
 .../scripts/ml/_threshold_sweep.py            | 110 ++++++++++++++++++
 backend/balanceteshaters/scripts/ml/config.py |  69 +++++++++++
 backend/balanceteshaters/scripts/ml/models.py |  34 ++++++
 3 files changed, 213 insertions(+)
 create mode 100644 backend/balanceteshaters/scripts/ml/_threshold_sweep.py
 create mode 100644 backend/balanceteshaters/scripts/ml/config.py
 create mode 100644 backend/balanceteshaters/scripts/ml/models.py

diff --git a/backend/balanceteshaters/scripts/ml/_threshold_sweep.py b/backend/balanceteshaters/scripts/ml/_threshold_sweep.py
new file mode 100644
index 00000000..941dd9c8
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/_threshold_sweep.py
@@ -0,0 +1,110 @@
+# ruff: noqa: E402
+import os
+import subprocess
+import sys
+import tempfile
+os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
+import numpy as np
+import pandas as pd
+import sklearn.metrics
+from dotenv import load_dotenv
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent.parent))
+load_dotenv()
+
+from balanceteshaters.services.nocodb import NocoDBService
+from balanceteshaters.scripts.ml.config import ANNOTATION_TABLE_ID, EVAL_TABLE_ID, CHECKPOINTS_DIR, DATA_DIR
+
+tmp = Path(tempfile.mkdtemp())
+emb_path = str(tmp / 'X_arctic.npy')
+helper = str(Path(__file__).parent / '_eval_subprocess.py')
+MODEL_ID = 'Snowflake/snowflake-arctic-embed-l-v2.0'
+
+print('Embedding test set...')
+r = subprocess.run([sys.executable, helper, 'embed', MODEL_ID, emb_path], capture_output=True, text=True)
+if r.returncode != 0:
+    print(r.stderr[-500:])
+    sys.exit(1)
+
+X_test = np.load(emb_path)
+test_df = pd.read_parquet(DATA_DIR / 'test.parquet')
+y_true = test_df['label'].values
+
+THRESHOLDS = [0.50, 0.45, 0.40, 0.35, 0.30, 0.25, 0.20]
+BEST_SO_FAR = 0.741
+
+nocodb = NocoDBService(os.environ['NOCODB_BASE_URL'], os.environ['NOCODB_TOKEN'], os.environ['NOCODB_BASE_ID'])
+
+
+def best_threshold(proba):
+    best_t, best_f1, best_m = 0.5, 0.0, None
+    for t in THRESHOLDS:
+        y_pred = (proba >= t).astype(int)
+        f1 = sklearn.metrics.f1_score(y_true, y_pred, zero_division=0)
+        if f1 > best_f1:
+            best_f1 = f1
+            best_t = t
+            best_m = {
+                'f1':        f1,
+                'precision': sklearn.metrics.precision_score(y_true, y_pred, zero_division=0),
+                'recall':    sklearn.metrics.recall_score(y_true, y_pred, zero_division=0),
+                'accuracy':  sklearn.metrics.accuracy_score(y_true, y_pred),
+            }
+    return best_t, best_m
+
+
+def log(run_name, dataset, t, m):
+    nocodb.create_record(EVAL_TABLE_ID, {
+        'model_name': f'{run_name}+threshold={t}',
+        'table_id': ANNOTATION_TABLE_ID,
+        'table_name': f'finetune/{dataset}',
+        'f1': m['f1'], 'precision': m['precision'],
+        'recall': m['recall'], 'accuracy': m['accuracy'],
+        'total_samples': len(y_true),
+        'positive_samples': int(y_true.sum()),
+        'negative_samples': len(y_true) - int(y_true.sum()),
+        'prompt': f'approach=threshold-tuned dataset={dataset} threshold={t}',
+        'model_type': 'encoder embedding',
+    })
+
+
+print(f'\n{"Run":<60} {"BestT":>6} {"F1":>7} {"P":>7} {"R":>7} {"Acc":>7}')
+print('-' * 100)
+
+for clf_name in ['logreg', 'lightgbm', 'mlp']:
+    ckpt = CHECKPOINTS_DIR / f'snowflake-arctic-embed-l-v2.0+{clf_name}+real.joblib'
+    if not ckpt.exists():
+        continue
+    proba_path = str(tmp / f'proba_{clf_name}.npy')
+    r = subprocess.run(
+        [sys.executable, helper, 'predict_proba_classical', emb_path, str(ckpt), proba_path],
+        capture_output=True, text=True,
+    )
+    if r.returncode != 0:
+        print(f'  [error] {clf_name}: {r.stderr[-200:]}')
+        continue
+    proba = np.load(proba_path)
+    t, m = best_threshold(proba)
+    run = f'snowflake-arctic-embed-l-v2.0+{clf_name}+real'
+    flag = '  *** BEATS BEST' if m['f1'] > BEST_SO_FAR else ''
+    print(f'  {run:<58} {t:>6.2f} {m["f1"]:>7.4f} {m["precision"]:>7.4f} {m["recall"]:>7.4f} {m["accuracy"]:>7.4f}{flag}')
+    if t != 0.50:
+        log(run, 'real', t, m)
+
+# fine-tuned head_only real
+ft_dir = str(CHECKPOINTS_DIR / 'snowflake-arctic-embed-l-v2.0-finetuned-head_only-real')
+proba_path = str(tmp / 'proba_ft_real.npy')
+r = subprocess.run([sys.executable, helper, 'predict_proba', MODEL_ID, ft_dir, proba_path], capture_output=True, text=True)
+if r.returncode == 0:
+    proba = np.load(proba_path)
+    t, m = best_threshold(proba)
+    run = 'snowflake-arctic-embed-l-v2.0-finetuned-head_only-real'
+    flag = '  *** BEATS BEST' if m['f1'] > BEST_SO_FAR else ''
+    print(f'  {run:<58} {t:>6.2f} {m["f1"]:>7.4f} {m["precision"]:>7.4f} {m["recall"]:>7.4f} {m["accuracy"]:>7.4f}{flag}')
+    if t != 0.50:
+        log(run, 'real', t, m)
+else:
+    print(f'  [error] finetuned: {r.stderr[-300:]}')
+
+print('\nAll non-default thresholds logged to NocoDB.')
diff --git a/backend/balanceteshaters/scripts/ml/config.py b/backend/balanceteshaters/scripts/ml/config.py
new file mode 100644
index 00000000..0c56536e
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/config.py
@@ -0,0 +1,69 @@
+from pathlib import Path
+import torch
+
+JINA_MODEL_ID = "jinaai/jina-embeddings-v5-text-nano"    # 239M, EuroBERT, CPU-only (MPS segfault)
+JINA_SMALL_MODEL_ID = "jinaai/jina-embeddings-v5-text-small"  # 677M, Qwen3-based, MPS-safe
+# Decoder-only, 270M, 640-dim, 94-language, no trust_remote_code needed
+BIDIR_MODEL_ID = "microsoft/harrier-oss-v1-270m"
+# Encoder-only, 568M, 1024-dim, bge-m3-retromae base, MRL, no trust_remote_code
+ARCTIC_EMBED_MODEL_ID = "Snowflake/snowflake-arctic-embed-l-v2.0"
+# 600M XLM-R fine-tuned on multilingual toxicity (15 langs incl. French); labels: 0=neutral, 1=toxic
+XLMR_TOXICITY_MODEL_ID = "textdetox/xlmr-large-toxicity-classifier-v2"
+MODELS = [JINA_MODEL_ID, JINA_SMALL_MODEL_ID, BIDIR_MODEL_ID, ARCTIC_EMBED_MODEL_ID, XLMR_TOXICITY_MODEL_ID]
+
+MODEL_TYPE = {
+    JINA_MODEL_ID:          "encoder embedding",
+    JINA_SMALL_MODEL_ID:    "encoder embedding",
+    BIDIR_MODEL_ID:         "encoder embedding",
+    ARCTIC_EMBED_MODEL_ID:  "encoder embedding",
+    XLMR_TOXICITY_MODEL_ID: "encoder classifier",
+}
+
+SCRIPTS_DIR = Path(__file__).resolve().parent
+BACKEND_DIR = SCRIPTS_DIR.parent.parent.parent
+DATA_DIR = BACKEND_DIR / "balanceteshaters" / "data" / "finetune"
+CHECKPOINTS_DIR = DATA_DIR / "checkpoints"
+
+ANNOTATION_TABLE_ID = "m5t7qqaer2oa441"
+EVAL_TABLE_ID = "m0ww7qnx69u9r1a"
+
+LABEL_MAP = {
+    "Absence de cyberharcèlement": 0,
+}
+
+MINORITY_CATEGORIES = [
+    "Doxxing",
+    "Incitation au suicide",
+    "Cyberharcèlement à caractère sexuel",
+    "Menaces",
+    "Incitation à la haine",
+]
+
+
+def get_device() -> str:
+    if torch.backends.mps.is_available():
+        return "mps"
+    if torch.cuda.is_available():
+        return "cuda"
+    return "cpu"
+
+
+def get_device_for_model(model_id: str) -> str:
+    # jina-v5-text-nano uses EuroBERT which segfaults on MPS — force CPU
+    # jina-v5-text-small uses Qwen3 and is MPS-safe
+    if model_id == JINA_MODEL_ID:
+        return "cpu"
+    return get_device()
+
+
+def compute_binary_label(annotated_categories: list[str] | None) -> int | None:
+    if not annotated_categories:
+        return None
+    for cat in annotated_categories:
+        if "Absence de cyberharcèlement" in cat:
+            return 0
+    return 1
+
+
+def model_slug(model_id: str) -> str:
+    return model_id.split("/")[-1]
diff --git a/backend/balanceteshaters/scripts/ml/models.py b/backend/balanceteshaters/scripts/ml/models.py
new file mode 100644
index 00000000..2e884fb9
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/models.py
@@ -0,0 +1,34 @@
+"""Shared model definitions for the ML fine-tuning scripts."""
+import torch
+import torch.nn as nn
+
+
+class EmbeddingClassifier(nn.Module):
+    """Wraps a SentenceTransformer encoder with a linear classification head."""
+
+    def __init__(self, encoder, embed_dim: int, num_labels: int = 2, task: str | None = None, trainable_encoder: bool = False):
+        super().__init__()
+        self.encoder = encoder
+        self.classifier = nn.Linear(embed_dim, num_labels)
+        self.task = task
+        self.trainable_encoder = trainable_encoder
+
+    def forward(self, texts: list[str], labels: torch.Tensor | None = None):
+        if self.trainable_encoder:
+            # SentenceTransformer.forward() preserves the computation graph; encode() does not
+            features = self.encoder.tokenize(texts)
+            features = {k: v.to(self.classifier.weight.device) for k, v in features.items()}
+            embeddings = self.encoder(features)["sentence_embedding"]
+        else:
+            encode_kwargs: dict = {"convert_to_numpy": True, "show_progress_bar": False}
+            if self.task:
+                encode_kwargs["task"] = self.task
+            embeddings = torch.tensor(
+                self.encoder.encode(texts, **encode_kwargs),
+                dtype=torch.float32,
+            ).to(self.classifier.weight.device)
+        logits = self.classifier(embeddings)
+        if labels is not None:
+            loss = nn.CrossEntropyLoss()(logits, labels)
+            return loss, logits
+        return logits

From 1f1eee3a59deef0df65ca2afd57e910ca31a05f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20Co?= <gregoire.corbiere@gmail.com>
Date: Wed, 22 Apr 2026 17:20:45 +0200
Subject: [PATCH 7/8] Add finetuning and init

---
 .../scripts/ml/03_finetune_embedding.py       | 231 ++++++++++++++++++
 .../balanceteshaters/scripts/ml/__init__.py   |   0
 2 files changed, 231 insertions(+)
 create mode 100644 backend/balanceteshaters/scripts/ml/03_finetune_embedding.py
 create mode 100644 backend/balanceteshaters/scripts/ml/__init__.py

diff --git a/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py b/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py
new file mode 100644
index 00000000..0c0ac249
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py
@@ -0,0 +1,231 @@
+# ruff: noqa: E402
+"""
+Fine-tune embedding models for binary harassment classification.
+
+Usage:
+  python 03_finetune_embedding.py --model bidir --strategy full --dataset real
+  python 03_finetune_embedding.py --model jina  --strategy head_only --dataset augmented
+"""
+import os
+os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0")
+
+import argparse
+import sys
+import time
+from pathlib import Path
+
+import pandas as pd
+import sklearn.metrics
+import torch
+from dotenv import load_dotenv
+from sentence_transformers import SentenceTransformer
+from torch.utils.data import DataLoader, Dataset
+
+SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+from balanceteshaters.scripts.ml.config import (
+    ANNOTATION_TABLE_ID,
+    ARCTIC_EMBED_MODEL_ID,
+    BIDIR_MODEL_ID,
+    CHECKPOINTS_DIR,
+    DATA_DIR,
+    EVAL_TABLE_ID,
+    JINA_MODEL_ID,
+    JINA_SMALL_MODEL_ID,
+    get_device_for_model,
+    model_slug,
+)
+from balanceteshaters.scripts.ml.models import EmbeddingClassifier
+from balanceteshaters.services.nocodb import NocoDBService
+
+
+class TextDataset(Dataset):
+    def __init__(self, df: pd.DataFrame):
+        self.texts = df["comment"].tolist()
+        self.labels = torch.tensor(df["label"].values, dtype=torch.long)
+
+    def __len__(self):
+        return len(self.labels)
+
+    def __getitem__(self, idx):
+        return self.texts[idx], self.labels[idx]
+
+
+def log_to_nocodb(nocodb, run_name: str, dataset: str, strategy: str, metrics: dict, n_total: int, n_pos: int):
+    if nocodb is None:
+        return
+    data = {
+        "model_name": run_name,
+        "table_id": ANNOTATION_TABLE_ID,
+        "table_name": f"finetune/{dataset}",
+        "f1": metrics["f1"],
+        "precision": metrics["precision"],
+        "recall": metrics["recall"],
+        "accuracy": metrics["accuracy"],
+        "total_samples": n_total,
+        "positive_samples": n_pos,
+        "negative_samples": n_total - n_pos,
+        "prompt": f"strategy={strategy} dataset={dataset}",
+        "model_type": "encoder embedding",
+    }
+    try:
+        nocodb.create_record(EVAL_TABLE_ID, data)
+    except Exception as e:
+        print(f"  [warn] NocoDB logging failed: {e}")
+
+
+def run_finetune(model_id: str, is_jina: bool, strategy: str, dataset: str, nocodb, device: str):
+    slug = model_slug(model_id)
+    run_name = f"{slug}-finetuned-{strategy}-{dataset}"
+    ckpt_dir = CHECKPOINTS_DIR / run_name
+    ckpt_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"\n{'='*60}")
+    print(f"Fine-tuning {model_id}  strategy={strategy}  dataset={dataset}")
+
+    load_kwargs = {"device": device}
+    if is_jina:
+        load_kwargs["trust_remote_code"] = True
+    encoder = SentenceTransformer(model_id, **load_kwargs)
+
+    probe_kwargs = {"convert_to_numpy": True, "show_progress_bar": False}
+    if is_jina:
+        probe_kwargs["task"] = "classification"
+    embed_dim = encoder.encode(["probe"], **probe_kwargs).shape[1]
+
+    task = "classification" if is_jina else None
+    clf_model = EmbeddingClassifier(encoder, embed_dim, task=task, trainable_encoder=(strategy == "full")).to(device)
+
+    if strategy == "head_only":
+        for param in clf_model.encoder.parameters():
+            param.requires_grad = False
+        lr = 1e-3
+        batch_size = 32
+        max_epochs = 50
+        patience = 15
+    else:
+        lr = 2e-5
+        batch_size = 16
+        max_epochs = 15
+        patience = 5
+
+    train_df = pd.read_parquet(DATA_DIR / f"train_{dataset}.parquet")
+    val_df = pd.read_parquet(DATA_DIR / "val.parquet")
+    test_df = pd.read_parquet(DATA_DIR / "test.parquet")
+
+    train_loader = DataLoader(TextDataset(train_df), batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(TextDataset(val_df), batch_size=32, shuffle=False)
+
+    optimizer = torch.optim.AdamW(
+        filter(lambda p: p.requires_grad, clf_model.parameters()), lr=lr, weight_decay=0.01
+    )
+
+    best_f1 = 0.0
+    no_improve = 0
+    best_state = None
+    t0 = time.time()
+
+    for epoch in range(1, max_epochs + 1):
+        clf_model.train()
+        total_loss = 0.0
+        for texts, labels in train_loader:
+            labels = labels.to(device)
+            optimizer.zero_grad()
+            loss, _ = clf_model(texts, labels)
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+
+        clf_model.eval()
+        all_preds, all_labels = [], []
+        with torch.no_grad():
+            for texts, labels in val_loader:
+                logits = clf_model(texts)
+                preds = torch.argmax(logits, dim=-1).cpu().numpy()
+                all_preds.extend(preds)
+                all_labels.extend(labels.numpy())
+
+        val_f1 = sklearn.metrics.f1_score(all_labels, all_preds, zero_division=0)
+        avg_loss = total_loss / len(train_loader)
+
+        if val_f1 > best_f1:
+            best_f1 = val_f1
+            best_state = {k: v.cpu().clone() for k, v in clf_model.state_dict().items()}
+            torch.save(best_state, ckpt_dir / "best_model.pt")
+            no_improve = 0
+            print(f"  Epoch {epoch}  loss={avg_loss:.4f}  val_f1={val_f1:.4f}  ← best")
+        else:
+            no_improve += 1
+            print(f"  Epoch {epoch}  loss={avg_loss:.4f}  val_f1={val_f1:.4f}  (no improve {no_improve}/{patience})")
+            if no_improve >= patience:
+                print(f"  Early stopping at epoch {epoch}")
+                break
+
+    elapsed = time.time() - t0
+
+    if best_state:
+        clf_model.load_state_dict(best_state)
+
+    clf_model.eval()
+    test_loader = DataLoader(TextDataset(test_df), batch_size=32, shuffle=False)
+    all_preds, all_labels = [], []
+    with torch.no_grad():
+        for texts, labels in test_loader:
+            logits = clf_model(texts)
+            preds = torch.argmax(logits, dim=-1).cpu().numpy()
+            all_preds.extend(preds)
+            all_labels.extend(labels.numpy())
+
+    metrics = {
+        "f1": sklearn.metrics.f1_score(all_labels, all_preds, zero_division=0),
+        "precision": sklearn.metrics.precision_score(all_labels, all_preds, zero_division=0),
+        "recall": sklearn.metrics.recall_score(all_labels, all_preds, zero_division=0),
+        "accuracy": sklearn.metrics.accuracy_score(all_labels, all_preds),
+    }
+    y_test = test_df["label"].values
+
+    print(f"Test  F1={metrics['f1']:.4f}  P={metrics['precision']:.4f}  R={metrics['recall']:.4f}  Acc={metrics['accuracy']:.4f}  ({elapsed:.0f}s)")
+    print(f"  Saved to {ckpt_dir}")
+    log_to_nocodb(nocodb, run_name, dataset, strategy, metrics, len(y_test), int(y_test.sum()))
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Fine-tune embedding model for binary classification")
+    parser.add_argument("--model", choices=["jina", "jina-small", "bidir", "arctic"], required=True)
+    parser.add_argument("--strategy", choices=["head_only", "full"], default="full")
+    parser.add_argument("--dataset", choices=["real", "augmented", "augmented_v2"], default="real")
+    args = parser.parse_args()
+
+    load_dotenv()
+    nocodb = None
+    if all(os.environ.get(k) for k in ["NOCODB_BASE_URL", "NOCODB_TOKEN", "NOCODB_BASE_ID"]):
+        nocodb = NocoDBService(
+            nocodb_url=os.environ["NOCODB_BASE_URL"],
+            token=os.environ["NOCODB_TOKEN"],
+            base_id=os.environ["NOCODB_BASE_ID"],
+        )
+
+    model_map = {
+        "jina": JINA_MODEL_ID,
+        "jina-small": JINA_SMALL_MODEL_ID,
+        "bidir": BIDIR_MODEL_ID,
+        "arctic": ARCTIC_EMBED_MODEL_ID,
+    }
+    model_id = model_map[args.model]
+    is_jina = args.model in ("jina", "jina-small")
+
+    if is_jina and args.strategy == "full":
+        print("Note: jina models use encode() which blocks gradient flow — 'full' is equivalent to 'head_only'. Running head_only.")
+        args.strategy = "head_only"
+
+    device = get_device_for_model(model_id)
+    print(f"Device: {device}")
+
+    CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
+    run_finetune(model_id, is_jina, args.strategy, args.dataset, nocodb, device)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/balanceteshaters/scripts/ml/__init__.py b/backend/balanceteshaters/scripts/ml/__init__.py
new file mode 100644
index 00000000..e69de29b

From 0662baa75c43adb8c954ed7370b8f17e9e41d509 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20Co?= <gregoire.corbiere@gmail.com>
Date: Fri, 24 Apr 2026 11:59:01 +0200
Subject: [PATCH 8/8] Add README and other utils

---
 .../scripts/ml/03_finetune_embedding.py       |   4 +-
 backend/balanceteshaters/scripts/ml/README.md | 369 ++++++++++++++++++
 .../scripts/ml/_eval_subprocess.py            | 144 +++++++
 .../scripts/ml/_mlp_arch_search.py            | 151 +++++++
 .../scripts/ml/_threshold_sweep.py            |  73 ++--
 backend/balanceteshaters/scripts/ml/config.py |   2 +-
 .../scripts/ml/dedup_eval_table.py            |  77 ++++
 7 files changed, 789 insertions(+), 31 deletions(-)
 create mode 100644 backend/balanceteshaters/scripts/ml/README.md
 create mode 100644 backend/balanceteshaters/scripts/ml/_eval_subprocess.py
 create mode 100644 backend/balanceteshaters/scripts/ml/_mlp_arch_search.py
 create mode 100644 backend/balanceteshaters/scripts/ml/dedup_eval_table.py

diff --git a/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py b/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py
index 0c0ac249..2bba2856 100644
--- a/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py
+++ b/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py
@@ -3,8 +3,8 @@
 Fine-tune embedding models for binary harassment classification.
 
 Usage:
-  python 03_finetune_embedding.py --model bidir --strategy full --dataset real
-  python 03_finetune_embedding.py --model jina  --strategy head_only --dataset augmented
+  python 03_finetune_embedding.py --model bidir  --strategy full      --dataset real
+  python 03_finetune_embedding.py --model arctic --strategy head_only --dataset augmented_v2
 """
 import os
 os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0")
diff --git a/backend/balanceteshaters/scripts/ml/README.md b/backend/balanceteshaters/scripts/ml/README.md
new file mode 100644
index 00000000..55c0210c
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/README.md
@@ -0,0 +1,369 @@
+# Pipeline ML : détection de cyberharcèlement par embeddings
+
+Pipeline complet pour entraîner et évaluer des modèles d'embeddings sur la classification binaire de commentaires français issus des réseaux sociaux (Instagram, TikTok, YouTube, Twitter).
+
+## Vue d'ensemble
+
+Le pipeline est organisé en scripts numérotés à exécuter dans l'ordre :
+
+```
+00_prepare_dataset.py            → construire les splits train/val/test depuis NocoDB
+01_generate_synthetic.py         → augmenter les catégories minoritaires avec Claude
+02_embed_and_train_classical.py  → classifieurs classiques sur embeddings gelés
+03_finetune_embedding.py         → fine-tuning bout-en-bout encodeur + tête
+04_compare_evaluate.py           → comparer tous les runs, logger dans NocoDB
+05_claude_annotate.py            → annoter les données non étiquetées avec Claude
+```
+
+Tous les résultats sont enregistrés dans NocoDB pour le suivi et la comparaison.
+
+---
+
+## Installation (MacBook M4 Pro)
+
+### Prérequis
+
+- **macOS Sequoia** (ou supérieur recommandé)
+- **Python 3.12+** — installez via [pyenv](https://github.com/pyenv/pyenv) ou [mise](https://mise.jdx.dev/)
+- **uv** — gestionnaire de packages rapide
+
+```bash
+# Installer uv si pas déjà fait
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
+
+### Cloner et installer les dépendances
+
+```bash
+git clone <repo-url>
+cd 14_BalanceTesHaters/backend
+uv sync
+```
+
+`uv sync` crée un environnement virtuel dans `.venv/` et installe toutes les dépendances définies dans `pyproject.toml`, y compris PyTorch, sentence-transformers et LightGBM.
+
+### Variables d'environnement
+
+Créez un fichier `.env` à la racine du dossier `backend/` :
+
+```bash
+NOCODB_BASE_URL=https://votre-nocodb.example.com
+NOCODB_TOKEN=votre_token
+NOCODB_BASE_ID=votre_base_id
+ANTHROPIC_API_KEY=votre_clé   # facultatif — requis seulement pour 01_generate_synthetic.py et 05_claude_annotate.py
+```
+
+Les scripts `00`, `02`, `03` et `04` n'utilisent pas l'API Anthropic et fonctionnent sans cette clé. Les scripts appellent `load_dotenv()` automatiquement.
+
+### Notes spécifiques Apple Silicon (M4 Pro)
+
+Le chip M4 Pro dispose d'un GPU unifié (MPS) utilisé par PyTorch. Quelques points importants :
+
+- **jina-v5-text-nano** (`jina`) provoque un **segfault sur MPS** à cause d'EuroBERT. Il est automatiquement forcé sur CPU — pas d'action requise.
+- **jina-v5-text-small**, BidirLM et Arctic sont tous **compatibles MPS** et utiliseront le GPU automatiquement.
+- La variable d'environnement `PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0` est positionnée en tête de chaque script pour éviter les erreurs de mémoire unifiée.
+- **LightGBM** utilise `libgomp` (OpenMP GNU) tandis que certains modèles HuggingFace utilisent `libomp` (OpenMP LLVM). Charger les deux dans le même processus provoque un crash. Les scripts gèrent cela via des sous-processus isolés — aucune configuration nécessaire de votre côté.
+
+### Vérifier l'installation
+
+```bash
+cd backend
+uv run python -c "import torch; print('MPS disponible:', torch.backends.mps.is_available())"
+# → MPS disponible: True
+```
+
+---
+
+## Démarrage rapide
+
+```bash
+# 1. Construire les splits
+uv run python balanceteshaters/scripts/ml/00_prepare_dataset.py
+
+# 2. Générer des données synthétiques (vérifier le coût d'abord)
+uv run python balanceteshaters/scripts/ml/01_generate_synthetic.py --dry-run
+uv run python balanceteshaters/scripts/ml/01_generate_synthetic.py
+
+# 3. Entraîner les baselines sur embeddings gelés (tous modèles × classifieurs × datasets)
+uv run python balanceteshaters/scripts/ml/02_embed_and_train_classical.py
+
+# 4. Fine-tuner (un run à la fois)
+uv run python balanceteshaters/scripts/ml/03_finetune_embedding.py --model arctic --strategy head_only --dataset real
+uv run python balanceteshaters/scripts/ml/03_finetune_embedding.py --model arctic --strategy head_only --dataset augmented_v2
+
+# 5. Comparer tout
+uv run python balanceteshaters/scripts/ml/04_compare_evaluate.py
+```
+
+---
+
+## Scripts
+
+### 00_prepare_dataset.py — Construire les splits
+
+Récupère les enregistrements annotés depuis NocoDB et crée des splits stratifiés 70/15/15.
+
+```bash
+uv run python balanceteshaters/scripts/ml/00_prepare_dataset.py [--high-confidence-only]
+```
+
+**Options :**
+- `--high-confidence-only` — ne garder que les annotations marquées `HIGH_CONFIDENCE` (réduit la taille du dataset, améliore la qualité des labels)
+
+**Sorties** (dans `data/finetune/`) :
+- `train_real.parquet`
+- `val.parquet`
+- `test.parquet`
+
+À ré-exécuter à chaque fois que les annotations changent.
+
+---
+
+### 01_generate_synthetic.py — Augmenter les catégories minoritaires
+
+Utilise Claude pour générer des commentaires de cyberharcèlement synthétiques réalistes pour les catégories sous-représentées (doxxing, incitation au suicide, harcèlement sexuel, menaces, incitation à la haine). Chaque prompt inclut des exemples réels du training set comme ancrage few-shot.
+
+```bash
+# Voir l'allocation et estimer le coût
+uv run python balanceteshaters/scripts/ml/01_generate_synthetic.py --dry-run
+
+# Générer avec les paramètres par défaut (1000 exemples, Sonnet)
+uv run python balanceteshaters/scripts/ml/01_generate_synthetic.py
+
+# Modèle moins cher ou nombre d'exemples réduit
+uv run python balanceteshaters/scripts/ml/01_generate_synthetic.py --model claude-haiku-4-5-20251001 --total 500
+```
+
+**Sorties :**
+- `data/finetune/synthetic_v2.parquet` — exemples générés
+- `data/finetune/train_augmented_v2.parquet` — `train_real` + synthétique (à utiliser avec `--dataset augmented_v2` dans les scripts suivants)
+
+**Coût typique :** ~0,10 $ pour 1000 exemples avec Sonnet.
+
+---
+
+### 02_embed_and_train_classical.py — Baselines sur embeddings gelés
+
+Encode le texte avec chaque modèle puis entraîne trois classifieurs sur les représentations gelées. Couvre toutes les combinaisons encodeur × classifieux × dataset.
+
+```bash
+# Tout (par défaut)
+uv run python balanceteshaters/scripts/ml/02_embed_and_train_classical.py
+
+# Sous-ensemble
+uv run python balanceteshaters/scripts/ml/02_embed_and_train_classical.py \
+    --models arctic jina-small \
+    --datasets real augmented_v2
+```
+
+**Options :**
+- `--models` — un ou plusieurs parmi `jina`, `jina-small`, `bidir`, `arctic`, `all` (défaut : `all`)
+- `--datasets` — un ou plusieurs parmi `real`, `augmented`, `augmented_v2`, `all` (défaut : `all`)
+
+**Classifieurs entraînés :**
+
+| Nom | Architecture |
+|---|---|
+| `logreg` | Régression logistique (baseline linéaire) |
+| `lightgbm` | LightGBM avec early stopping |
+| `mlp` | MLP 512→128, ReLU, early stopping |
+
+**Sorties :** `data/finetune/checkpoints/{slug}+{clf}+{dataset}.joblib`
+
+---
+
+### 03_finetune_embedding.py — Fine-tuning bout-en-bout
+
+Attache une tête de classification linéaire à un encodeur et entraîne avec AdamW. Deux stratégies disponibles :
+
+- `head_only` — encodeur gelé, seule la tête apprend (rapide, ~50 époques)
+- `full` — encodeur + tête entraînés conjointement avec un faible LR (lent, ~15 époques)
+
+```bash
+# Arctic head-only sur données réelles
+uv run python balanceteshaters/scripts/ml/03_finetune_embedding.py \
+    --model arctic --strategy head_only --dataset real
+
+# Head-only sur données augmentées
+uv run python balanceteshaters/scripts/ml/03_finetune_embedding.py \
+    --model arctic --strategy head_only --dataset augmented_v2
+```
+
+**Options :**
+- `--model` — `jina`, `jina-small`, `bidir`, `arctic` (requis)
+- `--strategy` — `head_only` ou `full` (défaut : `full`)
+- `--dataset` — `real`, `augmented`, `augmented_v2` (défaut : `real`)
+
+> Les modèles jina utilisent `encode()` qui bloque le flux de gradient, donc `full` bascule automatiquement en `head_only` pour eux.
+
+**Sortie :** `data/finetune/checkpoints/{slug}-finetuned-{strategy}-{dataset}/best_model.pt`
+
+L'entraînement sauvegarde le checkpoint avec le meilleur F1 de validation et applique l'early stopping.
+
+---
+
+### 04_compare_evaluate.py — Table de comparaison complète
+
+Charge tous les checkpoints des scripts 02 et 03, évalue sur le test set, affiche une table de comparaison et logue tous les résultats dans NocoDB. Inclut également :
+- Baseline zero-shot XLM-R toxicité
+- Meilleure baseline LLM depuis les CSV de prédictions (si présents)
+- Delta A/B montrant le gain F1 des données augmentées vs réelles
+
+```bash
+uv run python balanceteshaters/scripts/ml/04_compare_evaluate.py
+```
+
+Les embeddings et prédictions tournent dans des sous-processus isolés pour éviter le conflit OpenMP entre EuroBERT de jina-nano (libomp) et LightGBM (libgomp).
+
+**Meilleurs résultats observés (frozen embeddings, test set) :**
+```
+Run                                                     F1      P      R    Acc
+================================================================================
+snowflake-arctic-embed-l-v2.0+mlp+real              0.6916 0.6852 0.6981 0.7130
+snowflake-arctic-embed-l-v2.0+logreg+real           0.6903 0.6500 0.7358 0.6957
+harrier-oss-v1-270m+lightgbm+real                   0.6729 0.6667 0.6792 0.6957
+jina-embeddings-v5-text-nano+lightgbm+augmented     0.6573 0.5222 0.8868 0.5739
+jina-embeddings-v5-text-small+mlp+real              0.6195 0.5833 0.6604 0.6261
+```
+
+---
+
+### 05_claude_annotate.py — Annotation avec Claude
+
+Deux modes :
+- **evaluate** — fait tourner Claude sur un échantillon de lignes déjà annotées et mesure l'accord avec les labels humains (accuracy, F1, kappa de Cohen). À utiliser en premier pour valider la fiabilité.
+- **annotate** — classe les lignes non annotées avec Claude et sauvegarde en parquet pour relecture.
+
+```bash
+# Mesurer l'accord Claude vs humain sur 100 exemples
+uv run python balanceteshaters/scripts/ml/05_claude_annotate.py --mode evaluate --n 100
+
+# Estimer le coût
+uv run python balanceteshaters/scripts/ml/05_claude_annotate.py --mode annotate --limit 500 --dry-run
+
+# Annoter
+uv run python balanceteshaters/scripts/ml/05_claude_annotate.py --mode annotate --limit 500
+```
+
+**Options :**
+- `--mode` — `evaluate` ou `annotate` (requis)
+- `--n` — nombre de lignes annotées à échantillonner pour l'évaluation (défaut : 100)
+- `--limit` — nombre max de lignes non annotées à traiter (défaut : 500)
+- `--dry-run` — afficher uniquement l'estimation de coût, sans appels API
+- `--seed` — graine aléatoire pour l'échantillonnage (défaut : 42)
+
+**Sortie :** `data/finetune/claude_annotated.parquet`
+
+Après relecture, ré-exécuter `00_prepare_dataset.py` pour inclure les lignes annotées par Claude dans les splits.
+
+---
+
+## Modèles
+
+| Alias | ID HuggingFace | Params | Notes |
+|---|---|---|---|
+| `jina` | `jinaai/jina-embeddings-v5-text-nano` | 239M | Basé sur EuroBERT ; CPU uniquement sur Apple Silicon (segfault MPS) |
+| `jina-small` | `jinaai/jina-embeddings-v5-text-small` | 677M | Basé sur Qwen3 ; compatible MPS |
+| `bidir` | `microsoft/harrier-oss-v1-270m` | 270M | 94 langues, 640 dimensions |
+| `arctic` | `Snowflake/snowflake-arctic-embed-l-v2.0` | 568M | 1024 dimensions, MRL, forte baseline retrieval |
+| *(zero-shot)* | `textdetox/xlmr-large-toxicity-classifier-v2` | 600M | XLM-R fine-tuné sur la toxicité multilingue ; aucun entraînement requis |
+
+---
+
+## Scripts utilitaires
+
+### _threshold_sweep.py
+
+Balaye les seuils de décision [0.20 … 0.50] pour les modèles classiques et fine-tunés Arctic. Logue dans NocoDB tout run dont le meilleur seuil diffère de 0.50.
+
+```bash
+uv run python balanceteshaters/scripts/ml/_threshold_sweep.py
+```
+
+Nécessite que les checkpoints de `02_embed_and_train_classical.py` existent.
+
+---
+
+### _mlp_arch_search.py
+
+Recherche de grille sur les configurations de couches cachées du MLP sur les embeddings Arctic gelés. Teste 8 architectures avec optimisation du seuil et logue la meilleure dans NocoDB.
+
+```bash
+uv run python balanceteshaters/scripts/ml/_mlp_arch_search.py
+```
+
+---
+
+### dedup_eval_table.py
+
+Supprime les entrées en doublon dans la table d'évaluation NocoDB, en gardant l'enregistrement le plus récent par `model_name`. Peut être exécuté plusieurs fois sans risque.
+
+```bash
+uv run python balanceteshaters/scripts/ml/dedup_eval_table.py
+```
+
+---
+
+### _eval_subprocess.py
+
+Helper interne utilisé par `04_compare_evaluate.py`, `_threshold_sweep.py` et `_mlp_arch_search.py`. Chaque commande tourne dans son propre sous-processus pour éviter les conflits OpenMP entre torch/sentence-transformers (libomp) et LightGBM (libgomp).
+
+Commandes disponibles :
+```
+embed                    <model_id> <output_npy> [split]
+predict_classical        <embedding_npy> <ckpt_path> <output_npy>
+predict_proba_classical  <embedding_npy> <ckpt_path> <output_npy>
+predict                  <model_id> <ckpt_dir> <output_npy>
+predict_proba            <model_id> <ckpt_dir> <output_npy>
+```
+
+`split` vaut `"test"` par défaut. Passer `"train_real"` ou `"val"` pour les autres splits.
+
+---
+
+## Structure des données
+
+```
+backend/balanceteshaters/data/finetune/
+├── train_real.parquet           # ~70 % des données annotées
+├── train_augmented.parquet      # train_real + synthétique v1
+├── train_augmented_v2.parquet   # train_real + synthétique v2 (recommandé)
+├── val.parquet                  # ~15 %, utilisé pour l'early stopping
+├── test.parquet                 # ~15 %, réservé à l'évaluation finale
+├── synthetic_v2.parquet         # exemples générés par Claude
+├── claude_annotated.parquet     # lignes annotées par Claude (après 05)
+└── checkpoints/
+    ├── {slug}+{clf}+{dataset}.joblib         # classifieurs sur embeddings gelés
+    └── {slug}-finetuned-{strategy}-{dataset}/
+        └── best_model.pt                     # meilleur checkpoint (val F1)
+```
+
+Tous les fichiers parquet partagent le même schéma :
+
+| Colonne | Type | Description |
+|---|---|---|
+| `id` | str/None | Identifiant NocoDB |
+| `comment` | str | Texte brut du commentaire |
+| `label` | int | 0 = bénin, 1 = cyberharcèlement |
+| `annotated_category` | str | Noms de catégories séparés par des virgules |
+| `binary_confidence` | str/None | `HIGH_CONFIDENCE` ou None |
+| `source` | str | `real`, `synthetic_v2` ou `claude_annotated` |
+
+---
+
+## Configuration (config.py)
+
+Module central avec les identifiants de modèles, chemins et utilitaires.
+
+| Symbole | Description |
+|---|---|
+| `JINA_MODEL_ID` / `JINA_SMALL_MODEL_ID` | IDs des modèles jina |
+| `ARCTIC_EMBED_MODEL_ID` | ID du modèle Snowflake Arctic |
+| `XLMR_TOXICITY_MODEL_ID` | Classifieurs XLM-R toxicité (zero-shot) |
+| `DATA_DIR` | Chemin vers `data/finetune/` |
+| `CHECKPOINTS_DIR` | Chemin vers `data/finetune/checkpoints/` |
+| `ANNOTATION_TABLE_ID` | Table NocoDB des annotations brutes |
+| `EVAL_TABLE_ID` | Table NocoDB des résultats d'évaluation |
+| `get_device_for_model(model_id)` | Retourne `"cpu"` pour jina-nano (MPS non sûr), sinon le meilleur device disponible |
+| `model_slug(model_id)` | Extrait le nom court d'un ID HuggingFace (ex. `"snowflake-arctic-embed-l-v2.0"`) |
+| `compute_binary_label(categories)` | Convertit une liste de catégories en label binaire 0/1 |
diff --git a/backend/balanceteshaters/scripts/ml/_eval_subprocess.py b/backend/balanceteshaters/scripts/ml/_eval_subprocess.py
new file mode 100644
index 00000000..bf39edb1
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/_eval_subprocess.py
@@ -0,0 +1,144 @@
+"""
+Subprocess helper for 04_compare_evaluate.py and threshold/arch sweep scripts.
+Each command uses lazy imports so unrelated native libraries are never loaded together.
+
+Usage:
+  python _eval_subprocess.py embed             <model_id> <output_npy> [split]
+  python _eval_subprocess.py predict_classical <embedding_npy> <ckpt_path> <output_npy>
+  python _eval_subprocess.py predict_proba_classical <embedding_npy> <ckpt_path> <output_npy>
+  python _eval_subprocess.py predict           <model_id> <ckpt_dir> <output_npy>
+  python _eval_subprocess.py predict_proba     <model_id> <ckpt_dir> <output_npy>
+
+[split] defaults to "test". Pass "train_real" or "val" to embed other splits.
+"""
+import os
+os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
+
+import sys
+import numpy as np
+from pathlib import Path
+
+SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+
+def cmd_embed(model_id: str, output_npy: str, split: str = "test"):
+    import pandas as pd
+    from sentence_transformers import SentenceTransformer
+    from balanceteshaters.scripts.ml.config import DATA_DIR, get_device_for_model
+
+    df = pd.read_parquet(DATA_DIR / f"{split}.parquet")
+    is_jina = "jinaai/" in model_id
+    load_kwargs = {"device": get_device_for_model(model_id)}
+    if is_jina:
+        load_kwargs["trust_remote_code"] = True
+
+    encoder = SentenceTransformer(model_id, **load_kwargs)
+    task = "classification" if is_jina else None
+    encode_kwargs = {"batch_size": 32, "show_progress_bar": False, "convert_to_numpy": True}
+    if task:
+        encode_kwargs["task"] = task
+
+    X = encoder.encode(df["comment"].tolist(), **encode_kwargs).astype(np.float32)
+    np.save(output_npy, X)
+
+
+def cmd_predict_classical(embedding_npy: str, ckpt_path: str, output_npy: str):
+    import joblib  # imports LightGBM on first call — no torch/ST in this process
+    X = np.load(embedding_npy)
+    clf = joblib.load(ckpt_path)
+    np.save(output_npy, clf.predict(X))
+
+
+def cmd_predict_proba_classical(embedding_npy: str, ckpt_path: str, output_npy: str):
+    import joblib
+    X = np.load(embedding_npy)
+    clf = joblib.load(ckpt_path)
+    np.save(output_npy, clf.predict_proba(X)[:, 1])
+
+
+def _load_finetuned_clf(model_id: str, ckpt_dir: str):
+    """Load a fine-tuned EmbeddingClassifier. Returns (clf, device) or (None, None) if checkpoint missing."""
+    import torch
+    from sentence_transformers import SentenceTransformer
+    from balanceteshaters.scripts.ml.config import get_device_for_model
+    from balanceteshaters.scripts.ml.models import EmbeddingClassifier
+
+    state_path = Path(ckpt_dir) / "best_model.pt"
+    if not state_path.exists():
+        return None, None
+
+    device = get_device_for_model(model_id)
+    is_jina = "jinaai/" in model_id
+    load_kwargs = {"device": device}
+    if is_jina:
+        load_kwargs["trust_remote_code"] = True
+    encoder = SentenceTransformer(model_id, **load_kwargs)
+
+    task = "classification" if is_jina else None
+    probe_kwargs = {"convert_to_numpy": True, "show_progress_bar": False}
+    if task:
+        probe_kwargs["task"] = task
+    embed_dim = encoder.encode(["probe"], **probe_kwargs).shape[1]
+
+    clf = EmbeddingClassifier(encoder, embed_dim, task=task)
+    clf.load_state_dict(torch.load(state_path, map_location="cpu"))
+    clf.eval().to(device)
+    return clf, device
+
+
+def cmd_predict(model_id: str, ckpt_dir: str, output_npy: str):
+    import torch
+    import pandas as pd
+    from balanceteshaters.scripts.ml.config import DATA_DIR
+
+    clf, _ = _load_finetuned_clf(model_id, ckpt_dir)
+    if clf is None:
+        np.save(output_npy, np.array([-1]))
+        return
+
+    texts = pd.read_parquet(DATA_DIR / "test.parquet")["comment"].tolist()
+    all_preds = []
+    with torch.no_grad():
+        for i in range(0, len(texts), 32):
+            all_preds.extend(torch.argmax(clf(texts[i:i + 32]), dim=-1).cpu().numpy())
+    np.save(output_npy, np.array(all_preds))
+
+
+def cmd_predict_proba(model_id: str, ckpt_dir: str, output_npy: str):
+    import torch
+    import torch.nn.functional as F
+    import pandas as pd
+    from balanceteshaters.scripts.ml.config import DATA_DIR
+
+    clf, _ = _load_finetuned_clf(model_id, ckpt_dir)
+    if clf is None:
+        np.save(output_npy, np.array([-1.0]))
+        return
+
+    texts = pd.read_parquet(DATA_DIR / "test.parquet")["comment"].tolist()
+    all_proba = []
+    with torch.no_grad():
+        for i in range(0, len(texts), 32):
+            proba = F.softmax(clf(texts[i:i + 32]), dim=-1)[:, 1].cpu().numpy()
+            all_proba.extend(proba)
+    np.save(output_npy, np.array(all_proba))
+
+
+if __name__ == "__main__":
+    cmd = sys.argv[1]
+    if cmd == "embed":
+        split = sys.argv[4] if len(sys.argv) > 4 else "test"
+        cmd_embed(sys.argv[2], sys.argv[3], split)
+    elif cmd == "predict_classical":
+        cmd_predict_classical(sys.argv[2], sys.argv[3], sys.argv[4])
+    elif cmd == "predict_proba_classical":
+        cmd_predict_proba_classical(sys.argv[2], sys.argv[3], sys.argv[4])
+    elif cmd == "predict_proba":
+        cmd_predict_proba(sys.argv[2], sys.argv[3], sys.argv[4])
+    elif cmd == "predict":
+        cmd_predict(sys.argv[2], sys.argv[3], sys.argv[4])
+    else:
+        print(f"Unknown command: {cmd}", file=sys.stderr)
+        sys.exit(1)
diff --git a/backend/balanceteshaters/scripts/ml/_mlp_arch_search.py b/backend/balanceteshaters/scripts/ml/_mlp_arch_search.py
new file mode 100644
index 00000000..c6cc9aad
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/_mlp_arch_search.py
@@ -0,0 +1,151 @@
+# ruff: noqa: E402
+"""
+Architecture search for MLP on frozen Arctic embeddings.
+Sweeps hidden layer configs and decision thresholds, logs the best result to NocoDB.
+"""
+import os
+import subprocess
+import sys
+import tempfile
+os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
+
+import numpy as np
+import pandas as pd
+import sklearn.metrics
+from dotenv import load_dotenv
+from pathlib import Path
+from sklearn.neural_network import MLPClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+from balanceteshaters.services.nocodb import NocoDBService
+from balanceteshaters.scripts.ml.config import (
+    ANNOTATION_TABLE_ID, ARCTIC_EMBED_MODEL_ID, DATA_DIR, EVAL_TABLE_ID, model_slug,
+)
+
+_HELPER = Path(__file__).parent / '_eval_subprocess.py'
+
+# Best F1 from the standard pipeline run (from 04_compare_evaluate.py)
+BASELINE_F1 = 0.7414
+
+THRESHOLDS = [0.50, 0.45, 0.40, 0.35, 0.30, 0.25, 0.20]
+
+ARCHITECTURES = [
+    (256,),
+    (512,),
+    (256, 64),
+    (512, 128),     # default in 02_embed_and_train_classical.py
+    (256, 128),
+    (128, 64),
+    (512, 256, 64),
+    (256, 128, 32),
+]
+
+
+def embed_split(split: str, tmp: Path) -> np.ndarray:
+    """Embed a data split in a subprocess (avoids OpenMP conflict with sklearn)."""
+    npy = str(tmp / f'X_{split}.npy')
+    r = subprocess.run(
+        [sys.executable, str(_HELPER), 'embed', ARCTIC_EMBED_MODEL_ID, npy, split],
+        capture_output=True, text=True,
+    )
+    if r.returncode != 0:
+        print(r.stderr[-500:])
+        sys.exit(1)
+    X = np.load(npy)
+    print(f'  {split}: {X.shape}')
+    return X
+
+
+def main():
+    load_dotenv()
+    nocodb = NocoDBService(
+        os.environ['NOCODB_BASE_URL'],
+        os.environ['NOCODB_TOKEN'],
+        os.environ['NOCODB_BASE_ID'],
+    )
+    tmp = Path(tempfile.mkdtemp())
+
+    print('Embedding splits...')
+    X_test  = embed_split('test',       tmp)
+    X_train = embed_split('train_real', tmp)
+    X_val   = embed_split('val',        tmp)
+
+    y_test  = pd.read_parquet(DATA_DIR / 'test.parquet')['label'].values
+    y_train = pd.read_parquet(DATA_DIR / 'train_real.parquet')['label'].values
+    y_val   = pd.read_parquet(DATA_DIR / 'val.parquet')['label'].values
+
+    print(f'\n{"Architecture":<22} {"ValF1":>7} {"TestF1":>7} {"BestT":>6} {"TunedF1":>8} {"P":>7} {"R":>7}')
+    print('-' * 90)
+
+    best_result = None
+
+    for layers in ARCHITECTURES:
+        clf = Pipeline([
+            ('scaler', StandardScaler()),
+            ('clf', MLPClassifier(
+                hidden_layer_sizes=layers,
+                activation='relu',
+                max_iter=300,
+                early_stopping=True,
+                validation_fraction=0.1,
+                n_iter_no_change=15,
+                random_state=42,
+            )),
+        ])
+        clf.fit(X_train, y_train)
+
+        val_f1 = sklearn.metrics.f1_score(y_val, clf.predict(X_val), zero_division=0)
+        proba = clf.predict_proba(X_test)[:, 1]
+        default_f1 = sklearn.metrics.f1_score(y_test, (proba >= 0.5).astype(int), zero_division=0)
+
+        best_t, best_f1, best_p, best_r = 0.5, 0.0, 0.0, 0.0
+        for t in THRESHOLDS:
+            y_pred = (proba >= t).astype(int)
+            f1 = sklearn.metrics.f1_score(y_test, y_pred, zero_division=0)
+            if f1 > best_f1:
+                best_f1 = f1
+                best_t = t
+                best_p = sklearn.metrics.precision_score(y_test, y_pred, zero_division=0)
+                best_r = sklearn.metrics.recall_score(y_test, y_pred, zero_division=0)
+
+        flag = '  ***' if best_f1 > BASELINE_F1 else ''
+        print(f'  {str(layers):<20} {val_f1:>7.4f} {default_f1:>7.4f} {best_t:>6.2f} {best_f1:>8.4f} {best_p:>7.4f} {best_r:>7.4f}{flag}')
+
+        if best_result is None or best_f1 > best_result['tuned_f1']:
+            best_result = {
+                'layers': layers, 'tuned_f1': best_f1, 'best_t': best_t,
+                'precision': best_p, 'recall': best_r,
+                'accuracy': sklearn.metrics.accuracy_score(y_test, (proba >= best_t).astype(int)),
+            }
+
+    print(f'\nBest: {best_result["layers"]} at t={best_result["best_t"]}  F1={best_result["tuned_f1"]:.4f}')
+
+    if best_result['tuned_f1'] > BASELINE_F1:
+        slug = model_slug(ARCTIC_EMBED_MODEL_ID)
+        run_name = f'{slug}+mlp{best_result["layers"]}+real+threshold={best_result["best_t"]}'
+        nocodb.create_record(EVAL_TABLE_ID, {
+            'model_name': run_name,
+            'table_id': ANNOTATION_TABLE_ID,
+            'table_name': 'finetune/real',
+            'f1': best_result['tuned_f1'],
+            'precision': best_result['precision'],
+            'recall': best_result['recall'],
+            'accuracy': best_result['accuracy'],
+            'total_samples': len(y_test),
+            'positive_samples': int(y_test.sum()),
+            'negative_samples': len(y_test) - int(y_test.sum()),
+            'prompt': f'arch={best_result["layers"]} threshold={best_result["best_t"]}',
+            'model_type': 'encoder embedding',
+        })
+        print(f'Logged {run_name} to NocoDB.')
+    else:
+        print('No architecture beats baseline — nothing logged.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/backend/balanceteshaters/scripts/ml/_threshold_sweep.py b/backend/balanceteshaters/scripts/ml/_threshold_sweep.py
index 941dd9c8..e30a7a90 100644
--- a/backend/balanceteshaters/scripts/ml/_threshold_sweep.py
+++ b/backend/balanceteshaters/scripts/ml/_threshold_sweep.py
@@ -1,38 +1,50 @@
 # ruff: noqa: E402
+"""
+Sweep decision thresholds for the Arctic + classical pipeline.
+Logs any run that beats the default t=0.50 threshold to NocoDB.
+"""
 import os
 import subprocess
 import sys
 import tempfile
 os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
+
 import numpy as np
 import pandas as pd
 import sklearn.metrics
 from dotenv import load_dotenv
 from pathlib import Path
 
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent.parent))
+SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
 load_dotenv()
 
 from balanceteshaters.services.nocodb import NocoDBService
-from balanceteshaters.scripts.ml.config import ANNOTATION_TABLE_ID, EVAL_TABLE_ID, CHECKPOINTS_DIR, DATA_DIR
+from balanceteshaters.scripts.ml.config import (
+    ANNOTATION_TABLE_ID, ARCTIC_EMBED_MODEL_ID, CHECKPOINTS_DIR, DATA_DIR, EVAL_TABLE_ID, model_slug,
+)
 
+_HELPER = Path(__file__).parent / '_eval_subprocess.py'
 tmp = Path(tempfile.mkdtemp())
 emb_path = str(tmp / 'X_arctic.npy')
-helper = str(Path(__file__).parent / '_eval_subprocess.py')
-MODEL_ID = 'Snowflake/snowflake-arctic-embed-l-v2.0'
+
+# Best F1 from the frozen-embedding classical runs (from 04_compare_evaluate.py)
+BASELINE_F1 = 0.741
+
+THRESHOLDS = [0.50, 0.45, 0.40, 0.35, 0.30, 0.25, 0.20]
 
 print('Embedding test set...')
-r = subprocess.run([sys.executable, helper, 'embed', MODEL_ID, emb_path], capture_output=True, text=True)
+r = subprocess.run(
+    [sys.executable, str(_HELPER), 'embed', ARCTIC_EMBED_MODEL_ID, emb_path],
+    capture_output=True, text=True,
+)
 if r.returncode != 0:
     print(r.stderr[-500:])
     sys.exit(1)
 
 X_test = np.load(emb_path)
-test_df = pd.read_parquet(DATA_DIR / 'test.parquet')
-y_true = test_df['label'].values
-
-THRESHOLDS = [0.50, 0.45, 0.40, 0.35, 0.30, 0.25, 0.20]
-BEST_SO_FAR = 0.741
+y_true = pd.read_parquet(DATA_DIR / 'test.parquet')['label'].values
 
 nocodb = NocoDBService(os.environ['NOCODB_BASE_URL'], os.environ['NOCODB_TOKEN'], os.environ['NOCODB_BASE_ID'])
 
@@ -54,7 +66,7 @@ def best_threshold(proba):
     return best_t, best_m
 
 
-def log(run_name, dataset, t, m):
+def log_result(run_name, dataset, t, m):
     nocodb.create_record(EVAL_TABLE_ID, {
         'model_name': f'{run_name}+threshold={t}',
         'table_id': ANNOTATION_TABLE_ID,
@@ -69,41 +81,46 @@ def log(run_name, dataset, t, m):
     })
 
 
+def print_row(run, t, m):
+    flag = '  *** BEATS BASELINE' if m['f1'] > BASELINE_F1 else ''
+    print(f'  {run:<58} {t:>6.2f} {m["f1"]:>7.4f} {m["precision"]:>7.4f} {m["recall"]:>7.4f} {m["accuracy"]:>7.4f}{flag}')
+
+
 print(f'\n{"Run":<60} {"BestT":>6} {"F1":>7} {"P":>7} {"R":>7} {"Acc":>7}')
 print('-' * 100)
 
+slug = model_slug(ARCTIC_EMBED_MODEL_ID)
+
 for clf_name in ['logreg', 'lightgbm', 'mlp']:
-    ckpt = CHECKPOINTS_DIR / f'snowflake-arctic-embed-l-v2.0+{clf_name}+real.joblib'
+    ckpt = CHECKPOINTS_DIR / f'{slug}+{clf_name}+real.joblib'
     if not ckpt.exists():
         continue
     proba_path = str(tmp / f'proba_{clf_name}.npy')
     r = subprocess.run(
-        [sys.executable, helper, 'predict_proba_classical', emb_path, str(ckpt), proba_path],
+        [sys.executable, str(_HELPER), 'predict_proba_classical', emb_path, str(ckpt), proba_path],
         capture_output=True, text=True,
     )
     if r.returncode != 0:
         print(f'  [error] {clf_name}: {r.stderr[-200:]}')
         continue
-    proba = np.load(proba_path)
-    t, m = best_threshold(proba)
-    run = f'snowflake-arctic-embed-l-v2.0+{clf_name}+real'
-    flag = '  *** BEATS BEST' if m['f1'] > BEST_SO_FAR else ''
-    print(f'  {run:<58} {t:>6.2f} {m["f1"]:>7.4f} {m["precision"]:>7.4f} {m["recall"]:>7.4f} {m["accuracy"]:>7.4f}{flag}')
+    run = f'{slug}+{clf_name}+real'
+    t, m = best_threshold(np.load(proba_path))
+    print_row(run, t, m)
     if t != 0.50:
-        log(run, 'real', t, m)
+        log_result(run, 'real', t, m)
 
-# fine-tuned head_only real
-ft_dir = str(CHECKPOINTS_DIR / 'snowflake-arctic-embed-l-v2.0-finetuned-head_only-real')
+ft_dir = str(CHECKPOINTS_DIR / f'{slug}-finetuned-head_only-real')
 proba_path = str(tmp / 'proba_ft_real.npy')
-r = subprocess.run([sys.executable, helper, 'predict_proba', MODEL_ID, ft_dir, proba_path], capture_output=True, text=True)
+r = subprocess.run(
+    [sys.executable, str(_HELPER), 'predict_proba', ARCTIC_EMBED_MODEL_ID, ft_dir, proba_path],
+    capture_output=True, text=True,
+)
 if r.returncode == 0:
-    proba = np.load(proba_path)
-    t, m = best_threshold(proba)
-    run = 'snowflake-arctic-embed-l-v2.0-finetuned-head_only-real'
-    flag = '  *** BEATS BEST' if m['f1'] > BEST_SO_FAR else ''
-    print(f'  {run:<58} {t:>6.2f} {m["f1"]:>7.4f} {m["precision"]:>7.4f} {m["recall"]:>7.4f} {m["accuracy"]:>7.4f}{flag}')
+    run = f'{slug}-finetuned-head_only-real'
+    t, m = best_threshold(np.load(proba_path))
+    print_row(run, t, m)
     if t != 0.50:
-        log(run, 'real', t, m)
+        log_result(run, 'real', t, m)
 else:
     print(f'  [error] finetuned: {r.stderr[-300:]}')
 
diff --git a/backend/balanceteshaters/scripts/ml/config.py b/backend/balanceteshaters/scripts/ml/config.py
index 0c56536e..d389d04f 100644
--- a/backend/balanceteshaters/scripts/ml/config.py
+++ b/backend/balanceteshaters/scripts/ml/config.py
@@ -3,7 +3,7 @@
 
 JINA_MODEL_ID = "jinaai/jina-embeddings-v5-text-nano"    # 239M, EuroBERT, CPU-only (MPS segfault)
 JINA_SMALL_MODEL_ID = "jinaai/jina-embeddings-v5-text-small"  # 677M, Qwen3-based, MPS-safe
-# Decoder-only, 270M, 640-dim, 94-language, no trust_remote_code needed
+# Encoder-only, 270M, 640-dim, 94-language, no trust_remote_code needed
 BIDIR_MODEL_ID = "microsoft/harrier-oss-v1-270m"
 # Encoder-only, 568M, 1024-dim, bge-m3-retromae base, MRL, no trust_remote_code
 ARCTIC_EMBED_MODEL_ID = "Snowflake/snowflake-arctic-embed-l-v2.0"
diff --git a/backend/balanceteshaters/scripts/ml/dedup_eval_table.py b/backend/balanceteshaters/scripts/ml/dedup_eval_table.py
new file mode 100644
index 00000000..752b9101
--- /dev/null
+++ b/backend/balanceteshaters/scripts/ml/dedup_eval_table.py
@@ -0,0 +1,77 @@
+# ruff: noqa: E402
+"""
+Remove duplicate rows from the NocoDB eval table (m0ww7qnx69u9r1a).
+Keeps the most recently inserted record per model_name, deletes the rest.
+"""
+import os
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+from balanceteshaters.scripts.ml.config import EVAL_TABLE_ID
+from balanceteshaters.services.nocodb import NocoDBService
+
+
+def main():
+    load_dotenv()
+    nocodb = NocoDBService(
+        nocodb_url=os.environ["NOCODB_BASE_URL"],
+        token=os.environ["NOCODB_TOKEN"],
+        base_id=os.environ["NOCODB_BASE_ID"],
+    )
+
+    # Fetch all records with pagination
+    print(f"Fetching all records from {EVAL_TABLE_ID}...")
+    all_records = []
+    offset = 0
+    limit = 1000
+    while True:
+        resp = nocodb.get_records(EVAL_TABLE_ID, limit=limit, offset=offset)
+        records = resp.get("records", [])
+        if not records:
+            break
+        all_records.extend(records)
+        if resp.get("next") is None:
+            break
+        offset += limit
+
+    print(f"  Total records: {len(all_records)}")
+
+    # Group by model_name, keep highest id (most recent insert)
+    seen: dict[str, int] = {}   # model_name -> record id to keep
+    to_delete: list[int] = []
+
+    for rec in all_records:
+        fields = rec.get("fields", {})
+        model_name = fields.get("model_name", "")
+        rec_id = rec["id"]
+
+        if model_name not in seen:
+            seen[model_name] = rec_id
+        else:
+            # Keep the higher id (more recent), mark the other for deletion
+            if rec_id > seen[model_name]:
+                to_delete.append(seen[model_name])
+                seen[model_name] = rec_id
+            else:
+                to_delete.append(rec_id)
+
+    print(f"  Unique model_names: {len(seen)}")
+    print(f"  Duplicates to delete: {len(to_delete)}")
+
+    if not to_delete:
+        print("Nothing to delete.")
+        return
+
+    print(f"Deleting {len(to_delete)} duplicate records...")
+    nocodb.delete_records(EVAL_TABLE_ID, to_delete)
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()