From e46b7100e3a467b7d8e8b5afadd85a7a31d4071d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Co?= Date: Wed, 22 Apr 2026 16:44:59 +0200 Subject: [PATCH 1/8] feat(deps): add ML dependencies for embedding benchmark --- backend/pyproject.toml | 6 + backend/uv.lock | 621 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 627 insertions(+) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index cbd63d7c..8bc9dffc 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -26,4 +26,10 @@ dependencies = [ "torch>=2.11.0", "accelerate>=1.13.0", "bitsandbytes>=0.49.2", + "anthropic>=0.50.0", + "peft>=0.14.0", + "lightgbm>=4.0.0", + "sentence-transformers>=3.0.0", + "datasets>=3.0.0", + "python-dotenv>=1.0.0", ] diff --git a/backend/uv.lock b/backend/uv.lock index 6bfdcf26..e3561056 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -26,6 +26,79 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/46/02ac5e262d4af18054b3e922b2baedbb2a03289ee792162de60a865defc5/accelerate-1.13.0-py3-none-any.whl", hash = "sha256:cf1a3efb96c18f7b152eb0fa7490f3710b19c3f395699358f08decca2b8b62e0", size = 383744, upload-time = "2026-03-04T19:34:10.313Z" }, ] +[[package]] +name = "aiohappyeyeballs" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, +] + +[[package]] +name = "aiohttp" +version = "3.13.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/77/9a/152096d4808df8e4268befa55fba462f440f14beab85e8ad9bf990516918/aiohttp-3.13.5.tar.gz", hash = "sha256:9d98cc980ecc96be6eb4c1994ce35d28d8b1f5e5208a23b421187d1209dbb7d1", size = 7858271, upload-time = "2026-03-31T22:01:03.343Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/6f/353954c29e7dcce7cf00280a02c75f30e133c00793c7a2ed3776d7b2f426/aiohttp-3.13.5-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:023ecba036ddd840b0b19bf195bfae970083fd7024ce1ac22e9bba90464620e9", size = 748876, upload-time = "2026-03-31T21:57:36.319Z" }, + { url = "https://files.pythonhosted.org/packages/f5/1b/428a7c64687b3b2e9cd293186695affc0e1e54a445d0361743b231f11066/aiohttp-3.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15c933ad7920b7d9a20de151efcd05a6e38302cbf0e10c9b2acb9a42210a2416", size = 499557, upload-time = "2026-03-31T21:57:38.236Z" }, + { url = "https://files.pythonhosted.org/packages/29/47/7be41556bfbb6917069d6a6634bb7dd5e163ba445b783a90d40f5ac7e3a7/aiohttp-3.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab2899f9fa2f9f741896ebb6fa07c4c883bfa5c7f2ddd8cf2aafa86fa981b2d2", size = 500258, upload-time = "2026-03-31T21:57:39.923Z" }, + { url = "https://files.pythonhosted.org/packages/67/84/c9ecc5828cb0b3695856c07c0a6817a99d51e2473400f705275a2b3d9239/aiohttp-3.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a60eaa2d440cd4707696b52e40ed3e2b0f73f65be07fd0ef23b6b539c9c0b0b4", size = 1749199, upload-time = "2026-03-31T21:57:41.938Z" }, + { url = "https://files.pythonhosted.org/packages/f0/d3/3c6d610e66b495657622edb6ae7c7fd31b2e9086b4ec50b47897ad6042a9/aiohttp-3.13.5-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:55b3bdd3292283295774ab585160c4004f4f2f203946997f49aac032c84649e9", size = 1721013, upload-time = "2026-03-31T21:57:43.904Z" }, + { url = "https://files.pythonhosted.org/packages/49/a0/24409c12217456df0bae7babe3b014e460b0b38a8e60753d6cb339f6556d/aiohttp-3.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c2b2355dc094e5f7d45a7bb262fe7207aa0460b37a0d87027dcf21b5d890e7d5", size = 1781501, upload-time = "2026-03-31T21:57:46.285Z" }, + { url = "https://files.pythonhosted.org/packages/98/9d/b65ec649adc5bccc008b0957a9a9c691070aeac4e41cea18559fef49958b/aiohttp-3.13.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b38765950832f7d728297689ad78f5f2cf79ff82487131c4d26fe6ceecdc5f8e", size = 1878981, upload-time = "2026-03-31T21:57:48.734Z" }, + { url = "https://files.pythonhosted.org/packages/57/d8/8d44036d7eb7b6a8ec4c5494ea0c8c8b94fbc0ed3991c1a7adf230df03bf/aiohttp-3.13.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b18f31b80d5a33661e08c89e202edabf1986e9b49c42b4504371daeaa11b47c1", size = 1767934, upload-time = "2026-03-31T21:57:51.171Z" }, + { url = "https://files.pythonhosted.org/packages/31/04/d3f8211f273356f158e3464e9e45484d3fb8c4ce5eb2f6fe9405c3273983/aiohttp-3.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:33add2463dde55c4f2d9635c6ab33ce154e5ecf322bd26d09af95c5f81cfa286", size = 1566671, upload-time = "2026-03-31T21:57:53.326Z" }, + { url = "https://files.pythonhosted.org/packages/41/db/073e4ebe00b78e2dfcacff734291651729a62953b48933d765dc513bf798/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:327cc432fdf1356fb4fbc6fe833ad4e9f6aacb71a8acaa5f1855e4b25910e4a9", size = 1705219, upload-time = "2026-03-31T21:57:55.385Z" }, + { url = "https://files.pythonhosted.org/packages/48/45/7dfba71a2f9fd97b15c95c06819de7eb38113d2cdb6319669195a7d64270/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:7c35b0bf0b48a70b4cb4fc5d7bed9b932532728e124874355de1a0af8ec4bc88", size = 1743049, upload-time = "2026-03-31T21:57:57.341Z" }, + { url = "https://files.pythonhosted.org/packages/18/71/901db0061e0f717d226386a7f471bb59b19566f2cae5f0d93874b017271f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:df23d57718f24badef8656c49743e11a89fd6f5358fa8a7b96e728fda2abf7d3", size = 1749557, upload-time = "2026-03-31T21:57:59.626Z" }, + { url = "https://files.pythonhosted.org/packages/08/d5/41eebd16066e59cd43728fe74bce953d7402f2b4ddfdfef2c0e9f17ca274/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:02e048037a6501a5ec1f6fc9736135aec6eb8a004ce48838cb951c515f32c80b", size = 1558931, upload-time = "2026-03-31T21:58:01.972Z" }, + { url = "https://files.pythonhosted.org/packages/30/e6/4a799798bf05740e66c3a1161079bda7a3dd8e22ca392481d7a7f9af82a6/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:31cebae8b26f8a615d2b546fee45d5ffb76852ae6450e2a03f42c9102260d6fe", size = 1774125, upload-time = "2026-03-31T21:58:04.007Z" }, + { url = "https://files.pythonhosted.org/packages/84/63/7749337c90f92bc2cb18f9560d67aa6258c7060d1397d21529b8004fcf6f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:888e78eb5ca55a615d285c3c09a7a91b42e9dd6fc699b166ebd5dee87c9ccf14", size = 1732427, upload-time = "2026-03-31T21:58:06.337Z" }, + { url = "https://files.pythonhosted.org/packages/98/de/cf2f44ff98d307e72fb97d5f5bbae3bfcb442f0ea9790c0bf5c5c2331404/aiohttp-3.13.5-cp312-cp312-win32.whl", hash = "sha256:8bd3ec6376e68a41f9f95f5ed170e2fcf22d4eb27a1f8cb361d0508f6e0557f3", size = 433534, upload-time = "2026-03-31T21:58:08.712Z" }, + { url = "https://files.pythonhosted.org/packages/aa/ca/eadf6f9c8fa5e31d40993e3db153fb5ed0b11008ad5d9de98a95045bed84/aiohttp-3.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:110e448e02c729bcebb18c60b9214a87ba33bac4a9fa5e9a5f139938b56c6cb1", size = 460446, upload-time = "2026-03-31T21:58:10.945Z" }, + { url = "https://files.pythonhosted.org/packages/78/e9/d76bf503005709e390122d34e15256b88f7008e246c4bdbe915cd4f1adce/aiohttp-3.13.5-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5029cc80718bbd545123cd8fe5d15025eccaaaace5d0eeec6bd556ad6163d61", size = 742930, upload-time = "2026-03-31T21:58:13.155Z" }, + { url = "https://files.pythonhosted.org/packages/57/00/4b7b70223deaebd9bb85984d01a764b0d7bd6526fcdc73cca83bcbe7243e/aiohttp-3.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4bb6bf5811620003614076bdc807ef3b5e38244f9d25ca5fe888eaccea2a9832", size = 496927, upload-time = "2026-03-31T21:58:15.073Z" }, + { url = "https://files.pythonhosted.org/packages/9c/f5/0fb20fb49f8efdcdce6cd8127604ad2c503e754a8f139f5e02b01626523f/aiohttp-3.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a84792f8631bf5a94e52d9cc881c0b824ab42717165a5579c760b830d9392ac9", size = 497141, upload-time = "2026-03-31T21:58:17.009Z" }, + { url = "https://files.pythonhosted.org/packages/3b/86/b7c870053e36a94e8951b803cb5b909bfbc9b90ca941527f5fcafbf6b0fa/aiohttp-3.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57653eac22c6a4c13eb22ecf4d673d64a12f266e72785ab1c8b8e5940d0e8090", size = 1732476, upload-time = "2026-03-31T21:58:18.925Z" }, + { url = "https://files.pythonhosted.org/packages/b5/e5/4e161f84f98d80c03a238671b4136e6530453d65262867d989bbe78244d0/aiohttp-3.13.5-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5e5f7debc7a57af53fdf5c5009f9391d9f4c12867049d509bf7bb164a6e295b", size = 1706507, upload-time = "2026-03-31T21:58:21.094Z" }, + { url = "https://files.pythonhosted.org/packages/d4/56/ea11a9f01518bd5a2a2fcee869d248c4b8a0cfa0bb13401574fa31adf4d4/aiohttp-3.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c719f65bebcdf6716f10e9eff80d27567f7892d8988c06de12bbbd39307c6e3a", size = 1773465, upload-time = "2026-03-31T21:58:23.159Z" }, + { url = "https://files.pythonhosted.org/packages/eb/40/333ca27fb74b0383f17c90570c748f7582501507307350a79d9f9f3c6eb1/aiohttp-3.13.5-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d97f93fdae594d886c5a866636397e2bcab146fd7a132fd6bb9ce182224452f8", size = 1873523, upload-time = "2026-03-31T21:58:25.59Z" }, + { url = "https://files.pythonhosted.org/packages/f0/d2/e2f77eef1acb7111405433c707dc735e63f67a56e176e72e9e7a2cd3f493/aiohttp-3.13.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3df334e39d4c2f899a914f1dba283c1aadc311790733f705182998c6f7cae665", size = 1754113, upload-time = "2026-03-31T21:58:27.624Z" }, + { url = "https://files.pythonhosted.org/packages/fb/56/3f653d7f53c89669301ec9e42c95233e2a0c0a6dd051269e6e678db4fdb0/aiohttp-3.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fe6970addfea9e5e081401bcbadf865d2b6da045472f58af08427e108d618540", size = 1562351, upload-time = "2026-03-31T21:58:29.918Z" }, + { url = "https://files.pythonhosted.org/packages/ec/a6/9b3e91eb8ae791cce4ee736da02211c85c6f835f1bdfac0594a8a3b7018c/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7becdf835feff2f4f335d7477f121af787e3504b48b449ff737afb35869ba7bb", size = 1693205, upload-time = "2026-03-31T21:58:32.214Z" }, + { url = "https://files.pythonhosted.org/packages/98/fc/bfb437a99a2fcebd6b6eaec609571954de2ed424f01c352f4b5504371dd3/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:676e5651705ad5d8a70aeb8eb6936c436d8ebbd56e63436cb7dd9bb36d2a9a46", size = 1730618, upload-time = "2026-03-31T21:58:34.728Z" }, + { url = "https://files.pythonhosted.org/packages/e4/b6/c8534862126191a034f68153194c389addc285a0f1347d85096d349bbc15/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:9b16c653d38eb1a611cc898c41e76859ca27f119d25b53c12875fd0474ae31a8", size = 1745185, upload-time = "2026-03-31T21:58:36.909Z" }, + { url = "https://files.pythonhosted.org/packages/0b/93/4ca8ee2ef5236e2707e0fd5fecb10ce214aee1ff4ab307af9c558bda3b37/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:999802d5fa0389f58decd24b537c54aa63c01c3219ce17d1214cbda3c2b22d2d", size = 1557311, upload-time = "2026-03-31T21:58:39.38Z" }, + { url = "https://files.pythonhosted.org/packages/57/ae/76177b15f18c5f5d094f19901d284025db28eccc5ae374d1d254181d33f4/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:ec707059ee75732b1ba130ed5f9580fe10ff75180c812bc267ded039db5128c6", size = 1773147, upload-time = "2026-03-31T21:58:41.476Z" }, + { url = "https://files.pythonhosted.org/packages/01/a4/62f05a0a98d88af59d93b7fcac564e5f18f513cb7471696ac286db970d6a/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d6d44a5b48132053c2f6cd5c8cb14bc67e99a63594e336b0f2af81e94d5530c", size = 1730356, upload-time = "2026-03-31T21:58:44.049Z" }, + { url = "https://files.pythonhosted.org/packages/e4/85/fc8601f59dfa8c9523808281f2da571f8b4699685f9809a228adcc90838d/aiohttp-3.13.5-cp313-cp313-win32.whl", hash = "sha256:329f292ed14d38a6c4c435e465f48bebb47479fd676a0411936cc371643225cc", size = 432637, upload-time = "2026-03-31T21:58:46.167Z" }, + { url = "https://files.pythonhosted.org/packages/c0/1b/ac685a8882896acf0f6b31d689e3792199cfe7aba37969fa91da63a7fa27/aiohttp-3.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:69f571de7500e0557801c0b51f4780482c0ec5fe2ac851af5a92cfce1af1cb83", size = 458896, upload-time = "2026-03-31T21:58:48.119Z" }, +] + +[[package]] +name = "aiosignal" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, +] + [[package]] name = "alembic" version = "1.18.4" @@ -58,6 +131,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, ] +[[package]] +name = "anthropic" +version = "0.96.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "docstring-parser" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b9/7e/672f533dee813028d2c699bfd2a7f52c9118d7353680d9aa44b9e23f717f/anthropic-0.96.0.tar.gz", hash = "sha256:9de947b737f39452f68aa520f1c2239d44119c9b73b0fb6d4e6ca80f00279ee6", size = 658210, upload-time = "2026-04-16T14:28:02.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/5a/72f33204064b6e87601a71a6baf8d855769f8a0c1eaae8d06a1094872371/anthropic-0.96.0-py3-none-any.whl", hash = "sha256:9a6e335a354602a521cd9e777e92bfd46ba6e115bf9bbfe6135311e8fb2015b2", size = 635930, upload-time = "2026-04-16T14:28:01.436Z" }, +] + [[package]] name = "anyio" version = "4.12.1" @@ -128,6 +220,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/91/66/b25ccb84a246b470eb943b0107c07edcae51804912b824054b3413995a10/asyncpg-0.31.0-cp313-cp313-win_amd64.whl", hash = "sha256:dc5f2fa9916f292e5c5c8b2ac2813763bcd7f58e130055b4ad8a0531314201ab", size = 596569, upload-time = "2025-11-24T23:26:16.189Z" }, ] +[[package]] +name = "attrs" +version = "26.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9a/8e/82a0fe20a541c03148528be8cac2408564a6c9a0cc7e9171802bc1d26985/attrs-26.1.0.tar.gz", hash = "sha256:d03ceb89cb322a8fd706d4fb91940737b6642aa36998fe130a9bc96c985eff32", size = 952055, upload-time = "2026-03-19T14:22:25.026Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/b4/17d4b0b2a2dc85a6df63d1157e028ed19f90d4cd97c36717afef2bc2f395/attrs-26.1.0-py3-none-any.whl", hash = "sha256:c647aa4a12dfbad9333ca4e71fe62ddc36f4e63b2d260a37a8b83d2f043ac309", size = 67548, upload-time = "2026-03-19T14:22:23.645Z" }, +] + [[package]] name = "balanceteshaters" version = "0.1.0" @@ -135,20 +236,26 @@ source = { virtual = "." } dependencies = [ { name = "accelerate" }, { name = "alembic" }, + { name = "anthropic" }, { name = "argon2-cffi" }, { name = "asyncpg" }, { name = "bitsandbytes" }, { name = "colorlog" }, + { name = "datasets" }, { name = "dependency-injector", extra = ["pydantic2"] }, { name = "fastapi", extra = ["standard"] }, + { name = "lightgbm" }, { name = "lingua-language-detector" }, { name = "llama-cpp-python" }, { name = "ollama" }, { name = "pandas" }, + { name = "peft" }, { name = "pyjwt" }, + { name = "python-dotenv" }, { name = "python-multipart" }, { name = "requests" }, { name = "scikit-learn" }, + { name = "sentence-transformers" }, { name = "sqlalchemy", extra = ["asyncio"] }, { name = "torch" }, { name = "tqdm" }, @@ -160,20 +267,26 @@ dependencies = [ requires-dist = [ { name = "accelerate", specifier = ">=1.13.0" }, { name = "alembic", specifier = ">=1.17.2" }, + { name = "anthropic", specifier = ">=0.50.0" }, { name = "argon2-cffi", specifier = ">=25.1.0" }, { name = "asyncpg", specifier = ">=0.31.0" }, { name = "bitsandbytes", specifier = ">=0.49.2" }, { name = "colorlog", specifier = ">=6.10.1" }, + { name = "datasets", specifier = ">=3.0.0" }, { name = "dependency-injector", extras = ["pydantic2"], specifier = ">=4.48.2" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.122.0" }, + { name = "lightgbm", specifier = ">=4.0.0" }, { name = "lingua-language-detector", specifier = ">=2.1.1" }, { name = "llama-cpp-python", specifier = ">=0.3.0" }, { name = "ollama", specifier = ">=0.6.1" }, { name = "pandas", specifier = ">=3.0.1" }, + { name = "peft", specifier = ">=0.14.0" }, { name = "pyjwt", specifier = ">=2.10.1" }, + { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "python-multipart", specifier = ">=0.0.20" }, { name = "requests", specifier = ">=2.32.5" }, { name = "scikit-learn", specifier = ">=1.8.0" }, + { name = "sentence-transformers", specifier = ">=3.0.0" }, { name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0.44" }, { name = "torch", specifier = ">=2.11.0" }, { name = "tqdm", specifier = ">=4.67.3" }, @@ -380,6 +493,31 @@ nvtx = [ { name = "nvidia-nvtx", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] +[[package]] +name = "datasets" +version = "4.8.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, + { name = "filelock" }, + { name = "fsspec", extra = ["http"] }, + { name = "httpx" }, + { name = "huggingface-hub" }, + { name = "multiprocess" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "xxhash" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/22/73e46ac7a8c25e7ef0b3bd6f10da3465021d90219a32eb0b4d2afea4c56e/datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52", size = 604382, upload-time = "2026-03-23T14:21:17.987Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b0/e5/247d094108e42ac26363ab8dc57f168840cf7c05774b40ffeb0d78868fcc/datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d", size = 526991, upload-time = "2026-03-23T14:21:15.89Z" }, +] + [[package]] name = "dependency-injector" version = "4.48.3" @@ -403,6 +541,15 @@ pydantic2 = [ { name = "pydantic-settings" }, ] +[[package]] +name = "dill" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/81/e1/56027a71e31b02ddc53c7d65b01e68edf64dea2932122fe7746a516f75d5/dill-0.4.1.tar.gz", hash = "sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa", size = 187315, upload-time = "2026-01-19T02:36:56.85Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" }, +] + [[package]] name = "diskcache" version = "5.6.3" @@ -412,6 +559,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" }, ] +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, +] + [[package]] name = "dnspython" version = "2.8.0" @@ -421,6 +577,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ba/5a/18ad964b0086c6e62e2e7500f7edc89e3faa45033c71c1893d34eed2b2de/dnspython-2.8.0-py3-none-any.whl", hash = "sha256:01d9bbc4a2d76bf0db7c1f729812ded6d912bd318d3b1cf81d30c0f845dbf3af", size = 331094, upload-time = "2025-09-07T18:57:58.071Z" }, ] +[[package]] +name = "docstring-parser" +version = "0.18.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/4d/f332313098c1de1b2d2ff91cf2674415cc7cddab2ca1b01ae29774bd5fdf/docstring_parser-0.18.0.tar.gz", hash = "sha256:292510982205c12b1248696f44959db3cdd1740237a968ea1e2e7a900eeb2015", size = 29341, upload-time = "2026-04-14T04:09:19.867Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/5f/ed01f9a3cdffbd5a008556fc7b2a08ddb1cc6ace7effa7340604b1d16699/docstring_parser-0.18.0-py3-none-any.whl", hash = "sha256:b3fcbed555c47d8479be0796ef7e19c2670d428d72e96da63f3a40122860374b", size = 22484, upload-time = "2026-04-14T04:09:18.638Z" }, +] + [[package]] name = "email-validator" version = "2.3.0" @@ -548,6 +713,63 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a4/a5/842ae8f0c08b61d6484b52f99a03510a3a72d23141942d216ebe81fefbce/filelock-3.25.2-py3-none-any.whl", hash = "sha256:ca8afb0da15f229774c9ad1b455ed96e85a81373065fb10446672f64444ddf70", size = 26759, upload-time = "2026-03-11T20:45:37.437Z" }, ] +[[package]] +name = "frozenlist" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/29/948b9aa87e75820a38650af445d2ef2b6b8a6fab1a23b6bb9e4ef0be2d59/frozenlist-1.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1", size = 87782, upload-time = "2025-10-06T05:36:06.649Z" }, + { url = "https://files.pythonhosted.org/packages/64/80/4f6e318ee2a7c0750ed724fa33a4bdf1eacdc5a39a7a24e818a773cd91af/frozenlist-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b", size = 50594, upload-time = "2025-10-06T05:36:07.69Z" }, + { url = "https://files.pythonhosted.org/packages/2b/94/5c8a2b50a496b11dd519f4a24cb5496cf125681dd99e94c604ccdea9419a/frozenlist-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4", size = 50448, upload-time = "2025-10-06T05:36:08.78Z" }, + { url = "https://files.pythonhosted.org/packages/6a/bd/d91c5e39f490a49df14320f4e8c80161cfcce09f1e2cde1edd16a551abb3/frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:494a5952b1c597ba44e0e78113a7266e656b9794eec897b19ead706bd7074383", size = 242411, upload-time = "2025-10-06T05:36:09.801Z" }, + { url = "https://files.pythonhosted.org/packages/8f/83/f61505a05109ef3293dfb1ff594d13d64a2324ac3482be2cedc2be818256/frozenlist-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f423a119f4777a4a056b66ce11527366a8bb92f54e541ade21f2374433f6d4", size = 243014, upload-time = "2025-10-06T05:36:11.394Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cb/cb6c7b0f7d4023ddda30cf56b8b17494eb3a79e3fda666bf735f63118b35/frozenlist-1.8.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3462dd9475af2025c31cc61be6652dfa25cbfb56cbbf52f4ccfe029f38decaf8", size = 234909, upload-time = "2025-10-06T05:36:12.598Z" }, + { url = "https://files.pythonhosted.org/packages/31/c5/cd7a1f3b8b34af009fb17d4123c5a778b44ae2804e3ad6b86204255f9ec5/frozenlist-1.8.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4c800524c9cd9bac5166cd6f55285957fcfc907db323e193f2afcd4d9abd69b", size = 250049, upload-time = "2025-10-06T05:36:14.065Z" }, + { url = "https://files.pythonhosted.org/packages/c0/01/2f95d3b416c584a1e7f0e1d6d31998c4a795f7544069ee2e0962a4b60740/frozenlist-1.8.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d6a5df73acd3399d893dafc71663ad22534b5aa4f94e8a2fabfe856c3c1b6a52", size = 256485, upload-time = "2025-10-06T05:36:15.39Z" }, + { url = "https://files.pythonhosted.org/packages/ce/03/024bf7720b3abaebcff6d0793d73c154237b85bdf67b7ed55e5e9596dc9a/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:405e8fe955c2280ce66428b3ca55e12b3c4e9c336fb2103a4937e891c69a4a29", size = 237619, upload-time = "2025-10-06T05:36:16.558Z" }, + { url = "https://files.pythonhosted.org/packages/69/fa/f8abdfe7d76b731f5d8bd217827cf6764d4f1d9763407e42717b4bed50a0/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:908bd3f6439f2fef9e85031b59fd4f1297af54415fb60e4254a95f75b3cab3f3", size = 250320, upload-time = "2025-10-06T05:36:17.821Z" }, + { url = "https://files.pythonhosted.org/packages/f5/3c/b051329f718b463b22613e269ad72138cc256c540f78a6de89452803a47d/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:294e487f9ec720bd8ffcebc99d575f7eff3568a08a253d1ee1a0378754b74143", size = 246820, upload-time = "2025-10-06T05:36:19.046Z" }, + { url = "https://files.pythonhosted.org/packages/0f/ae/58282e8f98e444b3f4dd42448ff36fa38bef29e40d40f330b22e7108f565/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:74c51543498289c0c43656701be6b077f4b265868fa7f8a8859c197006efb608", size = 250518, upload-time = "2025-10-06T05:36:20.763Z" }, + { url = "https://files.pythonhosted.org/packages/8f/96/007e5944694d66123183845a106547a15944fbbb7154788cbf7272789536/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:776f352e8329135506a1d6bf16ac3f87bc25b28e765949282dcc627af36123aa", size = 239096, upload-time = "2025-10-06T05:36:22.129Z" }, + { url = "https://files.pythonhosted.org/packages/66/bb/852b9d6db2fa40be96f29c0d1205c306288f0684df8fd26ca1951d461a56/frozenlist-1.8.0-cp312-cp312-win32.whl", hash = "sha256:433403ae80709741ce34038da08511d4a77062aa924baf411ef73d1146e74faf", size = 39985, upload-time = "2025-10-06T05:36:23.661Z" }, + { url = "https://files.pythonhosted.org/packages/b8/af/38e51a553dd66eb064cdf193841f16f077585d4d28394c2fa6235cb41765/frozenlist-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:34187385b08f866104f0c0617404c8eb08165ab1272e884abc89c112e9c00746", size = 44591, upload-time = "2025-10-06T05:36:24.958Z" }, + { url = "https://files.pythonhosted.org/packages/a7/06/1dc65480ab147339fecc70797e9c2f69d9cea9cf38934ce08df070fdb9cb/frozenlist-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:fe3c58d2f5db5fbd18c2987cba06d51b0529f52bc3a6cdc33d3f4eab725104bd", size = 40102, upload-time = "2025-10-06T05:36:26.333Z" }, + { url = "https://files.pythonhosted.org/packages/2d/40/0832c31a37d60f60ed79e9dfb5a92e1e2af4f40a16a29abcc7992af9edff/frozenlist-1.8.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8d92f1a84bb12d9e56f818b3a746f3efba93c1b63c8387a73dde655e1e42282a", size = 85717, upload-time = "2025-10-06T05:36:27.341Z" }, + { url = "https://files.pythonhosted.org/packages/30/ba/b0b3de23f40bc55a7057bd38434e25c34fa48e17f20ee273bbde5e0650f3/frozenlist-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:96153e77a591c8adc2ee805756c61f59fef4cf4073a9275ee86fe8cba41241f7", size = 49651, upload-time = "2025-10-06T05:36:28.855Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ab/6e5080ee374f875296c4243c381bbdef97a9ac39c6e3ce1d5f7d42cb78d6/frozenlist-1.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f21f00a91358803399890ab167098c131ec2ddd5f8f5fd5fe9c9f2c6fcd91e40", size = 49417, upload-time = "2025-10-06T05:36:29.877Z" }, + { url = "https://files.pythonhosted.org/packages/d5/4e/e4691508f9477ce67da2015d8c00acd751e6287739123113a9fca6f1604e/frozenlist-1.8.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fb30f9626572a76dfe4293c7194a09fb1fe93ba94c7d4f720dfae3b646b45027", size = 234391, upload-time = "2025-10-06T05:36:31.301Z" }, + { url = "https://files.pythonhosted.org/packages/40/76/c202df58e3acdf12969a7895fd6f3bc016c642e6726aa63bd3025e0fc71c/frozenlist-1.8.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eaa352d7047a31d87dafcacbabe89df0aa506abb5b1b85a2fb91bc3faa02d822", size = 233048, upload-time = "2025-10-06T05:36:32.531Z" }, + { url = "https://files.pythonhosted.org/packages/f9/c0/8746afb90f17b73ca5979c7a3958116e105ff796e718575175319b5bb4ce/frozenlist-1.8.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:03ae967b4e297f58f8c774c7eabcce57fe3c2434817d4385c50661845a058121", size = 226549, upload-time = "2025-10-06T05:36:33.706Z" }, + { url = "https://files.pythonhosted.org/packages/7e/eb/4c7eefc718ff72f9b6c4893291abaae5fbc0c82226a32dcd8ef4f7a5dbef/frozenlist-1.8.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f6292f1de555ffcc675941d65fffffb0a5bcd992905015f85d0592201793e0e5", size = 239833, upload-time = "2025-10-06T05:36:34.947Z" }, + { url = "https://files.pythonhosted.org/packages/c2/4e/e5c02187cf704224f8b21bee886f3d713ca379535f16893233b9d672ea71/frozenlist-1.8.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:29548f9b5b5e3460ce7378144c3010363d8035cea44bc0bf02d57f5a685e084e", size = 245363, upload-time = "2025-10-06T05:36:36.534Z" }, + { url = "https://files.pythonhosted.org/packages/1f/96/cb85ec608464472e82ad37a17f844889c36100eed57bea094518bf270692/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ec3cc8c5d4084591b4237c0a272cc4f50a5b03396a47d9caaf76f5d7b38a4f11", size = 229314, upload-time = "2025-10-06T05:36:38.582Z" }, + { url = "https://files.pythonhosted.org/packages/5d/6f/4ae69c550e4cee66b57887daeebe006fe985917c01d0fff9caab9883f6d0/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:517279f58009d0b1f2e7c1b130b377a349405da3f7621ed6bfae50b10adf20c1", size = 243365, upload-time = "2025-10-06T05:36:40.152Z" }, + { url = "https://files.pythonhosted.org/packages/7a/58/afd56de246cf11780a40a2c28dc7cbabbf06337cc8ddb1c780a2d97e88d8/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:db1e72ede2d0d7ccb213f218df6a078a9c09a7de257c2fe8fcef16d5925230b1", size = 237763, upload-time = "2025-10-06T05:36:41.355Z" }, + { url = "https://files.pythonhosted.org/packages/cb/36/cdfaf6ed42e2644740d4a10452d8e97fa1c062e2a8006e4b09f1b5fd7d63/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:b4dec9482a65c54a5044486847b8a66bf10c9cb4926d42927ec4e8fd5db7fed8", size = 240110, upload-time = "2025-10-06T05:36:42.716Z" }, + { url = "https://files.pythonhosted.org/packages/03/a8/9ea226fbefad669f11b52e864c55f0bd57d3c8d7eb07e9f2e9a0b39502e1/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:21900c48ae04d13d416f0e1e0c4d81f7931f73a9dfa0b7a8746fb2fe7dd970ed", size = 233717, upload-time = "2025-10-06T05:36:44.251Z" }, + { url = "https://files.pythonhosted.org/packages/1e/0b/1b5531611e83ba7d13ccc9988967ea1b51186af64c42b7a7af465dcc9568/frozenlist-1.8.0-cp313-cp313-win32.whl", hash = "sha256:8b7b94a067d1c504ee0b16def57ad5738701e4ba10cec90529f13fa03c833496", size = 39628, upload-time = "2025-10-06T05:36:45.423Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cf/174c91dbc9cc49bc7b7aab74d8b734e974d1faa8f191c74af9b7e80848e6/frozenlist-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:878be833caa6a3821caf85eb39c5ba92d28e85df26d57afb06b35b2efd937231", size = 43882, upload-time = "2025-10-06T05:36:46.796Z" }, + { url = "https://files.pythonhosted.org/packages/c1/17/502cd212cbfa96eb1388614fe39a3fc9ab87dbbe042b66f97acb57474834/frozenlist-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:44389d135b3ff43ba8cc89ff7f51f5a0bb6b63d829c8300f79a2fe4fe61bcc62", size = 39676, upload-time = "2025-10-06T05:36:47.8Z" }, + { url = "https://files.pythonhosted.org/packages/d2/5c/3bbfaa920dfab09e76946a5d2833a7cbdf7b9b4a91c714666ac4855b88b4/frozenlist-1.8.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:e25ac20a2ef37e91c1b39938b591457666a0fa835c7783c3a8f33ea42870db94", size = 89235, upload-time = "2025-10-06T05:36:48.78Z" }, + { url = "https://files.pythonhosted.org/packages/d2/d6/f03961ef72166cec1687e84e8925838442b615bd0b8854b54923ce5b7b8a/frozenlist-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:07cdca25a91a4386d2e76ad992916a85038a9b97561bf7a3fd12d5d9ce31870c", size = 50742, upload-time = "2025-10-06T05:36:49.837Z" }, + { url = "https://files.pythonhosted.org/packages/1e/bb/a6d12b7ba4c3337667d0e421f7181c82dda448ce4e7ad7ecd249a16fa806/frozenlist-1.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4e0c11f2cc6717e0a741f84a527c52616140741cd812a50422f83dc31749fb52", size = 51725, upload-time = "2025-10-06T05:36:50.851Z" }, + { url = "https://files.pythonhosted.org/packages/bc/71/d1fed0ffe2c2ccd70b43714c6cab0f4188f09f8a67a7914a6b46ee30f274/frozenlist-1.8.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b3210649ee28062ea6099cfda39e147fa1bc039583c8ee4481cb7811e2448c51", size = 284533, upload-time = "2025-10-06T05:36:51.898Z" }, + { url = "https://files.pythonhosted.org/packages/c9/1f/fb1685a7b009d89f9bf78a42d94461bc06581f6e718c39344754a5d9bada/frozenlist-1.8.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:581ef5194c48035a7de2aefc72ac6539823bb71508189e5de01d60c9dcd5fa65", size = 292506, upload-time = "2025-10-06T05:36:53.101Z" }, + { url = "https://files.pythonhosted.org/packages/e6/3b/b991fe1612703f7e0d05c0cf734c1b77aaf7c7d321df4572e8d36e7048c8/frozenlist-1.8.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3ef2d026f16a2b1866e1d86fc4e1291e1ed8a387b2c333809419a2f8b3a77b82", size = 274161, upload-time = "2025-10-06T05:36:54.309Z" }, + { url = "https://files.pythonhosted.org/packages/ca/ec/c5c618767bcdf66e88945ec0157d7f6c4a1322f1473392319b7a2501ded7/frozenlist-1.8.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5500ef82073f599ac84d888e3a8c1f77ac831183244bfd7f11eaa0289fb30714", size = 294676, upload-time = "2025-10-06T05:36:55.566Z" }, + { url = "https://files.pythonhosted.org/packages/7c/ce/3934758637d8f8a88d11f0585d6495ef54b2044ed6ec84492a91fa3b27aa/frozenlist-1.8.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:50066c3997d0091c411a66e710f4e11752251e6d2d73d70d8d5d4c76442a199d", size = 300638, upload-time = "2025-10-06T05:36:56.758Z" }, + { url = "https://files.pythonhosted.org/packages/fc/4f/a7e4d0d467298f42de4b41cbc7ddaf19d3cfeabaf9ff97c20c6c7ee409f9/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5c1c8e78426e59b3f8005e9b19f6ff46e5845895adbde20ece9218319eca6506", size = 283067, upload-time = "2025-10-06T05:36:57.965Z" }, + { url = "https://files.pythonhosted.org/packages/dc/48/c7b163063d55a83772b268e6d1affb960771b0e203b632cfe09522d67ea5/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:eefdba20de0d938cec6a89bd4d70f346a03108a19b9df4248d3cf0d88f1b0f51", size = 292101, upload-time = "2025-10-06T05:36:59.237Z" }, + { url = "https://files.pythonhosted.org/packages/9f/d0/2366d3c4ecdc2fd391e0afa6e11500bfba0ea772764d631bbf82f0136c9d/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cf253e0e1c3ceb4aaff6df637ce033ff6535fb8c70a764a8f46aafd3d6ab798e", size = 289901, upload-time = "2025-10-06T05:37:00.811Z" }, + { url = "https://files.pythonhosted.org/packages/b8/94/daff920e82c1b70e3618a2ac39fbc01ae3e2ff6124e80739ce5d71c9b920/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:032efa2674356903cd0261c4317a561a6850f3ac864a63fc1583147fb05a79b0", size = 289395, upload-time = "2025-10-06T05:37:02.115Z" }, + { url = "https://files.pythonhosted.org/packages/e3/20/bba307ab4235a09fdcd3cc5508dbabd17c4634a1af4b96e0f69bfe551ebd/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6da155091429aeba16851ecb10a9104a108bcd32f6c1642867eadaee401c1c41", size = 283659, upload-time = "2025-10-06T05:37:03.711Z" }, + { url = "https://files.pythonhosted.org/packages/fd/00/04ca1c3a7a124b6de4f8a9a17cc2fcad138b4608e7a3fc5877804b8715d7/frozenlist-1.8.0-cp313-cp313t-win32.whl", hash = "sha256:0f96534f8bfebc1a394209427d0f8a63d343c9779cda6fc25e8e121b5fd8555b", size = 43492, upload-time = "2025-10-06T05:37:04.915Z" }, + { url = "https://files.pythonhosted.org/packages/59/5e/c69f733a86a94ab10f68e496dc6b7e8bc078ebb415281d5698313e3af3a1/frozenlist-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5d63a068f978fc69421fb0e6eb91a9603187527c86b7cd3f534a5b77a592b888", size = 48034, upload-time = "2025-10-06T05:37:06.343Z" }, + { url = "https://files.pythonhosted.org/packages/16/6c/be9d79775d8abe79b05fa6d23da99ad6e7763a1d080fbae7290b286093fd/frozenlist-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf0a7e10b077bf5fb9380ad3ae8ce20ef919a6ad93b4552896419ac7e1d8e042", size = 41749, upload-time = "2025-10-06T05:37:07.431Z" }, + { url = "https://files.pythonhosted.org/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409, upload-time = "2025-10-06T05:38:16.721Z" }, +] + [[package]] name = "fsspec" version = "2026.2.0" @@ -557,6 +779,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, ] +[package.optional-dependencies] +http = [ + { name = "aiohttp" }, +] + [[package]] name = "greenlet" version = "3.3.0" @@ -705,6 +932,51 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] +[[package]] +name = "jiter" +version = "0.14.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6e/c1/0cddc6eb17d4c53a99840953f95dd3accdc5cfc7a337b0e9b26476276be9/jiter-0.14.0.tar.gz", hash = "sha256:e8a39e66dac7153cf3f964a12aad515afa8d74938ec5cc0018adcdae5367c79e", size = 165725, upload-time = "2026-04-10T14:28:42.01Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/68/7390a418f10897da93b158f2d5a8bd0bcd73a0f9ec3bb36917085bb759ef/jiter-0.14.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:2fb2ce3a7bc331256dfb14cefc34832366bb28a9aca81deaf43bbf2a5659e607", size = 316295, upload-time = "2026-04-10T14:26:24.887Z" }, + { url = "https://files.pythonhosted.org/packages/60/a0/5854ac00ff63551c52c6c89534ec6aba4b93474e7924d64e860b1c94165b/jiter-0.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5252a7ca23785cef5d02d4ece6077a1b556a410c591b379f82091c3001e14844", size = 315898, upload-time = "2026-04-10T14:26:26.601Z" }, + { url = "https://files.pythonhosted.org/packages/41/a1/4f44832650a16b18e8391f1bf1d6ca4909bc738351826bcc198bba4357f4/jiter-0.14.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c409578cbd77c338975670ada777add4efd53379667edf0aceea730cabede6fb", size = 343730, upload-time = "2026-04-10T14:26:28.326Z" }, + { url = "https://files.pythonhosted.org/packages/48/64/a329e9d469f86307203594b1707e11ae51c3348d03bfd514a5f997870012/jiter-0.14.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7ede4331a1899d604463369c730dbb961ffdc5312bc7f16c41c2896415b1304a", size = 370102, upload-time = "2026-04-10T14:26:30.089Z" }, + { url = "https://files.pythonhosted.org/packages/94/c1/5e3dfc59635aa4d4c7bd20a820ac1d09b8ed851568356802cf1c08edb3cf/jiter-0.14.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:92cd8b6025981a041f5310430310b55b25ca593972c16407af8837d3d7d2ca01", size = 461335, upload-time = "2026-04-10T14:26:31.911Z" }, + { url = "https://files.pythonhosted.org/packages/e3/1b/dd157009dbc058f7b00108f545ccb72a2d56461395c4fc7b9cfdccb00af4/jiter-0.14.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:351bf6eda4e3a7ceb876377840c702e9a3e4ecc4624dbfb2d6463c67ae52637d", size = 378536, upload-time = "2026-04-10T14:26:33.595Z" }, + { url = "https://files.pythonhosted.org/packages/91/78/256013667b7c10b8834f8e6e54cd3e562d4c6e34227a1596addccc05e38c/jiter-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1dcfbeb93d9ecd9ca128bbf8910120367777973fa193fb9a39c31237d8df165", size = 353859, upload-time = "2026-04-10T14:26:35.098Z" }, + { url = "https://files.pythonhosted.org/packages/de/d9/137d65ade9093a409fe80955ce60b12bb753722c986467aeda47faf450ad/jiter-0.14.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:ae039aaef8de3f8157ecc1fdd4d85043ac4f57538c245a0afaecb8321ec951c3", size = 357626, upload-time = "2026-04-10T14:26:36.685Z" }, + { url = "https://files.pythonhosted.org/packages/2e/48/76750835b87029342727c1a268bea8878ab988caf81ee4e7b880900eeb5a/jiter-0.14.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7d9d51eb96c82a9652933bd769fe6de66877d6eb2b2440e281f2938c51b5643e", size = 393172, upload-time = "2026-04-10T14:26:38.097Z" }, + { url = "https://files.pythonhosted.org/packages/a6/60/456c4e81d5c8045279aefe60e9e483be08793828800a4e64add8fdde7f2a/jiter-0.14.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d824ca4148b705970bf4e120924a212fdfca9859a73e42bd7889a63a4ea6bb98", size = 520300, upload-time = "2026-04-10T14:26:39.532Z" }, + { url = "https://files.pythonhosted.org/packages/a8/9f/2020e0984c235f678dced38fe4eec3058cf528e6af36ebf969b410305941/jiter-0.14.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ff3a6465b3a0f54b1a430f45c3c0ba7d61ceb45cbc3e33f9e1a7f638d690baf3", size = 553059, upload-time = "2026-04-10T14:26:40.991Z" }, + { url = "https://files.pythonhosted.org/packages/ef/32/e2d298e1a22a4bbe6062136d1c7192db7dba003a6975e51d9a9eecabc4c2/jiter-0.14.0-cp312-cp312-win32.whl", hash = "sha256:5dec7c0a3e98d2a3f8a2e67382d0d7c3ac60c69103a4b271da889b4e8bb1e129", size = 206030, upload-time = "2026-04-10T14:26:42.517Z" }, + { url = "https://files.pythonhosted.org/packages/36/ac/96369141b3d8a4a8e4590e983085efe1c436f35c0cda940dd76d942e3e40/jiter-0.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:fc7e37b4b8bc7e80a63ad6cfa5fc11fab27dbfea4cc4ae644b1ab3f273dc348f", size = 201603, upload-time = "2026-04-10T14:26:44.328Z" }, + { url = "https://files.pythonhosted.org/packages/01/c3/75d847f264647017d7e3052bbcc8b1e24b95fa139c320c5f5066fa7a0bdd/jiter-0.14.0-cp312-cp312-win_arm64.whl", hash = "sha256:ee4a72f12847ef29b072aee9ad5474041ab2924106bdca9fcf5d7d965853e057", size = 191525, upload-time = "2026-04-10T14:26:46Z" }, + { url = "https://files.pythonhosted.org/packages/97/2a/09f70020898507a89279659a1afe3364d57fc1b2c89949081975d135f6f5/jiter-0.14.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:af72f204cf4d44258e5b4c1745130ac45ddab0e71a06333b01de660ab4187a94", size = 315502, upload-time = "2026-04-10T14:26:47.697Z" }, + { url = "https://files.pythonhosted.org/packages/d6/be/080c96a45cd74f9fce5db4fd68510b88087fb37ffe2541ff73c12db92535/jiter-0.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4b77da71f6e819be5fbcec11a453fde5b1d0267ef6ed487e2a392fd8e14e4e3a", size = 314870, upload-time = "2026-04-10T14:26:49.149Z" }, + { url = "https://files.pythonhosted.org/packages/7d/5e/2d0fee155826a968a832cc32438de5e2a193292c8721ca70d0b53e58245b/jiter-0.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f4ea612fe8b84b8b04e51d0e78029ecf3466348e25973f953de6e6a59aa4c1", size = 343406, upload-time = "2026-04-10T14:26:50.762Z" }, + { url = "https://files.pythonhosted.org/packages/70/af/bf9ee0d3a4f8dc0d679fc1337f874fe60cdbf841ebbb304b374e1c9aaceb/jiter-0.14.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:62fe2451f8fcc0240261e6a4df18ecbcd58327857e61e625b2393ea3b468aac9", size = 369415, upload-time = "2026-04-10T14:26:52.188Z" }, + { url = "https://files.pythonhosted.org/packages/0f/83/8e8561eadba31f4d3948a5b712fb0447ec71c3560b57a855449e7b8ddc98/jiter-0.14.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6112f26f5afc75bcb475787d29da3aa92f9d09c7858f632f4be6ffe607be82e9", size = 461456, upload-time = "2026-04-10T14:26:53.611Z" }, + { url = "https://files.pythonhosted.org/packages/f6/c9/c5299e826a5fe6108d172b344033f61c69b1bb979dd8d9ddd4278a160971/jiter-0.14.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:215a6cb8fb7dc702aa35d475cc00ddc7f970e5c0b1417fb4b4ac5d82fa2a29db", size = 378488, upload-time = "2026-04-10T14:26:55.211Z" }, + { url = "https://files.pythonhosted.org/packages/5d/37/c16d9d15c0a471b8644b1abe3c82668092a707d9bedcf076f24ff2e380cd/jiter-0.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4ab96a30fb3cb2c7e0cd33f7616c8860da5f5674438988a54ac717caccdbaa", size = 353242, upload-time = "2026-04-10T14:26:56.705Z" }, + { url = "https://files.pythonhosted.org/packages/58/ea/8050cb0dc654e728e1bfacbc0c640772f2181af5dedd13ae70145743a439/jiter-0.14.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:3a99c1387b1f2928f799a9de899193484d66206a50e98233b6b088a7f0c1edb2", size = 356823, upload-time = "2026-04-10T14:26:58.281Z" }, + { url = "https://files.pythonhosted.org/packages/b0/3b/cf71506d270e5f84d97326bf220e47aed9b95e9a4a060758fb07772170ab/jiter-0.14.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ab18d11074485438695f8d34a1b6da61db9754248f96d51341956607a8f39985", size = 392564, upload-time = "2026-04-10T14:27:00.018Z" }, + { url = "https://files.pythonhosted.org/packages/b0/cc/8c6c74a3efb5bd671bfd14f51e8a73375464ca914b1551bc3b40e26ac2c9/jiter-0.14.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:801028dcfc26ac0895e4964cbc0fd62c73be9fd4a7d7b1aaf6e5790033a719b7", size = 520322, upload-time = "2026-04-10T14:27:01.664Z" }, + { url = "https://files.pythonhosted.org/packages/41/24/68d7b883ec959884ddf00d019b2e0e82ba81b167e1253684fa90519ce33c/jiter-0.14.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ad425b087aafb4a1c7e1e98a279200743b9aaf30c3e0ba723aec93f061bd9bc8", size = 552619, upload-time = "2026-04-10T14:27:03.316Z" }, + { url = "https://files.pythonhosted.org/packages/b6/89/b1a0985223bbf3150ff9e8f46f98fc9360c1de94f48abe271bbe1b465682/jiter-0.14.0-cp313-cp313-win32.whl", hash = "sha256:882bcb9b334318e233950b8be366fe5f92c86b66a7e449e76975dfd6d776a01f", size = 205699, upload-time = "2026-04-10T14:27:04.662Z" }, + { url = "https://files.pythonhosted.org/packages/4c/19/3f339a5a7f14a11730e67f6be34f9d5105751d547b615ef593fa122a5ded/jiter-0.14.0-cp313-cp313-win_amd64.whl", hash = "sha256:9b8c571a5dba09b98bd3462b5a53f27209a5cbbe85670391692ede71974e979f", size = 201323, upload-time = "2026-04-10T14:27:06.139Z" }, + { url = "https://files.pythonhosted.org/packages/50/56/752dd89c84be0e022a8ea3720bcfa0a8431db79a962578544812ce061739/jiter-0.14.0-cp313-cp313-win_arm64.whl", hash = "sha256:34f19dcc35cb1abe7c369b3756babf8c7f04595c0807a848df8f26ef8298ef92", size = 191099, upload-time = "2026-04-10T14:27:07.564Z" }, + { url = "https://files.pythonhosted.org/packages/91/28/292916f354f25a1fe8cf2c918d1415c699a4a659ae00be0430e1c5d9ffea/jiter-0.14.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e89bcd7d426a75bb4952c696b267075790d854a07aad4c9894551a82c5b574ab", size = 320880, upload-time = "2026-04-10T14:27:09.326Z" }, + { url = "https://files.pythonhosted.org/packages/ad/c7/b002a7d8b8957ac3d469bd59c18ef4b1595a5216ae0de639a287b9816023/jiter-0.14.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b25beaa0d4447ea8c7ae0c18c688905d34840d7d0b937f2f7bdd52162c98a40", size = 346563, upload-time = "2026-04-10T14:27:11.287Z" }, + { url = "https://files.pythonhosted.org/packages/f9/3b/f8d07580d8706021d255a6356b8fab13ee4c869412995550ce6ed4ddf97d/jiter-0.14.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:651a8758dd413c51e3b7f6557cdc6921faf70b14106f45f969f091f5cda990ea", size = 357928, upload-time = "2026-04-10T14:27:12.729Z" }, + { url = "https://files.pythonhosted.org/packages/47/5b/ac1a974da29e35507230383110ffec59998b290a8732585d04e19a9eb5ba/jiter-0.14.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e1a7eead856a5038a8d291f1447176ab0b525c77a279a058121b5fccee257f6f", size = 203519, upload-time = "2026-04-10T14:27:14.125Z" }, + { url = "https://files.pythonhosted.org/packages/96/6d/9fc8433d667d2454271378a79747d8c76c10b51b482b454e6190e511f244/jiter-0.14.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e692633a12cda97e352fdcd1c4acc971b1c28707e1e33aeef782b0cbf051975", size = 190113, upload-time = "2026-04-10T14:27:16.638Z" }, + { url = "https://files.pythonhosted.org/packages/21/42/9042c3f3019de4adcb8c16591c325ec7255beea9fcd33a42a43f3b0b1000/jiter-0.14.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:fbd9e482663ca9d005d051330e4d2d8150bb208a209409c10f7e7dfdf7c49da9", size = 308810, upload-time = "2026-04-10T14:28:34.673Z" }, + { url = "https://files.pythonhosted.org/packages/60/cf/a7e19b308bd86bb04776803b1f01a5f9a287a4c55205f4708827ee487fbf/jiter-0.14.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:33a20d838b91ef376b3a56896d5b04e725c7df5bc4864cc6569cf046a8d73b6d", size = 308443, upload-time = "2026-04-10T14:28:36.658Z" }, + { url = "https://files.pythonhosted.org/packages/ca/44/e26ede3f0caeff93f222559cb0cc4ca68579f07d009d7b6010c5b586f9b1/jiter-0.14.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:432c4db5255d86a259efde91e55cb4c8d18c0521d844c9e2e7efcce3899fb016", size = 343039, upload-time = "2026-04-10T14:28:38.356Z" }, + { url = "https://files.pythonhosted.org/packages/da/e9/1f9ada30cef7b05e74bb06f52127e7a724976c225f46adb65c37b1dadfb6/jiter-0.14.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67f00d94b281174144d6532a04b66a12cb866cbdc47c3af3bfe2973677f9861a", size = 349613, upload-time = "2026-04-10T14:28:40.066Z" }, +] + [[package]] name = "joblib" version = "1.5.3" @@ -714,6 +986,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, ] +[[package]] +name = "lightgbm" +version = "4.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/68/0b/a2e9f5c5da7ef047cc60cef37f86185088845e8433e54d2e7ed439cce8a3/lightgbm-4.6.0.tar.gz", hash = "sha256:cb1c59720eb569389c0ba74d14f52351b573af489f230032a1c9f314f8bab7fe", size = 1703705, upload-time = "2025-02-15T04:03:03.111Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f2/75/cffc9962cca296bc5536896b7e65b4a7cdeb8db208e71b9c0133c08f8f7e/lightgbm-4.6.0-py3-none-macosx_10_15_x86_64.whl", hash = "sha256:b7a393de8a334d5c8e490df91270f0763f83f959574d504c7ccb9eee4aef70ed", size = 2010151, upload-time = "2025-02-15T04:02:50.961Z" }, + { url = "https://files.pythonhosted.org/packages/21/1b/550ee378512b78847930f5d74228ca1fdba2a7fbdeaac9aeccc085b0e257/lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:2dafd98d4e02b844ceb0b61450a660681076b1ea6c7adb8c566dfd66832aafad", size = 1592172, upload-time = "2025-02-15T04:02:53.937Z" }, + { url = "https://files.pythonhosted.org/packages/64/41/4fbde2c3d29e25ee7c41d87df2f2e5eda65b431ee154d4d462c31041846c/lightgbm-4.6.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4d68712bbd2b57a0b14390cbf9376c1d5ed773fa2e71e099cac588703b590336", size = 3454567, upload-time = "2025-02-15T04:02:56.443Z" }, + { url = "https://files.pythonhosted.org/packages/42/86/dabda8fbcb1b00bcfb0003c3776e8ade1aa7b413dff0a2c08f457dace22f/lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:cb19b5afea55b5b61cbb2131095f50538bd608a00655f23ad5d25ae3e3bf1c8d", size = 3569831, upload-time = "2025-02-15T04:02:58.925Z" }, + { url = "https://files.pythonhosted.org/packages/5e/23/f8b28ca248bb629b9e08f877dd2965d1994e1674a03d67cd10c5246da248/lightgbm-4.6.0-py3-none-win_amd64.whl", hash = "sha256:37089ee95664b6550a7189d887dbf098e3eadab03537e411f52c63c121e3ba4b", size = 1451509, upload-time = "2025-02-15T04:03:01.515Z" }, +] + [[package]] name = "lingua-language-detector" version = "2.1.1" @@ -830,6 +1119,85 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, ] +[[package]] +name = "multidict" +version = "6.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010, upload-time = "2026-01-26T02:46:45.979Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893, upload-time = "2026-01-26T02:43:52.754Z" }, + { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456, upload-time = "2026-01-26T02:43:53.893Z" }, + { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872, upload-time = "2026-01-26T02:43:55.041Z" }, + { url = "https://files.pythonhosted.org/packages/cf/3b/d6bd75dc4f3ff7c73766e04e705b00ed6dbbaccf670d9e05a12b006f5a21/multidict-6.7.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53", size = 251018, upload-time = "2026-01-26T02:43:56.198Z" }, + { url = "https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75", size = 258883, upload-time = "2026-01-26T02:43:57.499Z" }, + { url = "https://files.pythonhosted.org/packages/86/85/7ed40adafea3d4f1c8b916e3b5cc3a8e07dfcdcb9cd72800f4ed3ca1b387/multidict-6.7.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b", size = 242413, upload-time = "2026-01-26T02:43:58.755Z" }, + { url = "https://files.pythonhosted.org/packages/d2/57/b8565ff533e48595503c785f8361ff9a4fde4d67de25c207cd0ba3befd03/multidict-6.7.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733", size = 268404, upload-time = "2026-01-26T02:44:00.216Z" }, + { url = "https://files.pythonhosted.org/packages/e0/50/9810c5c29350f7258180dfdcb2e52783a0632862eb334c4896ac717cebcb/multidict-6.7.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a", size = 269456, upload-time = "2026-01-26T02:44:02.202Z" }, + { url = "https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961", size = 256322, upload-time = "2026-01-26T02:44:03.56Z" }, + { url = "https://files.pythonhosted.org/packages/31/6e/d8a26d81ac166a5592782d208dd90dfdc0a7a218adaa52b45a672b46c122/multidict-6.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582", size = 253955, upload-time = "2026-01-26T02:44:04.845Z" }, + { url = "https://files.pythonhosted.org/packages/59/4c/7c672c8aad41534ba619bcd4ade7a0dc87ed6b8b5c06149b85d3dd03f0cd/multidict-6.7.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e", size = 251254, upload-time = "2026-01-26T02:44:06.133Z" }, + { url = "https://files.pythonhosted.org/packages/7b/bd/84c24de512cbafbdbc39439f74e967f19570ce7924e3007174a29c348916/multidict-6.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3", size = 252059, upload-time = "2026-01-26T02:44:07.518Z" }, + { url = "https://files.pythonhosted.org/packages/fa/ba/f5449385510825b73d01c2d4087bf6d2fccc20a2d42ac34df93191d3dd03/multidict-6.7.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6", size = 263588, upload-time = "2026-01-26T02:44:09.382Z" }, + { url = "https://files.pythonhosted.org/packages/d7/11/afc7c677f68f75c84a69fe37184f0f82fce13ce4b92f49f3db280b7e92b3/multidict-6.7.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a", size = 259642, upload-time = "2026-01-26T02:44:10.73Z" }, + { url = "https://files.pythonhosted.org/packages/2b/17/ebb9644da78c4ab36403739e0e6e0e30ebb135b9caf3440825001a0bddcb/multidict-6.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba", size = 251377, upload-time = "2026-01-26T02:44:12.042Z" }, + { url = "https://files.pythonhosted.org/packages/ca/a4/840f5b97339e27846c46307f2530a2805d9d537d8b8bd416af031cad7fa0/multidict-6.7.1-cp312-cp312-win32.whl", hash = "sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511", size = 41887, upload-time = "2026-01-26T02:44:14.245Z" }, + { url = "https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19", size = 46053, upload-time = "2026-01-26T02:44:15.371Z" }, + { url = "https://files.pythonhosted.org/packages/0c/5b/aba28e4ee4006ae4c7df8d327d31025d760ffa992ea23812a601d226e682/multidict-6.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf", size = 43307, upload-time = "2026-01-26T02:44:16.852Z" }, + { url = "https://files.pythonhosted.org/packages/f2/22/929c141d6c0dba87d3e1d38fbdf1ba8baba86b7776469f2bc2d3227a1e67/multidict-6.7.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2b41f5fed0ed563624f1c17630cb9941cf2309d4df00e494b551b5f3e3d67a23", size = 76174, upload-time = "2026-01-26T02:44:18.509Z" }, + { url = "https://files.pythonhosted.org/packages/c7/75/bc704ae15fee974f8fccd871305e254754167dce5f9e42d88a2def741a1d/multidict-6.7.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84e61e3af5463c19b67ced91f6c634effb89ef8bfc5ca0267f954451ed4bb6a2", size = 45116, upload-time = "2026-01-26T02:44:19.745Z" }, + { url = "https://files.pythonhosted.org/packages/79/76/55cd7186f498ed080a18440c9013011eb548f77ae1b297206d030eb1180a/multidict-6.7.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:935434b9853c7c112eee7ac891bc4cb86455aa631269ae35442cb316790c1445", size = 43524, upload-time = "2026-01-26T02:44:21.571Z" }, + { url = "https://files.pythonhosted.org/packages/e9/3c/414842ef8d5a1628d68edee29ba0e5bcf235dbfb3ccd3ea303a7fe8c72ff/multidict-6.7.1-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:432feb25a1cb67fe82a9680b4d65fb542e4635cb3166cd9c01560651ad60f177", size = 249368, upload-time = "2026-01-26T02:44:22.803Z" }, + { url = "https://files.pythonhosted.org/packages/f6/32/befed7f74c458b4a525e60519fe8d87eef72bb1e99924fa2b0f9d97a221e/multidict-6.7.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e82d14e3c948952a1a85503817e038cba5905a3352de76b9a465075d072fba23", size = 256952, upload-time = "2026-01-26T02:44:24.306Z" }, + { url = "https://files.pythonhosted.org/packages/03/d6/c878a44ba877f366630c860fdf74bfb203c33778f12b6ac274936853c451/multidict-6.7.1-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4cfb48c6ea66c83bcaaf7e4dfa7ec1b6bbcf751b7db85a328902796dfde4c060", size = 240317, upload-time = "2026-01-26T02:44:25.772Z" }, + { url = "https://files.pythonhosted.org/packages/68/49/57421b4d7ad2e9e60e25922b08ceb37e077b90444bde6ead629095327a6f/multidict-6.7.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1d540e51b7e8e170174555edecddbd5538105443754539193e3e1061864d444d", size = 267132, upload-time = "2026-01-26T02:44:27.648Z" }, + { url = "https://files.pythonhosted.org/packages/b7/fe/ec0edd52ddbcea2a2e89e174f0206444a61440b40f39704e64dc807a70bd/multidict-6.7.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:273d23f4b40f3dce4d6c8a821c741a86dec62cded82e1175ba3d99be128147ed", size = 268140, upload-time = "2026-01-26T02:44:29.588Z" }, + { url = "https://files.pythonhosted.org/packages/b0/73/6e1b01cbeb458807aa0831742232dbdd1fa92bfa33f52a3f176b4ff3dc11/multidict-6.7.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d624335fd4fa1c08a53f8b4be7676ebde19cd092b3895c421045ca87895b429", size = 254277, upload-time = "2026-01-26T02:44:30.902Z" }, + { url = "https://files.pythonhosted.org/packages/6a/b2/5fb8c124d7561a4974c342bc8c778b471ebbeb3cc17df696f034a7e9afe7/multidict-6.7.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:12fad252f8b267cc75b66e8fc51b3079604e8d43a75428ffe193cd9e2195dfd6", size = 252291, upload-time = "2026-01-26T02:44:32.31Z" }, + { url = "https://files.pythonhosted.org/packages/5a/96/51d4e4e06bcce92577fcd488e22600bd38e4fd59c20cb49434d054903bd2/multidict-6.7.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:03ede2a6ffbe8ef936b92cb4529f27f42be7f56afcdab5ab739cd5f27fb1cbf9", size = 250156, upload-time = "2026-01-26T02:44:33.734Z" }, + { url = "https://files.pythonhosted.org/packages/db/6b/420e173eec5fba721a50e2a9f89eda89d9c98fded1124f8d5c675f7a0c0f/multidict-6.7.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:90efbcf47dbe33dcf643a1e400d67d59abeac5db07dc3f27d6bdeae497a2198c", size = 249742, upload-time = "2026-01-26T02:44:35.222Z" }, + { url = "https://files.pythonhosted.org/packages/44/a3/ec5b5bd98f306bc2aa297b8c6f11a46714a56b1e6ef5ebda50a4f5d7c5fb/multidict-6.7.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:5c4b9bfc148f5a91be9244d6264c53035c8a0dcd2f51f1c3c6e30e30ebaa1c84", size = 262221, upload-time = "2026-01-26T02:44:36.604Z" }, + { url = "https://files.pythonhosted.org/packages/cd/f7/e8c0d0da0cd1e28d10e624604e1a36bcc3353aaebdfdc3a43c72bc683a12/multidict-6.7.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:401c5a650f3add2472d1d288c26deebc540f99e2fb83e9525007a74cd2116f1d", size = 258664, upload-time = "2026-01-26T02:44:38.008Z" }, + { url = "https://files.pythonhosted.org/packages/52/da/151a44e8016dd33feed44f730bd856a66257c1ee7aed4f44b649fb7edeb3/multidict-6.7.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:97891f3b1b3ffbded884e2916cacf3c6fc87b66bb0dde46f7357404750559f33", size = 249490, upload-time = "2026-01-26T02:44:39.386Z" }, + { url = "https://files.pythonhosted.org/packages/87/af/a3b86bf9630b732897f6fc3f4c4714b90aa4361983ccbdcd6c0339b21b0c/multidict-6.7.1-cp313-cp313-win32.whl", hash = "sha256:e1c5988359516095535c4301af38d8a8838534158f649c05dd1050222321bcb3", size = 41695, upload-time = "2026-01-26T02:44:41.318Z" }, + { url = "https://files.pythonhosted.org/packages/b2/35/e994121b0e90e46134673422dd564623f93304614f5d11886b1b3e06f503/multidict-6.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:960c83bf01a95b12b08fd54324a4eb1d5b52c88932b5cba5d6e712bb3ed12eb5", size = 45884, upload-time = "2026-01-26T02:44:42.488Z" }, + { url = "https://files.pythonhosted.org/packages/ca/61/42d3e5dbf661242a69c97ea363f2d7b46c567da8eadef8890022be6e2ab0/multidict-6.7.1-cp313-cp313-win_arm64.whl", hash = "sha256:563fe25c678aaba333d5399408f5ec3c383ca5b663e7f774dd179a520b8144df", size = 43122, upload-time = "2026-01-26T02:44:43.664Z" }, + { url = "https://files.pythonhosted.org/packages/6d/b3/e6b21c6c4f314bb956016b0b3ef2162590a529b84cb831c257519e7fde44/multidict-6.7.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:c76c4bec1538375dad9d452d246ca5368ad6e1c9039dadcf007ae59c70619ea1", size = 83175, upload-time = "2026-01-26T02:44:44.894Z" }, + { url = "https://files.pythonhosted.org/packages/fb/76/23ecd2abfe0957b234f6c960f4ade497f55f2c16aeb684d4ecdbf1c95791/multidict-6.7.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:57b46b24b5d5ebcc978da4ec23a819a9402b4228b8a90d9c656422b4bdd8a963", size = 48460, upload-time = "2026-01-26T02:44:46.106Z" }, + { url = "https://files.pythonhosted.org/packages/c4/57/a0ed92b23f3a042c36bc4227b72b97eca803f5f1801c1ab77c8a212d455e/multidict-6.7.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e954b24433c768ce78ab7929e84ccf3422e46deb45a4dc9f93438f8217fa2d34", size = 46930, upload-time = "2026-01-26T02:44:47.278Z" }, + { url = "https://files.pythonhosted.org/packages/b5/66/02ec7ace29162e447f6382c495dc95826bf931d3818799bbef11e8f7df1a/multidict-6.7.1-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3bd231490fa7217cc832528e1cd8752a96f0125ddd2b5749390f7c3ec8721b65", size = 242582, upload-time = "2026-01-26T02:44:48.604Z" }, + { url = "https://files.pythonhosted.org/packages/58/18/64f5a795e7677670e872673aca234162514696274597b3708b2c0d276cce/multidict-6.7.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:253282d70d67885a15c8a7716f3a73edf2d635793ceda8173b9ecc21f2fb8292", size = 250031, upload-time = "2026-01-26T02:44:50.544Z" }, + { url = "https://files.pythonhosted.org/packages/c8/ed/e192291dbbe51a8290c5686f482084d31bcd9d09af24f63358c3d42fd284/multidict-6.7.1-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0b4c48648d7649c9335cf1927a8b87fa692de3dcb15faa676c6a6f1f1aabda43", size = 228596, upload-time = "2026-01-26T02:44:51.951Z" }, + { url = "https://files.pythonhosted.org/packages/1e/7e/3562a15a60cf747397e7f2180b0a11dc0c38d9175a650e75fa1b4d325e15/multidict-6.7.1-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:98bc624954ec4d2c7cb074b8eefc2b5d0ce7d482e410df446414355d158fe4ca", size = 257492, upload-time = "2026-01-26T02:44:53.902Z" }, + { url = "https://files.pythonhosted.org/packages/24/02/7d0f9eae92b5249bb50ac1595b295f10e263dd0078ebb55115c31e0eaccd/multidict-6.7.1-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1b99af4d9eec0b49927b4402bcbb58dea89d3e0db8806a4086117019939ad3dd", size = 255899, upload-time = "2026-01-26T02:44:55.316Z" }, + { url = "https://files.pythonhosted.org/packages/00/e3/9b60ed9e23e64c73a5cde95269ef1330678e9c6e34dd4eb6b431b85b5a10/multidict-6.7.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6aac4f16b472d5b7dc6f66a0d49dd57b0e0902090be16594dc9ebfd3d17c47e7", size = 247970, upload-time = "2026-01-26T02:44:56.783Z" }, + { url = "https://files.pythonhosted.org/packages/3e/06/538e58a63ed5cfb0bd4517e346b91da32fde409d839720f664e9a4ae4f9d/multidict-6.7.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:21f830fe223215dffd51f538e78c172ed7c7f60c9b96a2bf05c4848ad49921c3", size = 245060, upload-time = "2026-01-26T02:44:58.195Z" }, + { url = "https://files.pythonhosted.org/packages/b2/2f/d743a3045a97c895d401e9bd29aaa09b94f5cbdf1bd561609e5a6c431c70/multidict-6.7.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:f5dd81c45b05518b9aa4da4aa74e1c93d715efa234fd3e8a179df611cc85e5f4", size = 235888, upload-time = "2026-01-26T02:44:59.57Z" }, + { url = "https://files.pythonhosted.org/packages/38/83/5a325cac191ab28b63c52f14f1131f3b0a55ba3b9aa65a6d0bf2a9b921a0/multidict-6.7.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:eb304767bca2bb92fb9c5bd33cedc95baee5bb5f6c88e63706533a1c06ad08c8", size = 243554, upload-time = "2026-01-26T02:45:01.054Z" }, + { url = "https://files.pythonhosted.org/packages/20/1f/9d2327086bd15da2725ef6aae624208e2ef828ed99892b17f60c344e57ed/multidict-6.7.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:c9035dde0f916702850ef66460bc4239d89d08df4d02023a5926e7446724212c", size = 252341, upload-time = "2026-01-26T02:45:02.484Z" }, + { url = "https://files.pythonhosted.org/packages/e8/2c/2a1aa0280cf579d0f6eed8ee5211c4f1730bd7e06c636ba2ee6aafda302e/multidict-6.7.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:af959b9beeb66c822380f222f0e0a1889331597e81f1ded7f374f3ecb0fd6c52", size = 246391, upload-time = "2026-01-26T02:45:03.862Z" }, + { url = "https://files.pythonhosted.org/packages/e5/03/7ca022ffc36c5a3f6e03b179a5ceb829be9da5783e6fe395f347c0794680/multidict-6.7.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:41f2952231456154ee479651491e94118229844dd7226541788be783be2b5108", size = 243422, upload-time = "2026-01-26T02:45:05.296Z" }, + { url = "https://files.pythonhosted.org/packages/dc/1d/b31650eab6c5778aceed46ba735bd97f7c7d2f54b319fa916c0f96e7805b/multidict-6.7.1-cp313-cp313t-win32.whl", hash = "sha256:df9f19c28adcb40b6aae30bbaa1478c389efd50c28d541d76760199fc1037c32", size = 47770, upload-time = "2026-01-26T02:45:06.754Z" }, + { url = "https://files.pythonhosted.org/packages/ac/5b/2d2d1d522e51285bd61b1e20df8f47ae1a9d80839db0b24ea783b3832832/multidict-6.7.1-cp313-cp313t-win_amd64.whl", hash = "sha256:d54ecf9f301853f2c5e802da559604b3e95bb7a3b01a9c295c6ee591b9882de8", size = 53109, upload-time = "2026-01-26T02:45:08.044Z" }, + { url = "https://files.pythonhosted.org/packages/3d/a3/cc409ba012c83ca024a308516703cf339bdc4b696195644a7215a5164a24/multidict-6.7.1-cp313-cp313t-win_arm64.whl", hash = "sha256:5a37ca18e360377cfda1d62f5f382ff41f2b8c4ccb329ed974cc2e1643440118", size = 45573, upload-time = "2026-01-26T02:45:09.349Z" }, + { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, +] + +[[package]] +name = "multiprocess" +version = "0.70.19" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a2/f2/e783ac7f2aeeed14e9e12801f22529cc7e6b7ab80928d6dcce4e9f00922d/multiprocess-0.70.19.tar.gz", hash = "sha256:952021e0e6c55a4a9fe4cd787895b86e239a40e76802a789d6305398d3975897", size = 2079989, upload-time = "2026-01-19T06:47:39.744Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/45/8004d1e6b9185c1a444d6b55ac5682acf9d98035e54386d967366035a03a/multiprocess-0.70.19-py310-none-any.whl", hash = "sha256:97404393419dcb2a8385910864eedf47a3cadf82c66345b44f036420eb0b5d87", size = 134948, upload-time = "2026-01-19T06:47:32.325Z" }, + { url = "https://files.pythonhosted.org/packages/86/c2/dec9722dc3474c164a0b6bcd9a7ed7da542c98af8cabce05374abab35edd/multiprocess-0.70.19-py311-none-any.whl", hash = "sha256:928851ae7973aea4ce0eaf330bbdafb2e01398a91518d5c8818802845564f45c", size = 144457, upload-time = "2026-01-19T06:47:33.711Z" }, + { url = "https://files.pythonhosted.org/packages/71/70/38998b950a97ea279e6bd657575d22d1a2047256caf707d9a10fbce4f065/multiprocess-0.70.19-py312-none-any.whl", hash = "sha256:3a56c0e85dd5025161bac5ce138dcac1e49174c7d8e74596537e729fd5c53c28", size = 150281, upload-time = "2026-01-19T06:47:35.037Z" }, + { url = "https://files.pythonhosted.org/packages/7f/74/d2c27e03cb84251dfe7249b8e82923643c6d48fa4883b9476b025e7dc7eb/multiprocess-0.70.19-py313-none-any.whl", hash = "sha256:8d5eb4ec5017ba2fab4e34a747c6d2c2b6fecfe9e7236e77988db91580ada952", size = 156414, upload-time = "2026-01-19T06:47:35.915Z" }, + { url = "https://files.pythonhosted.org/packages/7e/82/69e539c4c2027f1e1697e09aaa2449243085a0edf81ae2c6341e84d769b6/multiprocess-0.70.19-py39-none-any.whl", hash = "sha256:0d4b4397ed669d371c81dcd1ef33fd384a44d6c3de1bd0ca7ac06d837720d3c5", size = 133477, upload-time = "2026-01-19T06:47:38.619Z" }, +] + [[package]] name = "networkx" version = "3.6.1" @@ -1086,6 +1454,81 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/55/80/178af0594890dee17e239fca96d3d8670ba0f5ff59b7d0439850924a9c09/pandas-3.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b03f91ae8c10a85c1613102c7bef5229b5379f343030a3ccefeca8a33414cf35", size = 10485047, upload-time = "2026-02-17T22:19:34.605Z" }, ] +[[package]] +name = "peft" +version = "0.19.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "accelerate" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyyaml" }, + { name = "safetensors" }, + { name = "torch" }, + { name = "tqdm" }, + { name = "transformers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/86/cf/037f1e3d5186496c05513a6754639e2dab3038a05f384284d49a9bd06a2d/peft-0.19.1.tar.gz", hash = "sha256:0d97542fe96dcdaa20d3b81c06f26f988618f416a73544ab23c3618ccb674a40", size = 763738, upload-time = "2026-04-16T15:46:45.105Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e8/b6/f54d676ed93cc2dd2234c3b172ea9c8c3d7d29361e66b1b23dec57a67465/peft-0.19.1-py3-none-any.whl", hash = "sha256:2113f72a81621b5913ef28f9022204c742df111890c5f49d812716a4a301e356", size = 680692, upload-time = "2026-04-16T15:46:42.886Z" }, +] + +[[package]] +name = "propcache" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/0f/f17b1b2b221d5ca28b4b876e8bb046ac40466513960646bda8e1853cdfa2/propcache-0.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e153e9cd40cc8945138822807139367f256f89c6810c2634a4f6902b52d3b4e2", size = 80061, upload-time = "2025-10-08T19:46:46.075Z" }, + { url = "https://files.pythonhosted.org/packages/76/47/8ccf75935f51448ba9a16a71b783eb7ef6b9ee60f5d14c7f8a8a79fbeed7/propcache-0.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cd547953428f7abb73c5ad82cbb32109566204260d98e41e5dfdc682eb7f8403", size = 46037, upload-time = "2025-10-08T19:46:47.23Z" }, + { url = "https://files.pythonhosted.org/packages/0a/b6/5c9a0e42df4d00bfb4a3cbbe5cf9f54260300c88a0e9af1f47ca5ce17ac0/propcache-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f048da1b4f243fc44f205dfd320933a951b8d89e0afd4c7cacc762a8b9165207", size = 47324, upload-time = "2025-10-08T19:46:48.384Z" }, + { url = "https://files.pythonhosted.org/packages/9e/d3/6c7ee328b39a81ee877c962469f1e795f9db87f925251efeb0545e0020d0/propcache-0.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ec17c65562a827bba85e3872ead335f95405ea1674860d96483a02f5c698fa72", size = 225505, upload-time = "2025-10-08T19:46:50.055Z" }, + { url = "https://files.pythonhosted.org/packages/01/5d/1c53f4563490b1d06a684742cc6076ef944bc6457df6051b7d1a877c057b/propcache-0.4.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:405aac25c6394ef275dee4c709be43745d36674b223ba4eb7144bf4d691b7367", size = 230242, upload-time = "2025-10-08T19:46:51.815Z" }, + { url = "https://files.pythonhosted.org/packages/20/e1/ce4620633b0e2422207c3cb774a0ee61cac13abc6217763a7b9e2e3f4a12/propcache-0.4.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0013cb6f8dde4b2a2f66903b8ba740bdfe378c943c4377a200551ceb27f379e4", size = 238474, upload-time = "2025-10-08T19:46:53.208Z" }, + { url = "https://files.pythonhosted.org/packages/46/4b/3aae6835b8e5f44ea6a68348ad90f78134047b503765087be2f9912140ea/propcache-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15932ab57837c3368b024473a525e25d316d8353016e7cc0e5ba9eb343fbb1cf", size = 221575, upload-time = "2025-10-08T19:46:54.511Z" }, + { url = "https://files.pythonhosted.org/packages/6e/a5/8a5e8678bcc9d3a1a15b9a29165640d64762d424a16af543f00629c87338/propcache-0.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:031dce78b9dc099f4c29785d9cf5577a3faf9ebf74ecbd3c856a7b92768c3df3", size = 216736, upload-time = "2025-10-08T19:46:56.212Z" }, + { url = "https://files.pythonhosted.org/packages/f1/63/b7b215eddeac83ca1c6b934f89d09a625aa9ee4ba158338854c87210cc36/propcache-0.4.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:ab08df6c9a035bee56e31af99be621526bd237bea9f32def431c656b29e41778", size = 213019, upload-time = "2025-10-08T19:46:57.595Z" }, + { url = "https://files.pythonhosted.org/packages/57/74/f580099a58c8af587cac7ba19ee7cb418506342fbbe2d4a4401661cca886/propcache-0.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4d7af63f9f93fe593afbf104c21b3b15868efb2c21d07d8732c0c4287e66b6a6", size = 220376, upload-time = "2025-10-08T19:46:59.067Z" }, + { url = "https://files.pythonhosted.org/packages/c4/ee/542f1313aff7eaf19c2bb758c5d0560d2683dac001a1c96d0774af799843/propcache-0.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:cfc27c945f422e8b5071b6e93169679e4eb5bf73bbcbf1ba3ae3a83d2f78ebd9", size = 226988, upload-time = "2025-10-08T19:47:00.544Z" }, + { url = "https://files.pythonhosted.org/packages/8f/18/9c6b015dd9c6930f6ce2229e1f02fb35298b847f2087ea2b436a5bfa7287/propcache-0.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:35c3277624a080cc6ec6f847cbbbb5b49affa3598c4535a0a4682a697aaa5c75", size = 215615, upload-time = "2025-10-08T19:47:01.968Z" }, + { url = "https://files.pythonhosted.org/packages/80/9e/e7b85720b98c45a45e1fca6a177024934dc9bc5f4d5dd04207f216fc33ed/propcache-0.4.1-cp312-cp312-win32.whl", hash = "sha256:671538c2262dadb5ba6395e26c1731e1d52534bfe9ae56d0b5573ce539266aa8", size = 38066, upload-time = "2025-10-08T19:47:03.503Z" }, + { url = "https://files.pythonhosted.org/packages/54/09/d19cff2a5aaac632ec8fc03737b223597b1e347416934c1b3a7df079784c/propcache-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:cb2d222e72399fcf5890d1d5cc1060857b9b236adff2792ff48ca2dfd46c81db", size = 41655, upload-time = "2025-10-08T19:47:04.973Z" }, + { url = "https://files.pythonhosted.org/packages/68/ab/6b5c191bb5de08036a8c697b265d4ca76148efb10fa162f14af14fb5f076/propcache-0.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:204483131fb222bdaaeeea9f9e6c6ed0cac32731f75dfc1d4a567fc1926477c1", size = 37789, upload-time = "2025-10-08T19:47:06.077Z" }, + { url = "https://files.pythonhosted.org/packages/bf/df/6d9c1b6ac12b003837dde8a10231a7344512186e87b36e855bef32241942/propcache-0.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:43eedf29202c08550aac1d14e0ee619b0430aaef78f85864c1a892294fbc28cf", size = 77750, upload-time = "2025-10-08T19:47:07.648Z" }, + { url = "https://files.pythonhosted.org/packages/8b/e8/677a0025e8a2acf07d3418a2e7ba529c9c33caf09d3c1f25513023c1db56/propcache-0.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d62cdfcfd89ccb8de04e0eda998535c406bf5e060ffd56be6c586cbcc05b3311", size = 44780, upload-time = "2025-10-08T19:47:08.851Z" }, + { url = "https://files.pythonhosted.org/packages/89/a4/92380f7ca60f99ebae761936bc48a72a639e8a47b29050615eef757cb2a7/propcache-0.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cae65ad55793da34db5f54e4029b89d3b9b9490d8abe1b4c7ab5d4b8ec7ebf74", size = 46308, upload-time = "2025-10-08T19:47:09.982Z" }, + { url = "https://files.pythonhosted.org/packages/2d/48/c5ac64dee5262044348d1d78a5f85dd1a57464a60d30daee946699963eb3/propcache-0.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:333ddb9031d2704a301ee3e506dc46b1fe5f294ec198ed6435ad5b6a085facfe", size = 208182, upload-time = "2025-10-08T19:47:11.319Z" }, + { url = "https://files.pythonhosted.org/packages/c6/0c/cd762dd011a9287389a6a3eb43aa30207bde253610cca06824aeabfe9653/propcache-0.4.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:fd0858c20f078a32cf55f7e81473d96dcf3b93fd2ccdb3d40fdf54b8573df3af", size = 211215, upload-time = "2025-10-08T19:47:13.146Z" }, + { url = "https://files.pythonhosted.org/packages/30/3e/49861e90233ba36890ae0ca4c660e95df565b2cd15d4a68556ab5865974e/propcache-0.4.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:678ae89ebc632c5c204c794f8dab2837c5f159aeb59e6ed0539500400577298c", size = 218112, upload-time = "2025-10-08T19:47:14.913Z" }, + { url = "https://files.pythonhosted.org/packages/f1/8b/544bc867e24e1bd48f3118cecd3b05c694e160a168478fa28770f22fd094/propcache-0.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d472aeb4fbf9865e0c6d622d7f4d54a4e101a89715d8904282bb5f9a2f476c3f", size = 204442, upload-time = "2025-10-08T19:47:16.277Z" }, + { url = "https://files.pythonhosted.org/packages/50/a6/4282772fd016a76d3e5c0df58380a5ea64900afd836cec2c2f662d1b9bb3/propcache-0.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4d3df5fa7e36b3225954fba85589da77a0fe6a53e3976de39caf04a0db4c36f1", size = 199398, upload-time = "2025-10-08T19:47:17.962Z" }, + { url = "https://files.pythonhosted.org/packages/3e/ec/d8a7cd406ee1ddb705db2139f8a10a8a427100347bd698e7014351c7af09/propcache-0.4.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:ee17f18d2498f2673e432faaa71698032b0127ebf23ae5974eeaf806c279df24", size = 196920, upload-time = "2025-10-08T19:47:19.355Z" }, + { url = "https://files.pythonhosted.org/packages/f6/6c/f38ab64af3764f431e359f8baf9e0a21013e24329e8b85d2da32e8ed07ca/propcache-0.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:580e97762b950f993ae618e167e7be9256b8353c2dcd8b99ec100eb50f5286aa", size = 203748, upload-time = "2025-10-08T19:47:21.338Z" }, + { url = "https://files.pythonhosted.org/packages/d6/e3/fa846bd70f6534d647886621388f0a265254d30e3ce47e5c8e6e27dbf153/propcache-0.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:501d20b891688eb8e7aa903021f0b72d5a55db40ffaab27edefd1027caaafa61", size = 205877, upload-time = "2025-10-08T19:47:23.059Z" }, + { url = "https://files.pythonhosted.org/packages/e2/39/8163fc6f3133fea7b5f2827e8eba2029a0277ab2c5beee6c1db7b10fc23d/propcache-0.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9a0bd56e5b100aef69bd8562b74b46254e7c8812918d3baa700c8a8009b0af66", size = 199437, upload-time = "2025-10-08T19:47:24.445Z" }, + { url = "https://files.pythonhosted.org/packages/93/89/caa9089970ca49c7c01662bd0eeedfe85494e863e8043565aeb6472ce8fe/propcache-0.4.1-cp313-cp313-win32.whl", hash = "sha256:bcc9aaa5d80322bc2fb24bb7accb4a30f81e90ab8d6ba187aec0744bc302ad81", size = 37586, upload-time = "2025-10-08T19:47:25.736Z" }, + { url = "https://files.pythonhosted.org/packages/f5/ab/f76ec3c3627c883215b5c8080debb4394ef5a7a29be811f786415fc1e6fd/propcache-0.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:381914df18634f5494334d201e98245c0596067504b9372d8cf93f4bb23e025e", size = 40790, upload-time = "2025-10-08T19:47:26.847Z" }, + { url = "https://files.pythonhosted.org/packages/59/1b/e71ae98235f8e2ba5004d8cb19765a74877abf189bc53fc0c80d799e56c3/propcache-0.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:8873eb4460fd55333ea49b7d189749ecf6e55bf85080f11b1c4530ed3034cba1", size = 37158, upload-time = "2025-10-08T19:47:27.961Z" }, + { url = "https://files.pythonhosted.org/packages/83/ce/a31bbdfc24ee0dcbba458c8175ed26089cf109a55bbe7b7640ed2470cfe9/propcache-0.4.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:92d1935ee1f8d7442da9c0c4fa7ac20d07e94064184811b685f5c4fada64553b", size = 81451, upload-time = "2025-10-08T19:47:29.445Z" }, + { url = "https://files.pythonhosted.org/packages/25/9c/442a45a470a68456e710d96cacd3573ef26a1d0a60067e6a7d5e655621ed/propcache-0.4.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:473c61b39e1460d386479b9b2f337da492042447c9b685f28be4f74d3529e566", size = 46374, upload-time = "2025-10-08T19:47:30.579Z" }, + { url = "https://files.pythonhosted.org/packages/f4/bf/b1d5e21dbc3b2e889ea4327044fb16312a736d97640fb8b6aa3f9c7b3b65/propcache-0.4.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c0ef0aaafc66fbd87842a3fe3902fd889825646bc21149eafe47be6072725835", size = 48396, upload-time = "2025-10-08T19:47:31.79Z" }, + { url = "https://files.pythonhosted.org/packages/f4/04/5b4c54a103d480e978d3c8a76073502b18db0c4bc17ab91b3cb5092ad949/propcache-0.4.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f95393b4d66bfae908c3ca8d169d5f79cd65636ae15b5e7a4f6e67af675adb0e", size = 275950, upload-time = "2025-10-08T19:47:33.481Z" }, + { url = "https://files.pythonhosted.org/packages/b4/c1/86f846827fb969c4b78b0af79bba1d1ea2156492e1b83dea8b8a6ae27395/propcache-0.4.1-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c07fda85708bc48578467e85099645167a955ba093be0a2dcba962195676e859", size = 273856, upload-time = "2025-10-08T19:47:34.906Z" }, + { url = "https://files.pythonhosted.org/packages/36/1d/fc272a63c8d3bbad6878c336c7a7dea15e8f2d23a544bda43205dfa83ada/propcache-0.4.1-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:af223b406d6d000830c6f65f1e6431783fc3f713ba3e6cc8c024d5ee96170a4b", size = 280420, upload-time = "2025-10-08T19:47:36.338Z" }, + { url = "https://files.pythonhosted.org/packages/07/0c/01f2219d39f7e53d52e5173bcb09c976609ba30209912a0680adfb8c593a/propcache-0.4.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a78372c932c90ee474559c5ddfffd718238e8673c340dc21fe45c5b8b54559a0", size = 263254, upload-time = "2025-10-08T19:47:37.692Z" }, + { url = "https://files.pythonhosted.org/packages/2d/18/cd28081658ce597898f0c4d174d4d0f3c5b6d4dc27ffafeef835c95eb359/propcache-0.4.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:564d9f0d4d9509e1a870c920a89b2fec951b44bf5ba7d537a9e7c1ccec2c18af", size = 261205, upload-time = "2025-10-08T19:47:39.659Z" }, + { url = "https://files.pythonhosted.org/packages/7a/71/1f9e22eb8b8316701c2a19fa1f388c8a3185082607da8e406a803c9b954e/propcache-0.4.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:17612831fda0138059cc5546f4d12a2aacfb9e47068c06af35c400ba58ba7393", size = 247873, upload-time = "2025-10-08T19:47:41.084Z" }, + { url = "https://files.pythonhosted.org/packages/4a/65/3d4b61f36af2b4eddba9def857959f1016a51066b4f1ce348e0cf7881f58/propcache-0.4.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:41a89040cb10bd345b3c1a873b2bf36413d48da1def52f268a055f7398514874", size = 262739, upload-time = "2025-10-08T19:47:42.51Z" }, + { url = "https://files.pythonhosted.org/packages/2a/42/26746ab087faa77c1c68079b228810436ccd9a5ce9ac85e2b7307195fd06/propcache-0.4.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e35b88984e7fa64aacecea39236cee32dd9bd8c55f57ba8a75cf2399553f9bd7", size = 263514, upload-time = "2025-10-08T19:47:43.927Z" }, + { url = "https://files.pythonhosted.org/packages/94/13/630690fe201f5502d2403dd3cfd451ed8858fe3c738ee88d095ad2ff407b/propcache-0.4.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6f8b465489f927b0df505cbe26ffbeed4d6d8a2bbc61ce90eb074ff129ef0ab1", size = 257781, upload-time = "2025-10-08T19:47:45.448Z" }, + { url = "https://files.pythonhosted.org/packages/92/f7/1d4ec5841505f423469efbfc381d64b7b467438cd5a4bbcbb063f3b73d27/propcache-0.4.1-cp313-cp313t-win32.whl", hash = "sha256:2ad890caa1d928c7c2965b48f3a3815c853180831d0e5503d35cf00c472f4717", size = 41396, upload-time = "2025-10-08T19:47:47.202Z" }, + { url = "https://files.pythonhosted.org/packages/48/f0/615c30622316496d2cbbc29f5985f7777d3ada70f23370608c1d3e081c1f/propcache-0.4.1-cp313-cp313t-win_amd64.whl", hash = "sha256:f7ee0e597f495cf415bcbd3da3caa3bd7e816b74d0d52b8145954c5e6fd3ff37", size = 44897, upload-time = "2025-10-08T19:47:48.336Z" }, + { url = "https://files.pythonhosted.org/packages/fd/ca/6002e46eccbe0e33dcd4069ef32f7f1c9e243736e07adca37ae8c4830ec3/propcache-0.4.1-cp313-cp313t-win_arm64.whl", hash = "sha256:929d7cbe1f01bb7baffb33dc14eb5691c95831450a26354cd210a8155170c93a", size = 39789, upload-time = "2025-10-08T19:47:49.876Z" }, + { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" }, +] + [[package]] name = "psutil" version = "7.2.2" @@ -1108,6 +1551,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" }, ] +[[package]] +name = "pyarrow" +version = "24.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559, upload-time = "2026-04-21T10:47:22.17Z" }, + { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654, upload-time = "2026-04-21T10:47:28.315Z" }, + { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394, upload-time = "2026-04-21T10:47:34.821Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122, upload-time = "2026-04-21T10:47:42.056Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032, upload-time = "2026-04-21T10:47:48.967Z" }, + { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490, upload-time = "2026-04-21T10:47:55.981Z" }, + { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660, upload-time = "2026-04-21T10:48:01.718Z" }, + { url = "https://files.pythonhosted.org/packages/6f/d3/a1abf004482026ddc17f4503db227787fa3cfe41ec5091ff20e4fea55e57/pyarrow-24.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:02b001b3ed4723caa44f6cd1af2d5c86aa2cf9971dacc2ffa55b21237713dfba", size = 34976759, upload-time = "2026-04-21T10:48:07.258Z" }, + { url = "https://files.pythonhosted.org/packages/4f/4a/34f0a36d28a2dd32225301b79daad44e243dc1a2bb77d43b60749be255c4/pyarrow-24.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:04920d6a71aabd08a0417709efce97d45ea8e6fb733d9ca9ecffb13c67839f68", size = 36658471, upload-time = "2026-04-21T10:48:13.347Z" }, + { url = "https://files.pythonhosted.org/packages/1f/78/543b94712ae8bb1a6023bcc1acf1a740fbff8286747c289cd9468fced2a5/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a964266397740257f16f7bb2e4f08a0c81454004beab8ff59dd531b73610e9f2", size = 45675981, upload-time = "2026-04-21T10:48:20.201Z" }, + { url = "https://files.pythonhosted.org/packages/84/9f/8fb7c222b100d314137fa40ec050de56cd8c6d957d1cfff685ce72f15b17/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6f066b179d68c413374294bc1735f68475457c933258df594443bb9d88ddc2a0", size = 48859172, upload-time = "2026-04-21T10:48:27.541Z" }, + { url = "https://files.pythonhosted.org/packages/a7/d3/1ea72538e6c8b3b475ed78d1049a2c518e655761ea50fe1171fc855fcab7/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1183baeb14c5f587b1ec52831e665718ce632caab84b7cd6b85fd44f96114495", size = 49385733, upload-time = "2026-04-21T10:48:34.7Z" }, + { url = "https://files.pythonhosted.org/packages/c3/be/c3d8b06a1ba35f2260f8e1f771abbee7d5e345c0937aab90675706b1690a/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:806f24b4085453c197a5078218d1ee08783ebbba271badd153d1ae22a3ee804f", size = 51934335, upload-time = "2026-04-21T10:48:42.099Z" }, + { url = "https://files.pythonhosted.org/packages/9c/62/89e07a1e7329d2cde3e3c6994ba0839a24977a2beda8be6005ea3d860b99/pyarrow-24.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e4505fc6583f7b05ab854934896bcac8253b04ac1171a77dfb73efef92076d91", size = 27271748, upload-time = "2026-04-21T10:49:42.532Z" }, + { url = "https://files.pythonhosted.org/packages/17/1a/cff3a59f80b5b1658549d46611b67163f65e0664431c076ad728bf9d5af4/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:1a4e45017efbf115032e4475ee876d525e0e36c742214fbe405332480ecd6275", size = 35238554, upload-time = "2026-04-21T10:48:48.526Z" }, + { url = "https://files.pythonhosted.org/packages/a8/99/cce0f42a327bfef2c420fb6078a3eb834826e5d6697bf3009fe11d2ad051/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:7986f1fa71cee060ad00758bcc79d3a93bab8559bf978fab9e53472a2e25a17b", size = 36782301, upload-time = "2026-04-21T10:48:55.181Z" }, + { url = "https://files.pythonhosted.org/packages/2a/66/8e560d5ff6793ca29aca213c53eec0dd482dd46cb93b2819e5aab52e4252/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:d3e0b61e8efb24ed38898e5cdc5fffa9124be480008d401a1f8071500494ae42", size = 45721929, upload-time = "2026-04-21T10:49:03.676Z" }, + { url = "https://files.pythonhosted.org/packages/27/0c/a26e25505d030716e078d9f16eb74973cbf0b33b672884e9f9da1c83b871/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:55a3bc1e3df3b5567b7d27ef551b2283f0c68a5e86f1cd56abc569da4f31335b", size = 48825365, upload-time = "2026-04-21T10:49:11.714Z" }, + { url = "https://files.pythonhosted.org/packages/5f/eb/771f9ecb0c65e73fe9dccdd1717901b9594f08c4515d000c7c62df573811/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:641f795b361874ac9da5294f8f443dfdbee355cf2bd9e3b8d97aaac2306b9b37", size = 49451819, upload-time = "2026-04-21T10:49:21.474Z" }, + { url = "https://files.pythonhosted.org/packages/48/da/61ae89a88732f5a785646f3ec6125dbb640fa98a540eb2b9889caa561403/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8adc8e6ce5fccf5dc707046ae4914fd537def529709cc0d285d37a7f9cd442ca", size = 51909252, upload-time = "2026-04-21T10:49:31.164Z" }, + { url = "https://files.pythonhosted.org/packages/cb/1a/8dd5cafab7b66573fa91c03d06d213356ad4edd71813aa75e08ce2b3a844/pyarrow-24.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:9b18371ad2f44044b81a8d23bc2d8a9b6a6226dca775e8e16cfee640473d6c5d", size = 27388127, upload-time = "2026-04-21T10:49:37.334Z" }, +] + [[package]] name = "pycparser" version = "3.0" @@ -1514,6 +1986,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543, upload-time = "2026-02-23T00:20:45.313Z" }, ] +[[package]] +name = "sentence-transformers" +version = "5.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "torch" }, + { name = "tqdm" }, + { name = "transformers" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4d/68/7f98c221940ce783b492ad6140384daf2e2918cd7175009d6a362c22b9ee/sentence_transformers-5.4.1.tar.gz", hash = "sha256:436bcb1182a0ff42a8fb2b1c43498a70d0a75b688d182f2cd0d1dd115af61ddc", size = 428910, upload-time = "2026-04-14T13:34:59.006Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/d9/3a9b6f2ccdedc9dc00fe37b2fc58f58f8efbff44565cf4bf39d8568bb13a/sentence_transformers-5.4.1-py3-none-any.whl", hash = "sha256:a6d640fc363849b63affb8e140e9d328feabab86f83d58ac3e16b1c28140b790", size = 571311, upload-time = "2026-04-14T13:34:57.731Z" }, +] + [[package]] name = "sentry-sdk" version = "2.53.0" @@ -1554,6 +2045,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + [[package]] name = "sqlalchemy" version = "2.0.45" @@ -1906,3 +2406,124 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6b/ae/90366304d7c2ce80f9b826096a9e9048b4bb760e44d3b873bb272cba696b/websockets-16.0-cp313-cp313-win_amd64.whl", hash = "sha256:3425ac5cf448801335d6fdc7ae1eb22072055417a96cc6b31b3861f455fbc156", size = 178689, upload-time = "2026-01-10T09:23:10.483Z" }, { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" }, ] + +[[package]] +name = "xxhash" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/84/30869e01909fb37a6cc7e18688ee8bf1e42d57e7e0777636bd47524c43c7/xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6", size = 85160, upload-time = "2025-10-02T14:37:08.097Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/07/d9412f3d7d462347e4511181dea65e47e0d0e16e26fbee2ea86a2aefb657/xxhash-3.6.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:01362c4331775398e7bb34e3ab403bc9ee9f7c497bc7dee6272114055277dd3c", size = 32744, upload-time = "2025-10-02T14:34:34.622Z" }, + { url = "https://files.pythonhosted.org/packages/79/35/0429ee11d035fc33abe32dca1b2b69e8c18d236547b9a9b72c1929189b9a/xxhash-3.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b7b2df81a23f8cb99656378e72501b2cb41b1827c0f5a86f87d6b06b69f9f204", size = 30816, upload-time = "2025-10-02T14:34:36.043Z" }, + { url = "https://files.pythonhosted.org/packages/b7/f2/57eb99aa0f7d98624c0932c5b9a170e1806406cdbcdb510546634a1359e0/xxhash-3.6.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:dc94790144e66b14f67b10ac8ed75b39ca47536bf8800eb7c24b50271ea0c490", size = 194035, upload-time = "2025-10-02T14:34:37.354Z" }, + { url = "https://files.pythonhosted.org/packages/4c/ed/6224ba353690d73af7a3f1c7cdb1fc1b002e38f783cb991ae338e1eb3d79/xxhash-3.6.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:93f107c673bccf0d592cdba077dedaf52fe7f42dcd7676eba1f6d6f0c3efffd2", size = 212914, upload-time = "2025-10-02T14:34:38.6Z" }, + { url = "https://files.pythonhosted.org/packages/38/86/fb6b6130d8dd6b8942cc17ab4d90e223653a89aa32ad2776f8af7064ed13/xxhash-3.6.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2aa5ee3444c25b69813663c9f8067dcfaa2e126dc55e8dddf40f4d1c25d7effa", size = 212163, upload-time = "2025-10-02T14:34:39.872Z" }, + { url = "https://files.pythonhosted.org/packages/ee/dc/e84875682b0593e884ad73b2d40767b5790d417bde603cceb6878901d647/xxhash-3.6.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f7f99123f0e1194fa59cc69ad46dbae2e07becec5df50a0509a808f90a0f03f0", size = 445411, upload-time = "2025-10-02T14:34:41.569Z" }, + { url = "https://files.pythonhosted.org/packages/11/4f/426f91b96701ec2f37bb2b8cec664eff4f658a11f3fa9d94f0a887ea6d2b/xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:49e03e6fe2cac4a1bc64952dd250cf0dbc5ef4ebb7b8d96bce82e2de163c82a2", size = 193883, upload-time = "2025-10-02T14:34:43.249Z" }, + { url = "https://files.pythonhosted.org/packages/53/5a/ddbb83eee8e28b778eacfc5a85c969673e4023cdeedcfcef61f36731610b/xxhash-3.6.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bd17fede52a17a4f9a7bc4472a5867cb0b160deeb431795c0e4abe158bc784e9", size = 210392, upload-time = "2025-10-02T14:34:45.042Z" }, + { url = "https://files.pythonhosted.org/packages/1e/c2/ff69efd07c8c074ccdf0a4f36fcdd3d27363665bcdf4ba399abebe643465/xxhash-3.6.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6fb5f5476bef678f69db04f2bd1efbed3030d2aba305b0fc1773645f187d6a4e", size = 197898, upload-time = "2025-10-02T14:34:46.302Z" }, + { url = "https://files.pythonhosted.org/packages/58/ca/faa05ac19b3b622c7c9317ac3e23954187516298a091eb02c976d0d3dd45/xxhash-3.6.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:843b52f6d88071f87eba1631b684fcb4b2068cd2180a0224122fe4ef011a9374", size = 210655, upload-time = "2025-10-02T14:34:47.571Z" }, + { url = "https://files.pythonhosted.org/packages/d4/7a/06aa7482345480cc0cb597f5c875b11a82c3953f534394f620b0be2f700c/xxhash-3.6.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7d14a6cfaf03b1b6f5f9790f76880601ccc7896aff7ab9cd8978a939c1eb7e0d", size = 414001, upload-time = "2025-10-02T14:34:49.273Z" }, + { url = "https://files.pythonhosted.org/packages/23/07/63ffb386cd47029aa2916b3d2f454e6cc5b9f5c5ada3790377d5430084e7/xxhash-3.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:418daf3db71e1413cfe211c2f9a528456936645c17f46b5204705581a45390ae", size = 191431, upload-time = "2025-10-02T14:34:50.798Z" }, + { url = "https://files.pythonhosted.org/packages/0f/93/14fde614cadb4ddf5e7cebf8918b7e8fac5ae7861c1875964f17e678205c/xxhash-3.6.0-cp312-cp312-win32.whl", hash = "sha256:50fc255f39428a27299c20e280d6193d8b63b8ef8028995323bf834a026b4fbb", size = 30617, upload-time = "2025-10-02T14:34:51.954Z" }, + { url = "https://files.pythonhosted.org/packages/13/5d/0d125536cbe7565a83d06e43783389ecae0c0f2ed037b48ede185de477c0/xxhash-3.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:c0f2ab8c715630565ab8991b536ecded9416d615538be8ecddce43ccf26cbc7c", size = 31534, upload-time = "2025-10-02T14:34:53.276Z" }, + { url = "https://files.pythonhosted.org/packages/54/85/6ec269b0952ec7e36ba019125982cf11d91256a778c7c3f98a4c5043d283/xxhash-3.6.0-cp312-cp312-win_arm64.whl", hash = "sha256:eae5c13f3bc455a3bbb68bdc513912dc7356de7e2280363ea235f71f54064829", size = 27876, upload-time = "2025-10-02T14:34:54.371Z" }, + { url = "https://files.pythonhosted.org/packages/33/76/35d05267ac82f53ae9b0e554da7c5e281ee61f3cad44c743f0fcd354f211/xxhash-3.6.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:599e64ba7f67472481ceb6ee80fa3bd828fd61ba59fb11475572cc5ee52b89ec", size = 32738, upload-time = "2025-10-02T14:34:55.839Z" }, + { url = "https://files.pythonhosted.org/packages/31/a8/3fbce1cd96534a95e35d5120637bf29b0d7f5d8fa2f6374e31b4156dd419/xxhash-3.6.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7d8b8aaa30fca4f16f0c84a5c8d7ddee0e25250ec2796c973775373257dde8f1", size = 30821, upload-time = "2025-10-02T14:34:57.219Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ea/d387530ca7ecfa183cb358027f1833297c6ac6098223fd14f9782cd0015c/xxhash-3.6.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d597acf8506d6e7101a4a44a5e428977a51c0fadbbfd3c39650cca9253f6e5a6", size = 194127, upload-time = "2025-10-02T14:34:59.21Z" }, + { url = "https://files.pythonhosted.org/packages/ba/0c/71435dcb99874b09a43b8d7c54071e600a7481e42b3e3ce1eb5226a5711a/xxhash-3.6.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:858dc935963a33bc33490128edc1c12b0c14d9c7ebaa4e387a7869ecc4f3e263", size = 212975, upload-time = "2025-10-02T14:35:00.816Z" }, + { url = "https://files.pythonhosted.org/packages/84/7a/c2b3d071e4bb4a90b7057228a99b10d51744878f4a8a6dd643c8bd897620/xxhash-3.6.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ba284920194615cb8edf73bf52236ce2e1664ccd4a38fdb543506413529cc546", size = 212241, upload-time = "2025-10-02T14:35:02.207Z" }, + { url = "https://files.pythonhosted.org/packages/81/5f/640b6eac0128e215f177df99eadcd0f1b7c42c274ab6a394a05059694c5a/xxhash-3.6.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4b54219177f6c6674d5378bd862c6aedf64725f70dd29c472eaae154df1a2e89", size = 445471, upload-time = "2025-10-02T14:35:03.61Z" }, + { url = "https://files.pythonhosted.org/packages/5e/1e/3c3d3ef071b051cc3abbe3721ffb8365033a172613c04af2da89d5548a87/xxhash-3.6.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:42c36dd7dbad2f5238950c377fcbf6811b1cdb1c444fab447960030cea60504d", size = 193936, upload-time = "2025-10-02T14:35:05.013Z" }, + { url = "https://files.pythonhosted.org/packages/2c/bd/4a5f68381939219abfe1c22a9e3a5854a4f6f6f3c4983a87d255f21f2e5d/xxhash-3.6.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f22927652cba98c44639ffdc7aaf35828dccf679b10b31c4ad72a5b530a18eb7", size = 210440, upload-time = "2025-10-02T14:35:06.239Z" }, + { url = "https://files.pythonhosted.org/packages/eb/37/b80fe3d5cfb9faff01a02121a0f4d565eb7237e9e5fc66e73017e74dcd36/xxhash-3.6.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b45fad44d9c5c119e9c6fbf2e1c656a46dc68e280275007bbfd3d572b21426db", size = 197990, upload-time = "2025-10-02T14:35:07.735Z" }, + { url = "https://files.pythonhosted.org/packages/d7/fd/2c0a00c97b9e18f72e1f240ad4e8f8a90fd9d408289ba9c7c495ed7dc05c/xxhash-3.6.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6f2580ffab1a8b68ef2b901cde7e55fa8da5e4be0977c68f78fc80f3c143de42", size = 210689, upload-time = "2025-10-02T14:35:09.438Z" }, + { url = "https://files.pythonhosted.org/packages/93/86/5dd8076a926b9a95db3206aba20d89a7fc14dd5aac16e5c4de4b56033140/xxhash-3.6.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:40c391dd3cd041ebc3ffe6f2c862f402e306eb571422e0aa918d8070ba31da11", size = 414068, upload-time = "2025-10-02T14:35:11.162Z" }, + { url = "https://files.pythonhosted.org/packages/af/3c/0bb129170ee8f3650f08e993baee550a09593462a5cddd8e44d0011102b1/xxhash-3.6.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f205badabde7aafd1a31e8ca2a3e5a763107a71c397c4481d6a804eb5063d8bd", size = 191495, upload-time = "2025-10-02T14:35:12.971Z" }, + { url = "https://files.pythonhosted.org/packages/e9/3a/6797e0114c21d1725e2577508e24006fd7ff1d8c0c502d3b52e45c1771d8/xxhash-3.6.0-cp313-cp313-win32.whl", hash = "sha256:2577b276e060b73b73a53042ea5bd5203d3e6347ce0d09f98500f418a9fcf799", size = 30620, upload-time = "2025-10-02T14:35:14.129Z" }, + { url = "https://files.pythonhosted.org/packages/86/15/9bc32671e9a38b413a76d24722a2bf8784a132c043063a8f5152d390b0f9/xxhash-3.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:757320d45d2fbcce8f30c42a6b2f47862967aea7bf458b9625b4bbe7ee390392", size = 31542, upload-time = "2025-10-02T14:35:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/39/c5/cc01e4f6188656e56112d6a8e0dfe298a16934b8c47a247236549a3f7695/xxhash-3.6.0-cp313-cp313-win_arm64.whl", hash = "sha256:457b8f85dec5825eed7b69c11ae86834a018b8e3df5e77783c999663da2f96d6", size = 27880, upload-time = "2025-10-02T14:35:16.315Z" }, + { url = "https://files.pythonhosted.org/packages/f3/30/25e5321c8732759e930c555176d37e24ab84365482d257c3b16362235212/xxhash-3.6.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a42e633d75cdad6d625434e3468126c73f13f7584545a9cf34e883aa1710e702", size = 32956, upload-time = "2025-10-02T14:35:17.413Z" }, + { url = "https://files.pythonhosted.org/packages/9f/3c/0573299560d7d9f8ab1838f1efc021a280b5ae5ae2e849034ef3dee18810/xxhash-3.6.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:568a6d743219e717b07b4e03b0a828ce593833e498c3b64752e0f5df6bfe84db", size = 31072, upload-time = "2025-10-02T14:35:18.844Z" }, + { url = "https://files.pythonhosted.org/packages/7a/1c/52d83a06e417cd9d4137722693424885cc9878249beb3a7c829e74bf7ce9/xxhash-3.6.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:bec91b562d8012dae276af8025a55811b875baace6af510412a5e58e3121bc54", size = 196409, upload-time = "2025-10-02T14:35:20.31Z" }, + { url = "https://files.pythonhosted.org/packages/e3/8e/c6d158d12a79bbd0b878f8355432075fc82759e356ab5a111463422a239b/xxhash-3.6.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:78e7f2f4c521c30ad5e786fdd6bae89d47a32672a80195467b5de0480aa97b1f", size = 215736, upload-time = "2025-10-02T14:35:21.616Z" }, + { url = "https://files.pythonhosted.org/packages/bc/68/c4c80614716345d55071a396cf03d06e34b5f4917a467faf43083c995155/xxhash-3.6.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3ed0df1b11a79856df5ffcab572cbd6b9627034c1c748c5566fa79df9048a7c5", size = 214833, upload-time = "2025-10-02T14:35:23.32Z" }, + { url = "https://files.pythonhosted.org/packages/7e/e9/ae27c8ffec8b953efa84c7c4a6c6802c263d587b9fc0d6e7cea64e08c3af/xxhash-3.6.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0e4edbfc7d420925b0dd5e792478ed393d6e75ff8fc219a6546fb446b6a417b1", size = 448348, upload-time = "2025-10-02T14:35:25.111Z" }, + { url = "https://files.pythonhosted.org/packages/d7/6b/33e21afb1b5b3f46b74b6bd1913639066af218d704cc0941404ca717fc57/xxhash-3.6.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fba27a198363a7ef87f8c0f6b171ec36b674fe9053742c58dd7e3201c1ab30ee", size = 196070, upload-time = "2025-10-02T14:35:26.586Z" }, + { url = "https://files.pythonhosted.org/packages/96/b6/fcabd337bc5fa624e7203aa0fa7d0c49eed22f72e93229431752bddc83d9/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:794fe9145fe60191c6532fa95063765529770edcdd67b3d537793e8004cabbfd", size = 212907, upload-time = "2025-10-02T14:35:28.087Z" }, + { url = "https://files.pythonhosted.org/packages/4b/d3/9ee6160e644d660fcf176c5825e61411c7f62648728f69c79ba237250143/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:6105ef7e62b5ac73a837778efc331a591d8442f8ef5c7e102376506cb4ae2729", size = 200839, upload-time = "2025-10-02T14:35:29.857Z" }, + { url = "https://files.pythonhosted.org/packages/0d/98/e8de5baa5109394baf5118f5e72ab21a86387c4f89b0e77ef3e2f6b0327b/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:f01375c0e55395b814a679b3eea205db7919ac2af213f4a6682e01220e5fe292", size = 213304, upload-time = "2025-10-02T14:35:31.222Z" }, + { url = "https://files.pythonhosted.org/packages/7b/1d/71056535dec5c3177eeb53e38e3d367dd1d16e024e63b1cee208d572a033/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:d706dca2d24d834a4661619dcacf51a75c16d65985718d6a7d73c1eeeb903ddf", size = 416930, upload-time = "2025-10-02T14:35:32.517Z" }, + { url = "https://files.pythonhosted.org/packages/dc/6c/5cbde9de2cd967c322e651c65c543700b19e7ae3e0aae8ece3469bf9683d/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5f059d9faeacd49c0215d66f4056e1326c80503f51a1532ca336a385edadd033", size = 193787, upload-time = "2025-10-02T14:35:33.827Z" }, + { url = "https://files.pythonhosted.org/packages/19/fa/0172e350361d61febcea941b0cc541d6e6c8d65d153e85f850a7b256ff8a/xxhash-3.6.0-cp313-cp313t-win32.whl", hash = "sha256:1244460adc3a9be84731d72b8e80625788e5815b68da3da8b83f78115a40a7ec", size = 30916, upload-time = "2025-10-02T14:35:35.107Z" }, + { url = "https://files.pythonhosted.org/packages/ad/e6/e8cf858a2b19d6d45820f072eff1bea413910592ff17157cabc5f1227a16/xxhash-3.6.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b1e420ef35c503869c4064f4a2f2b08ad6431ab7b229a05cce39d74268bca6b8", size = 31799, upload-time = "2025-10-02T14:35:36.165Z" }, + { url = "https://files.pythonhosted.org/packages/56/15/064b197e855bfb7b343210e82490ae672f8bc7cdf3ddb02e92f64304ee8a/xxhash-3.6.0-cp313-cp313t-win_arm64.whl", hash = "sha256:ec44b73a4220623235f67a996c862049f375df3b1052d9899f40a6382c32d746", size = 28044, upload-time = "2025-10-02T14:35:37.195Z" }, +] + +[[package]] +name = "yarl" +version = "1.23.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/23/6e/beb1beec874a72f23815c1434518bfc4ed2175065173fb138c3705f658d4/yarl-1.23.0.tar.gz", hash = "sha256:53b1ea6ca88ebd4420379c330aea57e258408dd0df9af0992e5de2078dc9f5d5", size = 194676, upload-time = "2026-03-01T22:07:53.373Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/8a/94615bc31022f711add374097ad4144d569e95ff3c38d39215d07ac153a0/yarl-1.23.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1932b6b8bba8d0160a9d1078aae5838a66039e8832d41d2992daa9a3a08f7860", size = 124737, upload-time = "2026-03-01T22:05:12.897Z" }, + { url = "https://files.pythonhosted.org/packages/e3/6f/c6554045d59d64052698add01226bc867b52fe4a12373415d7991fdca95d/yarl-1.23.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:411225bae281f114067578891bc75534cfb3d92a3b4dfef7a6ca78ba354e6069", size = 87029, upload-time = "2026-03-01T22:05:14.376Z" }, + { url = "https://files.pythonhosted.org/packages/19/2a/725ecc166d53438bc88f76822ed4b1e3b10756e790bafd7b523fe97c322d/yarl-1.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13a563739ae600a631c36ce096615fe307f131344588b0bc0daec108cdb47b25", size = 86310, upload-time = "2026-03-01T22:05:15.71Z" }, + { url = "https://files.pythonhosted.org/packages/99/30/58260ed98e6ff7f90ba84442c1ddd758c9170d70327394a6227b310cd60f/yarl-1.23.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cbf44c5cb4a7633d078788e1b56387e3d3cf2b8139a3be38040b22d6c3221c8", size = 97587, upload-time = "2026-03-01T22:05:17.384Z" }, + { url = "https://files.pythonhosted.org/packages/76/0a/8b08aac08b50682e65759f7f8dde98ae8168f72487e7357a5d684c581ef9/yarl-1.23.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53ad387048f6f09a8969631e4de3f1bf70c50e93545d64af4f751b2498755072", size = 92528, upload-time = "2026-03-01T22:05:18.804Z" }, + { url = "https://files.pythonhosted.org/packages/52/07/0b7179101fe5f8385ec6c6bb5d0cb9f76bd9fb4a769591ab6fb5cdbfc69a/yarl-1.23.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4a59ba56f340334766f3a4442e0efd0af895fae9e2b204741ef885c446b3a1a8", size = 105339, upload-time = "2026-03-01T22:05:20.235Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8a/36d82869ab5ec829ca8574dfcb92b51286fcfb1e9c7a73659616362dc880/yarl-1.23.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:803a3c3ce4acc62eaf01eaca1208dcf0783025ef27572c3336502b9c232005e7", size = 105061, upload-time = "2026-03-01T22:05:22.268Z" }, + { url = "https://files.pythonhosted.org/packages/66/3e/868e5c3364b6cee19ff3e1a122194fa4ce51def02c61023970442162859e/yarl-1.23.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3d2bff8f37f8d0f96c7ec554d16945050d54462d6e95414babaa18bfafc7f51", size = 100132, upload-time = "2026-03-01T22:05:23.638Z" }, + { url = "https://files.pythonhosted.org/packages/cf/26/9c89acf82f08a52cb52d6d39454f8d18af15f9d386a23795389d1d423823/yarl-1.23.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c75eb09e8d55bceb4367e83496ff8ef2bc7ea6960efb38e978e8073ea59ecb67", size = 99289, upload-time = "2026-03-01T22:05:25.749Z" }, + { url = "https://files.pythonhosted.org/packages/6f/54/5b0db00d2cb056922356104468019c0a132e89c8d3ab67d8ede9f4483d2a/yarl-1.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877b0738624280e34c55680d6054a307aa94f7d52fa0e3034a9cc6e790871da7", size = 96950, upload-time = "2026-03-01T22:05:27.318Z" }, + { url = "https://files.pythonhosted.org/packages/f6/40/10fa93811fd439341fad7e0718a86aca0de9548023bbb403668d6555acab/yarl-1.23.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b5405bb8f0e783a988172993cfc627e4d9d00432d6bbac65a923041edacf997d", size = 93960, upload-time = "2026-03-01T22:05:28.738Z" }, + { url = "https://files.pythonhosted.org/packages/bc/d2/8ae2e6cd77d0805f4526e30ec43b6f9a3dfc542d401ac4990d178e4bf0cf/yarl-1.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c3a3598a832590c5a3ce56ab5576361b5688c12cb1d39429cf5dba30b510760", size = 104703, upload-time = "2026-03-01T22:05:30.438Z" }, + { url = "https://files.pythonhosted.org/packages/2f/0c/b3ceacf82c3fe21183ce35fa2acf5320af003d52bc1fcf5915077681142e/yarl-1.23.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8419ebd326430d1cbb7efb5292330a2cf39114e82df5cc3d83c9a0d5ebeaf2f2", size = 98325, upload-time = "2026-03-01T22:05:31.835Z" }, + { url = "https://files.pythonhosted.org/packages/9d/e0/12900edd28bdab91a69bd2554b85ad7b151f64e8b521fe16f9ad2f56477a/yarl-1.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:be61f6fff406ca40e3b1d84716fde398fc08bc63dd96d15f3a14230a0973ed86", size = 105067, upload-time = "2026-03-01T22:05:33.358Z" }, + { url = "https://files.pythonhosted.org/packages/15/61/74bb1182cf79c9bbe4eb6b1f14a57a22d7a0be5e9cedf8e2d5c2086474c3/yarl-1.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ceb13c5c858d01321b5d9bb65e4cf37a92169ea470b70fec6f236b2c9dd7e34", size = 100285, upload-time = "2026-03-01T22:05:35.4Z" }, + { url = "https://files.pythonhosted.org/packages/69/7f/cd5ef733f2550de6241bd8bd8c3febc78158b9d75f197d9c7baa113436af/yarl-1.23.0-cp312-cp312-win32.whl", hash = "sha256:fffc45637bcd6538de8b85f51e3df3223e4ad89bccbfca0481c08c7fc8b7ed7d", size = 82359, upload-time = "2026-03-01T22:05:36.811Z" }, + { url = "https://files.pythonhosted.org/packages/f5/be/25216a49daeeb7af2bec0db22d5e7df08ed1d7c9f65d78b14f3b74fd72fc/yarl-1.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:f69f57305656a4852f2a7203efc661d8c042e6cc67f7acd97d8667fb448a426e", size = 87674, upload-time = "2026-03-01T22:05:38.171Z" }, + { url = "https://files.pythonhosted.org/packages/d2/35/aeab955d6c425b227d5b7247eafb24f2653fedc32f95373a001af5dfeb9e/yarl-1.23.0-cp312-cp312-win_arm64.whl", hash = "sha256:6e87a6e8735b44816e7db0b2fbc9686932df473c826b0d9743148432e10bb9b9", size = 81879, upload-time = "2026-03-01T22:05:40.006Z" }, + { url = "https://files.pythonhosted.org/packages/9a/4b/a0a6e5d0ee8a2f3a373ddef8a4097d74ac901ac363eea1440464ccbe0898/yarl-1.23.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:16c6994ac35c3e74fb0ae93323bf8b9c2a9088d55946109489667c510a7d010e", size = 123796, upload-time = "2026-03-01T22:05:41.412Z" }, + { url = "https://files.pythonhosted.org/packages/67/b6/8925d68af039b835ae876db5838e82e76ec87b9782ecc97e192b809c4831/yarl-1.23.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4a42e651629dafb64fd5b0286a3580613702b5809ad3f24934ea87595804f2c5", size = 86547, upload-time = "2026-03-01T22:05:42.841Z" }, + { url = "https://files.pythonhosted.org/packages/ae/50/06d511cc4b8e0360d3c94af051a768e84b755c5eb031b12adaaab6dec6e5/yarl-1.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7c6b9461a2a8b47c65eef63bb1c76a4f1c119618ffa99ea79bc5bb1e46c5821b", size = 85854, upload-time = "2026-03-01T22:05:44.85Z" }, + { url = "https://files.pythonhosted.org/packages/c4/f4/4e30b250927ffdab4db70da08b9b8d2194d7c7b400167b8fbeca1e4701ca/yarl-1.23.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2569b67d616eab450d262ca7cb9f9e19d2f718c70a8b88712859359d0ab17035", size = 98351, upload-time = "2026-03-01T22:05:46.836Z" }, + { url = "https://files.pythonhosted.org/packages/86/fc/4118c5671ea948208bdb1492d8b76bdf1453d3e73df051f939f563e7dcc5/yarl-1.23.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e9d9a4d06d3481eab79803beb4d9bd6f6a8e781ec078ac70d7ef2dcc29d1bea5", size = 92711, upload-time = "2026-03-01T22:05:48.316Z" }, + { url = "https://files.pythonhosted.org/packages/56/11/1ed91d42bd9e73c13dc9e7eb0dd92298d75e7ac4dd7f046ad0c472e231cd/yarl-1.23.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f514f6474e04179d3d33175ed3f3e31434d3130d42ec153540d5b157deefd735", size = 106014, upload-time = "2026-03-01T22:05:50.028Z" }, + { url = "https://files.pythonhosted.org/packages/ce/c9/74e44e056a23fbc33aca71779ef450ca648a5bc472bdad7a82339918f818/yarl-1.23.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:fda207c815b253e34f7e1909840fd14299567b1c0eb4908f8c2ce01a41265401", size = 105557, upload-time = "2026-03-01T22:05:51.416Z" }, + { url = "https://files.pythonhosted.org/packages/66/fe/b1e10b08d287f518994f1e2ff9b6d26f0adeecd8dd7d533b01bab29a3eda/yarl-1.23.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34b6cf500e61c90f305094911f9acc9c86da1a05a7a3f5be9f68817043f486e4", size = 101559, upload-time = "2026-03-01T22:05:52.872Z" }, + { url = "https://files.pythonhosted.org/packages/72/59/c5b8d94b14e3d3c2a9c20cb100119fd534ab5a14b93673ab4cc4a4141ea5/yarl-1.23.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d7504f2b476d21653e4d143f44a175f7f751cd41233525312696c76aa3dbb23f", size = 100502, upload-time = "2026-03-01T22:05:54.954Z" }, + { url = "https://files.pythonhosted.org/packages/77/4f/96976cb54cbfc5c9fd73ed4c51804f92f209481d1fb190981c0f8a07a1d7/yarl-1.23.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:578110dd426f0d209d1509244e6d4a3f1a3e9077655d98c5f22583d63252a08a", size = 98027, upload-time = "2026-03-01T22:05:56.409Z" }, + { url = "https://files.pythonhosted.org/packages/63/6e/904c4f476471afdbad6b7e5b70362fb5810e35cd7466529a97322b6f5556/yarl-1.23.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:609d3614d78d74ebe35f54953c5bbd2ac647a7ddb9c30a5d877580f5e86b22f2", size = 95369, upload-time = "2026-03-01T22:05:58.141Z" }, + { url = "https://files.pythonhosted.org/packages/9d/40/acfcdb3b5f9d68ef499e39e04d25e141fe90661f9d54114556cf83be8353/yarl-1.23.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4966242ec68afc74c122f8459abd597afd7d8a60dc93d695c1334c5fd25f762f", size = 105565, upload-time = "2026-03-01T22:06:00.286Z" }, + { url = "https://files.pythonhosted.org/packages/5e/c6/31e28f3a6ba2869c43d124f37ea5260cac9c9281df803c354b31f4dd1f3c/yarl-1.23.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:e0fd068364a6759bc794459f0a735ab151d11304346332489c7972bacbe9e72b", size = 99813, upload-time = "2026-03-01T22:06:01.712Z" }, + { url = "https://files.pythonhosted.org/packages/08/1f/6f65f59e72d54aa467119b63fc0b0b1762eff0232db1f4720cd89e2f4a17/yarl-1.23.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:39004f0ad156da43e86aa71f44e033de68a44e5a31fc53507b36dd253970054a", size = 105632, upload-time = "2026-03-01T22:06:03.188Z" }, + { url = "https://files.pythonhosted.org/packages/a3/c4/18b178a69935f9e7a338127d5b77d868fdc0f0e49becd286d51b3a18c61d/yarl-1.23.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e5723c01a56c5028c807c701aa66722916d2747ad737a046853f6c46f4875543", size = 101895, upload-time = "2026-03-01T22:06:04.651Z" }, + { url = "https://files.pythonhosted.org/packages/8f/54/f5b870b5505663911dba950a8e4776a0dbd51c9c54c0ae88e823e4b874a0/yarl-1.23.0-cp313-cp313-win32.whl", hash = "sha256:1b6b572edd95b4fa8df75de10b04bc81acc87c1c7d16bcdd2035b09d30acc957", size = 82356, upload-time = "2026-03-01T22:06:06.04Z" }, + { url = "https://files.pythonhosted.org/packages/7a/84/266e8da36879c6edcd37b02b547e2d9ecdfea776be49598e75696e3316e1/yarl-1.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:baaf55442359053c7d62f6f8413a62adba3205119bcb6f49594894d8be47e5e3", size = 87515, upload-time = "2026-03-01T22:06:08.107Z" }, + { url = "https://files.pythonhosted.org/packages/00/fd/7e1c66efad35e1649114fa13f17485f62881ad58edeeb7f49f8c5e748bf9/yarl-1.23.0-cp313-cp313-win_arm64.whl", hash = "sha256:fb4948814a2a98e3912505f09c9e7493b1506226afb1f881825368d6fb776ee3", size = 81785, upload-time = "2026-03-01T22:06:10.181Z" }, + { url = "https://files.pythonhosted.org/packages/9c/fc/119dd07004f17ea43bb91e3ece6587759edd7519d6b086d16bfbd3319982/yarl-1.23.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:aecfed0b41aa72b7881712c65cf764e39ce2ec352324f5e0837c7048d9e6daaa", size = 130719, upload-time = "2026-03-01T22:06:11.708Z" }, + { url = "https://files.pythonhosted.org/packages/e6/0d/9f2348502fbb3af409e8f47730282cd6bc80dec6630c1e06374d882d6eb2/yarl-1.23.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a41bcf68efd19073376eb8cf948b8d9be0af26256403e512bb18f3966f1f9120", size = 89690, upload-time = "2026-03-01T22:06:13.429Z" }, + { url = "https://files.pythonhosted.org/packages/50/93/e88f3c80971b42cfc83f50a51b9d165a1dbf154b97005f2994a79f212a07/yarl-1.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cde9a2ecd91668bcb7f077c4966d8ceddb60af01b52e6e3e2680e4cf00ad1a59", size = 89851, upload-time = "2026-03-01T22:06:15.53Z" }, + { url = "https://files.pythonhosted.org/packages/1c/07/61c9dd8ba8f86473263b4036f70fb594c09e99c0d9737a799dfd8bc85651/yarl-1.23.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5023346c4ee7992febc0068e7593de5fa2bf611848c08404b35ebbb76b1b0512", size = 95874, upload-time = "2026-03-01T22:06:17.553Z" }, + { url = "https://files.pythonhosted.org/packages/9e/e9/f9ff8ceefba599eac6abddcfb0b3bee9b9e636e96dbf54342a8577252379/yarl-1.23.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d1009abedb49ae95b136a8904a3f71b342f849ffeced2d3747bf29caeda218c4", size = 88710, upload-time = "2026-03-01T22:06:19.004Z" }, + { url = "https://files.pythonhosted.org/packages/eb/78/0231bfcc5d4c8eec220bc2f9ef82cb4566192ea867a7c5b4148f44f6cbcd/yarl-1.23.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a8d00f29b42f534cc8aa3931cfe773b13b23e561e10d2b26f27a8d309b0e82a1", size = 101033, upload-time = "2026-03-01T22:06:21.203Z" }, + { url = "https://files.pythonhosted.org/packages/cd/9b/30ea5239a61786f18fd25797151a17fbb3be176977187a48d541b5447dd4/yarl-1.23.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:95451e6ce06c3e104556d73b559f5da6c34a069b6b62946d3ad66afcd51642ea", size = 100817, upload-time = "2026-03-01T22:06:22.738Z" }, + { url = "https://files.pythonhosted.org/packages/62/e2/a4980481071791bc83bce2b7a1a1f7adcabfa366007518b4b845e92eeee3/yarl-1.23.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:531ef597132086b6cf96faa7c6c1dcd0361dd5f1694e5cc30375907b9b7d3ea9", size = 97482, upload-time = "2026-03-01T22:06:24.21Z" }, + { url = "https://files.pythonhosted.org/packages/e5/1e/304a00cf5f6100414c4b5a01fc7ff9ee724b62158a08df2f8170dfc72a2d/yarl-1.23.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:88f9fb0116fbfcefcab70f85cf4b74a2b6ce5d199c41345296f49d974ddb4123", size = 95949, upload-time = "2026-03-01T22:06:25.697Z" }, + { url = "https://files.pythonhosted.org/packages/68/03/093f4055ed4cae649ac53bca3d180bd37102e9e11d048588e9ab0c0108d0/yarl-1.23.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e7b0460976dc75cb87ad9cc1f9899a4b97751e7d4e77ab840fc9b6d377b8fd24", size = 95839, upload-time = "2026-03-01T22:06:27.309Z" }, + { url = "https://files.pythonhosted.org/packages/b9/28/4c75ebb108f322aa8f917ae10a8ffa4f07cae10a8a627b64e578617df6a0/yarl-1.23.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:115136c4a426f9da976187d238e84139ff6b51a20839aa6e3720cd1026d768de", size = 90696, upload-time = "2026-03-01T22:06:29.048Z" }, + { url = "https://files.pythonhosted.org/packages/23/9c/42c2e2dd91c1a570402f51bdf066bfdb1241c2240ba001967bad778e77b7/yarl-1.23.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:ead11956716a940c1abc816b7df3fa2b84d06eaed8832ca32f5c5e058c65506b", size = 100865, upload-time = "2026-03-01T22:06:30.525Z" }, + { url = "https://files.pythonhosted.org/packages/74/05/1bcd60a8a0a914d462c305137246b6f9d167628d73568505fce3f1cb2e65/yarl-1.23.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:fe8f8f5e70e6dbdfca9882cd9deaac058729bcf323cf7a58660901e55c9c94f6", size = 96234, upload-time = "2026-03-01T22:06:32.692Z" }, + { url = "https://files.pythonhosted.org/packages/90/b2/f52381aac396d6778ce516b7bc149c79e65bfc068b5de2857ab69eeea3b7/yarl-1.23.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:a0e317df055958a0c1e79e5d2aa5a5eaa4a6d05a20d4b0c9c3f48918139c9fc6", size = 100295, upload-time = "2026-03-01T22:06:34.268Z" }, + { url = "https://files.pythonhosted.org/packages/e5/e8/638bae5bbf1113a659b2435d8895474598afe38b4a837103764f603aba56/yarl-1.23.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6f0fd84de0c957b2d280143522c4f91a73aada1923caee763e24a2b3fda9f8a5", size = 97784, upload-time = "2026-03-01T22:06:35.864Z" }, + { url = "https://files.pythonhosted.org/packages/80/25/a3892b46182c586c202629fc2159aa13975d3741d52ebd7347fd501d48d5/yarl-1.23.0-cp313-cp313t-win32.whl", hash = "sha256:93a784271881035ab4406a172edb0faecb6e7d00f4b53dc2f55919d6c9688595", size = 88313, upload-time = "2026-03-01T22:06:37.39Z" }, + { url = "https://files.pythonhosted.org/packages/43/68/8c5b36aa5178900b37387937bc2c2fe0e9505537f713495472dcf6f6fccc/yarl-1.23.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dd00607bffbf30250fe108065f07453ec124dbf223420f57f5e749b04295e090", size = 94932, upload-time = "2026-03-01T22:06:39.579Z" }, + { url = "https://files.pythonhosted.org/packages/c6/cc/d79ba8292f51f81f4dc533a8ccfb9fc6992cabf0998ed3245de7589dc07c/yarl-1.23.0-cp313-cp313t-win_arm64.whl", hash = "sha256:ac09d42f48f80c9ee1635b2fcaa819496a44502737660d3c0f2ade7526d29144", size = 84786, upload-time = "2026-03-01T22:06:41.988Z" }, + { url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288, upload-time = "2026-03-01T22:07:51.388Z" }, +] From 0da6d29fcfa9f23ecebed0a40bfbaace71871a15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Co?= Date: Wed, 22 Apr 2026 17:14:18 +0200 Subject: [PATCH 2/8] Add record deletion for deduplication of model performance in Nocodb --- backend/balanceteshaters/services/nocodb.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/backend/balanceteshaters/services/nocodb.py b/backend/balanceteshaters/services/nocodb.py index ca9b6a80..1b610eda 100644 --- a/backend/balanceteshaters/services/nocodb.py +++ b/backend/balanceteshaters/services/nocodb.py @@ -49,6 +49,16 @@ def get_table_info(self, table_id: str) -> dict[str, Any]: response.raise_for_status() return response.json() + def delete_records(self, table_id: str, record_ids: list[int]) -> None: + """Delete records by ID list (batched to avoid URL length limits).""" + url = f"{self.nocodb_url}/api/v3/data/{self.base_id}/{table_id}/records" + headers = {"accept": "application/json", "xc-token": self.token, "Content-Type": "application/json"} + batch_size = 10 + for i in range(0, len(record_ids), batch_size): + batch = record_ids[i:i + batch_size] + response = requests.delete(url, headers=headers, json=[{"id": rid} for rid in batch]) + response.raise_for_status() + def count_records(self, table_id: str, where_str: str | None = None) -> int: """Count the number of records in a NocoDB table.""" url = f"{self.nocodb_url}/api/v3/data/{self.base_id}/{table_id}/count" From 7183a1e852abae9aed135ebdad4ba492737733d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Co?= Date: Wed, 22 Apr 2026 17:14:56 +0200 Subject: [PATCH 3/8] edit dependancies --- backend/pyproject.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 8bc9dffc..4ead3a35 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -33,3 +33,10 @@ dependencies = [ "datasets>=3.0.0", "python-dotenv>=1.0.0", ] + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +# E402: scripts that patch sys.path before project imports (standalone runner pattern) +per-file-ignores = { "balanceteshaters/scripts/**/*.py" = ["E402"] } From df84d809b9d17e975a4bdcc8c2bdd136b7b7981e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Co?= Date: Wed, 22 Apr 2026 17:15:58 +0200 Subject: [PATCH 4/8] Dataset preparation, suynthetic data generation and fine tuning loops --- .../scripts/ml/00_prepare_dataset.py | 75 +++++ .../scripts/ml/01_generate_synthetic.py | 266 ++++++++++++++++++ .../ml/02_embed_and_train_classical.py | 233 +++++++++++++++ 3 files changed, 574 insertions(+) create mode 100644 backend/balanceteshaters/scripts/ml/00_prepare_dataset.py create mode 100644 backend/balanceteshaters/scripts/ml/01_generate_synthetic.py create mode 100644 backend/balanceteshaters/scripts/ml/02_embed_and_train_classical.py diff --git a/backend/balanceteshaters/scripts/ml/00_prepare_dataset.py b/backend/balanceteshaters/scripts/ml/00_prepare_dataset.py new file mode 100644 index 00000000..58e18d76 --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/00_prepare_dataset.py @@ -0,0 +1,75 @@ +# ruff: noqa: E402 +import argparse +import os +import sys +from pathlib import Path + +import pandas as pd +from dotenv import load_dotenv +from sklearn.model_selection import train_test_split + +SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + +from balanceteshaters.services.annotation import AnnotationService, BinaryConfidence +from balanceteshaters.services.nocodb import NocoDBService +from balanceteshaters.scripts.ml.config import ANNOTATION_TABLE_ID, DATA_DIR, compute_binary_label + + +def main(): + parser = argparse.ArgumentParser(description="Prepare train/val/test splits from NocoDB annotations") + parser.add_argument("--high-confidence-only", action="store_true", help="Keep only HIGH_CONFIDENCE annotations") + args = parser.parse_args() + + load_dotenv() + nocodb = NocoDBService( + nocodb_url=os.environ["NOCODB_BASE_URL"], + token=os.environ["NOCODB_TOKEN"], + base_id=os.environ["NOCODB_BASE_ID"], + ) + service = AnnotationService(nocodb=nocodb, annotation_table_id=ANNOTATION_TABLE_ID) + + print("Fetching annotations from NocoDB...") + annotations = service.fetch_records_paginated() + print(f" Total records fetched: {len(annotations)}") + + rows = [] + for ann in annotations: + if not ann.annotated_category: + continue + if args.high_confidence_only and ann.binary_confidence != BinaryConfidence.HIGH_CONFIDENCE: + continue + cats = [c.value for c in ann.annotated_category] + label = compute_binary_label(cats) + if label is None: + continue + rows.append({ + "id": ann.id, + "comment": ann.comment, + "label": label, + "annotated_category": ",".join(cats), + "binary_confidence": ann.binary_confidence.value if ann.binary_confidence else None, + "source": "real", + }) + + df = pd.DataFrame(rows) + print(f" Usable annotated records: {len(df)}") + print(f" Label distribution: {df['label'].value_counts().to_dict()}") + + train_val, test = train_test_split(df, test_size=0.15, stratify=df["label"], random_state=42) + train, val = train_test_split(train_val, test_size=0.15 / 0.85, stratify=train_val["label"], random_state=42) + + DATA_DIR.mkdir(parents=True, exist_ok=True) + train.to_parquet(DATA_DIR / "train_real.parquet", index=False) + val.to_parquet(DATA_DIR / "val.parquet", index=False) + test.to_parquet(DATA_DIR / "test.parquet", index=False) + + print(f"\nSplits saved to {DATA_DIR}") + for name, split in [("train_real", train), ("val", val), ("test", test)]: + dist = split["label"].value_counts().to_dict() + print(f" {name}: {len(split)} rows label dist={dist}") + + +if __name__ == "__main__": + main() diff --git a/backend/balanceteshaters/scripts/ml/01_generate_synthetic.py b/backend/balanceteshaters/scripts/ml/01_generate_synthetic.py new file mode 100644 index 00000000..1b47e9fe --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/01_generate_synthetic.py @@ -0,0 +1,266 @@ +# ruff: noqa: E402 +""" +Generate synthetic French social media comments for minority harassment categories. +Produces synthetic.parquet and train_augmented.parquet (train_real + synthetic). + +Improvements over v1: + - Bigger model (Sonnet by default) for higher-quality, more nuanced output + - Harder examples: subtle language, indirect threats, edge cases near category boundaries + - Few-shot grounding: real examples from train set included in each prompt +""" +import argparse +import os +import random +import re +import sys +from pathlib import Path + +import anthropic +import pandas as pd +from dotenv import load_dotenv + +SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + +from balanceteshaters.scripts.ml.config import DATA_DIR + +# Pricing per million tokens (as of 2025) +MODEL_PRICING = { + "claude-haiku-4-5-20251001": {"input": 0.80, "output": 4.00, "cache_write": 1.00, "cache_read": 0.08}, + "claude-sonnet-4-6": {"input": 3.00, "output": 15.00, "cache_write": 3.75, "cache_read": 0.30}, + "claude-opus-4-7": {"input": 15.00, "output": 75.00, "cache_write": 18.75, "cache_read": 1.50}, +} + +TOTAL_TARGET = 1000 +BATCH_SIZE = 10 +FEW_SHOT_PER_CATEGORY = 5 # real examples to include in each generation prompt + +CATEGORY_DESCRIPTIONS = { + "Doxxing": "publication ou menace de publication d'informations privées (adresse, lieu de travail, numéro de téléphone, photo personnelle, identité réelle)", + "Incitation au suicide": "encouragement à se suicider ou à se blesser soi-même, messages du type 'tue-toi', 'va mourir', 'le monde irait mieux sans toi'", + "Cyberharcèlement à caractère sexuel": "harcèlement de nature sexuelle, propositions non sollicitées, slut-shaming, menaces sexuelles, commentaires dégradants sur le corps ou la sexualité", + "Menaces": "menaces directes ou voilées de violence physique, intimidation, messages impliquant des représailles", + "Incitation à la haine": "appels à la haine envers un groupe (racisme, homophobie, islamophobie, antisémitisme, transphobie, etc.), déshumanisation", + "Absence de cyberharcèlement": "commentaire anodin, critique constructive, expression d'une opinion sans agressivité, humour bienveillant, désaccord poli", +} + +SYSTEM_PROMPT = """Tu es un expert en modération de contenu et en sécurité en ligne. Tu génères des exemples de commentaires de réseaux sociaux en français pour entraîner des modèles de détection de cyberharcèlement. + +Les commentaires doivent être réalistes et variés : +- Style : argot, verlan, abréviations SMS, emojis, fautes d'orthographe intentionnelles, mélange français/anglais +- Longueur : courts à moyens (5-80 mots), comme on en trouve sur Instagram, TikTok ou Twitter +- Difficulté : inclure un mélange d'exemples évidents ET d'exemples subtils/ambigus qui nécessitent une lecture attentive pour être classifiés +- Pour les catégories de harcèlement : certains doivent utiliser un langage indirect, des métaphores, du sous-entendu, ou du codé plutôt que des insultes directes +- Pour l'absence de harcèlement : inclure des cas qui ressemblent superficiellement à du harcèlement mais n'en sont pas (critique légitime, humour, sarcasme bienveillant) + +IMPORTANT : génère UNIQUEMENT des commentaires bruts, sans explication ni méta-commentaire. Chaque commentaire sur une ligne séparée. Numérote-les de 1 à N.""" + + +def estimate_cost(model: str, allocation: dict[str, int], n_shots: int) -> float: + pricing = MODEL_PRICING[model] + n_categories = len(allocation) + total_calls = sum(-(-v // BATCH_SIZE) for v in allocation.values()) + avg_system_tokens = 350 + avg_shots_tokens = n_shots * 20 # ~20 tokens per real example + avg_user_tokens = 80 + avg_shots_tokens + avg_output_tokens = BATCH_SIZE * 30 + + # First call per category writes the system prompt to cache; subsequent calls hit cache + cache_write_calls = n_categories + cache_read_calls = max(0, total_calls - n_categories) + + cost = ( + (cache_write_calls * avg_system_tokens * pricing["cache_write"] + + cache_read_calls * avg_system_tokens * pricing["cache_read"] + + total_calls * avg_user_tokens * pricing["input"] + + total_calls * avg_output_tokens * pricing["output"]) + / 1_000_000 + ) + return cost + + +def allocate_examples(train_df: pd.DataFrame) -> dict[str, int]: + benign_count = TOTAL_TARGET // 5 # 200 benign + harassment_count = TOTAL_TARGET - benign_count # 800 harassment + + harassment_cats = [c for c in CATEGORY_DESCRIPTIONS if c != "Absence de cyberharcèlement"] + per_cat = harassment_count // len(harassment_cats) + remainder = harassment_count % len(harassment_cats) + + allocation = {cat: per_cat for cat in harassment_cats} + for i, cat in enumerate(harassment_cats[:remainder]): + allocation[cat] += 1 + allocation["Absence de cyberharcèlement"] = benign_count + return allocation + + +def get_real_examples(train_df: pd.DataFrame, category: str, n: int) -> list[str]: + """Sample up to n real training examples for a given category.""" + col = "annotated_category" + if col not in train_df.columns: + return [] + subset = train_df[train_df[col] == category]["comment"].dropna().tolist() + if not subset: + # fall back: for benign, use label=0; for harassment, label=1 + label = 0 if category == "Absence de cyberharcèlement" else 1 + subset = train_df[train_df["label"] == label]["comment"].dropna().tolist() + return random.sample(subset, min(n, len(subset))) + + +def generate_batch( + client: anthropic.Anthropic, + category: str, + n: int, + real_examples: list[str], + tokens_used: dict, + model: str, +) -> list[str]: + description = CATEGORY_DESCRIPTIONS[category] + + shots_block = "" + if real_examples: + formatted = "\n".join(f" • {ex[:150]}" for ex in real_examples) + shots_block = f"\nExemples RÉELS de cette catégorie (pour calibrer le style et la difficulté) :\n{formatted}\n\nGénère des commentaires DIFFÉRENTS de ces exemples mais de style et difficulté similaires.\n" + + user_msg = ( + f"Catégorie : **{category}**\n" + f"Description : {description}\n" + f"{shots_block}\n" + f"Génère exactement {n} commentaires, numérotés de 1 à {n}." + ) + + response = client.messages.create( + model=model, + max_tokens=n * 80 + 150, + system=[ + { + "type": "text", + "text": SYSTEM_PROMPT, + "cache_control": {"type": "ephemeral"}, + } + ], + messages=[{"role": "user", "content": user_msg}], + ) + + tokens_used["input"] += response.usage.input_tokens + tokens_used["output"] += response.usage.output_tokens + if hasattr(response.usage, "cache_read_input_tokens"): + tokens_used["cache_read"] += response.usage.cache_read_input_tokens + if hasattr(response.usage, "cache_creation_input_tokens"): + tokens_used["cache_write"] += response.usage.cache_creation_input_tokens + + lines = response.content[0].text.strip().split("\n") + comments = [] + for line in lines: + line = line.strip() + if not line: + continue + cleaned = re.sub(r"^\d+[.)]\s*", "", line).strip() + if cleaned: + comments.append(cleaned) + return comments[:n] + + +def main(): + parser = argparse.ArgumentParser(description="Generate synthetic French harassment comments") + parser.add_argument("--dry-run", action="store_true", help="Print allocation and cost estimate only") + parser.add_argument( + "--model", + choices=list(MODEL_PRICING.keys()), + default="claude-sonnet-4-6", + help="Anthropic model to use for generation", + ) + parser.add_argument("--total", type=int, default=TOTAL_TARGET, help="Total examples to generate") + args = parser.parse_args() + + load_dotenv() + + train_path = DATA_DIR / "train_real.parquet" + if not train_path.exists(): + print(f"ERROR: {train_path} not found. Run 00_prepare_dataset.py first.") + sys.exit(1) + + train_df = pd.read_parquet(train_path) + allocation = allocate_examples(train_df) + # Rescale if --total was overridden + if args.total != TOTAL_TARGET: + scale = args.total / TOTAL_TARGET + allocation = {k: max(1, round(v * scale)) for k, v in allocation.items()} + + cost_estimate = estimate_cost(args.model, allocation, FEW_SHOT_PER_CATEGORY) + + print(f"=== Synthetic data allocation ({sum(allocation.values())} total) ===") + for cat, n in allocation.items(): + real_count = len(train_df[train_df["annotated_category"] == cat]) if "annotated_category" in train_df.columns else "?" + print(f" {cat}: {n} synthetic (real in train: {real_count})") + print(f"\nModel: {args.model}") + print(f"Few-shot examples per prompt: {FEW_SHOT_PER_CATEGORY}") + print(f"Estimated API cost: ~${cost_estimate:.3f}") + + if args.dry_run: + print("\n[dry-run] No API calls made.") + return + + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + print("ERROR: ANTHROPIC_API_KEY not set in environment.") + sys.exit(1) + + client = anthropic.Anthropic(api_key=api_key) + tokens_used = {"input": 0, "output": 0, "cache_read": 0, "cache_write": 0} + + all_rows = [] + for category, total_needed in allocation.items(): + print(f"\nGenerating {total_needed} examples for: {category}") + label = 0 if category == "Absence de cyberharcèlement" else 1 + real_examples = get_real_examples(train_df, category, FEW_SHOT_PER_CATEGORY) + print(f" Using {len(real_examples)} real few-shot examples") + + generated = [] + while len(generated) < total_needed: + batch_n = min(BATCH_SIZE, total_needed - len(generated)) + # Resample real examples each batch to add variety + shots = get_real_examples(train_df, category, FEW_SHOT_PER_CATEGORY) + batch = generate_batch(client, category, batch_n, shots, tokens_used, args.model) + generated.extend(batch) + print(f" {len(generated)}/{total_needed}", end="\r") + + for comment in generated[:total_needed]: + all_rows.append({ + "id": None, + "comment": comment, + "label": label, + "annotated_category": category, + "binary_confidence": None, + "source": "synthetic_v2", + }) + + samples = random.sample(generated[:total_needed], min(5, len(generated))) + print(f"\n Samples from '{category}':") + for s in samples: + print(f" • {s[:120]}") + + synthetic_df = pd.DataFrame(all_rows) + synthetic_df.to_parquet(DATA_DIR / "synthetic_v2.parquet", index=False) + + augmented_df = pd.concat([train_df, synthetic_df], ignore_index=True) + augmented_df.to_parquet(DATA_DIR / "train_augmented_v2.parquet", index=False) + + pricing = MODEL_PRICING[args.model] + actual_cost = ( + tokens_used["input"] * pricing["input"] + + tokens_used["output"] * pricing["output"] + + tokens_used.get("cache_write", 0) * pricing["cache_write"] + + tokens_used.get("cache_read", 0) * pricing["cache_read"] + ) / 1_000_000 + print("\n=== Done ===") + print(f" Synthetic examples: {len(synthetic_df)}") + print(f" train_augmented_v2 size: {len(augmented_df)}") + print(f" Tokens — input: {tokens_used['input']}, output: {tokens_used['output']}, cache_read: {tokens_used['cache_read']}, cache_write: {tokens_used['cache_write']}") + print(f" Actual API cost: ~${actual_cost:.4f}") + print(f" Files: {DATA_DIR}/synthetic_v2.parquet, train_augmented_v2.parquet") + + +if __name__ == "__main__": + main() diff --git a/backend/balanceteshaters/scripts/ml/02_embed_and_train_classical.py b/backend/balanceteshaters/scripts/ml/02_embed_and_train_classical.py new file mode 100644 index 00000000..ad7591d0 --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/02_embed_and_train_classical.py @@ -0,0 +1,233 @@ +# ruff: noqa: E402 +""" +Train three classifiers on frozen embeddings: + - Logistic Regression (linear baseline) + - LightGBM (tree ensemble) + - MLP (512→128) (neural head, best at dense vectors) + +Loops over 2 embedding models × 3 classifiers × 2 datasets = 12 runs. +Embeddings for val/test are shared per encoder to avoid recomputing. +""" +# Must be set before torch is imported: jina-v5 EuroBERT allocates MPS memory +# even when device="cpu", which segfaults on Apple Silicon. +import os +os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0") + +import argparse +import sys +import time +from pathlib import Path + +import joblib +import numpy as np +import pandas as pd +import sklearn.metrics +from dotenv import load_dotenv +from sentence_transformers import SentenceTransformer + +SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + +from balanceteshaters.scripts.ml.config import ( + ANNOTATION_TABLE_ID, + ARCTIC_EMBED_MODEL_ID, + BIDIR_MODEL_ID, + CHECKPOINTS_DIR, + DATA_DIR, + EVAL_TABLE_ID, + JINA_MODEL_ID, + JINA_SMALL_MODEL_ID, + get_device_for_model, + model_slug, +) +from balanceteshaters.services.nocodb import NocoDBService + + +def embed(model: SentenceTransformer, texts: list[str], task: str | None = None, batch_size: int = 32) -> np.ndarray: + kwargs = {"batch_size": batch_size, "show_progress_bar": True, "convert_to_numpy": True} + if task is not None: + kwargs["task"] = task + return model.encode(texts, **kwargs).astype(np.float32) + + +def evaluate(y_true, y_pred) -> dict: + return { + "f1": sklearn.metrics.f1_score(y_true, y_pred, zero_division=0), + "precision": sklearn.metrics.precision_score(y_true, y_pred, zero_division=0), + "recall": sklearn.metrics.recall_score(y_true, y_pred, zero_division=0), + "accuracy": sklearn.metrics.accuracy_score(y_true, y_pred), + } + + +def log_to_nocodb(nocodb, run_name: str, dataset: str, metrics: dict, n_total: int, n_pos: int, model_type: str = "encoder embedding"): + if nocodb is None: + return + data = { + "model_name": run_name, + "table_id": ANNOTATION_TABLE_ID, + "table_name": f"finetune/{dataset}", + "f1": metrics["f1"], + "precision": metrics["precision"], + "recall": metrics["recall"], + "accuracy": metrics["accuracy"], + "total_samples": n_total, + "positive_samples": n_pos, + "negative_samples": n_total - n_pos, + "prompt": f"frozen+{run_name.split('+')[1]} dataset={dataset}", + "model_type": model_type, + } + try: + nocodb.create_record(EVAL_TABLE_ID, data) + except Exception as e: + print(f" [warn] NocoDB logging failed: {e}") + + +def make_classifiers(): + # Lazy imports so LightGBM's OpenMP doesn't initialize before jina loads + from lightgbm import LGBMClassifier + from sklearn.linear_model import LogisticRegression + from sklearn.neural_network import MLPClassifier + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + + return [ + ( + "logreg", + Pipeline([ + ("scaler", StandardScaler()), + ("clf", LogisticRegression(max_iter=1000, C=1.0)), + ]), + False, + ), + ( + "lightgbm", + LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, verbose=-1, n_jobs=1), + True, # uses early stopping on val set + ), + ( + "mlp", + Pipeline([ + ("scaler", StandardScaler()), + ("clf", MLPClassifier( + hidden_layer_sizes=(512, 128), + activation="relu", + max_iter=200, + early_stopping=True, + validation_fraction=0.1, + n_iter_no_change=10, + random_state=42, + )), + ]), + False, + ), + ] + + +def run_for_encoder(model_id: str, is_jina: bool, datasets: list[str], nocodb): + import gc + device = get_device_for_model(model_id) + print(f"\n{'='*60}") + print(f"Encoder: {model_id} device={device}") + + load_kwargs = {"device": device, "trust_remote_code": True} + st_model = SentenceTransformer(model_id, **load_kwargs) + task = "classification" if is_jina else None + + val_df = pd.read_parquet(DATA_DIR / "val.parquet") + test_df = pd.read_parquet(DATA_DIR / "test.parquet") + + print("Embedding val set...") + X_val = embed(st_model, val_df["comment"].tolist(), task=task) + y_val = val_df["label"].values + + print("Embedding test set...") + X_test = embed(st_model, test_df["comment"].tolist(), task=task) + y_test = test_df["label"].values + + # Embed all training splits before freeing the encoder + train_embeddings: dict[str, tuple[np.ndarray, np.ndarray]] = {} + for dataset in datasets: + train_file = DATA_DIR / f"train_{dataset}.parquet" + if not train_file.exists(): + print(f" [skip] {train_file.name} not found") + continue + train_df = pd.read_parquet(train_file) + print(f"\nEmbedding train set ({dataset}, {len(train_df)} rows)...") + train_embeddings[dataset] = ( + embed(st_model, train_df["comment"].tolist(), task=task), + train_df["label"].values, + ) + + # Free encoder before initialising classifier threads (avoids OpenMP conflict) + del st_model + gc.collect() + + slug = model_slug(model_id) + CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True) + + for dataset, (X_train, y_train) in train_embeddings.items(): + for clf_name, clf, needs_val in make_classifiers(): + run_name = f"{slug}+{clf_name}+{dataset}" + print(f"\n Training {run_name}...") + t0 = time.time() + + if needs_val: + from lightgbm import early_stopping, log_evaluation + clf.fit( + X_train, y_train, + eval_set=[(X_val, y_val)], + callbacks=[early_stopping(50, verbose=False), log_evaluation(-1)], + ) + else: + clf.fit(X_train, y_train) + + elapsed = time.time() - t0 + y_pred = clf.predict(X_test) + m = evaluate(y_test, y_pred) + n_pos = int(y_test.sum()) + + print(f" F1={m['f1']:.4f} P={m['precision']:.4f} R={m['recall']:.4f} Acc={m['accuracy']:.4f} ({elapsed:.1f}s)") + + ckpt_path = CHECKPOINTS_DIR / f"{run_name}.joblib" + joblib.dump(clf, ckpt_path) + print(f" Saved to {ckpt_path.name}") + + log_to_nocodb(nocodb, run_name, dataset, m, len(y_test), n_pos) + + +def main(): + parser = argparse.ArgumentParser(description="Embed + train classical ML classifiers") + parser.add_argument("--models", nargs="+", choices=["jina", "jina-small", "bidir", "arctic", "all"], default=["all"]) + parser.add_argument("--datasets", nargs="+", choices=["real", "augmented", "augmented_v2", "all"], default=["all"]) + args = parser.parse_args() + + load_dotenv() + nocodb = None + if all(os.environ.get(k) for k in ["NOCODB_BASE_URL", "NOCODB_TOKEN", "NOCODB_BASE_ID"]): + nocodb = NocoDBService( + nocodb_url=os.environ["NOCODB_BASE_URL"], + token=os.environ["NOCODB_TOKEN"], + base_id=os.environ["NOCODB_BASE_ID"], + ) + + encoders = [] + if "all" in args.models or "jina" in args.models: + encoders.append((JINA_MODEL_ID, True)) + if "all" in args.models or "jina-small" in args.models: + encoders.append((JINA_SMALL_MODEL_ID, True)) + if "all" in args.models or "bidir" in args.models: + encoders.append((BIDIR_MODEL_ID, False)) + if "all" in args.models or "arctic" in args.models: + encoders.append((ARCTIC_EMBED_MODEL_ID, False)) + + datasets = ["real", "augmented", "augmented_v2"] if "all" in args.datasets else args.datasets + + for model_id, is_jina in encoders: + run_for_encoder(model_id, is_jina, datasets, nocodb) + + print("\nAll runs complete.") + + +if __name__ == "__main__": + main() From 68b9671003faa79834363ec96da796a8959c3271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Co?= Date: Wed, 22 Apr 2026 17:16:40 +0200 Subject: [PATCH 5/8] Compare all trainings, and add script for prompting Claude for annotations --- .../scripts/ml/04_compare_evaluate.py | 253 ++++++++++++++++ .../scripts/ml/05_claude_annotate.py | 286 ++++++++++++++++++ 2 files changed, 539 insertions(+) create mode 100644 backend/balanceteshaters/scripts/ml/04_compare_evaluate.py create mode 100644 backend/balanceteshaters/scripts/ml/05_claude_annotate.py diff --git a/backend/balanceteshaters/scripts/ml/04_compare_evaluate.py b/backend/balanceteshaters/scripts/ml/04_compare_evaluate.py new file mode 100644 index 00000000..efba8cae --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/04_compare_evaluate.py @@ -0,0 +1,253 @@ +# ruff: noqa: E402 +""" +Load all checkpoints and print a comparison table. +Also logs all results to NocoDB and highlights A/B delta (real vs augmented). + +SentenceTransformer embedding and fine-tuned inference run in isolated subprocesses +to avoid the OpenMP conflict between jina-nano's EuroBERT (libomp) and LightGBM (libgomp). +""" +import os +os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0") + +import sys +import subprocess +import tempfile +from pathlib import Path + +import numpy as np +import pandas as pd +import sklearn.metrics +from dotenv import load_dotenv + +SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + +from balanceteshaters.scripts.ml.config import ( + ANNOTATION_TABLE_ID, + ARCTIC_EMBED_MODEL_ID, + BIDIR_MODEL_ID, + CHECKPOINTS_DIR, + DATA_DIR, + EVAL_TABLE_ID, + JINA_MODEL_ID, + JINA_SMALL_MODEL_ID, + MODEL_TYPE, + XLMR_TOXICITY_MODEL_ID, + get_device_for_model, + model_slug, +) +from balanceteshaters.services.nocodb import NocoDBService + +_HELPER = Path(__file__).parent / "_eval_subprocess.py" + + +def metrics(y_true, y_pred) -> dict: + return { + "f1": sklearn.metrics.f1_score(y_true, y_pred, zero_division=0), + "precision": sklearn.metrics.precision_score(y_true, y_pred, zero_division=0), + "recall": sklearn.metrics.recall_score(y_true, y_pred, zero_division=0), + "accuracy": sklearn.metrics.accuracy_score(y_true, y_pred), + } + + +def load_llm_baseline(data_dir: Path) -> dict | None: + csv_dir = data_dir.parent + csvs = list(csv_dir.glob("predictions_m5t7qqaer2oa441_*.csv")) + if not csvs: + return None + + best = None + best_f1 = -1.0 + for p in csvs: + df = pd.read_csv(p) + if "annotated_category" not in df.columns or "predicted_category" not in df.columns: + continue + df = df[df["annotated_category"].notna()] + df["predicted_category"] = df["predicted_category"].astype(str).str.strip() + df = df[df["predicted_category"].isin(["0", "1"])] + if df.empty: + continue + y_true = (~df["annotated_category"].str.contains("Absence de cyberharcèlement")).astype(int) + y_pred = df["predicted_category"].astype(int) + f1 = sklearn.metrics.f1_score(y_true, y_pred, zero_division=0) + if f1 > best_f1: + best_f1 = f1 + best = {"run_name": f"LLM baseline ({p.stem})", "approach": "LLM prompt", "model": "best LLM", "dataset": "real", **metrics(y_true, y_pred)} + return best + + +def _subprocess_run(args: list[str]): + result = subprocess.run([sys.executable] + args, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"Subprocess failed (exit {result.returncode}):\n{result.stderr[-2000:]}") + + +def embed_in_subprocess(model_id: str, output_npy: str): + print(f" Embedding {model_slug(model_id)}...") + _subprocess_run([str(_HELPER), "embed", model_id, output_npy]) + + +def predict_classical_in_subprocess(embedding_npy: str, ckpt_path: str, output_npy: str): + _subprocess_run([str(_HELPER), "predict_classical", embedding_npy, ckpt_path, output_npy]) + + +def predict_ft_in_subprocess(model_id: str, ckpt_dir: str, output_npy: str): + _subprocess_run([str(_HELPER), "predict", model_id, ckpt_dir, output_npy]) + + +def predict_xlmr_zero_shot(test_df: pd.DataFrame) -> np.ndarray: + import torch + from transformers import AutoTokenizer, AutoModelForSequenceClassification + device = get_device_for_model(XLMR_TOXICITY_MODEL_ID) + tokenizer = AutoTokenizer.from_pretrained(XLMR_TOXICITY_MODEL_ID) + model = AutoModelForSequenceClassification.from_pretrained(XLMR_TOXICITY_MODEL_ID) + model.eval().to(device) + texts = test_df["comment"].tolist() + all_preds = [] + for i in range(0, len(texts), 32): + batch = texts[i:i + 32] + enc = tokenizer(batch, truncation=True, padding=True, max_length=512, return_tensors="pt").to(device) + with torch.no_grad(): + logits = model(**enc).logits + all_preds.extend(torch.argmax(logits, dim=-1).cpu().numpy()) + return np.array(all_preds) + + +def main(): + load_dotenv() + + test_df = pd.read_parquet(DATA_DIR / "test.parquet") + y_test = test_df["label"].values + + nocodb = None + if all(os.environ.get(k) for k in ["NOCODB_BASE_URL", "NOCODB_TOKEN", "NOCODB_BASE_ID"]): + nocodb = NocoDBService( + nocodb_url=os.environ["NOCODB_BASE_URL"], + token=os.environ["NOCODB_TOKEN"], + base_id=os.environ["NOCODB_BASE_ID"], + ) + + results = [] + tmp = Path(tempfile.mkdtemp()) + + encoder_configs = [ + (JINA_MODEL_ID, True), + (JINA_SMALL_MODEL_ID, True), + (BIDIR_MODEL_ID, False), + (ARCTIC_EMBED_MODEL_ID, False), + ] + + # ── Phase 1: embed test set — each model in isolated subprocess ──────── + # Prevents OpenMP conflict: jina-nano (EuroBERT/libomp) vs LightGBM (libgomp) + print("── Phase 1: computing test embeddings ──") + embedding_cache: dict[str, np.ndarray] = {} + for model_id, _ in encoder_configs: + npy_path = str(tmp / f"X_{model_slug(model_id)}.npy") + embed_in_subprocess(model_id, npy_path) + embedding_cache[model_id] = np.load(npy_path) + + # ── Phase 2: classical ML predictions — each in isolated subprocess ────── + # LightGBM (libgomp) conflicts with leaked OpenMP state from Phase 1 subprocesses + print("\n── Phase 2: classical ML predictions ──") + for model_id, _ in encoder_configs: + slug = model_slug(model_id) + embedding_npy = str(tmp / f"X_{slug}.npy") + + for clf_name in ["logreg", "lightgbm", "mlp"]: + for dataset in ["real", "augmented", "augmented_v2"]: + run_name = f"{slug}+{clf_name}+{dataset}" + ckpt = CHECKPOINTS_DIR / f"{run_name}.joblib" + if not ckpt.exists(): + print(f" [skip] {ckpt.name}") + continue + pred_npy = str(tmp / f"pred_{run_name}.npy") + predict_classical_in_subprocess(embedding_npy, str(ckpt), pred_npy) + y_pred = np.load(pred_npy) + m = metrics(y_test, y_pred) + results.append({"run": run_name, "approach": f"frozen+{clf_name}", "model": slug, "dataset": dataset, "model_type": MODEL_TYPE.get(model_id, "encoder embedding"), **m}) + print(f" {run_name:<55} F1={m['f1']:.4f}") + + # ── Phase 3: fine-tuned model predictions (each in isolated subprocess) ── + print("\n── Phase 3: fine-tuned predictions ──") + for model_id, _ in encoder_configs: + slug = model_slug(model_id) + for strategy in ["head_only", "full"]: + for dataset in ["real", "augmented", "augmented_v2"]: + run_name = f"{slug}-finetuned-{strategy}-{dataset}" + ckpt_dir = CHECKPOINTS_DIR / run_name + if not ckpt_dir.exists(): + print(f" [skip] {run_name}") + continue + npy_path = str(tmp / f"pred_{run_name}.npy") + print(f" Predicting {run_name}...") + predict_ft_in_subprocess(model_id, str(ckpt_dir), npy_path) + y_pred = np.load(npy_path) + if len(y_pred) == 1 and y_pred[0] == -1: + print(f" [skip] no best_model.pt in {run_name}") + continue + m = metrics(y_test, y_pred) + results.append({"run": run_name, "approach": f"finetune-{strategy}", "model": slug, "dataset": dataset, "model_type": MODEL_TYPE.get(model_id, "encoder embedding"), **m}) + print(f" {run_name:<55} F1={m['f1']:.4f}") + + # ── XLM-R zero-shot (no LightGBM conflict — transformers only) ──────── + print("\nRunning XLM-R toxicity zero-shot...") + xlmr_slug = model_slug(XLMR_TOXICITY_MODEL_ID) + y_pred = predict_xlmr_zero_shot(test_df) + m = metrics(y_test, y_pred) + results.append({"run": f"{xlmr_slug}-zero-shot", "approach": "zero-shot", "model": xlmr_slug, "dataset": "real", "model_type": "encoder classifier", **m}) + print(f" XLM-R zero-shot F1={m['f1']:.4f}") + + # ── LLM baseline ────────────────────────────────────────────────────── + baseline = load_llm_baseline(DATA_DIR) + if baseline: + run_name = baseline.pop("run_name") + results.append({"run": run_name, "model_type": "generative", **baseline}) + + if not results: + print("No results found. Run the training scripts first.") + return + + df = pd.DataFrame(results) + df = df.sort_values(["approach", "model", "dataset"]) + + print("\n" + "="*90) + print(f"{'Run':<55} {'Dataset':<12} {'F1':>6} {'P':>6} {'R':>6} {'Acc':>6}") + print("="*90) + for _, row in df.iterrows(): + print(f"{row['run']:<55} {row['dataset']:<12} {row['f1']:>6.4f} {row['precision']:>6.4f} {row['recall']:>6.4f} {row['accuracy']:>6.4f}") + + print("\n── A/B delta (augmented − real F1) ─────────────") + for (approach, model), group in df.groupby(["approach", "model"]): + real_row = group[group["dataset"] == "real"] + aug_row = group[group["dataset"] == "augmented"] + if real_row.empty or aug_row.empty: + continue + delta = aug_row["f1"].values[0] - real_row["f1"].values[0] + print(f" {approach:<25} {model:<35} {delta:+.4f}") + + if nocodb: + for _, row in df.iterrows(): + data = { + "model_name": row["run"], + "table_id": ANNOTATION_TABLE_ID, + "table_name": f"finetune/{row.get('dataset','?')}", + "f1": row["f1"], + "precision": row["precision"], + "recall": row["recall"], + "accuracy": row["accuracy"], + "total_samples": len(y_test), + "positive_samples": int(y_test.sum()), + "negative_samples": len(y_test) - int(y_test.sum()), + "prompt": f"approach={row.get('approach','?')} dataset={row.get('dataset','?')}", + "model_type": row.get("model_type", "encoder embedding"), + } + try: + nocodb.create_record(EVAL_TABLE_ID, data) + except Exception as e: + print(f" [warn] NocoDB: {e}") + print("\nAll results logged to NocoDB.") + + +if __name__ == "__main__": + main() diff --git a/backend/balanceteshaters/scripts/ml/05_claude_annotate.py b/backend/balanceteshaters/scripts/ml/05_claude_annotate.py new file mode 100644 index 00000000..c77ce713 --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/05_claude_annotate.py @@ -0,0 +1,286 @@ +# ruff: noqa: E402 +""" +Auto-annotate unannotated comments using Claude API, with agreement evaluation. + +Modes: + evaluate — Run Claude on a subsample of manually-annotated rows, measure agreement + against human labels to build confidence before using for training. + annotate — Fetch unannotated rows, classify with Claude, save to parquet. + +Usage: + python 05_claude_annotate.py --mode evaluate --n 100 + python 05_claude_annotate.py --mode annotate --limit 500 --dry-run + python 05_claude_annotate.py --mode annotate --limit 500 +""" +import argparse +import json +import os +import random +import re +import sys +from pathlib import Path + +import anthropic +import pandas as pd +from dotenv import load_dotenv + +SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + +from balanceteshaters.scripts.ml.config import ANNOTATION_TABLE_ID, DATA_DIR +from balanceteshaters.services.annotation import AnnotatedCategory, AnnotationService +from balanceteshaters.services.nocodb import NocoDBService + +# Sonnet pricing per million tokens +SONNET_INPUT_PRICE = 3.00 +SONNET_OUTPUT_PRICE = 15.00 +SONNET_CACHE_WRITE_PRICE = 3.75 +SONNET_CACHE_READ_PRICE = 0.30 + +CATEGORIES = [c.value for c in AnnotatedCategory] +HARASSMENT_CATEGORIES = [c for c in CATEGORIES if c != "Absence de cyberharcèlement"] + +SYSTEM_PROMPT = f"""Tu es un expert en modération de contenu et en sécurité en ligne. Tu analyses des commentaires de réseaux sociaux français (Instagram, TikTok, YouTube, Twitter) pour détecter le cyberharcèlement. + +Pour chaque commentaire, tu dois choisir la catégorie la plus appropriée parmi : +{chr(10).join(f'- {c}' for c in CATEGORIES)} + +Définitions : +- Absence de cyberharcèlement : commentaire anodin, critique constructive, humour bienveillant, opinion sans agressivité +- Cyberharcèlement (définition générale) : harcèlement en ligne ne rentrant pas dans une catégorie spécifique +- Cyberharcèlement (autre) : forme de harcèlement non couverte par les autres catégories +- Cyberharcèlement à caractère sexuel : harcèlement sexuel, slut-shaming, commentaires dégradants sur le corps/sexualité +- Menaces : menaces directes ou voilées de violence physique, intimidation +- Incitation au suicide : encouragement à se suicider ou se blesser +- Injure : insulte directe, terme offensant, dénigrement +- Diffamation : fausses accusations destinées à nuire à la réputation +- Injure et diffamation publique : combinaison d'injure et diffamation +- Doxxing : publication ou menace de publication d'informations privées +- Incitation à la haine : appel à la haine envers un groupe (racisme, homophobie, etc.) +- Suspect : commentaire ambigu nécessitant une vérification humaine + +Points importants : +- L'ironie, le sarcasme et les emojis péjoratifs peuvent constituer du cyberharcèlement même sans insulte directe +- Un commentaire qui semble superficiellement bénin peut être du harcèlement selon le contexte +- Si tu n'es pas certain, utilise "Suspect" + +Réponds UNIQUEMENT avec un objet JSON valide, sans texte avant ou après : +{{"category": "", "binary_label": <0 ou 1>, "confidence": "", "reasoning": ""}} + +binary_label : 0 = Absence de cyberharcèlement, 1 = toute forme de cyberharcèlement""" + + +def classify_comment(client: anthropic.Anthropic, comment: str, tokens_used: dict) -> dict | None: + try: + response = client.messages.create( + model="claude-sonnet-4-6", + max_tokens=200, + system=[{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}], + messages=[{"role": "user", "content": f"Commentaire : {comment}"}], + ) + tokens_used["input"] += response.usage.input_tokens + tokens_used["output"] += response.usage.output_tokens + if hasattr(response.usage, "cache_read_input_tokens"): + tokens_used["cache_read"] += response.usage.cache_read_input_tokens + if hasattr(response.usage, "cache_creation_input_tokens"): + tokens_used["cache_write"] += response.usage.cache_creation_input_tokens + + text = response.content[0].text.strip() + # Strip markdown code blocks if present + text = re.sub(r"^```(?:json)?\s*|\s*```$", "", text, flags=re.MULTILINE).strip() + result = json.loads(text) + + # Validate category + if result.get("category") not in CATEGORIES: + result["category"] = "Suspect" + result["binary_label"] = 1 + return result + + except Exception as e: + print(f" [warn] Classification failed: {e}") + return None + + +def compute_cost(tokens_used: dict) -> float: + return ( + tokens_used["input"] * SONNET_INPUT_PRICE + + tokens_used["output"] * SONNET_OUTPUT_PRICE + + tokens_used.get("cache_write", 0) * SONNET_CACHE_WRITE_PRICE + + tokens_used.get("cache_read", 0) * SONNET_CACHE_READ_PRICE + ) / 1_000_000 + + +def mode_evaluate(service: AnnotationService, client: anthropic.Anthropic, n: int): + """Sample n manually-annotated rows, run Claude blind, measure agreement.""" + import sklearn.metrics + + print("Fetching annotated records...") + all_annotated = service.fetch_records_paginated() + all_annotated = [a for a in all_annotated if a.annotated_category] + print(f" Found {len(all_annotated)} annotated records") + + sample = random.sample(all_annotated, min(n, len(all_annotated))) + print(f" Evaluating on {len(sample)} randomly sampled records\n") + + tokens_used = {"input": 0, "output": 0, "cache_read": 0, "cache_write": 0} + rows = [] + + for i, ann in enumerate(sample): + result = classify_comment(client, ann.comment, tokens_used) + if result is None: + continue + + # Human binary label + cats = [c.value for c in ann.annotated_category] + human_binary = 0 if any("Absence de cyberharcèlement" in c for c in cats) else 1 + claude_binary = result.get("binary_label", 1) + + rows.append({ + "id": ann.id, + "comment": ann.comment[:120], + "human_category": cats[0] if cats else "?", + "human_binary": human_binary, + "claude_category": result.get("category"), + "claude_binary": claude_binary, + "claude_confidence": result.get("confidence"), + "claude_reasoning": result.get("reasoning", ""), + "agree": human_binary == claude_binary, + }) + + if (i + 1) % 10 == 0: + cost_so_far = compute_cost(tokens_used) + print(f" {i+1}/{len(sample)} cost so far: ${cost_so_far:.3f}") + + df = pd.DataFrame(rows) + y_true = df["human_binary"].values + y_pred = df["claude_binary"].values + + acc = sklearn.metrics.accuracy_score(y_true, y_pred) + f1 = sklearn.metrics.f1_score(y_true, y_pred, zero_division=0) + prec = sklearn.metrics.precision_score(y_true, y_pred, zero_division=0) + rec = sklearn.metrics.recall_score(y_true, y_pred, zero_division=0) + kappa = sklearn.metrics.cohen_kappa_score(y_true, y_pred) + + print(f"\n{'='*60}") + print(f"Agreement metrics (Claude vs human, n={len(df)})") + print(f"{'='*60}") + print(f" Accuracy : {acc:.4f}") + print(f" F1 : {f1:.4f} (P={prec:.4f} R={rec:.4f})") + print(f" Kappa : {kappa:.4f} {'(substantial)' if kappa > 0.6 else '(moderate)' if kappa > 0.4 else '(fair)'}") + + print("\nConfusion matrix (rows=human, cols=claude):") + cm = sklearn.metrics.confusion_matrix(y_true, y_pred) + print(" Claude=0 Claude=1") + print(f" Human=0 {cm[0,0]:5} {cm[0,1]:5}") + print(f" Human=1 {cm[1,0]:5} {cm[1,1]:5}") + + disagree_df = df[~df["agree"]].head(20) + if not disagree_df.empty: + print(f"\nDisagreements (first {len(disagree_df)}):") + print(f"{'Comment':<60} {'Human':>6} {'Claude':>6} {'Conf':<6} Reasoning") + print("-" * 120) + for _, row in disagree_df.iterrows(): + print(f"{row['comment'][:58]:<60} {row['human_binary']:>6} {row['claude_binary']:>6} {row['claude_confidence']:<6} {row['claude_reasoning'][:60]}") + + total_cost = compute_cost(tokens_used) + print(f"\nAPI cost: ${total_cost:.4f} (tokens in={tokens_used['input']}, out={tokens_used['output']}, cache_read={tokens_used['cache_read']})") + + out_path = DATA_DIR / "claude_evaluate_agreement.parquet" + df.to_parquet(out_path, index=False) + print(f"Full results saved to {out_path}") + + +def mode_annotate(service: AnnotationService, client: anthropic.Anthropic, limit: int, dry_run: bool): + """Fetch unannotated rows, classify with Claude, save to parquet.""" + print("Fetching all records...") + all_records = service.fetch_records_paginated() + unannotated = [a for a in all_records if not a.annotated_category] + print(f" Total records: {len(all_records)}") + print(f" Unannotated: {len(unannotated)}") + + to_annotate = unannotated[:limit] + print(f" Will annotate: {len(to_annotate)}") + + # Cost estimate: ~400 tokens system (cached after first) + ~20 tokens per comment + n_calls = len(to_annotate) + est_input = 400 + 20 * n_calls # first call full, rest cache hits + est_output = 60 * n_calls + est_cost = (est_input * SONNET_INPUT_PRICE + est_output * SONNET_OUTPUT_PRICE + 400 * SONNET_CACHE_WRITE_PRICE) / 1_000_000 + print(f" Estimated cost: ~${est_cost:.3f}") + + if dry_run: + print("\n[dry-run] No API calls made.") + return + + tokens_used = {"input": 0, "output": 0, "cache_read": 0, "cache_write": 0} + rows = [] + + for i, ann in enumerate(to_annotate): + result = classify_comment(client, ann.comment, tokens_used) + if result is None: + continue + + rows.append({ + "id": ann.id, + "comment": ann.comment, + "claude_category": result.get("category"), + "claude_binary_label": result.get("binary_label", 1), + "claude_confidence": result.get("confidence"), + "claude_reasoning": result.get("reasoning", ""), + "label": result.get("binary_label", 1), + "annotated_category": result.get("category"), + "binary_confidence": None, + "source": "claude_annotated", + }) + + if (i + 1) % 25 == 0: + print(f" {i+1}/{len(to_annotate)} cost: ${compute_cost(tokens_used):.3f}") + + df = pd.DataFrame(rows) + out_path = DATA_DIR / "claude_annotated.parquet" + df.to_parquet(out_path, index=False) + + total_cost = compute_cost(tokens_used) + label_dist = df["label"].value_counts().to_dict() + print("\n=== Done ===") + print(f" Annotated: {len(df)} comments") + print(f" Label distribution: {label_dist}") + print(f" API cost: ${total_cost:.4f}") + print(f" Saved to {out_path}") + print(f"\nNext: review {out_path.name}, then run 00_prepare_dataset.py to rebuild train splits.") + + +def main(): + parser = argparse.ArgumentParser(description="Claude-based annotation and agreement evaluation") + parser.add_argument("--mode", choices=["evaluate", "annotate"], required=True) + parser.add_argument("--n", type=int, default=100, help="[evaluate] Number of annotated rows to sample") + parser.add_argument("--limit", type=int, default=500, help="[annotate] Max unannotated rows to process") + parser.add_argument("--dry-run", action="store_true", help="[annotate] Show cost estimate only") + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + random.seed(args.seed) + load_dotenv() + + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + print("ERROR: ANTHROPIC_API_KEY not set.") + sys.exit(1) + + nocodb = NocoDBService( + nocodb_url=os.environ["NOCODB_BASE_URL"], + token=os.environ["NOCODB_TOKEN"], + base_id=os.environ["NOCODB_BASE_ID"], + ) + service = AnnotationService(nocodb=nocodb, annotation_table_id=ANNOTATION_TABLE_ID) + client = anthropic.Anthropic(api_key=api_key) + + if args.mode == "evaluate": + mode_evaluate(service, client, args.n) + else: + mode_annotate(service, client, args.limit, args.dry_run) + + +if __name__ == "__main__": + main() From eadb5e33127801bb34c33240a718737a628a6411 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Co?= Date: Wed, 22 Apr 2026 17:17:59 +0200 Subject: [PATCH 6/8] add utils for model handling --- .../scripts/ml/_threshold_sweep.py | 110 ++++++++++++++++++ backend/balanceteshaters/scripts/ml/config.py | 69 +++++++++++ backend/balanceteshaters/scripts/ml/models.py | 34 ++++++ 3 files changed, 213 insertions(+) create mode 100644 backend/balanceteshaters/scripts/ml/_threshold_sweep.py create mode 100644 backend/balanceteshaters/scripts/ml/config.py create mode 100644 backend/balanceteshaters/scripts/ml/models.py diff --git a/backend/balanceteshaters/scripts/ml/_threshold_sweep.py b/backend/balanceteshaters/scripts/ml/_threshold_sweep.py new file mode 100644 index 00000000..941dd9c8 --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/_threshold_sweep.py @@ -0,0 +1,110 @@ +# ruff: noqa: E402 +import os +import subprocess +import sys +import tempfile +os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0' +import numpy as np +import pandas as pd +import sklearn.metrics +from dotenv import load_dotenv +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent.parent)) +load_dotenv() + +from balanceteshaters.services.nocodb import NocoDBService +from balanceteshaters.scripts.ml.config import ANNOTATION_TABLE_ID, EVAL_TABLE_ID, CHECKPOINTS_DIR, DATA_DIR + +tmp = Path(tempfile.mkdtemp()) +emb_path = str(tmp / 'X_arctic.npy') +helper = str(Path(__file__).parent / '_eval_subprocess.py') +MODEL_ID = 'Snowflake/snowflake-arctic-embed-l-v2.0' + +print('Embedding test set...') +r = subprocess.run([sys.executable, helper, 'embed', MODEL_ID, emb_path], capture_output=True, text=True) +if r.returncode != 0: + print(r.stderr[-500:]) + sys.exit(1) + +X_test = np.load(emb_path) +test_df = pd.read_parquet(DATA_DIR / 'test.parquet') +y_true = test_df['label'].values + +THRESHOLDS = [0.50, 0.45, 0.40, 0.35, 0.30, 0.25, 0.20] +BEST_SO_FAR = 0.741 + +nocodb = NocoDBService(os.environ['NOCODB_BASE_URL'], os.environ['NOCODB_TOKEN'], os.environ['NOCODB_BASE_ID']) + + +def best_threshold(proba): + best_t, best_f1, best_m = 0.5, 0.0, None + for t in THRESHOLDS: + y_pred = (proba >= t).astype(int) + f1 = sklearn.metrics.f1_score(y_true, y_pred, zero_division=0) + if f1 > best_f1: + best_f1 = f1 + best_t = t + best_m = { + 'f1': f1, + 'precision': sklearn.metrics.precision_score(y_true, y_pred, zero_division=0), + 'recall': sklearn.metrics.recall_score(y_true, y_pred, zero_division=0), + 'accuracy': sklearn.metrics.accuracy_score(y_true, y_pred), + } + return best_t, best_m + + +def log(run_name, dataset, t, m): + nocodb.create_record(EVAL_TABLE_ID, { + 'model_name': f'{run_name}+threshold={t}', + 'table_id': ANNOTATION_TABLE_ID, + 'table_name': f'finetune/{dataset}', + 'f1': m['f1'], 'precision': m['precision'], + 'recall': m['recall'], 'accuracy': m['accuracy'], + 'total_samples': len(y_true), + 'positive_samples': int(y_true.sum()), + 'negative_samples': len(y_true) - int(y_true.sum()), + 'prompt': f'approach=threshold-tuned dataset={dataset} threshold={t}', + 'model_type': 'encoder embedding', + }) + + +print(f'\n{"Run":<60} {"BestT":>6} {"F1":>7} {"P":>7} {"R":>7} {"Acc":>7}') +print('-' * 100) + +for clf_name in ['logreg', 'lightgbm', 'mlp']: + ckpt = CHECKPOINTS_DIR / f'snowflake-arctic-embed-l-v2.0+{clf_name}+real.joblib' + if not ckpt.exists(): + continue + proba_path = str(tmp / f'proba_{clf_name}.npy') + r = subprocess.run( + [sys.executable, helper, 'predict_proba_classical', emb_path, str(ckpt), proba_path], + capture_output=True, text=True, + ) + if r.returncode != 0: + print(f' [error] {clf_name}: {r.stderr[-200:]}') + continue + proba = np.load(proba_path) + t, m = best_threshold(proba) + run = f'snowflake-arctic-embed-l-v2.0+{clf_name}+real' + flag = ' *** BEATS BEST' if m['f1'] > BEST_SO_FAR else '' + print(f' {run:<58} {t:>6.2f} {m["f1"]:>7.4f} {m["precision"]:>7.4f} {m["recall"]:>7.4f} {m["accuracy"]:>7.4f}{flag}') + if t != 0.50: + log(run, 'real', t, m) + +# fine-tuned head_only real +ft_dir = str(CHECKPOINTS_DIR / 'snowflake-arctic-embed-l-v2.0-finetuned-head_only-real') +proba_path = str(tmp / 'proba_ft_real.npy') +r = subprocess.run([sys.executable, helper, 'predict_proba', MODEL_ID, ft_dir, proba_path], capture_output=True, text=True) +if r.returncode == 0: + proba = np.load(proba_path) + t, m = best_threshold(proba) + run = 'snowflake-arctic-embed-l-v2.0-finetuned-head_only-real' + flag = ' *** BEATS BEST' if m['f1'] > BEST_SO_FAR else '' + print(f' {run:<58} {t:>6.2f} {m["f1"]:>7.4f} {m["precision"]:>7.4f} {m["recall"]:>7.4f} {m["accuracy"]:>7.4f}{flag}') + if t != 0.50: + log(run, 'real', t, m) +else: + print(f' [error] finetuned: {r.stderr[-300:]}') + +print('\nAll non-default thresholds logged to NocoDB.') diff --git a/backend/balanceteshaters/scripts/ml/config.py b/backend/balanceteshaters/scripts/ml/config.py new file mode 100644 index 00000000..0c56536e --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/config.py @@ -0,0 +1,69 @@ +from pathlib import Path +import torch + +JINA_MODEL_ID = "jinaai/jina-embeddings-v5-text-nano" # 239M, EuroBERT, CPU-only (MPS segfault) +JINA_SMALL_MODEL_ID = "jinaai/jina-embeddings-v5-text-small" # 677M, Qwen3-based, MPS-safe +# Decoder-only, 270M, 640-dim, 94-language, no trust_remote_code needed +BIDIR_MODEL_ID = "microsoft/harrier-oss-v1-270m" +# Encoder-only, 568M, 1024-dim, bge-m3-retromae base, MRL, no trust_remote_code +ARCTIC_EMBED_MODEL_ID = "Snowflake/snowflake-arctic-embed-l-v2.0" +# 600M XLM-R fine-tuned on multilingual toxicity (15 langs incl. French); labels: 0=neutral, 1=toxic +XLMR_TOXICITY_MODEL_ID = "textdetox/xlmr-large-toxicity-classifier-v2" +MODELS = [JINA_MODEL_ID, JINA_SMALL_MODEL_ID, BIDIR_MODEL_ID, ARCTIC_EMBED_MODEL_ID, XLMR_TOXICITY_MODEL_ID] + +MODEL_TYPE = { + JINA_MODEL_ID: "encoder embedding", + JINA_SMALL_MODEL_ID: "encoder embedding", + BIDIR_MODEL_ID: "encoder embedding", + ARCTIC_EMBED_MODEL_ID: "encoder embedding", + XLMR_TOXICITY_MODEL_ID: "encoder classifier", +} + +SCRIPTS_DIR = Path(__file__).resolve().parent +BACKEND_DIR = SCRIPTS_DIR.parent.parent.parent +DATA_DIR = BACKEND_DIR / "balanceteshaters" / "data" / "finetune" +CHECKPOINTS_DIR = DATA_DIR / "checkpoints" + +ANNOTATION_TABLE_ID = "m5t7qqaer2oa441" +EVAL_TABLE_ID = "m0ww7qnx69u9r1a" + +LABEL_MAP = { + "Absence de cyberharcèlement": 0, +} + +MINORITY_CATEGORIES = [ + "Doxxing", + "Incitation au suicide", + "Cyberharcèlement à caractère sexuel", + "Menaces", + "Incitation à la haine", +] + + +def get_device() -> str: + if torch.backends.mps.is_available(): + return "mps" + if torch.cuda.is_available(): + return "cuda" + return "cpu" + + +def get_device_for_model(model_id: str) -> str: + # jina-v5-text-nano uses EuroBERT which segfaults on MPS — force CPU + # jina-v5-text-small uses Qwen3 and is MPS-safe + if model_id == JINA_MODEL_ID: + return "cpu" + return get_device() + + +def compute_binary_label(annotated_categories: list[str] | None) -> int | None: + if not annotated_categories: + return None + for cat in annotated_categories: + if "Absence de cyberharcèlement" in cat: + return 0 + return 1 + + +def model_slug(model_id: str) -> str: + return model_id.split("/")[-1] diff --git a/backend/balanceteshaters/scripts/ml/models.py b/backend/balanceteshaters/scripts/ml/models.py new file mode 100644 index 00000000..2e884fb9 --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/models.py @@ -0,0 +1,34 @@ +"""Shared model definitions for the ML fine-tuning scripts.""" +import torch +import torch.nn as nn + + +class EmbeddingClassifier(nn.Module): + """Wraps a SentenceTransformer encoder with a linear classification head.""" + + def __init__(self, encoder, embed_dim: int, num_labels: int = 2, task: str | None = None, trainable_encoder: bool = False): + super().__init__() + self.encoder = encoder + self.classifier = nn.Linear(embed_dim, num_labels) + self.task = task + self.trainable_encoder = trainable_encoder + + def forward(self, texts: list[str], labels: torch.Tensor | None = None): + if self.trainable_encoder: + # SentenceTransformer.forward() preserves the computation graph; encode() does not + features = self.encoder.tokenize(texts) + features = {k: v.to(self.classifier.weight.device) for k, v in features.items()} + embeddings = self.encoder(features)["sentence_embedding"] + else: + encode_kwargs: dict = {"convert_to_numpy": True, "show_progress_bar": False} + if self.task: + encode_kwargs["task"] = self.task + embeddings = torch.tensor( + self.encoder.encode(texts, **encode_kwargs), + dtype=torch.float32, + ).to(self.classifier.weight.device) + logits = self.classifier(embeddings) + if labels is not None: + loss = nn.CrossEntropyLoss()(logits, labels) + return loss, logits + return logits From 1f1eee3a59deef0df65ca2afd57e910ca31a05f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Co?= Date: Wed, 22 Apr 2026 17:20:45 +0200 Subject: [PATCH 7/8] Add finetuning and init --- .../scripts/ml/03_finetune_embedding.py | 231 ++++++++++++++++++ .../balanceteshaters/scripts/ml/__init__.py | 0 2 files changed, 231 insertions(+) create mode 100644 backend/balanceteshaters/scripts/ml/03_finetune_embedding.py create mode 100644 backend/balanceteshaters/scripts/ml/__init__.py diff --git a/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py b/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py new file mode 100644 index 00000000..0c0ac249 --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py @@ -0,0 +1,231 @@ +# ruff: noqa: E402 +""" +Fine-tune embedding models for binary harassment classification. + +Usage: + python 03_finetune_embedding.py --model bidir --strategy full --dataset real + python 03_finetune_embedding.py --model jina --strategy head_only --dataset augmented +""" +import os +os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0") + +import argparse +import sys +import time +from pathlib import Path + +import pandas as pd +import sklearn.metrics +import torch +from dotenv import load_dotenv +from sentence_transformers import SentenceTransformer +from torch.utils.data import DataLoader, Dataset + +SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + +from balanceteshaters.scripts.ml.config import ( + ANNOTATION_TABLE_ID, + ARCTIC_EMBED_MODEL_ID, + BIDIR_MODEL_ID, + CHECKPOINTS_DIR, + DATA_DIR, + EVAL_TABLE_ID, + JINA_MODEL_ID, + JINA_SMALL_MODEL_ID, + get_device_for_model, + model_slug, +) +from balanceteshaters.scripts.ml.models import EmbeddingClassifier +from balanceteshaters.services.nocodb import NocoDBService + + +class TextDataset(Dataset): + def __init__(self, df: pd.DataFrame): + self.texts = df["comment"].tolist() + self.labels = torch.tensor(df["label"].values, dtype=torch.long) + + def __len__(self): + return len(self.labels) + + def __getitem__(self, idx): + return self.texts[idx], self.labels[idx] + + +def log_to_nocodb(nocodb, run_name: str, dataset: str, strategy: str, metrics: dict, n_total: int, n_pos: int): + if nocodb is None: + return + data = { + "model_name": run_name, + "table_id": ANNOTATION_TABLE_ID, + "table_name": f"finetune/{dataset}", + "f1": metrics["f1"], + "precision": metrics["precision"], + "recall": metrics["recall"], + "accuracy": metrics["accuracy"], + "total_samples": n_total, + "positive_samples": n_pos, + "negative_samples": n_total - n_pos, + "prompt": f"strategy={strategy} dataset={dataset}", + "model_type": "encoder embedding", + } + try: + nocodb.create_record(EVAL_TABLE_ID, data) + except Exception as e: + print(f" [warn] NocoDB logging failed: {e}") + + +def run_finetune(model_id: str, is_jina: bool, strategy: str, dataset: str, nocodb, device: str): + slug = model_slug(model_id) + run_name = f"{slug}-finetuned-{strategy}-{dataset}" + ckpt_dir = CHECKPOINTS_DIR / run_name + ckpt_dir.mkdir(parents=True, exist_ok=True) + + print(f"\n{'='*60}") + print(f"Fine-tuning {model_id} strategy={strategy} dataset={dataset}") + + load_kwargs = {"device": device} + if is_jina: + load_kwargs["trust_remote_code"] = True + encoder = SentenceTransformer(model_id, **load_kwargs) + + probe_kwargs = {"convert_to_numpy": True, "show_progress_bar": False} + if is_jina: + probe_kwargs["task"] = "classification" + embed_dim = encoder.encode(["probe"], **probe_kwargs).shape[1] + + task = "classification" if is_jina else None + clf_model = EmbeddingClassifier(encoder, embed_dim, task=task, trainable_encoder=(strategy == "full")).to(device) + + if strategy == "head_only": + for param in clf_model.encoder.parameters(): + param.requires_grad = False + lr = 1e-3 + batch_size = 32 + max_epochs = 50 + patience = 15 + else: + lr = 2e-5 + batch_size = 16 + max_epochs = 15 + patience = 5 + + train_df = pd.read_parquet(DATA_DIR / f"train_{dataset}.parquet") + val_df = pd.read_parquet(DATA_DIR / "val.parquet") + test_df = pd.read_parquet(DATA_DIR / "test.parquet") + + train_loader = DataLoader(TextDataset(train_df), batch_size=batch_size, shuffle=True) + val_loader = DataLoader(TextDataset(val_df), batch_size=32, shuffle=False) + + optimizer = torch.optim.AdamW( + filter(lambda p: p.requires_grad, clf_model.parameters()), lr=lr, weight_decay=0.01 + ) + + best_f1 = 0.0 + no_improve = 0 + best_state = None + t0 = time.time() + + for epoch in range(1, max_epochs + 1): + clf_model.train() + total_loss = 0.0 + for texts, labels in train_loader: + labels = labels.to(device) + optimizer.zero_grad() + loss, _ = clf_model(texts, labels) + loss.backward() + optimizer.step() + total_loss += loss.item() + + clf_model.eval() + all_preds, all_labels = [], [] + with torch.no_grad(): + for texts, labels in val_loader: + logits = clf_model(texts) + preds = torch.argmax(logits, dim=-1).cpu().numpy() + all_preds.extend(preds) + all_labels.extend(labels.numpy()) + + val_f1 = sklearn.metrics.f1_score(all_labels, all_preds, zero_division=0) + avg_loss = total_loss / len(train_loader) + + if val_f1 > best_f1: + best_f1 = val_f1 + best_state = {k: v.cpu().clone() for k, v in clf_model.state_dict().items()} + torch.save(best_state, ckpt_dir / "best_model.pt") + no_improve = 0 + print(f" Epoch {epoch} loss={avg_loss:.4f} val_f1={val_f1:.4f} ← best") + else: + no_improve += 1 + print(f" Epoch {epoch} loss={avg_loss:.4f} val_f1={val_f1:.4f} (no improve {no_improve}/{patience})") + if no_improve >= patience: + print(f" Early stopping at epoch {epoch}") + break + + elapsed = time.time() - t0 + + if best_state: + clf_model.load_state_dict(best_state) + + clf_model.eval() + test_loader = DataLoader(TextDataset(test_df), batch_size=32, shuffle=False) + all_preds, all_labels = [], [] + with torch.no_grad(): + for texts, labels in test_loader: + logits = clf_model(texts) + preds = torch.argmax(logits, dim=-1).cpu().numpy() + all_preds.extend(preds) + all_labels.extend(labels.numpy()) + + metrics = { + "f1": sklearn.metrics.f1_score(all_labels, all_preds, zero_division=0), + "precision": sklearn.metrics.precision_score(all_labels, all_preds, zero_division=0), + "recall": sklearn.metrics.recall_score(all_labels, all_preds, zero_division=0), + "accuracy": sklearn.metrics.accuracy_score(all_labels, all_preds), + } + y_test = test_df["label"].values + + print(f"Test F1={metrics['f1']:.4f} P={metrics['precision']:.4f} R={metrics['recall']:.4f} Acc={metrics['accuracy']:.4f} ({elapsed:.0f}s)") + print(f" Saved to {ckpt_dir}") + log_to_nocodb(nocodb, run_name, dataset, strategy, metrics, len(y_test), int(y_test.sum())) + + +def main(): + parser = argparse.ArgumentParser(description="Fine-tune embedding model for binary classification") + parser.add_argument("--model", choices=["jina", "jina-small", "bidir", "arctic"], required=True) + parser.add_argument("--strategy", choices=["head_only", "full"], default="full") + parser.add_argument("--dataset", choices=["real", "augmented", "augmented_v2"], default="real") + args = parser.parse_args() + + load_dotenv() + nocodb = None + if all(os.environ.get(k) for k in ["NOCODB_BASE_URL", "NOCODB_TOKEN", "NOCODB_BASE_ID"]): + nocodb = NocoDBService( + nocodb_url=os.environ["NOCODB_BASE_URL"], + token=os.environ["NOCODB_TOKEN"], + base_id=os.environ["NOCODB_BASE_ID"], + ) + + model_map = { + "jina": JINA_MODEL_ID, + "jina-small": JINA_SMALL_MODEL_ID, + "bidir": BIDIR_MODEL_ID, + "arctic": ARCTIC_EMBED_MODEL_ID, + } + model_id = model_map[args.model] + is_jina = args.model in ("jina", "jina-small") + + if is_jina and args.strategy == "full": + print("Note: jina models use encode() which blocks gradient flow — 'full' is equivalent to 'head_only'. Running head_only.") + args.strategy = "head_only" + + device = get_device_for_model(model_id) + print(f"Device: {device}") + + CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True) + run_finetune(model_id, is_jina, args.strategy, args.dataset, nocodb, device) + + +if __name__ == "__main__": + main() diff --git a/backend/balanceteshaters/scripts/ml/__init__.py b/backend/balanceteshaters/scripts/ml/__init__.py new file mode 100644 index 00000000..e69de29b From 0662baa75c43adb8c954ed7370b8f17e9e41d509 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Co?= Date: Fri, 24 Apr 2026 11:59:01 +0200 Subject: [PATCH 8/8] Add README and other utils --- .../scripts/ml/03_finetune_embedding.py | 4 +- backend/balanceteshaters/scripts/ml/README.md | 369 ++++++++++++++++++ .../scripts/ml/_eval_subprocess.py | 144 +++++++ .../scripts/ml/_mlp_arch_search.py | 151 +++++++ .../scripts/ml/_threshold_sweep.py | 73 ++-- backend/balanceteshaters/scripts/ml/config.py | 2 +- .../scripts/ml/dedup_eval_table.py | 77 ++++ 7 files changed, 789 insertions(+), 31 deletions(-) create mode 100644 backend/balanceteshaters/scripts/ml/README.md create mode 100644 backend/balanceteshaters/scripts/ml/_eval_subprocess.py create mode 100644 backend/balanceteshaters/scripts/ml/_mlp_arch_search.py create mode 100644 backend/balanceteshaters/scripts/ml/dedup_eval_table.py diff --git a/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py b/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py index 0c0ac249..2bba2856 100644 --- a/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py +++ b/backend/balanceteshaters/scripts/ml/03_finetune_embedding.py @@ -3,8 +3,8 @@ Fine-tune embedding models for binary harassment classification. Usage: - python 03_finetune_embedding.py --model bidir --strategy full --dataset real - python 03_finetune_embedding.py --model jina --strategy head_only --dataset augmented + python 03_finetune_embedding.py --model bidir --strategy full --dataset real + python 03_finetune_embedding.py --model arctic --strategy head_only --dataset augmented_v2 """ import os os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0") diff --git a/backend/balanceteshaters/scripts/ml/README.md b/backend/balanceteshaters/scripts/ml/README.md new file mode 100644 index 00000000..55c0210c --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/README.md @@ -0,0 +1,369 @@ +# Pipeline ML : détection de cyberharcèlement par embeddings + +Pipeline complet pour entraîner et évaluer des modèles d'embeddings sur la classification binaire de commentaires français issus des réseaux sociaux (Instagram, TikTok, YouTube, Twitter). + +## Vue d'ensemble + +Le pipeline est organisé en scripts numérotés à exécuter dans l'ordre : + +``` +00_prepare_dataset.py → construire les splits train/val/test depuis NocoDB +01_generate_synthetic.py → augmenter les catégories minoritaires avec Claude +02_embed_and_train_classical.py → classifieurs classiques sur embeddings gelés +03_finetune_embedding.py → fine-tuning bout-en-bout encodeur + tête +04_compare_evaluate.py → comparer tous les runs, logger dans NocoDB +05_claude_annotate.py → annoter les données non étiquetées avec Claude +``` + +Tous les résultats sont enregistrés dans NocoDB pour le suivi et la comparaison. + +--- + +## Installation (MacBook M4 Pro) + +### Prérequis + +- **macOS Sequoia** (ou supérieur recommandé) +- **Python 3.12+** — installez via [pyenv](https://github.com/pyenv/pyenv) ou [mise](https://mise.jdx.dev/) +- **uv** — gestionnaire de packages rapide + +```bash +# Installer uv si pas déjà fait +curl -LsSf https://astral.sh/uv/install.sh | sh +``` + +### Cloner et installer les dépendances + +```bash +git clone +cd 14_BalanceTesHaters/backend +uv sync +``` + +`uv sync` crée un environnement virtuel dans `.venv/` et installe toutes les dépendances définies dans `pyproject.toml`, y compris PyTorch, sentence-transformers et LightGBM. + +### Variables d'environnement + +Créez un fichier `.env` à la racine du dossier `backend/` : + +```bash +NOCODB_BASE_URL=https://votre-nocodb.example.com +NOCODB_TOKEN=votre_token +NOCODB_BASE_ID=votre_base_id +ANTHROPIC_API_KEY=votre_clé # facultatif — requis seulement pour 01_generate_synthetic.py et 05_claude_annotate.py +``` + +Les scripts `00`, `02`, `03` et `04` n'utilisent pas l'API Anthropic et fonctionnent sans cette clé. Les scripts appellent `load_dotenv()` automatiquement. + +### Notes spécifiques Apple Silicon (M4 Pro) + +Le chip M4 Pro dispose d'un GPU unifié (MPS) utilisé par PyTorch. Quelques points importants : + +- **jina-v5-text-nano** (`jina`) provoque un **segfault sur MPS** à cause d'EuroBERT. Il est automatiquement forcé sur CPU — pas d'action requise. +- **jina-v5-text-small**, BidirLM et Arctic sont tous **compatibles MPS** et utiliseront le GPU automatiquement. +- La variable d'environnement `PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0` est positionnée en tête de chaque script pour éviter les erreurs de mémoire unifiée. +- **LightGBM** utilise `libgomp` (OpenMP GNU) tandis que certains modèles HuggingFace utilisent `libomp` (OpenMP LLVM). Charger les deux dans le même processus provoque un crash. Les scripts gèrent cela via des sous-processus isolés — aucune configuration nécessaire de votre côté. + +### Vérifier l'installation + +```bash +cd backend +uv run python -c "import torch; print('MPS disponible:', torch.backends.mps.is_available())" +# → MPS disponible: True +``` + +--- + +## Démarrage rapide + +```bash +# 1. Construire les splits +uv run python balanceteshaters/scripts/ml/00_prepare_dataset.py + +# 2. Générer des données synthétiques (vérifier le coût d'abord) +uv run python balanceteshaters/scripts/ml/01_generate_synthetic.py --dry-run +uv run python balanceteshaters/scripts/ml/01_generate_synthetic.py + +# 3. Entraîner les baselines sur embeddings gelés (tous modèles × classifieurs × datasets) +uv run python balanceteshaters/scripts/ml/02_embed_and_train_classical.py + +# 4. Fine-tuner (un run à la fois) +uv run python balanceteshaters/scripts/ml/03_finetune_embedding.py --model arctic --strategy head_only --dataset real +uv run python balanceteshaters/scripts/ml/03_finetune_embedding.py --model arctic --strategy head_only --dataset augmented_v2 + +# 5. Comparer tout +uv run python balanceteshaters/scripts/ml/04_compare_evaluate.py +``` + +--- + +## Scripts + +### 00_prepare_dataset.py — Construire les splits + +Récupère les enregistrements annotés depuis NocoDB et crée des splits stratifiés 70/15/15. + +```bash +uv run python balanceteshaters/scripts/ml/00_prepare_dataset.py [--high-confidence-only] +``` + +**Options :** +- `--high-confidence-only` — ne garder que les annotations marquées `HIGH_CONFIDENCE` (réduit la taille du dataset, améliore la qualité des labels) + +**Sorties** (dans `data/finetune/`) : +- `train_real.parquet` +- `val.parquet` +- `test.parquet` + +À ré-exécuter à chaque fois que les annotations changent. + +--- + +### 01_generate_synthetic.py — Augmenter les catégories minoritaires + +Utilise Claude pour générer des commentaires de cyberharcèlement synthétiques réalistes pour les catégories sous-représentées (doxxing, incitation au suicide, harcèlement sexuel, menaces, incitation à la haine). Chaque prompt inclut des exemples réels du training set comme ancrage few-shot. + +```bash +# Voir l'allocation et estimer le coût +uv run python balanceteshaters/scripts/ml/01_generate_synthetic.py --dry-run + +# Générer avec les paramètres par défaut (1000 exemples, Sonnet) +uv run python balanceteshaters/scripts/ml/01_generate_synthetic.py + +# Modèle moins cher ou nombre d'exemples réduit +uv run python balanceteshaters/scripts/ml/01_generate_synthetic.py --model claude-haiku-4-5-20251001 --total 500 +``` + +**Sorties :** +- `data/finetune/synthetic_v2.parquet` — exemples générés +- `data/finetune/train_augmented_v2.parquet` — `train_real` + synthétique (à utiliser avec `--dataset augmented_v2` dans les scripts suivants) + +**Coût typique :** ~0,10 $ pour 1000 exemples avec Sonnet. + +--- + +### 02_embed_and_train_classical.py — Baselines sur embeddings gelés + +Encode le texte avec chaque modèle puis entraîne trois classifieurs sur les représentations gelées. Couvre toutes les combinaisons encodeur × classifieux × dataset. + +```bash +# Tout (par défaut) +uv run python balanceteshaters/scripts/ml/02_embed_and_train_classical.py + +# Sous-ensemble +uv run python balanceteshaters/scripts/ml/02_embed_and_train_classical.py \ + --models arctic jina-small \ + --datasets real augmented_v2 +``` + +**Options :** +- `--models` — un ou plusieurs parmi `jina`, `jina-small`, `bidir`, `arctic`, `all` (défaut : `all`) +- `--datasets` — un ou plusieurs parmi `real`, `augmented`, `augmented_v2`, `all` (défaut : `all`) + +**Classifieurs entraînés :** + +| Nom | Architecture | +|---|---| +| `logreg` | Régression logistique (baseline linéaire) | +| `lightgbm` | LightGBM avec early stopping | +| `mlp` | MLP 512→128, ReLU, early stopping | + +**Sorties :** `data/finetune/checkpoints/{slug}+{clf}+{dataset}.joblib` + +--- + +### 03_finetune_embedding.py — Fine-tuning bout-en-bout + +Attache une tête de classification linéaire à un encodeur et entraîne avec AdamW. Deux stratégies disponibles : + +- `head_only` — encodeur gelé, seule la tête apprend (rapide, ~50 époques) +- `full` — encodeur + tête entraînés conjointement avec un faible LR (lent, ~15 époques) + +```bash +# Arctic head-only sur données réelles +uv run python balanceteshaters/scripts/ml/03_finetune_embedding.py \ + --model arctic --strategy head_only --dataset real + +# Head-only sur données augmentées +uv run python balanceteshaters/scripts/ml/03_finetune_embedding.py \ + --model arctic --strategy head_only --dataset augmented_v2 +``` + +**Options :** +- `--model` — `jina`, `jina-small`, `bidir`, `arctic` (requis) +- `--strategy` — `head_only` ou `full` (défaut : `full`) +- `--dataset` — `real`, `augmented`, `augmented_v2` (défaut : `real`) + +> Les modèles jina utilisent `encode()` qui bloque le flux de gradient, donc `full` bascule automatiquement en `head_only` pour eux. + +**Sortie :** `data/finetune/checkpoints/{slug}-finetuned-{strategy}-{dataset}/best_model.pt` + +L'entraînement sauvegarde le checkpoint avec le meilleur F1 de validation et applique l'early stopping. + +--- + +### 04_compare_evaluate.py — Table de comparaison complète + +Charge tous les checkpoints des scripts 02 et 03, évalue sur le test set, affiche une table de comparaison et logue tous les résultats dans NocoDB. Inclut également : +- Baseline zero-shot XLM-R toxicité +- Meilleure baseline LLM depuis les CSV de prédictions (si présents) +- Delta A/B montrant le gain F1 des données augmentées vs réelles + +```bash +uv run python balanceteshaters/scripts/ml/04_compare_evaluate.py +``` + +Les embeddings et prédictions tournent dans des sous-processus isolés pour éviter le conflit OpenMP entre EuroBERT de jina-nano (libomp) et LightGBM (libgomp). + +**Meilleurs résultats observés (frozen embeddings, test set) :** +``` +Run F1 P R Acc +================================================================================ +snowflake-arctic-embed-l-v2.0+mlp+real 0.6916 0.6852 0.6981 0.7130 +snowflake-arctic-embed-l-v2.0+logreg+real 0.6903 0.6500 0.7358 0.6957 +harrier-oss-v1-270m+lightgbm+real 0.6729 0.6667 0.6792 0.6957 +jina-embeddings-v5-text-nano+lightgbm+augmented 0.6573 0.5222 0.8868 0.5739 +jina-embeddings-v5-text-small+mlp+real 0.6195 0.5833 0.6604 0.6261 +``` + +--- + +### 05_claude_annotate.py — Annotation avec Claude + +Deux modes : +- **evaluate** — fait tourner Claude sur un échantillon de lignes déjà annotées et mesure l'accord avec les labels humains (accuracy, F1, kappa de Cohen). À utiliser en premier pour valider la fiabilité. +- **annotate** — classe les lignes non annotées avec Claude et sauvegarde en parquet pour relecture. + +```bash +# Mesurer l'accord Claude vs humain sur 100 exemples +uv run python balanceteshaters/scripts/ml/05_claude_annotate.py --mode evaluate --n 100 + +# Estimer le coût +uv run python balanceteshaters/scripts/ml/05_claude_annotate.py --mode annotate --limit 500 --dry-run + +# Annoter +uv run python balanceteshaters/scripts/ml/05_claude_annotate.py --mode annotate --limit 500 +``` + +**Options :** +- `--mode` — `evaluate` ou `annotate` (requis) +- `--n` — nombre de lignes annotées à échantillonner pour l'évaluation (défaut : 100) +- `--limit` — nombre max de lignes non annotées à traiter (défaut : 500) +- `--dry-run` — afficher uniquement l'estimation de coût, sans appels API +- `--seed` — graine aléatoire pour l'échantillonnage (défaut : 42) + +**Sortie :** `data/finetune/claude_annotated.parquet` + +Après relecture, ré-exécuter `00_prepare_dataset.py` pour inclure les lignes annotées par Claude dans les splits. + +--- + +## Modèles + +| Alias | ID HuggingFace | Params | Notes | +|---|---|---|---| +| `jina` | `jinaai/jina-embeddings-v5-text-nano` | 239M | Basé sur EuroBERT ; CPU uniquement sur Apple Silicon (segfault MPS) | +| `jina-small` | `jinaai/jina-embeddings-v5-text-small` | 677M | Basé sur Qwen3 ; compatible MPS | +| `bidir` | `microsoft/harrier-oss-v1-270m` | 270M | 94 langues, 640 dimensions | +| `arctic` | `Snowflake/snowflake-arctic-embed-l-v2.0` | 568M | 1024 dimensions, MRL, forte baseline retrieval | +| *(zero-shot)* | `textdetox/xlmr-large-toxicity-classifier-v2` | 600M | XLM-R fine-tuné sur la toxicité multilingue ; aucun entraînement requis | + +--- + +## Scripts utilitaires + +### _threshold_sweep.py + +Balaye les seuils de décision [0.20 … 0.50] pour les modèles classiques et fine-tunés Arctic. Logue dans NocoDB tout run dont le meilleur seuil diffère de 0.50. + +```bash +uv run python balanceteshaters/scripts/ml/_threshold_sweep.py +``` + +Nécessite que les checkpoints de `02_embed_and_train_classical.py` existent. + +--- + +### _mlp_arch_search.py + +Recherche de grille sur les configurations de couches cachées du MLP sur les embeddings Arctic gelés. Teste 8 architectures avec optimisation du seuil et logue la meilleure dans NocoDB. + +```bash +uv run python balanceteshaters/scripts/ml/_mlp_arch_search.py +``` + +--- + +### dedup_eval_table.py + +Supprime les entrées en doublon dans la table d'évaluation NocoDB, en gardant l'enregistrement le plus récent par `model_name`. Peut être exécuté plusieurs fois sans risque. + +```bash +uv run python balanceteshaters/scripts/ml/dedup_eval_table.py +``` + +--- + +### _eval_subprocess.py + +Helper interne utilisé par `04_compare_evaluate.py`, `_threshold_sweep.py` et `_mlp_arch_search.py`. Chaque commande tourne dans son propre sous-processus pour éviter les conflits OpenMP entre torch/sentence-transformers (libomp) et LightGBM (libgomp). + +Commandes disponibles : +``` +embed [split] +predict_classical +predict_proba_classical +predict +predict_proba +``` + +`split` vaut `"test"` par défaut. Passer `"train_real"` ou `"val"` pour les autres splits. + +--- + +## Structure des données + +``` +backend/balanceteshaters/data/finetune/ +├── train_real.parquet # ~70 % des données annotées +├── train_augmented.parquet # train_real + synthétique v1 +├── train_augmented_v2.parquet # train_real + synthétique v2 (recommandé) +├── val.parquet # ~15 %, utilisé pour l'early stopping +├── test.parquet # ~15 %, réservé à l'évaluation finale +├── synthetic_v2.parquet # exemples générés par Claude +├── claude_annotated.parquet # lignes annotées par Claude (après 05) +└── checkpoints/ + ├── {slug}+{clf}+{dataset}.joblib # classifieurs sur embeddings gelés + └── {slug}-finetuned-{strategy}-{dataset}/ + └── best_model.pt # meilleur checkpoint (val F1) +``` + +Tous les fichiers parquet partagent le même schéma : + +| Colonne | Type | Description | +|---|---|---| +| `id` | str/None | Identifiant NocoDB | +| `comment` | str | Texte brut du commentaire | +| `label` | int | 0 = bénin, 1 = cyberharcèlement | +| `annotated_category` | str | Noms de catégories séparés par des virgules | +| `binary_confidence` | str/None | `HIGH_CONFIDENCE` ou None | +| `source` | str | `real`, `synthetic_v2` ou `claude_annotated` | + +--- + +## Configuration (config.py) + +Module central avec les identifiants de modèles, chemins et utilitaires. + +| Symbole | Description | +|---|---| +| `JINA_MODEL_ID` / `JINA_SMALL_MODEL_ID` | IDs des modèles jina | +| `ARCTIC_EMBED_MODEL_ID` | ID du modèle Snowflake Arctic | +| `XLMR_TOXICITY_MODEL_ID` | Classifieurs XLM-R toxicité (zero-shot) | +| `DATA_DIR` | Chemin vers `data/finetune/` | +| `CHECKPOINTS_DIR` | Chemin vers `data/finetune/checkpoints/` | +| `ANNOTATION_TABLE_ID` | Table NocoDB des annotations brutes | +| `EVAL_TABLE_ID` | Table NocoDB des résultats d'évaluation | +| `get_device_for_model(model_id)` | Retourne `"cpu"` pour jina-nano (MPS non sûr), sinon le meilleur device disponible | +| `model_slug(model_id)` | Extrait le nom court d'un ID HuggingFace (ex. `"snowflake-arctic-embed-l-v2.0"`) | +| `compute_binary_label(categories)` | Convertit une liste de catégories en label binaire 0/1 | diff --git a/backend/balanceteshaters/scripts/ml/_eval_subprocess.py b/backend/balanceteshaters/scripts/ml/_eval_subprocess.py new file mode 100644 index 00000000..bf39edb1 --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/_eval_subprocess.py @@ -0,0 +1,144 @@ +""" +Subprocess helper for 04_compare_evaluate.py and threshold/arch sweep scripts. +Each command uses lazy imports so unrelated native libraries are never loaded together. + +Usage: + python _eval_subprocess.py embed [split] + python _eval_subprocess.py predict_classical + python _eval_subprocess.py predict_proba_classical + python _eval_subprocess.py predict + python _eval_subprocess.py predict_proba + +[split] defaults to "test". Pass "train_real" or "val" to embed other splits. +""" +import os +os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" + +import sys +import numpy as np +from pathlib import Path + +SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + + +def cmd_embed(model_id: str, output_npy: str, split: str = "test"): + import pandas as pd + from sentence_transformers import SentenceTransformer + from balanceteshaters.scripts.ml.config import DATA_DIR, get_device_for_model + + df = pd.read_parquet(DATA_DIR / f"{split}.parquet") + is_jina = "jinaai/" in model_id + load_kwargs = {"device": get_device_for_model(model_id)} + if is_jina: + load_kwargs["trust_remote_code"] = True + + encoder = SentenceTransformer(model_id, **load_kwargs) + task = "classification" if is_jina else None + encode_kwargs = {"batch_size": 32, "show_progress_bar": False, "convert_to_numpy": True} + if task: + encode_kwargs["task"] = task + + X = encoder.encode(df["comment"].tolist(), **encode_kwargs).astype(np.float32) + np.save(output_npy, X) + + +def cmd_predict_classical(embedding_npy: str, ckpt_path: str, output_npy: str): + import joblib # imports LightGBM on first call — no torch/ST in this process + X = np.load(embedding_npy) + clf = joblib.load(ckpt_path) + np.save(output_npy, clf.predict(X)) + + +def cmd_predict_proba_classical(embedding_npy: str, ckpt_path: str, output_npy: str): + import joblib + X = np.load(embedding_npy) + clf = joblib.load(ckpt_path) + np.save(output_npy, clf.predict_proba(X)[:, 1]) + + +def _load_finetuned_clf(model_id: str, ckpt_dir: str): + """Load a fine-tuned EmbeddingClassifier. Returns (clf, device) or (None, None) if checkpoint missing.""" + import torch + from sentence_transformers import SentenceTransformer + from balanceteshaters.scripts.ml.config import get_device_for_model + from balanceteshaters.scripts.ml.models import EmbeddingClassifier + + state_path = Path(ckpt_dir) / "best_model.pt" + if not state_path.exists(): + return None, None + + device = get_device_for_model(model_id) + is_jina = "jinaai/" in model_id + load_kwargs = {"device": device} + if is_jina: + load_kwargs["trust_remote_code"] = True + encoder = SentenceTransformer(model_id, **load_kwargs) + + task = "classification" if is_jina else None + probe_kwargs = {"convert_to_numpy": True, "show_progress_bar": False} + if task: + probe_kwargs["task"] = task + embed_dim = encoder.encode(["probe"], **probe_kwargs).shape[1] + + clf = EmbeddingClassifier(encoder, embed_dim, task=task) + clf.load_state_dict(torch.load(state_path, map_location="cpu")) + clf.eval().to(device) + return clf, device + + +def cmd_predict(model_id: str, ckpt_dir: str, output_npy: str): + import torch + import pandas as pd + from balanceteshaters.scripts.ml.config import DATA_DIR + + clf, _ = _load_finetuned_clf(model_id, ckpt_dir) + if clf is None: + np.save(output_npy, np.array([-1])) + return + + texts = pd.read_parquet(DATA_DIR / "test.parquet")["comment"].tolist() + all_preds = [] + with torch.no_grad(): + for i in range(0, len(texts), 32): + all_preds.extend(torch.argmax(clf(texts[i:i + 32]), dim=-1).cpu().numpy()) + np.save(output_npy, np.array(all_preds)) + + +def cmd_predict_proba(model_id: str, ckpt_dir: str, output_npy: str): + import torch + import torch.nn.functional as F + import pandas as pd + from balanceteshaters.scripts.ml.config import DATA_DIR + + clf, _ = _load_finetuned_clf(model_id, ckpt_dir) + if clf is None: + np.save(output_npy, np.array([-1.0])) + return + + texts = pd.read_parquet(DATA_DIR / "test.parquet")["comment"].tolist() + all_proba = [] + with torch.no_grad(): + for i in range(0, len(texts), 32): + proba = F.softmax(clf(texts[i:i + 32]), dim=-1)[:, 1].cpu().numpy() + all_proba.extend(proba) + np.save(output_npy, np.array(all_proba)) + + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "embed": + split = sys.argv[4] if len(sys.argv) > 4 else "test" + cmd_embed(sys.argv[2], sys.argv[3], split) + elif cmd == "predict_classical": + cmd_predict_classical(sys.argv[2], sys.argv[3], sys.argv[4]) + elif cmd == "predict_proba_classical": + cmd_predict_proba_classical(sys.argv[2], sys.argv[3], sys.argv[4]) + elif cmd == "predict_proba": + cmd_predict_proba(sys.argv[2], sys.argv[3], sys.argv[4]) + elif cmd == "predict": + cmd_predict(sys.argv[2], sys.argv[3], sys.argv[4]) + else: + print(f"Unknown command: {cmd}", file=sys.stderr) + sys.exit(1) diff --git a/backend/balanceteshaters/scripts/ml/_mlp_arch_search.py b/backend/balanceteshaters/scripts/ml/_mlp_arch_search.py new file mode 100644 index 00000000..c6cc9aad --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/_mlp_arch_search.py @@ -0,0 +1,151 @@ +# ruff: noqa: E402 +""" +Architecture search for MLP on frozen Arctic embeddings. +Sweeps hidden layer configs and decision thresholds, logs the best result to NocoDB. +""" +import os +import subprocess +import sys +import tempfile +os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0' + +import numpy as np +import pandas as pd +import sklearn.metrics +from dotenv import load_dotenv +from pathlib import Path +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + +from balanceteshaters.services.nocodb import NocoDBService +from balanceteshaters.scripts.ml.config import ( + ANNOTATION_TABLE_ID, ARCTIC_EMBED_MODEL_ID, DATA_DIR, EVAL_TABLE_ID, model_slug, +) + +_HELPER = Path(__file__).parent / '_eval_subprocess.py' + +# Best F1 from the standard pipeline run (from 04_compare_evaluate.py) +BASELINE_F1 = 0.7414 + +THRESHOLDS = [0.50, 0.45, 0.40, 0.35, 0.30, 0.25, 0.20] + +ARCHITECTURES = [ + (256,), + (512,), + (256, 64), + (512, 128), # default in 02_embed_and_train_classical.py + (256, 128), + (128, 64), + (512, 256, 64), + (256, 128, 32), +] + + +def embed_split(split: str, tmp: Path) -> np.ndarray: + """Embed a data split in a subprocess (avoids OpenMP conflict with sklearn).""" + npy = str(tmp / f'X_{split}.npy') + r = subprocess.run( + [sys.executable, str(_HELPER), 'embed', ARCTIC_EMBED_MODEL_ID, npy, split], + capture_output=True, text=True, + ) + if r.returncode != 0: + print(r.stderr[-500:]) + sys.exit(1) + X = np.load(npy) + print(f' {split}: {X.shape}') + return X + + +def main(): + load_dotenv() + nocodb = NocoDBService( + os.environ['NOCODB_BASE_URL'], + os.environ['NOCODB_TOKEN'], + os.environ['NOCODB_BASE_ID'], + ) + tmp = Path(tempfile.mkdtemp()) + + print('Embedding splits...') + X_test = embed_split('test', tmp) + X_train = embed_split('train_real', tmp) + X_val = embed_split('val', tmp) + + y_test = pd.read_parquet(DATA_DIR / 'test.parquet')['label'].values + y_train = pd.read_parquet(DATA_DIR / 'train_real.parquet')['label'].values + y_val = pd.read_parquet(DATA_DIR / 'val.parquet')['label'].values + + print(f'\n{"Architecture":<22} {"ValF1":>7} {"TestF1":>7} {"BestT":>6} {"TunedF1":>8} {"P":>7} {"R":>7}') + print('-' * 90) + + best_result = None + + for layers in ARCHITECTURES: + clf = Pipeline([ + ('scaler', StandardScaler()), + ('clf', MLPClassifier( + hidden_layer_sizes=layers, + activation='relu', + max_iter=300, + early_stopping=True, + validation_fraction=0.1, + n_iter_no_change=15, + random_state=42, + )), + ]) + clf.fit(X_train, y_train) + + val_f1 = sklearn.metrics.f1_score(y_val, clf.predict(X_val), zero_division=0) + proba = clf.predict_proba(X_test)[:, 1] + default_f1 = sklearn.metrics.f1_score(y_test, (proba >= 0.5).astype(int), zero_division=0) + + best_t, best_f1, best_p, best_r = 0.5, 0.0, 0.0, 0.0 + for t in THRESHOLDS: + y_pred = (proba >= t).astype(int) + f1 = sklearn.metrics.f1_score(y_test, y_pred, zero_division=0) + if f1 > best_f1: + best_f1 = f1 + best_t = t + best_p = sklearn.metrics.precision_score(y_test, y_pred, zero_division=0) + best_r = sklearn.metrics.recall_score(y_test, y_pred, zero_division=0) + + flag = ' ***' if best_f1 > BASELINE_F1 else '' + print(f' {str(layers):<20} {val_f1:>7.4f} {default_f1:>7.4f} {best_t:>6.2f} {best_f1:>8.4f} {best_p:>7.4f} {best_r:>7.4f}{flag}') + + if best_result is None or best_f1 > best_result['tuned_f1']: + best_result = { + 'layers': layers, 'tuned_f1': best_f1, 'best_t': best_t, + 'precision': best_p, 'recall': best_r, + 'accuracy': sklearn.metrics.accuracy_score(y_test, (proba >= best_t).astype(int)), + } + + print(f'\nBest: {best_result["layers"]} at t={best_result["best_t"]} F1={best_result["tuned_f1"]:.4f}') + + if best_result['tuned_f1'] > BASELINE_F1: + slug = model_slug(ARCTIC_EMBED_MODEL_ID) + run_name = f'{slug}+mlp{best_result["layers"]}+real+threshold={best_result["best_t"]}' + nocodb.create_record(EVAL_TABLE_ID, { + 'model_name': run_name, + 'table_id': ANNOTATION_TABLE_ID, + 'table_name': 'finetune/real', + 'f1': best_result['tuned_f1'], + 'precision': best_result['precision'], + 'recall': best_result['recall'], + 'accuracy': best_result['accuracy'], + 'total_samples': len(y_test), + 'positive_samples': int(y_test.sum()), + 'negative_samples': len(y_test) - int(y_test.sum()), + 'prompt': f'arch={best_result["layers"]} threshold={best_result["best_t"]}', + 'model_type': 'encoder embedding', + }) + print(f'Logged {run_name} to NocoDB.') + else: + print('No architecture beats baseline — nothing logged.') + + +if __name__ == '__main__': + main() diff --git a/backend/balanceteshaters/scripts/ml/_threshold_sweep.py b/backend/balanceteshaters/scripts/ml/_threshold_sweep.py index 941dd9c8..e30a7a90 100644 --- a/backend/balanceteshaters/scripts/ml/_threshold_sweep.py +++ b/backend/balanceteshaters/scripts/ml/_threshold_sweep.py @@ -1,38 +1,50 @@ # ruff: noqa: E402 +""" +Sweep decision thresholds for the Arctic + classical pipeline. +Logs any run that beats the default t=0.50 threshold to NocoDB. +""" import os import subprocess import sys import tempfile os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0' + import numpy as np import pandas as pd import sklearn.metrics from dotenv import load_dotenv from pathlib import Path -sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent.parent)) +SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) load_dotenv() from balanceteshaters.services.nocodb import NocoDBService -from balanceteshaters.scripts.ml.config import ANNOTATION_TABLE_ID, EVAL_TABLE_ID, CHECKPOINTS_DIR, DATA_DIR +from balanceteshaters.scripts.ml.config import ( + ANNOTATION_TABLE_ID, ARCTIC_EMBED_MODEL_ID, CHECKPOINTS_DIR, DATA_DIR, EVAL_TABLE_ID, model_slug, +) +_HELPER = Path(__file__).parent / '_eval_subprocess.py' tmp = Path(tempfile.mkdtemp()) emb_path = str(tmp / 'X_arctic.npy') -helper = str(Path(__file__).parent / '_eval_subprocess.py') -MODEL_ID = 'Snowflake/snowflake-arctic-embed-l-v2.0' + +# Best F1 from the frozen-embedding classical runs (from 04_compare_evaluate.py) +BASELINE_F1 = 0.741 + +THRESHOLDS = [0.50, 0.45, 0.40, 0.35, 0.30, 0.25, 0.20] print('Embedding test set...') -r = subprocess.run([sys.executable, helper, 'embed', MODEL_ID, emb_path], capture_output=True, text=True) +r = subprocess.run( + [sys.executable, str(_HELPER), 'embed', ARCTIC_EMBED_MODEL_ID, emb_path], + capture_output=True, text=True, +) if r.returncode != 0: print(r.stderr[-500:]) sys.exit(1) X_test = np.load(emb_path) -test_df = pd.read_parquet(DATA_DIR / 'test.parquet') -y_true = test_df['label'].values - -THRESHOLDS = [0.50, 0.45, 0.40, 0.35, 0.30, 0.25, 0.20] -BEST_SO_FAR = 0.741 +y_true = pd.read_parquet(DATA_DIR / 'test.parquet')['label'].values nocodb = NocoDBService(os.environ['NOCODB_BASE_URL'], os.environ['NOCODB_TOKEN'], os.environ['NOCODB_BASE_ID']) @@ -54,7 +66,7 @@ def best_threshold(proba): return best_t, best_m -def log(run_name, dataset, t, m): +def log_result(run_name, dataset, t, m): nocodb.create_record(EVAL_TABLE_ID, { 'model_name': f'{run_name}+threshold={t}', 'table_id': ANNOTATION_TABLE_ID, @@ -69,41 +81,46 @@ def log(run_name, dataset, t, m): }) +def print_row(run, t, m): + flag = ' *** BEATS BASELINE' if m['f1'] > BASELINE_F1 else '' + print(f' {run:<58} {t:>6.2f} {m["f1"]:>7.4f} {m["precision"]:>7.4f} {m["recall"]:>7.4f} {m["accuracy"]:>7.4f}{flag}') + + print(f'\n{"Run":<60} {"BestT":>6} {"F1":>7} {"P":>7} {"R":>7} {"Acc":>7}') print('-' * 100) +slug = model_slug(ARCTIC_EMBED_MODEL_ID) + for clf_name in ['logreg', 'lightgbm', 'mlp']: - ckpt = CHECKPOINTS_DIR / f'snowflake-arctic-embed-l-v2.0+{clf_name}+real.joblib' + ckpt = CHECKPOINTS_DIR / f'{slug}+{clf_name}+real.joblib' if not ckpt.exists(): continue proba_path = str(tmp / f'proba_{clf_name}.npy') r = subprocess.run( - [sys.executable, helper, 'predict_proba_classical', emb_path, str(ckpt), proba_path], + [sys.executable, str(_HELPER), 'predict_proba_classical', emb_path, str(ckpt), proba_path], capture_output=True, text=True, ) if r.returncode != 0: print(f' [error] {clf_name}: {r.stderr[-200:]}') continue - proba = np.load(proba_path) - t, m = best_threshold(proba) - run = f'snowflake-arctic-embed-l-v2.0+{clf_name}+real' - flag = ' *** BEATS BEST' if m['f1'] > BEST_SO_FAR else '' - print(f' {run:<58} {t:>6.2f} {m["f1"]:>7.4f} {m["precision"]:>7.4f} {m["recall"]:>7.4f} {m["accuracy"]:>7.4f}{flag}') + run = f'{slug}+{clf_name}+real' + t, m = best_threshold(np.load(proba_path)) + print_row(run, t, m) if t != 0.50: - log(run, 'real', t, m) + log_result(run, 'real', t, m) -# fine-tuned head_only real -ft_dir = str(CHECKPOINTS_DIR / 'snowflake-arctic-embed-l-v2.0-finetuned-head_only-real') +ft_dir = str(CHECKPOINTS_DIR / f'{slug}-finetuned-head_only-real') proba_path = str(tmp / 'proba_ft_real.npy') -r = subprocess.run([sys.executable, helper, 'predict_proba', MODEL_ID, ft_dir, proba_path], capture_output=True, text=True) +r = subprocess.run( + [sys.executable, str(_HELPER), 'predict_proba', ARCTIC_EMBED_MODEL_ID, ft_dir, proba_path], + capture_output=True, text=True, +) if r.returncode == 0: - proba = np.load(proba_path) - t, m = best_threshold(proba) - run = 'snowflake-arctic-embed-l-v2.0-finetuned-head_only-real' - flag = ' *** BEATS BEST' if m['f1'] > BEST_SO_FAR else '' - print(f' {run:<58} {t:>6.2f} {m["f1"]:>7.4f} {m["precision"]:>7.4f} {m["recall"]:>7.4f} {m["accuracy"]:>7.4f}{flag}') + run = f'{slug}-finetuned-head_only-real' + t, m = best_threshold(np.load(proba_path)) + print_row(run, t, m) if t != 0.50: - log(run, 'real', t, m) + log_result(run, 'real', t, m) else: print(f' [error] finetuned: {r.stderr[-300:]}') diff --git a/backend/balanceteshaters/scripts/ml/config.py b/backend/balanceteshaters/scripts/ml/config.py index 0c56536e..d389d04f 100644 --- a/backend/balanceteshaters/scripts/ml/config.py +++ b/backend/balanceteshaters/scripts/ml/config.py @@ -3,7 +3,7 @@ JINA_MODEL_ID = "jinaai/jina-embeddings-v5-text-nano" # 239M, EuroBERT, CPU-only (MPS segfault) JINA_SMALL_MODEL_ID = "jinaai/jina-embeddings-v5-text-small" # 677M, Qwen3-based, MPS-safe -# Decoder-only, 270M, 640-dim, 94-language, no trust_remote_code needed +# Encoder-only, 270M, 640-dim, 94-language, no trust_remote_code needed BIDIR_MODEL_ID = "microsoft/harrier-oss-v1-270m" # Encoder-only, 568M, 1024-dim, bge-m3-retromae base, MRL, no trust_remote_code ARCTIC_EMBED_MODEL_ID = "Snowflake/snowflake-arctic-embed-l-v2.0" diff --git a/backend/balanceteshaters/scripts/ml/dedup_eval_table.py b/backend/balanceteshaters/scripts/ml/dedup_eval_table.py new file mode 100644 index 00000000..752b9101 --- /dev/null +++ b/backend/balanceteshaters/scripts/ml/dedup_eval_table.py @@ -0,0 +1,77 @@ +# ruff: noqa: E402 +""" +Remove duplicate rows from the NocoDB eval table (m0ww7qnx69u9r1a). +Keeps the most recently inserted record per model_name, deletes the rest. +""" +import os +import sys +from pathlib import Path + +from dotenv import load_dotenv + +SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent.parent +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + +from balanceteshaters.scripts.ml.config import EVAL_TABLE_ID +from balanceteshaters.services.nocodb import NocoDBService + + +def main(): + load_dotenv() + nocodb = NocoDBService( + nocodb_url=os.environ["NOCODB_BASE_URL"], + token=os.environ["NOCODB_TOKEN"], + base_id=os.environ["NOCODB_BASE_ID"], + ) + + # Fetch all records with pagination + print(f"Fetching all records from {EVAL_TABLE_ID}...") + all_records = [] + offset = 0 + limit = 1000 + while True: + resp = nocodb.get_records(EVAL_TABLE_ID, limit=limit, offset=offset) + records = resp.get("records", []) + if not records: + break + all_records.extend(records) + if resp.get("next") is None: + break + offset += limit + + print(f" Total records: {len(all_records)}") + + # Group by model_name, keep highest id (most recent insert) + seen: dict[str, int] = {} # model_name -> record id to keep + to_delete: list[int] = [] + + for rec in all_records: + fields = rec.get("fields", {}) + model_name = fields.get("model_name", "") + rec_id = rec["id"] + + if model_name not in seen: + seen[model_name] = rec_id + else: + # Keep the higher id (more recent), mark the other for deletion + if rec_id > seen[model_name]: + to_delete.append(seen[model_name]) + seen[model_name] = rec_id + else: + to_delete.append(rec_id) + + print(f" Unique model_names: {len(seen)}") + print(f" Duplicates to delete: {len(to_delete)}") + + if not to_delete: + print("Nothing to delete.") + return + + print(f"Deleting {len(to_delete)} duplicate records...") + nocodb.delete_records(EVAL_TABLE_ID, to_delete) + print("Done.") + + +if __name__ == "__main__": + main()