From c0b4971b6476ae03c7337f916a5285727bae36b8 Mon Sep 17 00:00:00 2001 From: TheAIWizard Date: Mon, 1 Sep 2025 13:17:01 +0000 Subject: [PATCH 01/12] prepare wf for fasttext meta --- .github/workflows/build-image.yaml | 4 ++-- argo-workflows/train-workflow.yaml | 2 +- src/utils/data.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-image.yaml b/.github/workflows/build-image.yaml index e49fa16..1bfba92 100644 --- a/.github/workflows/build-image.yaml +++ b/.github/workflows/build-image.yaml @@ -1,9 +1,9 @@ -name: Image Build +name: Image Build to run FastText Meta on: push: branches: - - naf2008 + - run-fasttext-meta tags: - "*" paths: diff --git a/argo-workflows/train-workflow.yaml b/argo-workflows/train-workflow.yaml index cf767f9..e3954d2 100644 --- a/argo-workflows/train-workflow.yaml +++ b/argo-workflows/train-workflow.yaml @@ -75,7 +75,7 @@ spec: image: inseefrlab/codif-ape-train:naf2008 imagePullPolicy: Always command: ["/bin/bash", -c] - args: ["git clone -b naf2008 https://github.com/InseeFrLab/codif-ape-train.git &&\ + args: ["git clone -b run-fasttext-meta https://github.com/InseeFrLab/codif-ape-train.git &&\ cd codif-ape-train/ &&\ export MLFLOW_EXPERIMENT_NAME={{inputs.parameters.EXPERIMENT_NAME}} &&\ mlflow run ~/work/codif-ape-train/ \ diff --git a/src/utils/data.py b/src/utils/data.py index 0455511..0cfa0fe 100644 --- a/src/utils/data.py +++ b/src/utils/data.py @@ -147,9 +147,9 @@ def get_test_data(revision: str, y: str) -> pd.DataFrame: # Get test DataFrame fs = get_file_system() if revision == "NAF2008": - test_data_path = "projet-ape/label-studio/annotation-campaign-2024/NAF2008/preprocessed/test_data_NAF2008.parquet" + test_data_path = "projet-ape/data/25032024_26082025/nafrev2/raw_cleansed.parquet" elif revision == "NAF2025": - test_data_path = "projet-ape/label-studio/annotation-campaign-2024/rev-NAF2025/preprocessed/training_data_NAF2025.parquet" + test_data_path = "projet-ape/data/25032024_26082025/naf2025/raw.parquet" else: raise ValueError("Revision must be either 'NAF2008' or 'NAF2025'.") From faf4455f24d1824a1a172be42d4119397925d238 Mon Sep 17 00:00:00 2001 From: TheAIWizard Date: Mon, 1 Sep 2025 13:25:08 +0000 Subject: [PATCH 02/12] make CI work --- .github/workflows/{build-image.yaml => main.yaml} | 7 ------- 1 file changed, 7 deletions(-) rename .github/workflows/{build-image.yaml => main.yaml} (81%) diff --git a/.github/workflows/build-image.yaml b/.github/workflows/main.yaml similarity index 81% rename from .github/workflows/build-image.yaml rename to .github/workflows/main.yaml index 1bfba92..669d7f0 100644 --- a/.github/workflows/build-image.yaml +++ b/.github/workflows/main.yaml @@ -27,13 +27,6 @@ jobs: with: images: inseefrlab/codif-ape-train - - name: Make free space - # https://github.com/actions/virtual-environments/issues/2840 - run: | - sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/share/boost "$AGENT_TOOLSDIRECTORY" - docker rmi -f $(docker images -aq) - shell: bash - - name: Set up QEMU uses: docker/setup-qemu-action@v3 From df7a8b4297658deb1c8838f07ba8a047de63341d Mon Sep 17 00:00:00 2001 From: TheAIWizard Date: Mon, 1 Sep 2025 13:27:02 +0000 Subject: [PATCH 03/12] make CI work PR --- .github/workflows/main.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 669d7f0..0c4cbdc 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -10,6 +10,7 @@ on: - requirements.txt - Dockerfile # Allows you to run this workflow manually from the Actions tab + pull_request: workflow_dispatch: jobs: From 67f661c29ca131979af872b044f511d499c5f514 Mon Sep 17 00:00:00 2001 From: TheAIWizard Date: Mon, 1 Sep 2025 13:30:53 +0000 Subject: [PATCH 04/12] make CI work PR --- .github/workflows/{main.yaml => build-image.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{main.yaml => build-image.yaml} (100%) diff --git a/.github/workflows/main.yaml b/.github/workflows/build-image.yaml similarity index 100% rename from .github/workflows/main.yaml rename to .github/workflows/build-image.yaml From 7e4ceba564ab831d4dbb4c1ebb084b081ccc792a Mon Sep 17 00:00:00 2001 From: TheAIWizard Date: Mon, 1 Sep 2025 14:19:53 +0000 Subject: [PATCH 05/12] adapt car var types: CRT, SRF --- src/utils/data.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/utils/data.py b/src/utils/data.py index 0cfa0fe..a7d7a40 100644 --- a/src/utils/data.py +++ b/src/utils/data.py @@ -154,7 +154,8 @@ def get_test_data(revision: str, y: str) -> pd.DataFrame: raise ValueError("Revision must be either 'NAF2008' or 'NAF2025'.") df = pq.read_table(test_data_path, filesystem=fs).to_pandas() - + # Option 1: Remplacer les valeurs nulles par "" +df['colonne_str_vide'] = df['colonne_float'].fillna('').astype(str) # Reformat dataframe to have column names consistent # with Sirene 4 data df = df.rename( @@ -181,8 +182,9 @@ def get_test_data(revision: str, y: str) -> pd.DataFrame: # activ_nat_et, cj, activ_nat_lib_et, activ_perm_et: "" to "NaN" df["NAT"] = df["NAT"].replace("", "NaN") df["CJ"] = df["CJ"].replace("", "NaN") - # df["CRT"] = df["CRT"].replace("", "NaN") - df["SRF"] = df["SRF"].str.replace("", "NaN") # TODO: What if we use srf as float? + df["CRT"] = df["CRT"].replace("", "NaN") + # df["SRF"] = df["SRF"].str.replace("", "NaN") # TODO: What if we use srf as float? + df['SRF'] = df['SRF'].fillna('').astype(str) # TODO: need to add activ_sec_agri_et in data next time if "activ_sec_agri_et" not in df: From 819beb6904275f2c927b6fb3226ad4f889c55ed4 Mon Sep 17 00:00:00 2001 From: TheAIWizard Date: Mon, 1 Sep 2025 15:51:25 +0000 Subject: [PATCH 06/12] adapt car var types: CRT, SRF --- src/utils/data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/utils/data.py b/src/utils/data.py index a7d7a40..757ad89 100644 --- a/src/utils/data.py +++ b/src/utils/data.py @@ -154,8 +154,6 @@ def get_test_data(revision: str, y: str) -> pd.DataFrame: raise ValueError("Revision must be either 'NAF2008' or 'NAF2025'.") df = pq.read_table(test_data_path, filesystem=fs).to_pandas() - # Option 1: Remplacer les valeurs nulles par "" -df['colonne_str_vide'] = df['colonne_float'].fillna('').astype(str) # Reformat dataframe to have column names consistent # with Sirene 4 data df = df.rename( From b54c4182dea5fef40fe3117504e0e8b89b2ccbe2 Mon Sep 17 00:00:00 2001 From: Nathan Randriamanana <37664429+TheAIWizard@users.noreply.github.com> Date: Fri, 5 Sep 2025 16:16:10 +0200 Subject: [PATCH 07/12] change training data path --- src/utils/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/data.py b/src/utils/data.py index 757ad89..d4c6171 100644 --- a/src/utils/data.py +++ b/src/utils/data.py @@ -44,9 +44,9 @@ def get_sirene_4_data( fs = get_file_system() if revision == "NAF2008": - path = "projet-ape/extractions/20241027_sirene4.parquet" + path = "projet-ape/data/08112022_27102024/naf2008/raw_cleansed.parquet" elif revision == "NAF2025": - path = "projet-ape/NAF-revision/relabeled-data/20241027_sirene4_nace2025.parquet" + path = "projet-ape/data/08112022_27102024/naf2025/raw_cleansed.parquet" else: raise ValueError("Revision must be either 'NAF2008' or 'NAF2025'.") From e2f3cdd75e93cbe040c2554dbb08dcb91e4ffb11 Mon Sep 17 00:00:00 2001 From: Nathan Randriamanana <37664429+TheAIWizard@users.noreply.github.com> Date: Tue, 9 Sep 2025 19:20:00 +0200 Subject: [PATCH 08/12] Increase specs -> speed up cpu inference --- argo-workflows/train-workflow.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/argo-workflows/train-workflow.yaml b/argo-workflows/train-workflow.yaml index e3954d2..204975f 100644 --- a/argo-workflows/train-workflow.yaml +++ b/argo-workflows/train-workflow.yaml @@ -74,6 +74,9 @@ spec: container: image: inseefrlab/codif-ape-train:naf2008 imagePullPolicy: Always + resources: + requests: + cpu: "30" command: ["/bin/bash", -c] args: ["git clone -b run-fasttext-meta https://github.com/InseeFrLab/codif-ape-train.git &&\ cd codif-ape-train/ &&\ From 276ab01678819011b9f7c23abb327748a979d9f9 Mon Sep 17 00:00:00 2001 From: TheAIWizard Date: Wed, 17 Sep 2025 09:51:36 +0000 Subject: [PATCH 09/12] remove liasse_type from var --> noisy --- argo-workflows/train-workflow.yaml | 2 -- src/train.py | 14 +++++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/argo-workflows/train-workflow.yaml b/argo-workflows/train-workflow.yaml index 204975f..c2e80dd 100644 --- a/argo-workflows/train-workflow.yaml +++ b/argo-workflows/train-workflow.yaml @@ -139,8 +139,6 @@ spec: value: activ_nat_lib_et - name: TEXTUAL_FEATURE2 value: activ_sec_agri_et - - name: FEATURE1 - value: TYP - name: FEATURE2 value: NAT - name: FEATURE3 diff --git a/src/train.py b/src/train.py index be1e36f..61fe7be 100644 --- a/src/train.py +++ b/src/train.py @@ -114,13 +114,13 @@ help="Additional description of company's agricultural activities", required=True, ) -parser.add_argument( - "--categorical_features_1", - type=str, - default="AUTO", - help="Type of observation", - required=True, -) +# parser.add_argument( +# "--categorical_features_1", +# type=str, +# default="AUTO", +# help="Type of observation", +# required=True, +# ) parser.add_argument( "--categorical_features_2", type=str, From 39d68f9eac46b01c77bc67498ad73e4d13e84ad4 Mon Sep 17 00:00:00 2001 From: TheAIWizard Date: Wed, 17 Sep 2025 10:05:59 +0000 Subject: [PATCH 10/12] remove AUTO from MLproject --- MLproject | 1 - argo-workflows/train-workflow.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/MLproject b/MLproject index ba96e50..13e9790 100644 --- a/MLproject +++ b/MLproject @@ -58,7 +58,6 @@ entry_points: --text_feature {text_feature} \ --textual_features_1 {textual_features_1} \ --textual_features_2 {textual_features_2} \ - --categorical_features_1 {categorical_features_1} \ --categorical_features_2 {categorical_features_2} \ --categorical_features_3 {categorical_features_3} \ --categorical_features_5 {categorical_features_5} \ diff --git a/argo-workflows/train-workflow.yaml b/argo-workflows/train-workflow.yaml index c2e80dd..0f6e0fd 100644 --- a/argo-workflows/train-workflow.yaml +++ b/argo-workflows/train-workflow.yaml @@ -100,7 +100,6 @@ spec: -P text_feature=$TEXT_FEATURE \ -P textual_features_1=$TEXTUAL_FEATURE1 \ -P textual_features_2=$TEXTUAL_FEATURE2 \ - -P categorical_features_1=$FEATURE1 \ -P categorical_features_2=$FEATURE2 \ -P categorical_features_3=$FEATURE3 \ -P categorical_features_5=$FEATURE5 \ From 553538f8580719d9c1c73ada6393c6c1899cccc9 Mon Sep 17 00:00:00 2001 From: TheAIWizard Date: Wed, 17 Sep 2025 10:18:04 +0000 Subject: [PATCH 11/12] remove cat feature 1 --- src/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/train.py b/src/train.py index 61fe7be..b58e9fb 100644 --- a/src/train.py +++ b/src/train.py @@ -256,7 +256,6 @@ def main( text_feature: str, textual_features_1: str, textual_features_2: str, - categorical_features_1: str, categorical_features_2: str, categorical_features_3: str, categorical_features_5: str, From 44be8a3d6e26014702f2e8e05d95173812361ba5 Mon Sep 17 00:00:00 2001 From: TheAIWizard Date: Fri, 19 Sep 2025 15:23:09 +0000 Subject: [PATCH 12/12] save argo wf training contract --- argo-workflows/train-workflow.yaml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/argo-workflows/train-workflow.yaml b/argo-workflows/train-workflow.yaml index 0f6e0fd..580cd5f 100644 --- a/argo-workflows/train-workflow.yaml +++ b/argo-workflows/train-workflow.yaml @@ -9,8 +9,18 @@ spec: parameters: - name: training-conf-list value: '[ - { "EXPERIMENT_NAME": "NACE2008", "MODEL_CLASS": "fasttext", "REVISION": "NAF2008", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 50, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 150, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 140, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 130, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 120, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 110, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 100, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 90, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 80, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 70, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 60, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 50, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" } + ]' templates: - name: main @@ -76,7 +86,7 @@ spec: imagePullPolicy: Always resources: requests: - cpu: "30" + cpu: "20" command: ["/bin/bash", -c] args: ["git clone -b run-fasttext-meta https://github.com/InseeFrLab/codif-ape-train.git &&\ cd codif-ape-train/ &&\