diff --git a/.github/workflows/build-image.yaml b/.github/workflows/build-image.yaml index e49fa16..0c4cbdc 100644 --- a/.github/workflows/build-image.yaml +++ b/.github/workflows/build-image.yaml @@ -1,15 +1,16 @@ -name: Image Build +name: Image Build to run FastText Meta on: push: branches: - - naf2008 + - run-fasttext-meta tags: - "*" paths: - requirements.txt - Dockerfile # Allows you to run this workflow manually from the Actions tab + pull_request: workflow_dispatch: jobs: @@ -27,13 +28,6 @@ jobs: with: images: inseefrlab/codif-ape-train - - name: Make free space - # https://github.com/actions/virtual-environments/issues/2840 - run: | - sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/share/boost "$AGENT_TOOLSDIRECTORY" - docker rmi -f $(docker images -aq) - shell: bash - - name: Set up QEMU uses: docker/setup-qemu-action@v3 diff --git a/MLproject b/MLproject index ba96e50..13e9790 100644 --- a/MLproject +++ b/MLproject @@ -58,7 +58,6 @@ entry_points: --text_feature {text_feature} \ --textual_features_1 {textual_features_1} \ --textual_features_2 {textual_features_2} \ - --categorical_features_1 {categorical_features_1} \ --categorical_features_2 {categorical_features_2} \ --categorical_features_3 {categorical_features_3} \ --categorical_features_5 {categorical_features_5} \ diff --git a/argo-workflows/train-workflow.yaml b/argo-workflows/train-workflow.yaml index cf767f9..580cd5f 100644 --- a/argo-workflows/train-workflow.yaml +++ b/argo-workflows/train-workflow.yaml @@ -9,8 +9,18 @@ spec: parameters: - name: training-conf-list value: '[ - { "EXPERIMENT_NAME": "NACE2008", "MODEL_CLASS": "fasttext", "REVISION": "NAF2008", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 50, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 150, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 140, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 130, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 120, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 110, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 100, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 90, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 80, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 70, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, + { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 60, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }, { "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 50, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" } + ]' templates: - name: main @@ -74,8 +84,11 @@ spec: container: image: inseefrlab/codif-ape-train:naf2008 imagePullPolicy: Always + resources: + requests: + cpu: "20" command: ["/bin/bash", -c] - args: ["git clone -b naf2008 https://github.com/InseeFrLab/codif-ape-train.git &&\ + args: ["git clone -b run-fasttext-meta https://github.com/InseeFrLab/codif-ape-train.git &&\ cd codif-ape-train/ &&\ export MLFLOW_EXPERIMENT_NAME={{inputs.parameters.EXPERIMENT_NAME}} &&\ mlflow run ~/work/codif-ape-train/ \ @@ -97,7 +110,6 @@ spec: -P text_feature=$TEXT_FEATURE \ -P textual_features_1=$TEXTUAL_FEATURE1 \ -P textual_features_2=$TEXTUAL_FEATURE2 \ - -P categorical_features_1=$FEATURE1 \ -P categorical_features_2=$FEATURE2 \ -P categorical_features_3=$FEATURE3 \ -P categorical_features_5=$FEATURE5 \ @@ -136,8 +148,6 @@ spec: value: activ_nat_lib_et - name: TEXTUAL_FEATURE2 value: activ_sec_agri_et - - name: FEATURE1 - value: TYP - name: FEATURE2 value: NAT - name: FEATURE3 diff --git a/src/train.py b/src/train.py index be1e36f..b58e9fb 100644 --- a/src/train.py +++ b/src/train.py @@ -114,13 +114,13 @@ help="Additional description of company's agricultural activities", required=True, ) -parser.add_argument( - "--categorical_features_1", - type=str, - default="AUTO", - help="Type of observation", - required=True, -) +# parser.add_argument( +# "--categorical_features_1", +# type=str, +# default="AUTO", +# help="Type of observation", +# required=True, +# ) parser.add_argument( "--categorical_features_2", type=str, @@ -256,7 +256,6 @@ def main( text_feature: str, textual_features_1: str, textual_features_2: str, - categorical_features_1: str, categorical_features_2: str, categorical_features_3: str, categorical_features_5: str, diff --git a/src/utils/data.py b/src/utils/data.py index 0455511..d4c6171 100644 --- a/src/utils/data.py +++ b/src/utils/data.py @@ -44,9 +44,9 @@ def get_sirene_4_data( fs = get_file_system() if revision == "NAF2008": - path = "projet-ape/extractions/20241027_sirene4.parquet" + path = "projet-ape/data/08112022_27102024/naf2008/raw_cleansed.parquet" elif revision == "NAF2025": - path = "projet-ape/NAF-revision/relabeled-data/20241027_sirene4_nace2025.parquet" + path = "projet-ape/data/08112022_27102024/naf2025/raw_cleansed.parquet" else: raise ValueError("Revision must be either 'NAF2008' or 'NAF2025'.") @@ -147,14 +147,13 @@ def get_test_data(revision: str, y: str) -> pd.DataFrame: # Get test DataFrame fs = get_file_system() if revision == "NAF2008": - test_data_path = "projet-ape/label-studio/annotation-campaign-2024/NAF2008/preprocessed/test_data_NAF2008.parquet" + test_data_path = "projet-ape/data/25032024_26082025/nafrev2/raw_cleansed.parquet" elif revision == "NAF2025": - test_data_path = "projet-ape/label-studio/annotation-campaign-2024/rev-NAF2025/preprocessed/training_data_NAF2025.parquet" + test_data_path = "projet-ape/data/25032024_26082025/naf2025/raw.parquet" else: raise ValueError("Revision must be either 'NAF2008' or 'NAF2025'.") df = pq.read_table(test_data_path, filesystem=fs).to_pandas() - # Reformat dataframe to have column names consistent # with Sirene 4 data df = df.rename( @@ -181,8 +180,9 @@ def get_test_data(revision: str, y: str) -> pd.DataFrame: # activ_nat_et, cj, activ_nat_lib_et, activ_perm_et: "" to "NaN" df["NAT"] = df["NAT"].replace("", "NaN") df["CJ"] = df["CJ"].replace("", "NaN") - # df["CRT"] = df["CRT"].replace("", "NaN") - df["SRF"] = df["SRF"].str.replace("", "NaN") # TODO: What if we use srf as float? + df["CRT"] = df["CRT"].replace("", "NaN") + # df["SRF"] = df["SRF"].str.replace("", "NaN") # TODO: What if we use srf as float? + df['SRF'] = df['SRF'].fillna('').astype(str) # TODO: need to add activ_sec_agri_et in data next time if "activ_sec_agri_et" not in df: