Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions .github/workflows/build-image.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
name: Image Build
name: Image Build to run FastText Meta

on:
push:
branches:
- naf2008
- run-fasttext-meta
tags:
- "*"
paths:
- requirements.txt
- Dockerfile
# Allows you to run this workflow manually from the Actions tab
pull_request:
workflow_dispatch:

jobs:
Expand All @@ -27,13 +28,6 @@ jobs:
with:
images: inseefrlab/codif-ape-train

- name: Make free space
# https://github.com/actions/virtual-environments/issues/2840
run: |
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/share/boost "$AGENT_TOOLSDIRECTORY"
docker rmi -f $(docker images -aq)
shell: bash

- name: Set up QEMU
uses: docker/setup-qemu-action@v3

Expand Down
1 change: 0 additions & 1 deletion MLproject
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ entry_points:
--text_feature {text_feature} \
--textual_features_1 {textual_features_1} \
--textual_features_2 {textual_features_2} \
--categorical_features_1 {categorical_features_1} \
--categorical_features_2 {categorical_features_2} \
--categorical_features_3 {categorical_features_3} \
--categorical_features_5 {categorical_features_5} \
Expand Down
20 changes: 15 additions & 5 deletions argo-workflows/train-workflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,18 @@ spec:
parameters:
- name: training-conf-list
value: '[
{ "EXPERIMENT_NAME": "NACE2008", "MODEL_CLASS": "fasttext", "REVISION": "NAF2008", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 50, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" },
{ "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 150, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" },
{ "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 140, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" },
{ "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 130, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" },
{ "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 120, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" },
{ "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 110, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" },
{ "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 100, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" },
{ "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 90, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" },
{ "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 80, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" },
{ "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 70, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" },
{ "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 60, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" },
{ "EXPERIMENT_NAME": "NACE2025", "MODEL_CLASS": "fasttext", "REVISION": "NAF2025", "START_YEAR": 2025, "DIM": 180, "WS": 5, "LR": 0.2, "EPOCHS": 50, "WORDNGRAMS": 3, "MINN": 3, "MAXN": 4, "MINCOUNT": 3, "BUCKET": 2000000, "LOSS": "ova" }

]'
templates:
- name: main
Expand Down Expand Up @@ -74,8 +84,11 @@ spec:
container:
image: inseefrlab/codif-ape-train:naf2008
imagePullPolicy: Always
resources:
requests:
cpu: "20"
command: ["/bin/bash", -c]
args: ["git clone -b naf2008 https://github.com/InseeFrLab/codif-ape-train.git &&\
args: ["git clone -b run-fasttext-meta https://github.com/InseeFrLab/codif-ape-train.git &&\
cd codif-ape-train/ &&\
export MLFLOW_EXPERIMENT_NAME={{inputs.parameters.EXPERIMENT_NAME}} &&\
mlflow run ~/work/codif-ape-train/ \
Expand All @@ -97,7 +110,6 @@ spec:
-P text_feature=$TEXT_FEATURE \
-P textual_features_1=$TEXTUAL_FEATURE1 \
-P textual_features_2=$TEXTUAL_FEATURE2 \
-P categorical_features_1=$FEATURE1 \
-P categorical_features_2=$FEATURE2 \
-P categorical_features_3=$FEATURE3 \
-P categorical_features_5=$FEATURE5 \
Expand Down Expand Up @@ -136,8 +148,6 @@ spec:
value: activ_nat_lib_et
- name: TEXTUAL_FEATURE2
value: activ_sec_agri_et
- name: FEATURE1
value: TYP
- name: FEATURE2
value: NAT
- name: FEATURE3
Expand Down
15 changes: 7 additions & 8 deletions src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,13 @@
help="Additional description of company's agricultural activities",
required=True,
)
parser.add_argument(
"--categorical_features_1",
type=str,
default="AUTO",
help="Type of observation",
required=True,
)
# parser.add_argument(
# "--categorical_features_1",
# type=str,
# default="AUTO",
# help="Type of observation",
# required=True,
# )
parser.add_argument(
"--categorical_features_2",
type=str,
Expand Down Expand Up @@ -256,7 +256,6 @@ def main(
text_feature: str,
textual_features_1: str,
textual_features_2: str,
categorical_features_1: str,
categorical_features_2: str,
categorical_features_3: str,
categorical_features_5: str,
Expand Down
14 changes: 7 additions & 7 deletions src/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ def get_sirene_4_data(
fs = get_file_system()

if revision == "NAF2008":
path = "projet-ape/extractions/20241027_sirene4.parquet"
path = "projet-ape/data/08112022_27102024/naf2008/raw_cleansed.parquet"
elif revision == "NAF2025":
path = "projet-ape/NAF-revision/relabeled-data/20241027_sirene4_nace2025.parquet"
path = "projet-ape/data/08112022_27102024/naf2025/raw_cleansed.parquet"
else:
raise ValueError("Revision must be either 'NAF2008' or 'NAF2025'.")

Expand Down Expand Up @@ -147,14 +147,13 @@ def get_test_data(revision: str, y: str) -> pd.DataFrame:
# Get test DataFrame
fs = get_file_system()
if revision == "NAF2008":
test_data_path = "projet-ape/label-studio/annotation-campaign-2024/NAF2008/preprocessed/test_data_NAF2008.parquet"
test_data_path = "projet-ape/data/25032024_26082025/nafrev2/raw_cleansed.parquet"
elif revision == "NAF2025":
test_data_path = "projet-ape/label-studio/annotation-campaign-2024/rev-NAF2025/preprocessed/training_data_NAF2025.parquet"
test_data_path = "projet-ape/data/25032024_26082025/naf2025/raw.parquet"
else:
raise ValueError("Revision must be either 'NAF2008' or 'NAF2025'.")

df = pq.read_table(test_data_path, filesystem=fs).to_pandas()

# Reformat dataframe to have column names consistent
# with Sirene 4 data
df = df.rename(
Expand All @@ -181,8 +180,9 @@ def get_test_data(revision: str, y: str) -> pd.DataFrame:
# activ_nat_et, cj, activ_nat_lib_et, activ_perm_et: "" to "NaN"
df["NAT"] = df["NAT"].replace("", "NaN")
df["CJ"] = df["CJ"].replace("", "NaN")
# df["CRT"] = df["CRT"].replace("", "NaN")
df["SRF"] = df["SRF"].str.replace("", "NaN") # TODO: What if we use srf as float?
df["CRT"] = df["CRT"].replace("", "NaN")
# df["SRF"] = df["SRF"].str.replace("", "NaN") # TODO: What if we use srf as float?
df['SRF'] = df['SRF'].fillna('').astype(str)

# TODO: need to add activ_sec_agri_et in data next time
if "activ_sec_agri_et" not in df:
Expand Down