diff --git a/.github/workflows/build-image.yaml b/.github/workflows/build-image.yaml index e49fa16..fb38364 100644 --- a/.github/workflows/build-image.yaml +++ b/.github/workflows/build-image.yaml @@ -3,14 +3,16 @@ name: Image Build on: push: branches: - - naf2008 + - fasttext-meta tags: - "*" paths: - requirements.txt - Dockerfile # Allows you to run this workflow manually from the Actions tab + pull_request: workflow_dispatch: + jobs: image-build: @@ -27,13 +29,6 @@ jobs: with: images: inseefrlab/codif-ape-train - - name: Make free space - # https://github.com/actions/virtual-environments/issues/2840 - run: | - sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/share/boost "$AGENT_TOOLSDIRECTORY" - docker rmi -f $(docker images -aq) - shell: bash - - name: Set up QEMU uses: docker/setup-qemu-action@v3 diff --git a/argo-workflows/train-workflow.yaml b/argo-workflows/train-workflow.yaml index ec5fc5f..6b51a8e 100644 --- a/argo-workflows/train-workflow.yaml +++ b/argo-workflows/train-workflow.yaml @@ -72,10 +72,10 @@ spec: - name: MODEL_CLASS - name: EXPERIMENT_NAME container: - image: inseefrlab/codif-ape-train:naf2008 + image: inseefrlab/codif-ape-train:fasttext-meta imagePullPolicy: Always command: ["/bin/bash", -c] - args: ["git clone -b naf2008 https://github.com/InseeFrLab/codif-ape-train.git &&\ + args: ["git clone -b fasttext-meta https://github.com/InseeFrLab/codif-ape-train.git &&\ cd codif-ape-train/ &&\ export MLFLOW_EXPERIMENT_NAME={{inputs.parameters.EXPERIMENT_NAME}} &&\ mlflow run ~/work/codif-ape-train/ \ diff --git a/requirements.txt b/requirements.txt index dfb4f67..50b426e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,3 @@ sentencepiece accelerate datasets evaluate -torchFastText @ git+https://github.com/inseefrlab/torch-fastText@package \ No newline at end of file diff --git a/src/utils/data.py b/src/utils/data.py index 0455511..2a294d9 100644 --- a/src/utils/data.py +++ b/src/utils/data.py @@ -44,9 +44,9 @@ def get_sirene_4_data( fs = get_file_system() if revision == "NAF2008": - path = "projet-ape/extractions/20241027_sirene4.parquet" + path = "projet-ape/extractions/domain_specific_cleaned/full_dataset_20241027_sirene4_nacerev2_fuzzy_regex_similarity.parquet" elif revision == "NAF2025": - path = "projet-ape/NAF-revision/relabeled-data/20241027_sirene4_nace2025.parquet" + path = "projet-ape/extractions/domain_specific_cleaned/full_dataset_20241027_sirene4_nace2025_fuzzy_regex_similarity.parquet" else: raise ValueError("Revision must be either 'NAF2008' or 'NAF2025'.")