added script to tweak the persona descriptions for evals #148
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: test | |
| permissions: | |
| actions: read | |
| contents: write | |
| pull-requests: write # Allow writing comments on PRs | |
| issues: write # Allow writing comments on issues | |
| statuses: write # Allow writing statuses on PRs | |
| discussions: write | |
| # Cancel in-progress runs when a new commit is pushed to the same branch/PR | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| on: | |
| push: | |
| pull_request: | |
| workflow_dispatch: | |
| jobs: | |
| setup-chromium: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 5 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: astral-sh/setup-uv@v6 | |
| - name: Get week number for cache key | |
| id: week | |
| run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT | |
| - name: Cache chromium binaries | |
| id: cache-chromium | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/ms-playwright | |
| key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} | |
| restore-keys: | | |
| ${{ runner.os }}-${{ runner.arch }}-chromium- | |
| - name: Install Chromium if not cached | |
| if: steps.cache-chromium.outputs.cache-hit != 'true' | |
| run: uvx playwright install chromium --with-deps --no-shell | |
| find_tests: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 5 # Prevent hanging | |
| outputs: | |
| TEST_FILENAMES: ${{ steps.lsgrep.outputs.TEST_FILENAMES }} | |
| # ["test_browser", "test_tools", "test_browser_session", "test_tab_management", ...] | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| # Force fresh checkout to avoid any caching issues | |
| fetch-depth: 1 | |
| - id: lsgrep | |
| run: | | |
| echo "🔍 Discovering test files at $(date)" | |
| echo "Git commit: $(git rev-parse HEAD)" | |
| echo "Git branch: $(git branch --show-current)" | |
| echo "" | |
| TEST_FILENAMES="$(find tests/browser_use -name 'test_*.py' -type f | sed 's|^tests/browser_use/||' | sed 's|\.py$||' | jq -R -s -c 'split("\n")[:-1]')" | |
| echo "TEST_FILENAMES=${TEST_FILENAMES}" >> "$GITHUB_OUTPUT" | |
| echo "📋 Test matrix: $TEST_FILENAMES" | |
| # https://code.dblock.org/2021/09/03/generating-task-matrix-by-looping-over-repo-files-with-github-actions.html | |
| - name: Check that at least one test file is found | |
| run: | | |
| if [ -z "${{ steps.lsgrep.outputs.TEST_FILENAMES }}" ]; then | |
| echo "Failed to find any test_*.py files in tests/browser_use/ folder!" > /dev/stderr | |
| exit 1 | |
| fi | |
| tests: | |
| needs: [setup-chromium, find_tests] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 4 # Reduced timeout - tests should complete quickly or retry | |
| env: | |
| IN_DOCKER: 'True' | |
| ANONYMIZED_TELEMETRY: 'false' | |
| BROWSER_USE_LOGGING_LEVEL: 'DEBUG' | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| strategy: | |
| matrix: | |
| test_filename: ${{ fromJson(needs.find_tests.outputs.TEST_FILENAMES || '["FAILED_TO_DISCOVER_TESTS"]') }} | |
| # autodiscovers all the files in tests/browser_use/test_*.py | |
| # - test_browser | |
| # - test_tools | |
| # - test_browser_session | |
| # - test_tab_management | |
| # ... and more | |
| name: ${{ matrix.test_filename }} | |
| steps: | |
| - name: Check that the previous step managed to find some test files for us to run | |
| run: | | |
| if [[ "${{ matrix.test_filename }}" == "FAILED_TO_DISCOVER_TESTS" ]]; then | |
| echo "Failed get list of test files in tests/browser_use/test_*.py from find_tests job" > /dev/stderr | |
| exit 1 | |
| fi | |
| - uses: actions/checkout@v4 | |
| - uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: true | |
| activate-environment: true | |
| - name: Cache uv packages and venv | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/uv | |
| .venv | |
| key: ${{ runner.os }}-uv-venv-${{ hashFiles('pyproject.toml') }} | |
| restore-keys: | | |
| ${{ runner.os }}-uv-venv- | |
| - run: uv sync --dev --all-extras | |
| - name: Get week number for cache key | |
| id: week | |
| run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT | |
| - name: Cache chromium binaries | |
| id: cache-chromium | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/ms-playwright | |
| key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} | |
| restore-keys: | | |
| ${{ runner.os }}-${{ runner.arch }}-chromium- | |
| - name: Install Chromium browser if not cached | |
| if: steps.cache-chromium.outputs.cache-hit != 'true' | |
| run: uvx playwright install chromium --with-deps --no-shell | |
| - name: Cache browser-use extensions | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.config/browseruse/extensions | |
| key: ${{ runner.os }}-browseruse-extensions-${{ hashFiles('browser_use/browser/profile.py') }} | |
| restore-keys: | | |
| ${{ runner.os }}-browseruse-extensions- | |
| - name: Check if test file exists | |
| id: check-file | |
| run: | | |
| TEST_FILE="tests/browser_use/${{ matrix.test_filename }}.py" | |
| if [ -f "$TEST_FILE" ]; then | |
| echo "exists=true" >> $GITHUB_OUTPUT | |
| echo "✅ Test file found: $TEST_FILE" | |
| else | |
| echo "exists=false" >> $GITHUB_OUTPUT | |
| echo "❌ Test file not found: $TEST_FILE" | |
| echo "This file may have been renamed or removed. Current test files:" | |
| find tests/browser_use -name 'test_*.py' -type f | sed 's|tests/browser_use/||' | sed 's|\.py$||' | sort | |
| fi | |
| - name: Run test with retry | |
| if: steps.check-file.outputs.exists == 'true' | |
| uses: nick-fields/retry@v3 | |
| with: | |
| timeout_minutes: 4 | |
| max_attempts: 1 | |
| retry_on: error | |
| command: pytest "tests/browser_use/${{ matrix.test_filename }}.py" | |
| evaluate-tasks: | |
| needs: setup-chromium | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 8 # Allow more time for agent eval | |
| env: | |
| IN_DOCKER: 'true' | |
| BROWSER_USE_CLOUD_SYNC: 'false' | |
| ANONYMIZED_TELEMETRY: 'false' | |
| BROWSER_USE_LOGGING_LEVEL: 'DEBUG' | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: true | |
| activate-environment: true | |
| - name: Cache uv packages and venv | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/uv | |
| .venv | |
| key: ${{ runner.os }}-uv-venv-${{ hashFiles('pyproject.toml') }} | |
| restore-keys: | | |
| ${{ runner.os }}-uv-venv- | |
| - run: uv sync --dev --all-extras | |
| - name: Get week number for cache key | |
| id: week | |
| run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT | |
| - name: Cache chromium binaries | |
| id: cache-chromium | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/ms-playwright | |
| key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} | |
| restore-keys: | | |
| ${{ runner.os }}-${{ runner.arch }}-chromium- | |
| - name: Install Chromium browser if not cached | |
| if: steps.cache-chromium.outputs.cache-hit != 'true' | |
| run: uvx playwright install chromium --with-deps --no-shell | |
| - name: Cache browser-use extensions | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.config/browseruse/extensions | |
| key: ${{ runner.os }}-browseruse-extensions-${{ hashFiles('browser_use/browser/profile.py') }} | |
| restore-keys: | | |
| ${{ runner.os }}-browseruse-extensions- | |
| - name: Run agent tasks evaluation and capture score | |
| id: eval | |
| uses: nick-fields/retry@v3 | |
| with: | |
| timeout_minutes: 4 | |
| max_attempts: 1 | |
| retry_on: error | |
| command: | | |
| python tests/browser_use/evaluate_tasks.py > result.txt | |
| cat result.txt | |
| echo "PASSED=$(grep '^PASSED=' result.txt | cut -d= -f2)" >> $GITHUB_ENV | |
| echo "TOTAL=$(grep '^TOTAL=' result.txt | cut -d= -f2)" >> $GITHUB_ENV | |
| echo "DETAILED_RESULTS=$(grep '^DETAILED_RESULTS=' result.txt | cut -d= -f2-)" >> $GITHUB_ENV | |
| - name: Print agent evaluation summary | |
| run: | | |
| echo "Agent tasks passed: $PASSED / $TOTAL" | |
| - name: Write agent evaluation summary to workflow overview | |
| run: | | |
| if [ "$PASSED" = "$TOTAL" ]; then | |
| COLOR="green" | |
| else | |
| COLOR="yellow" | |
| fi | |
| echo "<h2>Agent Tasks Score: <span style='color:$COLOR;'>$PASSED/$TOTAL</span></h2>" >> $GITHUB_STEP_SUMMARY | |
| - name: Comment PR with agent evaluation results | |
| if: github.event_name == 'pull_request' | |
| uses: actions/github-script@v7 | |
| continue-on-error: true | |
| with: | |
| script: | | |
| const passed = parseInt(process.env.PASSED); | |
| const total = parseInt(process.env.TOTAL); | |
| const detailedResults = JSON.parse(process.env.DETAILED_RESULTS); | |
| const score = `${passed}/${total}`; | |
| const percentage = Math.round((passed / total) * 100); | |
| // Fail the workflow if 0% pass rate | |
| if (percentage === 0) { | |
| core.setFailed(`Evaluation failed: 0% pass rate (${passed}/${total})`); | |
| } | |
| // Create detailed table | |
| let tableRows = ''; | |
| detailedResults.forEach(result => { | |
| const emoji = result.success ? '✅' : '❌'; | |
| const status = result.success ? 'Pass' : 'Fail'; | |
| tableRows += `| ${result.task} | ${emoji} ${status} | ${result.reason} |\n`; | |
| }); | |
| const comment = `## Agent Task Evaluation Results: ${score} (${percentage}%) | |
| <details> | |
| <summary>View detailed results</summary> | |
| | Task | Result | Reason | | |
| |------|--------|--------| | |
| ${tableRows} | |
| Check the [evaluate-tasks job](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed task execution logs. | |
| </details>`; | |
| // Find existing comment to update or create new one | |
| const { data: comments } = await github.rest.issues.listComments({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: context.issue.number, | |
| }); | |
| const botComment = comments.find(comment => | |
| comment.user.type === 'Bot' && | |
| comment.body.includes('Agent Task Evaluation Results') | |
| ); | |
| if (botComment) { | |
| // Update existing comment | |
| await github.rest.issues.updateComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| comment_id: botComment.id, | |
| body: comment | |
| }); | |
| } else { | |
| // Create new comment | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: context.issue.number, | |
| body: comment | |
| }); | |
| } |