Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
.git
.github
.venv
.pytest_cache
.pycache_local
__pycache__
*.pyc
.DS_Store
.env
node_modules
frontend/react-chat/node_modules
frontend/react-chat/dist
docs
6 changes: 5 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
APP_ENV=development
APP_HOST=0.0.0.0
APP_PORT=8000
APP_RELOAD=true
APP_ORIGIN=http://localhost:5173
GROQ_API_KEY=
PINECONE_API_KEY=
PINECONE_INDEX_NAME=ai-observability-agent
Expand All @@ -7,4 +12,3 @@ LANGCHAIN_TRACING_V2=true
LANGCHAIN_PROJECT=ai-observability-agent
EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
GROQ_MODEL=llama-3.1-8b-instant
APP_ORIGIN=http://localhost:5173
59 changes: 59 additions & 0 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: CD

on:
push:
branches:
- main

permissions:
contents: read
packages: write

env:
IMAGE_NAME: ghcr.io/${{ github.repository_owner }}/ai-observability-agent

jobs:
build-and-push:
runs-on: ubuntu-latest

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract Docker metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.IMAGE_NAME }}
tags: |
type=raw,value=latest
type=sha

- name: Build and push image
uses: docker/build-push-action@v6
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

deploy-render:
runs-on: ubuntu-latest
needs: build-and-push
if: ${{ secrets.RENDER_DEPLOY_HOOK_URL != '' }}

steps:
- name: Trigger Render deploy
run: curl -X POST "$RENDER_DEPLOY_HOOK_URL"
env:
RENDER_DEPLOY_HOOK_URL: ${{ secrets.RENDER_DEPLOY_HOOK_URL }}
45 changes: 45 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: CI

on:
pull_request:
push:
branches:
- main

jobs:
test:
runs-on: ubuntu-latest

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r backend/requirements.txt

- name: Run tests
run: pytest

docker-build:
runs-on: ubuntu-latest

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build Docker image
uses: docker/build-push-action@v6
with:
context: .
push: false
tags: ai-observability-agent:ci
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ __pycache__/
*.pyc
node_modules/
dist/
docs
19 changes: 19 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
FROM python:3.11-slim

ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PYTHONPATH=/app

WORKDIR /app

COPY backend/requirements.txt ./backend/requirements.txt

RUN pip install --upgrade pip && \
pip install -r backend/requirements.txt

COPY . .

EXPOSE 8000

CMD ["sh", "-c", "uvicorn backend.main:app --host 0.0.0.0 --port ${PORT:-8000}"]
66 changes: 66 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,15 @@ Small agentic AI project scaffold for learning:
### Backend

1. Create a virtual environment
```bash
python3.11 -m venv .venv
source .venv/bin/activate
```
2. Install dependencies from `backend/requirements.txt`
```bash
pip install --upgrade pip
pip install -r backend/requirements.txt
```
3. Copy `.env.example` to `.env`
4. Start the API:

Expand All @@ -52,3 +60,61 @@ cd frontend/react-chat
npm install
npm run dev
```

### Pinecone
https://app.pinecone.io/organizations/

## Deployment path

This repo now includes a Docker-first deployment baseline:

- `Dockerfile` for packaging the backend
- `.dockerignore` to keep the image lean
- `.github/workflows/ci.yml` for tests plus Docker build validation
- `.github/workflows/cd.yml` for publishing a container to GHCR and triggering Render
- `render.yaml` as a starter Render blueprint

### Run locally with Docker

```bash
docker build -t ai-observability-agent .
docker run --rm -p 8000:8000 --env-file .env ai-observability-agent
```

Then verify:

```bash
curl http://localhost:8000/api/health
```

### CI

CI runs on pull requests and pushes to `main`:

- installs backend dependencies
- runs `pytest`
- builds the Docker image

### CD

CD runs on pushes to `main`:

- builds and pushes `ghcr.io/<owner>/ai-observability-agent`
- tags the image with `latest` and the Git SHA
- triggers Render through `RENDER_DEPLOY_HOOK_URL`

### Render setup

1. Create a Render web service from an existing image.
2. Point it at `ghcr.io/<your-user-or-org>/ai-observability-agent:latest`.
3. Add the environment variables from `.env.example`.
4. Set the health check path to `/api/health`.
5. Add `RENDER_DEPLOY_HOOK_URL` as a GitHub Actions secret.

### GitHub Actions secrets

You only need one repository secret for the current CD flow:

- `RENDER_DEPLOY_HOOK_URL`

The workflow uses the built-in `GITHUB_TOKEN` to push to GHCR.
10 changes: 10 additions & 0 deletions backend/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
# Backend

This folder contains the FastAPI service, LangGraph agent, RAG pipeline, tools, and MCP server.

## MCP quick check

After installing backend dependencies, you can test the local MCP server with:

```bash
python scripts/test_mcp_client.py --list-tools
python scripts/test_mcp_client.py --tool search_logs --args '{"query":"error","limit":3}'
python scripts/test_mcp_client.py --tool get_metrics --args '{"service_name":"checkout-service"}'
```
15 changes: 14 additions & 1 deletion backend/agent/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,20 @@
def classify_query(state: dict[str, Any]) -> dict[str, Any]:
"""Route queries to the right tooling using lightweight heuristics."""
query = state["query"].lower()
if any(keyword in query for keyword in ("log", "stack trace", "error", "exception")):
if any(
keyword in query
for keyword in (
"log",
"stack trace",
"error",
"exception",
"trace_id",
"trace id",
"trace",
"request id",
"correlation id",
)
):
intent = "logs"
elif any(keyword in query for keyword in ("cpu", "memory", "latency", "metrics", "slow")):
intent = "metrics"
Expand Down
31 changes: 28 additions & 3 deletions backend/tools/log_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"database": re.compile(r"SQLException|connection refused|deadlock", re.IGNORECASE),
}
TIMESTAMP_PATTERN = re.compile(r"^(?P<timestamp>\S+)")
TRACE_ID_PATTERN = re.compile(r"\btrace_id=(?P<trace_id>[A-Za-z0-9_-]+)")


def _read_log_file() -> str:
Expand Down Expand Up @@ -64,28 +65,41 @@ def analyze_logs_tool(log_text: str) -> str:

evidence_lines = [line for line in log_text.splitlines() if line.strip()]
timestamps = _extract_timestamps(evidence_lines)
trace_ids = _extract_trace_ids(evidence_lines)

if not findings:
if evidence_lines:
return (
summary = [
"No obvious incident signature detected.\n"
f"Evidence lines:\n{_format_evidence(evidence_lines)}\n"
]
if timestamps:
summary.append(f"Relevant timestamps: {', '.join(timestamps)}.")
if trace_ids:
summary.append(f"Relevant trace IDs: {', '.join(trace_ids)}.")
summary.append(f"Evidence lines:\n{_format_evidence(evidence_lines)}")
summary.append(
"Recommended next step: inspect latency spikes, correlation IDs, and nearby log lines."
)
return "\n".join(summary)
return "No obvious incident signature detected. Inspect latency spikes and correlation IDs."

summary = [f"Likely issue types: {', '.join(findings)}."]
if timestamps:
summary.append(f"Relevant timestamps: {', '.join(timestamps)}.")
if trace_ids:
summary.append(f"Relevant trace IDs: {', '.join(trace_ids)}.")
if evidence_lines:
summary.append(f"Evidence lines:\n{_format_evidence(evidence_lines)}")
summary.append("Recommended next step: inspect surrounding logs, trace IDs, and recent deploys.")
return "\n".join(summary)


def _expand_query_terms(query: str) -> list[str]:
normalized = re.sub(r"[^a-zA-Z0-9]+", " ", query.lower())
lowered = query.lower()
normalized = re.sub(r"[^a-zA-Z0-9]+", " ", lowered)
terms = {term for term in normalized.split() if len(term) > 2}
raw_terms = {term for term in re.split(r"\s+", lowered) if len(term) > 2}
terms.update(raw_terms)

if {"out", "memory"} <= terms or "oom" in terms:
terms.update({"outofmemoryerror", "java heap space", "heap", "memory"})
Expand All @@ -95,6 +109,8 @@ def _expand_query_terms(query: str) -> list[str]:
terms.update({"error", "exception", "failed"})
if "time" in terms or "when" in terms:
terms.update({"timestamp"})
if "trace" in terms or "trace_id" in terms or "trace id" in lowered:
terms.update({"trace_id", "trace_id=", "trace", "request", "traceid"})

return sorted(terms)

Expand All @@ -110,3 +126,12 @@ def _extract_timestamps(lines: list[str]) -> list[str]:

def _format_evidence(lines: list[str]) -> str:
return "\n".join(f"- {line}" for line in lines)


def _extract_trace_ids(lines: list[str]) -> list[str]:
trace_ids: list[str] = []
for line in lines:
match = TRACE_ID_PATTERN.search(line)
if match:
trace_ids.append(match.group("trace_id"))
return trace_ids
32 changes: 32 additions & 0 deletions render.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
services:
- type: web
name: ai-observability-agent
runtime: image
image:
url: ghcr.io/OWNER_OR_ORG/ai-observability-agent:latest
plan: free
healthCheckPath: /api/health
autoDeploy: false
envVars:
- key: APP_ENV
value: production
- key: APP_ORIGIN
sync: false
- key: GROQ_API_KEY
sync: false
- key: GROQ_MODEL
value: llama-3.1-8b-instant
- key: PINECONE_API_KEY
sync: false
- key: PINECONE_INDEX_NAME
value: ai-observability-agent
- key: PINECONE_NAMESPACE
value: observability-docs
- key: LANGCHAIN_API_KEY
sync: false
- key: LANGCHAIN_TRACING_V2
value: "true"
- key: LANGCHAIN_PROJECT
value: ai-observability-agent
- key: EMBEDDING_MODEL
value: sentence-transformers/all-MiniLM-L6-v2
Loading
Loading