diff --git a/.vscode/extensions.json b/.vscode/extensions.json
new file mode 100644
index 0000000..3f63eb9
--- /dev/null
+++ b/.vscode/extensions.json
@@ -0,0 +1,6 @@
+{
+ "recommendations": [
+ "ms-azuretools.vscode-azurefunctions",
+ "ms-python.python"
+ ]
+}
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..9a24428
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+ "version": "0.2.0",
+ "configurations": [
+ {
+ "name": "Attach to Python Functions",
+ "type": "debugpy",
+ "request": "attach",
+ "connect": {
+ "host": "localhost",
+ "port": 9091
+ },
+ "preLaunchTask": "func: host start"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..2245422
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,9 @@
+{
+ "azureFunctions.deploySubpath": "src",
+ "azureFunctions.scmDoBuildDuringDeployment": true,
+ "azureFunctions.pythonVenv": "./src/.venv",
+ "azureFunctions.projectLanguage": "Python",
+ "azureFunctions.projectRuntime": "~4",
+ "debug.internalConsoleOptions": "neverOpen",
+ "azureFunctions.projectLanguageModel": 2
+}
\ No newline at end of file
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
new file mode 100644
index 0000000..478f3c1
--- /dev/null
+++ b/.vscode/tasks.json
@@ -0,0 +1,33 @@
+{
+ "version": "2.0.0",
+ "tasks": [
+ {
+ "type": "func",
+ "label": "func: host start",
+ "command": "host start",
+ "problemMatcher": "$func-python-watch",
+ "isBackground": true,
+ "dependsOn": "pip install (functions)",
+ "options": {
+ "cwd": "${workspaceFolder}/src"
+ }
+ },
+ {
+ "label": "pip install (functions)",
+ "type": "shell",
+ "osx": {
+ "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
+ },
+ "windows": {
+ "command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt"
+ },
+ "linux": {
+ "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
+ },
+ "problemMatcher": [],
+ "options": {
+ "cwd": "${workspaceFolder}/src"
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 0b9c510..1d57ae7 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Costa Rica
[](https://github.com/)
[brown9804](https://github.com/brown9804)
-Last updated: 2025-06-03
+Last updated: 2025-06-06
----------
@@ -153,7 +153,6 @@ Last updated: 2025-06-03
> [!NOTE]
> This example is using system-assigned managed identity to assign RBACs (Role-based Access Control).
-
- Under `Settings`, go to `Environment variables`. And `+ Add` the following variables:
- `COSMOS_DB_ENDPOINT`: Your Cosmos DB account endpoint 🡢 `Review the existence of this, if not create it`
@@ -179,7 +178,6 @@ Last updated: 2025-06-03
-
## Function App: Develop the logic
- You need to install [VSCode](https://code.visualstudio.com/download)
@@ -336,7 +334,6 @@ Last updated: 2025-06-03
-
Total Visitors

diff --git a/src/.funcignore b/src/.funcignore
new file mode 100644
index 0000000..9966315
--- /dev/null
+++ b/src/.funcignore
@@ -0,0 +1,8 @@
+.git*
+.vscode
+__azurite_db*__.json
+__blobstorage__
+__queuestorage__
+local.settings.json
+test
+.venv
\ No newline at end of file
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..7685fc4
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,135 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don’t work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Azure Functions artifacts
+bin
+obj
+appsettings.json
+local.settings.json
+
+# Azurite artifacts
+__blobstorage__
+__queuestorage__
+__azurite_db*__.json
+.python_packages
\ No newline at end of file
diff --git a/src/function_app.py b/src/function_app.py
new file mode 100644
index 0000000..370c713
--- /dev/null
+++ b/src/function_app.py
@@ -0,0 +1,190 @@
+import logging
+import azure.functions as func
+from azure.ai.formrecognizer import DocumentAnalysisClient
+from azure.core.credentials import AzureKeyCredential
+from azure.cosmos import CosmosClient, PartitionKey, exceptions
+from azure.identity import DefaultAzureCredential
+import os
+import uuid
+import json
+
+app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)
+
+## DEFINITIONS
+def initialize_form_recognizer_client():
+ endpoint = os.getenv("FORM_RECOGNIZER_ENDPOINT")
+ key = os.getenv("FORM_RECOGNIZER_KEY")
+ if not isinstance(key, str):
+ raise ValueError("FORM_RECOGNIZER_KEY must be a string")
+ logging.info(f"Form Recognizer endpoint: {endpoint}")
+ return DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+def read_pdf_content(myblob):
+ logging.info(f"Reading PDF content from blob: {myblob.name}")
+ return myblob.read()
+
+def analyze_pdf(form_recognizer_client, pdf_bytes):
+ logging.info("Starting PDF layout analysis.")
+ poller = form_recognizer_client.begin_analyze_document(
+ model_id="prebuilt-layout",
+ document=pdf_bytes
+ )
+ logging.info("PDF layout analysis in progress.")
+ result = poller.result()
+ logging.info("PDF layout analysis completed.")
+ logging.info(f"Document has {len(result.pages)} page(s), {len(result.tables)} table(s), and {len(result.styles)} style(s).")
+ return result
+
+def extract_layout_data(result):
+ logging.info("Extracting layout data from analysis result.")
+
+ layout_data = {
+ "id": str(uuid.uuid4()),
+ "pages": []
+ }
+
+ # Log styles
+ for idx, style in enumerate(result.styles):
+ content_type = "handwritten" if style.is_handwritten else "no handwritten"
+ logging.info(f"Document contains {content_type} content")
+
+ # Process each page
+ for page in result.pages:
+ logging.info(f"--- Page {page.page_number} ---")
+ page_data = {
+ "page_number": page.page_number,
+ "lines": [line.content for line in page.lines],
+ "tables": [],
+ "selection_marks": [
+ {"state": mark.state, "confidence": mark.confidence}
+ for mark in page.selection_marks
+ ]
+ }
+
+ # Log extracted lines
+ for line_idx, line in enumerate(page.lines):
+ logging.info(f"Line {line_idx}: '{line.content}'")
+
+ # Log selection marks
+ for selection_mark in page.selection_marks:
+ logging.info(
+ f"Selection mark is '{selection_mark.state}' with confidence {selection_mark.confidence}"
+ )
+
+ # Extract tables
+ page_tables = [
+ table for table in result.tables
+ if any(region.page_number == page.page_number for region in table.bounding_regions)
+ ]
+
+ for table_index, table in enumerate(page_tables):
+ logging.info(f"Table {table_index}: {table.row_count} rows, {table.column_count} columns")
+
+ table_data = {
+ "row_count": table.row_count,
+ "column_count": table.column_count,
+ "cells": []
+ }
+
+ for cell in table.cells:
+ content = cell.content.strip()
+ table_data["cells"].append({
+ "row_index": cell.row_index,
+ "column_index": cell.column_index,
+ "content": content
+ })
+ logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}'")
+
+ page_data["tables"].append(table_data)
+
+ layout_data["pages"].append(page_data)
+
+ try:
+ preview = json.dumps(layout_data, indent=2)
+ logging.info("Structured layout data preview:\n" + preview)
+ except Exception as e:
+ logging.warning(f"Could not serialize layout data for preview: {e}")
+
+ return layout_data
+
+def save_layout_data_to_cosmos(layout_data):
+ try:
+ endpoint = os.getenv("COSMOS_DB_ENDPOINT")
+ key = os.getenv("COSMOS_DB_KEY")
+ aad_credentials = DefaultAzureCredential()
+ client = CosmosClient(endpoint, credential=aad_credentials, consistency_level='Session')
+ logging.info("Successfully connected to Cosmos DB using AAD default credential")
+ except Exception as e:
+ logging.error(f"Error connecting to Cosmos DB: {e}")
+ return
+
+ database_name = "ContosoDBDocIntellig"
+ container_name = "Layouts"
+
+ try:
+ database = client.create_database_if_not_exists(database_name)
+ logging.info(f"Database '{database_name}' does not exist. Creating it.")
+ except exceptions.CosmosResourceExistsError:
+ database = client.get_database_client(database_name)
+ logging.info(f"Database '{database_name}' already exists.")
+
+ database.read()
+ logging.info(f"Reading into '{database_name}' DB")
+
+ try:
+ container = database.create_container(
+ id=container_name,
+ partition_key=PartitionKey(path="/id"),
+ offer_throughput=400
+ )
+ logging.info(f"Container '{container_name}' does not exist. Creating it.")
+ except exceptions.CosmosResourceExistsError:
+ container = database.get_container_client(container_name)
+ logging.info(f"Container '{container_name}' already exists.")
+ except exceptions.CosmosHttpResponseError:
+ raise
+
+ container.read()
+ logging.info(f"Reading into '{container}' container")
+
+ try:
+ response = container.upsert_item(layout_data)
+ logging.info(f"Saved processed layout data to Cosmos DB. Response: {response}")
+ except Exception as e:
+ logging.error(f"Error inserting item into Cosmos DB: {e}")
+
+## MAIN
+@app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}",
+ connection="invoicecontosostorage_STORAGE")
+def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream):
+ logging.info(f"Python blob trigger function processed blob\n"
+ f"Name: {myblob.name}\n"
+ f"Blob Size: {myblob.length} bytes")
+
+ try:
+ form_recognizer_client = initialize_form_recognizer_client()
+ pdf_bytes = read_pdf_content(myblob)
+ logging.info("Successfully read PDF content from blob.")
+ except Exception as e:
+ logging.error(f"Error reading PDF: {e}")
+ return
+
+ try:
+ result = analyze_pdf(form_recognizer_client, pdf_bytes)
+ logging.info("Successfully analyzed PDF using Document Intelligence.")
+ except Exception as e:
+ logging.error(f"Error analyzing PDF: {e}")
+ return
+
+ try:
+ layout_data = extract_layout_data(result)
+ logging.info("Successfully extracted layout data.")
+ except Exception as e:
+ logging.error(f"Error extracting layout data: {e}")
+ return
+
+ try:
+ save_layout_data_to_cosmos(layout_data)
+ logging.info("Successfully saved layout data to Cosmos DB.")
+ except Exception as e:
+ logging.error(f"Error saving layout data to Cosmos DB: {e}")
diff --git a/src/host.json b/src/host.json
new file mode 100644
index 0000000..9df9136
--- /dev/null
+++ b/src/host.json
@@ -0,0 +1,15 @@
+{
+ "version": "2.0",
+ "logging": {
+ "applicationInsights": {
+ "samplingSettings": {
+ "isEnabled": true,
+ "excludedTypes": "Request"
+ }
+ }
+ },
+ "extensionBundle": {
+ "id": "Microsoft.Azure.Functions.ExtensionBundle",
+ "version": "[4.*, 5.0.0)"
+ }
+}
\ No newline at end of file
diff --git a/src/requirements.txt b/src/requirements.txt
new file mode 100644
index 0000000..d3cd3d3
--- /dev/null
+++ b/src/requirements.txt
@@ -0,0 +1,9 @@
+# DO NOT include azure-functions-worker in this file
+# The Python Worker is managed by Azure Functions platform
+# Manually managing azure-functions-worker may cause unexpected issues
+
+azure-functions
+azure-ai-formrecognizer
+azure-core
+azure-cosmos==4.3.0
+azure-identity==1.7.0