From 023829b28e3be9408118c87b34ef3be2cb48e509 Mon Sep 17 00:00:00 2001 From: brown9804 Date: Fri, 6 Jun 2025 08:49:11 -0600 Subject: [PATCH 1/5] base blob trigger function --- .vscode/extensions.json | 6 ++ .vscode/launch.json | 15 +++++ .vscode/settings.json | 9 +++ .vscode/tasks.json | 33 ++++++++++ src/.funcignore | 8 +++ src/.gitignore | 135 ++++++++++++++++++++++++++++++++++++++++ src/function_app.py | 11 ++++ src/host.json | 15 +++++ src/requirements.txt | 5 ++ 9 files changed, 237 insertions(+) create mode 100644 .vscode/extensions.json create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 .vscode/tasks.json create mode 100644 src/.funcignore create mode 100644 src/.gitignore create mode 100644 src/function_app.py create mode 100644 src/host.json create mode 100644 src/requirements.txt diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..3f63eb9 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,6 @@ +{ + "recommendations": [ + "ms-azuretools.vscode-azurefunctions", + "ms-python.python" + ] +} \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..9a24428 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Attach to Python Functions", + "type": "debugpy", + "request": "attach", + "connect": { + "host": "localhost", + "port": 9091 + }, + "preLaunchTask": "func: host start" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1b400fa --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,9 @@ +{ + "azureFunctions.deploySubpath": "src", + "azureFunctions.scmDoBuildDuringDeployment": true, + "azureFunctions.pythonVenv": ".venv", + "azureFunctions.projectLanguage": "Python", + "azureFunctions.projectRuntime": "~4", + "debug.internalConsoleOptions": "neverOpen", + "azureFunctions.projectLanguageModel": 2 +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..478f3c1 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,33 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "func", + "label": "func: host start", + "command": "host start", + "problemMatcher": "$func-python-watch", + "isBackground": true, + "dependsOn": "pip install (functions)", + "options": { + "cwd": "${workspaceFolder}/src" + } + }, + { + "label": "pip install (functions)", + "type": "shell", + "osx": { + "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" + }, + "windows": { + "command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt" + }, + "linux": { + "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" + }, + "problemMatcher": [], + "options": { + "cwd": "${workspaceFolder}/src" + } + } + ] +} \ No newline at end of file diff --git a/src/.funcignore b/src/.funcignore new file mode 100644 index 0000000..9966315 --- /dev/null +++ b/src/.funcignore @@ -0,0 +1,8 @@ +.git* +.vscode +__azurite_db*__.json +__blobstorage__ +__queuestorage__ +local.settings.json +test +.venv \ No newline at end of file diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..7685fc4 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,135 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don’t work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Azure Functions artifacts +bin +obj +appsettings.json +local.settings.json + +# Azurite artifacts +__blobstorage__ +__queuestorage__ +__azurite_db*__.json +.python_packages \ No newline at end of file diff --git a/src/function_app.py b/src/function_app.py new file mode 100644 index 0000000..77a95ec --- /dev/null +++ b/src/function_app.py @@ -0,0 +1,11 @@ +import azure.functions as func +import logging + +app = func.FunctionApp() + +@app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}", + connection="runtimestorebrownix3_STORAGE") +def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream): + logging.info(f"Python blob trigger function processed blob" + f"Name: {myblob.name}" + f"Blob Size: {myblob.length} bytes") diff --git a/src/host.json b/src/host.json new file mode 100644 index 0000000..9df9136 --- /dev/null +++ b/src/host.json @@ -0,0 +1,15 @@ +{ + "version": "2.0", + "logging": { + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "excludedTypes": "Request" + } + } + }, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + } +} \ No newline at end of file diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..bdb8fc5 --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,5 @@ +# DO NOT include azure-functions-worker in this file +# The Python Worker is managed by Azure Functions platform +# Manually managing azure-functions-worker may cause unexpected issues + +azure-functions From a006f104aebc169254104937154848749f9de3c9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 6 Jun 2025 14:50:30 +0000 Subject: [PATCH 2/5] Fix Markdown syntax issues --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 0b9c510..8ed6e73 100644 --- a/README.md +++ b/README.md @@ -153,7 +153,6 @@ Last updated: 2025-06-03 > [!NOTE] > This example is using system-assigned managed identity to assign RBACs (Role-based Access Control). - - Under `Settings`, go to `Environment variables`. And `+ Add` the following variables: - `COSMOS_DB_ENDPOINT`: Your Cosmos DB account endpoint 🡢 `Review the existence of this, if not create it` @@ -179,7 +178,6 @@ Last updated: 2025-06-03 image - ## Function App: Develop the logic - You need to install [VSCode](https://code.visualstudio.com/download) @@ -336,7 +334,6 @@ Last updated: 2025-06-03 image -

Total Visitors

Visitor Count From 34252160fb09be2b47a96f337f16d212359a073a Mon Sep 17 00:00:00 2001 From: brown9804 Date: Fri, 6 Jun 2025 09:37:12 -0600 Subject: [PATCH 3/5] vs config -> function updated --- .vscode/settings.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 1b400fa..2245422 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,7 +1,7 @@ { "azureFunctions.deploySubpath": "src", "azureFunctions.scmDoBuildDuringDeployment": true, - "azureFunctions.pythonVenv": ".venv", + "azureFunctions.pythonVenv": "./src/.venv", "azureFunctions.projectLanguage": "Python", "azureFunctions.projectRuntime": "~4", "debug.internalConsoleOptions": "neverOpen", From 54b01a0651f9060ab0c061c2ad5587c2242f9d12 Mon Sep 17 00:00:00 2001 From: brown9804 Date: Fri, 6 Jun 2025 09:38:21 -0600 Subject: [PATCH 4/5] layout template updated + some struct in logs --- src/function_app.py | 191 +++++++++++++++++++++++++++++++++++++++++-- src/requirements.txt | 4 + 2 files changed, 189 insertions(+), 6 deletions(-) diff --git a/src/function_app.py b/src/function_app.py index 77a95ec..370c713 100644 --- a/src/function_app.py +++ b/src/function_app.py @@ -1,11 +1,190 @@ -import azure.functions as func import logging +import azure.functions as func +from azure.ai.formrecognizer import DocumentAnalysisClient +from azure.core.credentials import AzureKeyCredential +from azure.cosmos import CosmosClient, PartitionKey, exceptions +from azure.identity import DefaultAzureCredential +import os +import uuid +import json + +app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) + +## DEFINITIONS +def initialize_form_recognizer_client(): + endpoint = os.getenv("FORM_RECOGNIZER_ENDPOINT") + key = os.getenv("FORM_RECOGNIZER_KEY") + if not isinstance(key, str): + raise ValueError("FORM_RECOGNIZER_KEY must be a string") + logging.info(f"Form Recognizer endpoint: {endpoint}") + return DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key)) + +def read_pdf_content(myblob): + logging.info(f"Reading PDF content from blob: {myblob.name}") + return myblob.read() + +def analyze_pdf(form_recognizer_client, pdf_bytes): + logging.info("Starting PDF layout analysis.") + poller = form_recognizer_client.begin_analyze_document( + model_id="prebuilt-layout", + document=pdf_bytes + ) + logging.info("PDF layout analysis in progress.") + result = poller.result() + logging.info("PDF layout analysis completed.") + logging.info(f"Document has {len(result.pages)} page(s), {len(result.tables)} table(s), and {len(result.styles)} style(s).") + return result + +def extract_layout_data(result): + logging.info("Extracting layout data from analysis result.") + + layout_data = { + "id": str(uuid.uuid4()), + "pages": [] + } + + # Log styles + for idx, style in enumerate(result.styles): + content_type = "handwritten" if style.is_handwritten else "no handwritten" + logging.info(f"Document contains {content_type} content") + + # Process each page + for page in result.pages: + logging.info(f"--- Page {page.page_number} ---") + page_data = { + "page_number": page.page_number, + "lines": [line.content for line in page.lines], + "tables": [], + "selection_marks": [ + {"state": mark.state, "confidence": mark.confidence} + for mark in page.selection_marks + ] + } + + # Log extracted lines + for line_idx, line in enumerate(page.lines): + logging.info(f"Line {line_idx}: '{line.content}'") + + # Log selection marks + for selection_mark in page.selection_marks: + logging.info( + f"Selection mark is '{selection_mark.state}' with confidence {selection_mark.confidence}" + ) + + # Extract tables + page_tables = [ + table for table in result.tables + if any(region.page_number == page.page_number for region in table.bounding_regions) + ] + + for table_index, table in enumerate(page_tables): + logging.info(f"Table {table_index}: {table.row_count} rows, {table.column_count} columns") + + table_data = { + "row_count": table.row_count, + "column_count": table.column_count, + "cells": [] + } -app = func.FunctionApp() + for cell in table.cells: + content = cell.content.strip() + table_data["cells"].append({ + "row_index": cell.row_index, + "column_index": cell.column_index, + "content": content + }) + logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}'") + page_data["tables"].append(table_data) + + layout_data["pages"].append(page_data) + + try: + preview = json.dumps(layout_data, indent=2) + logging.info("Structured layout data preview:\n" + preview) + except Exception as e: + logging.warning(f"Could not serialize layout data for preview: {e}") + + return layout_data + +def save_layout_data_to_cosmos(layout_data): + try: + endpoint = os.getenv("COSMOS_DB_ENDPOINT") + key = os.getenv("COSMOS_DB_KEY") + aad_credentials = DefaultAzureCredential() + client = CosmosClient(endpoint, credential=aad_credentials, consistency_level='Session') + logging.info("Successfully connected to Cosmos DB using AAD default credential") + except Exception as e: + logging.error(f"Error connecting to Cosmos DB: {e}") + return + + database_name = "ContosoDBDocIntellig" + container_name = "Layouts" + + try: + database = client.create_database_if_not_exists(database_name) + logging.info(f"Database '{database_name}' does not exist. Creating it.") + except exceptions.CosmosResourceExistsError: + database = client.get_database_client(database_name) + logging.info(f"Database '{database_name}' already exists.") + + database.read() + logging.info(f"Reading into '{database_name}' DB") + + try: + container = database.create_container( + id=container_name, + partition_key=PartitionKey(path="/id"), + offer_throughput=400 + ) + logging.info(f"Container '{container_name}' does not exist. Creating it.") + except exceptions.CosmosResourceExistsError: + container = database.get_container_client(container_name) + logging.info(f"Container '{container_name}' already exists.") + except exceptions.CosmosHttpResponseError: + raise + + container.read() + logging.info(f"Reading into '{container}' container") + + try: + response = container.upsert_item(layout_data) + logging.info(f"Saved processed layout data to Cosmos DB. Response: {response}") + except Exception as e: + logging.error(f"Error inserting item into Cosmos DB: {e}") + +## MAIN @app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}", - connection="runtimestorebrownix3_STORAGE") + connection="invoicecontosostorage_STORAGE") def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream): - logging.info(f"Python blob trigger function processed blob" - f"Name: {myblob.name}" - f"Blob Size: {myblob.length} bytes") + logging.info(f"Python blob trigger function processed blob\n" + f"Name: {myblob.name}\n" + f"Blob Size: {myblob.length} bytes") + + try: + form_recognizer_client = initialize_form_recognizer_client() + pdf_bytes = read_pdf_content(myblob) + logging.info("Successfully read PDF content from blob.") + except Exception as e: + logging.error(f"Error reading PDF: {e}") + return + + try: + result = analyze_pdf(form_recognizer_client, pdf_bytes) + logging.info("Successfully analyzed PDF using Document Intelligence.") + except Exception as e: + logging.error(f"Error analyzing PDF: {e}") + return + + try: + layout_data = extract_layout_data(result) + logging.info("Successfully extracted layout data.") + except Exception as e: + logging.error(f"Error extracting layout data: {e}") + return + + try: + save_layout_data_to_cosmos(layout_data) + logging.info("Successfully saved layout data to Cosmos DB.") + except Exception as e: + logging.error(f"Error saving layout data to Cosmos DB: {e}") diff --git a/src/requirements.txt b/src/requirements.txt index bdb8fc5..d3cd3d3 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -3,3 +3,7 @@ # Manually managing azure-functions-worker may cause unexpected issues azure-functions +azure-ai-formrecognizer +azure-core +azure-cosmos==4.3.0 +azure-identity==1.7.0 From 62e3b0fca0cfd808cadbeb1f282977a001c302ba Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 6 Jun 2025 15:38:43 +0000 Subject: [PATCH 5/5] Update last modified date in Markdown files --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8ed6e73..1d57ae7 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-06-03 +Last updated: 2025-06-06 ----------