Add tests for BigFrame and Snowpark dataframes

erindru · erindru · commit dc0788d6bf79 · 2025-05-09T05:58:45.000Z
diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py
@@ -249,7 +249,7 @@ def _df_to_source_queries(
 
         # we need to ensure that the order of the columns in columns_to_types columns matches the order of the values
         # they can differ if a user specifies columns() on a python model in a different order than what's in the DataFrame's emitted by that model
-        df = df[list(columns_to_types.keys())]
+        df = df[list(columns_to_types)]
         values = list(df.itertuples(index=False, name=None))
 
         return [
diff --git a/sqlmesh/core/engine_adapter/mssql.py b/sqlmesh/core/engine_adapter/mssql.py
@@ -219,7 +219,7 @@ def query_factory() -> Query:
             if not self.table_exists(temp_table):
                 columns_to_types_create = columns_to_types.copy()
                 ordered_df = df[
-                    list(columns_to_types_create.keys())
+                    list(columns_to_types_create)
                 ]  # reorder DataFrame so it matches columns_to_types
                 self._convert_df_datetime(ordered_df, columns_to_types_create)
                 self.create_table(temp_table, columns_to_types_create)
diff --git a/sqlmesh/core/engine_adapter/spark.py b/sqlmesh/core/engine_adapter/spark.py
@@ -280,14 +280,14 @@ def _ensure_pyspark_df(
         if pyspark_df:
             if columns_to_types:
                 # ensure Spark dataframe column order matches columns_to_types
-                pyspark_df = pyspark_df.select(*list(columns_to_types.keys()))
+                pyspark_df = pyspark_df.select(*list(columns_to_types))
             return pyspark_df
         df = self.try_get_pandas_df(generic_df)
         if df is None:
             raise SQLMeshError("Ensure PySpark DF can only be run on a PySpark or Pandas DataFrame")
         if columns_to_types:
             # ensure Pandas dataframe column order matches columns_to_types
-            df = df[list(columns_to_types.keys())]
+            df = df[list(columns_to_types)]
         kwargs = (
             dict(schema=self.sqlglot_to_spark_types(columns_to_types)) if columns_to_types else {}
         )
diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py
@@ -2742,8 +2742,7 @@ def test_python_model_column_order(ctx: TestContext, tmp_path_factory: pytest.Te
         pytest.skip("python model column order test only needs to be run once per db")
 
     tmp_path = tmp_path_factory.mktemp(f"column_order_{ctx.test_id}")
-
-    test_schema = ctx.add_test_suffix("column_order")
+    schema = ctx.add_test_suffix(TEST_SCHEMA)
 
     (tmp_path / "models").mkdir()
 
@@ -2772,7 +2771,7 @@ def execute(
     return context.spark.createDataFrame([
         Row(name="foo", id=1)
     ])
-    """.replace("TEST_SCHEMA", test_schema)
+    """.replace("TEST_SCHEMA", schema)
         )
     else:
         # python model that emits a Pandas DataFrame
@@ -2796,7 +2795,7 @@ def execute(
     return pd.DataFrame([
         {"name": "foo", "id": 1}
     ])
-    """.replace("TEST_SCHEMA", test_schema)
+    """.replace("TEST_SCHEMA", schema)
         )
 
     sqlmesh_ctx = ctx.create_context(path=tmp_path)
@@ -2808,6 +2807,9 @@ def execute(
 
     engine_adapter = sqlmesh_ctx.engine_adapter
 
-    df = engine_adapter.fetchdf(f"select * from {test_schema}.model")
+    query = exp.select("*").from_(
+        exp.to_table(f"{schema}.model", dialect=ctx.dialect), dialect=ctx.dialect
+    )
+    df = engine_adapter.fetchdf(query, quote_identifiers=True)
     assert len(df) == 1
     assert df.iloc[0].to_dict() == {"id": 1, "name": "foo"}
diff --git a/tests/core/engine_adapter/integration/test_integration_bigquery.py b/tests/core/engine_adapter/integration/test_integration_bigquery.py
@@ -13,7 +13,7 @@
 from sqlmesh.core.model import SqlModel, load_sql_based_model
 from sqlmesh.core.plan import Plan
 from sqlmesh.core.table_diff import TableDiff
-from tests.core.engine_adapter.integration import TestContext
+from tests.core.engine_adapter.integration import TestContext, TEST_SCHEMA
 
 pytestmark = [pytest.mark.engine, pytest.mark.remote, pytest.mark.bigquery]
 
@@ -433,3 +433,51 @@ def test_table_diff_table_name_matches_column_name(ctx: TestContext):
 
     assert row_diff.stats["join_count"] == 1
     assert row_diff.full_match_count == 1
+
+
+def test_bigframe_python_model_column_order(ctx: TestContext, tmp_path: Path):
+    schema = ctx.add_test_suffix(TEST_SCHEMA)
+
+    (tmp_path / "models").mkdir()
+
+    # note: this model deliberately defines the columns in the @model definition to be in a different order than what
+    # is returned by the DataFrame within the model
+    model_path = tmp_path / "models" / "python_model.py"
+
+    # python model that emits a BigFrame dataframe
+    model_path.write_text(
+        """
+from bigframes.pandas import DataFrame
+import typing as t
+from sqlmesh import ExecutionContext, model
+
+@model(
+    "TEST_SCHEMA.model",
+    columns={
+        "id": "int",
+        "name": "varchar"
+    }
+)
+def execute(
+    context: ExecutionContext,
+    **kwargs: t.Any,
+) -> DataFrame:
+    return DataFrame({'name': ['foo'], 'id': [1]})
+""".replace("TEST_SCHEMA", schema)
+    )
+
+    sqlmesh_ctx = ctx.create_context(path=tmp_path)
+
+    assert len(sqlmesh_ctx.models) == 1
+
+    plan = sqlmesh_ctx.plan(auto_apply=True)
+    assert len(plan.new_snapshots) == 1
+
+    engine_adapter = sqlmesh_ctx.engine_adapter
+
+    query = exp.select("*").from_(
+        exp.to_table(f"{schema}.model", dialect=ctx.dialect), dialect=ctx.dialect
+    )
+    df = engine_adapter.fetchdf(query, quote_identifiers=True)
+    assert len(df) == 1
+    assert df.iloc[0].to_dict() == {"id": 1, "name": "foo"}
diff --git a/tests/core/engine_adapter/integration/test_integration_snowflake.py b/tests/core/engine_adapter/integration/test_integration_snowflake.py
@@ -1,14 +1,15 @@
 import typing as t
 import pytest
 from sqlglot import exp
+from pathlib import Path
 from sqlglot.optimizer.qualify_columns import quote_identifiers
 from sqlglot.helper import seq_get
 from sqlmesh.core.engine_adapter import SnowflakeEngineAdapter
 from sqlmesh.core.engine_adapter.shared import DataObject
 import sqlmesh.core.dialect as d
 from sqlmesh.core.model import SqlModel, load_sql_based_model
 from sqlmesh.core.plan import Plan
-from tests.core.engine_adapter.integration import TestContext
+from tests.core.engine_adapter.integration import TestContext, TEST_SCHEMA
 
 pytestmark = [pytest.mark.engine, pytest.mark.remote, pytest.mark.snowflake]
 
@@ -210,3 +211,49 @@ def test_create_iceberg_table(ctx: TestContext, engine_adapter: SnowflakeEngineA
     result = sqlmesh.plan(auto_apply=True)
 
     assert len(result.new_snapshots) == 2
+
+
+def test_snowpark_python_model_column_order(ctx: TestContext, tmp_path: Path):
+    schema = ctx.add_test_suffix(TEST_SCHEMA)
+
+    (tmp_path / "models").mkdir()
+
+    # note: this model deliberately defines the columns in the @model definition to be in a different order than what
+    # is returned by the DataFrame within the model
+    model_path = tmp_path / "models" / "python_model.py"
+
+    # python model that emits a Snowpark DataFrame
+    model_path.write_text(
+        """
+from snowflake.snowpark.dataframe import DataFrame
+import typing as t
+from sqlmesh import ExecutionContext, model
+
+@model(
+    "TEST_SCHEMA.model",
+    columns={
+        "id": "int",
+        "name": "varchar"
+    }
+)
+def execute(
+    context: ExecutionContext,
+    **kwargs: t.Any,
+) -> DataFrame:
+    return context.snowpark.create_dataframe([["foo", 1]], schema=["name", "id"])
+""".replace("TEST_SCHEMA", schema)
+    )
+
+    sqlmesh_ctx = ctx.create_context(path=tmp_path)
+
+    assert len(sqlmesh_ctx.models) == 1
+
+    plan = sqlmesh_ctx.plan(auto_apply=True)
+    assert len(plan.new_snapshots) == 1
+
+    engine_adapter = sqlmesh_ctx.engine_adapter
+
+    query = exp.select("*").from_(exp.to_table(f"{schema}.model", dialect=ctx.dialect))
+    df = engine_adapter.fetchdf(query, quote_identifiers=True)
+    assert len(df) == 1
+    assert df.iloc[0].to_dict() == {"id": 1, "name": "foo"}