Merge pull request #634 from splitgraph/aws-athena-data-source-cu-1wged02

gruuya · web-flow · commit c37291267ad6 · 2022-02-24T17:06:52.000+01:00
Amazon Athena data source
diff --git a/.ci/install.sh b/.ci/install.sh
@@ -13,6 +13,7 @@ poetry install -E pandas
 python -m venv "$DBT_VENV"
 . "$DBT_VENV"/bin/activate
 pip install dbt-core==1.0.0 dbt-postgres==1.0.0
+pip install --force-reinstall --upgrade markupsafe==2.0.1
 
 # Singer tap integration test
 python -m venv "$TAP_MYSQL_VENV"
diff --git a/engine/Dockerfile b/engine/Dockerfile
@@ -190,6 +190,11 @@ COPY ./engine/src/postgres-elasticsearch-fdw/pg_es_fdw /pg_es_fdw/pg_es_fdw
 RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
     pip install "git+https://github.com/splitgraph/snowflake-sqlalchemy.git@14e64cc0ef7374df0cecc91923ff6901b0d721b7"
 
+# Install PyAthena for Amazon Athena SQLAlchemy-based FDW, as well as pandas
+RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
+    pip install "PyAthena>=2.4.1" && \
+    pip install "pandas>=1.0.0"
+
 ENV PATH "${PATH}:/splitgraph/bin"
 ENV PYTHONPATH "${PYTHONPATH}:/splitgraph:/pg_es_fdw"
 
diff --git a/engine/Dockerfile.debug b/engine/Dockerfile.debug
@@ -105,14 +105,19 @@ COPY ./bin /splitgraph/bin
 # "Install" elasticsearch_fdw
 RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
     mkdir /pg_es_fdw && \
-    pip install "elasticsearch>=7.7.0"
+    pip install "elasticsearch>=7.7.0,<8.0"
 COPY ./engine/src/postgres-elasticsearch-fdw/pg_es_fdw /pg_es_fdw/pg_es_fdw
 
 # Install the Snowflake SQLAlchemy connector
 # Use our fork that supports server-side cursors
 RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
     pip install "git+https://github.com/splitgraph/snowflake-sqlalchemy.git@14e64cc0ef7374df0cecc91923ff6901b0d721b7"
 
+# Install PyAthena for Amazon Athena SQLAlchemy-based FDW, as well as pandas
+RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
+    pip install "PyAthena>=2.4.1" && \
+    pip install "pandas>=1.0.0"
+
 ENV PATH "${PATH}:/splitgraph/bin"
 ENV PYTHONPATH "${PYTHONPATH}:/splitgraph:/pg_es_fdw"
 
diff --git a/engine/src/Multicorn b/engine/src/Multicorn
@@ -1 +1 @@
-Subproject commit 1989dab9cf0650afc6ccd82b489d8976a81834dc
+Subproject commit fc1305545c146b89dc9cc242f522c88a46dd5fcd
diff --git a/splitgraph/config/keys.py b/splitgraph/config/keys.py
@@ -66,6 +66,7 @@
         "csv": "splitgraph.ingestion.csv.CSVDataSource",
         "snowflake": "splitgraph.ingestion.snowflake.SnowflakeDataSource",
         "dbt": "splitgraph.ingestion.dbt.data_source.DBTDataSource",
+        "athena": "splitgraph.ingestion.athena.AmazonAthenaDataSource",
     },
 }
 
@@ -125,8 +126,8 @@
     "SG_ENGINE": """Current engine name in use by Splitgraph. By default, this is the local engine.
 
 This can be overridden to make `sgr` use a different engine in cases where the `--remote` flag is not supported.""",
-    "SG_LOGLEVEL": """Logging threshold (log messages not emitted below this).  
-Accepted values are CRITICAL, ERROR, WARNING, INFO and DEBUG.  
+    "SG_LOGLEVEL": """Logging threshold (log messages not emitted below this).
+Accepted values are CRITICAL, ERROR, WARNING, INFO and DEBUG.
 This can also be changed by passing `--verbosity` to `sgr`, e.g. `sgr --verbosity DEBUG init`.""",
     "SG_ENGINE_PREFIX": "Prefix for Docker containers that are treated as Splitgraph engines by `sgr engine`.",
     "SG_NAMESPACE": "Namespace used by default when pushing to this engine, if not explicitly specified. Normally this is set to the user's username on the registry.",
diff --git a/splitgraph/ingestion/athena/BUILD b/splitgraph/ingestion/athena/BUILD
@@ -0,0 +1,6 @@
+python_sources(
+    skip_black=True,
+    dependencies=[
+        "src/py/splitgraph/splitgraph/resources/icons",
+    ],
+)
diff --git a/splitgraph/ingestion/athena/__init__.py b/splitgraph/ingestion/athena/__init__.py
@@ -0,0 +1,134 @@
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+from splitgraph.core.types import Credentials, Params, TableInfo
+from splitgraph.hooks.data_source.fdw import ForeignDataWrapperDataSource
+from splitgraph.ingestion.common import build_commandline_help
+
+if TYPE_CHECKING:
+    from splitgraph.engine.postgres.engine import PostgresEngine
+
+
+class AmazonAthenaDataSource(ForeignDataWrapperDataSource):
+    credentials_schema: Dict[str, Any] = {
+        "type": "object",
+        "properties": {
+            "aws_access_key_id": {"type": "string", "title": "AWS Access Key Id"},
+            "aws_secret_access_key": {"type": "string", "title": "AWS Secret Access Key"},
+        },
+        "required": ["aws_access_key_id", "aws_secret_access_key"],
+    }
+
+    params_schema = {
+        "type": "object",
+        "properties": {
+            "region_name": {
+                "type": "string",
+                "title": "S3 region",
+                "description": "Region of the S3 bucket",
+            },
+            "schema_name": {
+                "type": "string",
+                "title": "Schema",
+                "description": "Athena database name",
+            },
+            "s3_staging_dir": {
+                "title": "S3 results folder",
+                "type": "string",
+                "description": "Folder for storing query output",
+            },
+        },
+        "required": ["region_name", "schema_name", "s3_staging_dir"],
+    }
+
+    supports_mount = True
+    supports_load = True
+    supports_sync = False
+
+    commandline_help = """Mount a Amazon Athena database.
+
+This will mount an Athena schema or a table:
+
+\b
+```
+$ sgr mount athena s3 -o@- <<EOF
+{
+    "aws_access_key_id": "ABCD",
+    "aws_secret_access_key": "abcd",
+    "region_name": "eu-west-3",
+    "schema_name": "mydatabase",
+    "s3_staging_dir": "s3://my-bucket/output/",
+}
+EOF
+```
+    """
+
+    commandline_kwargs_help: str = (
+        build_commandline_help(credentials_schema) + "\n" + build_commandline_help(params_schema)
+    )
+
+    _icon_file = "athena.svg"
+
+    def __init__(
+        self,
+        engine: "PostgresEngine",
+        credentials: Credentials,
+        params: Params,
+        tables: Optional[TableInfo] = None,
+    ):
+        super().__init__(engine, credentials, params, tables)
+
+    def get_fdw_name(self):
+        return "multicorn"
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "Amazon Athena"
+
+    @classmethod
+    def get_description(cls) -> str:
+        return "Query data in Amazon S3 files and folders"
+
+    def get_table_options(
+        self, table_name: str, tables: Optional[TableInfo] = None
+    ) -> Dict[str, str]:
+        result = super().get_table_options(table_name, tables)
+        result["tablename"] = result.get("tablename", table_name)
+        return result
+
+    def get_server_options(self):
+        options: Dict[str, Optional[str]] = {
+            "wrapper": "multicorn.sqlalchemyfdw.SqlAlchemyFdw",
+            "db_url": self._build_db_url(),
+            "cast_quals": "true",
+        }
+
+        # For some reason, in SQLAlchemy, if this is not passed
+        # to the FDW params (even if it is in the DB URL), it doesn't
+        # schema-qualify tables and server-side cursors don't work for scanning
+        # (loads the whole table instead of scrolling through it).
+        if "schema" in self.params:
+            options["schema"] = self.params["schema"]
+
+        return options
+
+    def _build_db_url(self) -> str:
+        """Construct the SQLAlchemy Amazon Athena db_url"""
+
+        aws_access_key_id = self.credentials["aws_access_key_id"]
+        aws_secret_access_key = self.credentials["aws_secret_access_key"]
+        region_name = self.params["region_name"]
+        schema_name = self.params["schema_name"]
+        s3_staging_dir = self.params["s3_staging_dir"]
+
+        db_url = (
+            f"awsathena+rest://{aws_access_key_id}:{aws_secret_access_key}@"
+            f"athena.{region_name}.amazonaws.com:443/"
+            f"{schema_name}?s3_staging_dir={s3_staging_dir}"
+        )
+
+        return db_url
+
+    def get_remote_schema_name(self) -> str:
+        if "schema_name" not in self.params:
+            raise ValueError("Cannot IMPORT FOREIGN SCHEMA without a schema_name!")
+        return str(self.params["schema_name"])
diff --git a/splitgraph/ingestion/csv/__init__.py b/splitgraph/ingestion/csv/__init__.py
@@ -241,23 +241,29 @@ class CSVDataSource(ForeignDataWrapperDataSource):
 
 If passed an URL, this will live query a CSV file on an HTTP server. If passed
 S3 access credentials, this will scan a bucket for CSV files, infer their schema
-and make them available to query over SQL.  
+and make them available to query over SQL.
 
-For example:  
+For example:
 
 \b
 ```
 sgr mount csv target_schema -o@- <<EOF
-  {
-    "s3_endpoint": "cdn.mycompany.com:9000",
-    "s3_access_key": "ABCDEF",
-    "s3_secret_key": "GHIJKL",
-    "s3_bucket": "data",
-    "s3_object_prefix": "csv_files/current/",
+{
+	"s3_access_key": "ABCD",
+	"s3_secret_key": "abcd",
+	"connection":
+	{
+        "connection_type": "s3",
+        "s3_bucket": "my-bucket-name",
+        "s3_endpoint": "s3.amazonaws.com",
+        "s3_region": "eu-west-3",
+        "s3_object_prefix": "",
+        "s3_object": "iris/iris.csv"
+	},
     "autodetect_header": true,
     "autodetect_dialect": true,
     "autodetect_encoding": true
-  }
+}
 EOF
 ```
 """
diff --git a/splitgraph/ingestion/snowflake/__init__.py b/splitgraph/ingestion/snowflake/__init__.py
@@ -228,6 +228,8 @@ def get_server_options(self):
 
         if "batch_size" in self.params:
             options["batch_size"] = str(self.params["batch_size"])
+        else:
+            options["batch_size"] = "10000"
 
         if self.credentials["secret"]["secret_type"] == "private_key":
             options["connect_args"] = json.dumps(
diff --git a/splitgraph/resources/icons/athena.svg b/splitgraph/resources/icons/athena.svg
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 85 85" fill="#fff" fill-rule="evenodd" stroke="#000" stroke-linecap="round" stroke-linejoin="round">
+  <use xlink:href="#A" x="2.5" y="2.5"/>
+  <symbol id="A" overflow="visible">
+    <g stroke="none">
+      <path d="M80 48.357l-40-4.981-40 4.981L40 80l40-31.643z" fill="#fcbf92"/>
+      <path d="M0 48.357l40 12.089V80L0 59.733V48.357z" fill="#9d5025"/>
+      <path d="M80 48.357L40 60.446V80l40-20.267V48.357z" fill="#f58534"/>
+      <path d="M5.165 31.168H0v11.495l5.165.951 5.741-6.157-5.741-6.288z" fill="#9d5025"/>
+      <path d="M10.906 43.138l-5.741.475V31.168h5.741v11.97z" fill="#f58534"/>
+      <path d="M14.777 16.832l-6.8 1.296v26.08l6.8 1.426 6.224-14.467-6.224-14.336z" fill="#9d5025"/>
+      <path d="M21 44.921l-6.223.713V16.832l6.223.713v27.376z" fill="#f58534"/>
+      <path d="M28.035 22.633l-9.388.951v22.871l9.388 1.902L38 35.435l-9.965-12.802z" fill="#9d5025"/>
+      <path d="M34.364 47.287l-6.329 1.07V22.633l6.329.475v24.178z" fill="#f58534"/>
+      <g fill="#9d5025">
+        <path d="M40 1.89l-6.329 2.021v45.516L40 50.722l6.329-24.416L40 1.89z"/>
+        <path d="M45.859 47.287l6.106 1.07 9.388-17.189-9.388-17.07-6.106.951v32.238z"/>
+        <path d="M61.353 31.168l-9.388-17.07-6.106.951"/>
+        <path d="M58.882 44.922l6.341.713 6.8-21.694-6.8-21.575-6.341 1.545v41.01z"/>
+        <path d="M69.094 43.138l5.741.475L80 22.158 74.835 0l-5.741 1.426v41.712z"/>
+      </g>
+      <path d="M40 1.89l6.329 2.021v45.516L40 50.722V1.89zM61.353 16l-9.388-1.902v34.259l9.388-1.902V16zM72.024 5.1l-6.8-2.734v43.269l6.8-1.427V5.1zM80 2.603L74.835 0v43.614L80 42.663V2.603z" fill="#f58534"/>
+    </g>
+  </symbol>
+</svg>
diff --git a/test/splitgraph/commands/test_multicorn_fdws.py b/test/splitgraph/commands/test_multicorn_fdws.py
@@ -979,6 +979,22 @@ def test_various_not_pushed_down(data_source, test_local_engine):
     result = test_local_engine.run_sql(query, return_shape=ResultShape.ONE_ONE)
     assert result == 1000
 
+    # Using a float in a qualifier against an integer column will not be pushed down
+    query = f"SELECT COUNT(firstname) FROM {data_source}.account WHERE age > 1000.0"
+
+    result = test_local_engine.run_sql("EXPLAIN " + query)
+
+    if data_source == "es":
+        assert _extract_es_queries_from_explain(result)[0] == _bare_es_sequential_scan
+    elif data_source == "pg":
+        assert _extract_pg_queries_from_explain(result)[0] == (
+            "SELECT public.account.firstname, public.account.age FROM public.account"
+        )
+
+    # Ensure results are correct
+    result = test_local_engine.run_sql(query, return_shape=ResultShape.ONE_ONE)
+    assert result == 0
+
     # COUNT DISTINCT queries are not going to be pushed down
     query = f"SELECT COUNT(DISTINCT state) FROM {data_source}.account"
 
diff --git a/test/splitgraph/ingestion/test_airbyte.py b/test/splitgraph/ingestion/test_airbyte.py
@@ -511,13 +511,13 @@ def _assert_scd_data(repo):
             "_airbyte_active_row": 1,
             "_airbyte_emitted_at": mock.ANY,
             "_airbyte_end_at": None,
-            "_airbyte_mushrooms_hashid": "e48f260f784baa48a5c4643ef36024af",
+            "_airbyte_mushrooms_hashid": "882da3c55d7481c75a8f8919fc8441e1",
             "_airbyte_normalized_at": mock.ANY,
             "_airbyte_start_at": 1,
             "_airbyte_unique_key": "c4ca4238a0b923820dcc509a6f75849b",
             "_airbyte_unique_key_scd": mock.ANY,
             "binary_data": "YmludHN0AA==",
-            "discovery": "2012-11-11T08:06:26Z",
+            "discovery": "2012-11-11T08:06:26.000000Z",
             "friendly": True,
             "mushroom_id": 1,
             "name": "portobello",
@@ -528,13 +528,13 @@ def _assert_scd_data(repo):
             "_airbyte_active_row": 1,
             "_airbyte_emitted_at": mock.ANY,
             "_airbyte_end_at": None,
-            "_airbyte_mushrooms_hashid": "5257322455a690592e14baeb4d24069c",
+            "_airbyte_mushrooms_hashid": "7c835aeba7c53e3a13acb6f953820bb6",
             "_airbyte_normalized_at": mock.ANY,
             "_airbyte_start_at": 2,
             "_airbyte_unique_key": "c81e728d9d4c2f636f067f89cc14862c",
             "_airbyte_unique_key_scd": mock.ANY,
             "binary_data": "AAAxMjMAAA==",
-            "discovery": "2018-03-17T08:06:26Z",
+            "discovery": "2018-03-17T08:06:26.000000Z",
             "friendly": False,
             "mushroom_id": 2,
             "name": "deathcap",
@@ -546,7 +546,7 @@ def _assert_scd_data(repo):
 def _assert_normalized_data(repo, unique_key=False):
     expected = [
         {
-            "discovery": "2012-11-11T08:06:26Z",
+            "discovery": "2012-11-11T08:06:26.000000Z",
             "friendly": True,
             "binary_data": "YmludHN0AA==",
             "name": "portobello",
@@ -555,10 +555,10 @@ def _assert_normalized_data(repo, unique_key=False):
             "_airbyte_ab_id": mock.ANY,
             "_airbyte_emitted_at": mock.ANY,
             "_airbyte_normalized_at": mock.ANY,
-            "_airbyte_mushrooms_hashid": "e48f260f784baa48a5c4643ef36024af",
+            "_airbyte_mushrooms_hashid": "882da3c55d7481c75a8f8919fc8441e1",
         },
         {
-            "discovery": "2018-03-17T08:06:26Z",
+            "discovery": "2018-03-17T08:06:26.000000Z",
             "friendly": False,
             "binary_data": "AAAxMjMAAA==",
             "name": "deathcap",
@@ -567,7 +567,7 @@ def _assert_normalized_data(repo, unique_key=False):
             "_airbyte_ab_id": mock.ANY,
             "_airbyte_emitted_at": mock.ANY,
             "_airbyte_normalized_at": mock.ANY,
-            "_airbyte_mushrooms_hashid": "5257322455a690592e14baeb4d24069c",
+            "_airbyte_mushrooms_hashid": "7c835aeba7c53e3a13acb6f953820bb6",
         },
     ]
 
@@ -670,7 +670,7 @@ def _assert_raw_data(repo):
             "_airbyte_data": {
                 "name": "portobello",
                 "friendly": True,
-                "discovery": "2012-11-11T08:06:26Z",
+                "discovery": "2012-11-11T08:06:26.000000Z",
                 "binary_data": "YmludHN0AA==",
                 "mushroom_id": 1,
                 "varbinary_data": "fwAAAQ==",
@@ -682,7 +682,7 @@ def _assert_raw_data(repo):
             "_airbyte_data": {
                 "name": "deathcap",
                 "friendly": False,
-                "discovery": "2018-03-17T08:06:26Z",
+                "discovery": "2018-03-17T08:06:26.000000Z",
                 "binary_data": "AAAxMjMAAA==",
                 "mushroom_id": 2,
                 "varbinary_data": "fwAAAQ==",
diff --git a/test/splitgraph/ingestion/test_athena.py b/test/splitgraph/ingestion/test_athena.py
diff --git a/test/splitgraph/ingestion/test_snowflake.py b/test/splitgraph/ingestion/test_snowflake.py