Support json type in athena2pandas

aws · LeonLuttenberger · May 7, 2024 · May 6, 2024 · May 6, 2024 · May 7, 2024
commit e87b4def21f9a0bc00ccd67eec57ed33556f09bf
diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
@@ -376,7 +376,7 @@ def athena2pandas(dtype: str, dtype_backend: str | None = None) -> str:  # noqa:
         return "decimal" if dtype_backend != "pyarrow" else "double[pyarrow]"
     if dtype in ("binary", "varbinary"):
         return "bytes" if dtype_backend != "pyarrow" else "binary[pyarrow]"
-    if any(dtype.startswith(t) for t in ["array", "row", "map", "struct"]):
+    if any(dtype.startswith(t) for t in ["array", "row", "map", "struct", "json"]):
         return "object"
     if dtype == "geometry":
         return "string"

diff --git a/tests/unit/test_athena.py b/tests/unit/test_athena.py
@@ -560,6 +560,47 @@ def test_athena_read_list(glue_database):
     assert df["col0"].iloc[0] == "[1, 2, 3]"
 
 
+def test_athena_read_json(glue_database):
+    sql = """
+        WITH dataset AS (
+        SELECT
+            CAST('HELLO ATHENA' AS JSON) AS some_str,
+            CAST(12345 AS JSON) AS some_int,
+            CAST(MAP(ARRAY['a', 'b'], ARRAY[1,2]) AS JSON) AS some_map
+        )
+        SELECT * FROM dataset
+    """
+    df = wr.athena.read_sql_query(sql=sql, database=glue_database, ctas_approach=False)
+    assert len(df) == 1
+    assert len(df.index) == 1
+    assert len(df.columns) == 3
+    assert df["some_str"].iloc[0] == '"HELLO ATHENA"'
+    assert df["some_int"].iloc[0] == '12345'
+    assert df["some_map"].iloc[0] == '{"a":1,"b":2}'
+
+
+def test_athena_read_json_extract(glue_database):
+    sql = """
+        WITH dataset AS (
+          SELECT '{"name": "Susan Smith",
+                   "org": "engineering",
+                   "projects": [{"name":"project1", "completed":false},
+                   {"name":"project2", "completed":true}]}'
+            AS myblob
+        )
+        SELECT
+          json_extract(myblob, '$.name') AS name,
+          json_extract(myblob, '$.projects') AS projects
+        FROM dataset
+    """
+    df = wr.athena.read_sql_query(sql=sql, database=glue_database, ctas_approach=False)
+    assert len(df) == 1
+    assert len(df.index) == 1
+    assert len(df.columns) == 2
+    assert df["name"].iloc[0] == '"Susan Smith"'
+    assert df["projects"].iloc[0] == '[{"name":"project1","completed":false},{"name":"project2","completed":true}]'
+
+
 def test_sanitize_dataframe_column_names():
     with pytest.warns(UserWarning, match=r"Duplicate*"):
         test_df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})