Canner
diff --git a/‎conftest.py‎
Lines changed: 115 additions & 0 deletions b/‎conftest.py‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎ibis/impala/compiler.py‎
Lines changed: 12 additions & 3 deletions b/‎ibis/impala/compiler.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎ibis/impala/tests/test_exprs.py‎
Lines changed: 4 additions & 4 deletions b/‎ibis/impala/tests/test_exprs.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎ibis/spark/api.py‎
Lines changed: 7 additions & 0 deletions b/‎ibis/spark/api.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎ibis/spark/client.py‎
Lines changed: 68 additions & 42 deletions b/‎ibis/spark/client.py‎
Lines changed: 68 additions & 42 deletions
@@ -3,6 +3,8 @@
 
 import pytest
 
+import ibis
+
 collect_ignore = ['setup.py']
 
 
@@ -18,3 +20,116 @@ def data_directory():
         pytest.skip('test data directory not found')
 
     return datadir
+
+
+@pytest.fixture(scope='session')
+def spark_client_testing(data_directory):
+    pytest.importorskip('pyspark')
+
+    import pyspark.sql.types as pt
+
+    client = ibis.spark.connect()
+
+    df_functional_alltypes = client._session.read.csv(
+        path=str(data_directory / 'functional_alltypes.csv'),
+        schema=pt.StructType([
+            pt.StructField('index', pt.IntegerType(), True),
+            pt.StructField('Unnamed: 0', pt.IntegerType(), True),
+            pt.StructField('id', pt.IntegerType(), True),
+            # cast below, Spark can't read 0/1 as bool
+            pt.StructField('bool_col', pt.ByteType(), True),
+            pt.StructField('tinyint_col', pt.ByteType(), True),
+            pt.StructField('smallint_col', pt.ShortType(), True),
+            pt.StructField('int_col', pt.IntegerType(), True),
+            pt.StructField('bigint_col', pt.LongType(), True),
+            pt.StructField('float_col', pt.FloatType(), True),
+            pt.StructField('double_col', pt.DoubleType(), True),
+            pt.StructField('date_string_col', pt.StringType(), True),
+            pt.StructField('string_col', pt.StringType(), True),
+            pt.StructField('timestamp_col', pt.TimestampType(), True),
+            pt.StructField('year', pt.IntegerType(), True),
+            pt.StructField('month', pt.IntegerType(), True),
+        ]),
+        mode='FAILFAST',
+        header=True,
+    )
+    df_functional_alltypes = df_functional_alltypes.withColumn(
+        "bool_col", df_functional_alltypes["bool_col"].cast("boolean"))
+    df_functional_alltypes.createOrReplaceTempView('functional_alltypes')
+
+    df_batting = client._session.read.csv(
+        path=str(data_directory / 'batting.csv'),
+        schema=pt.StructType([
+            pt.StructField('playerID', pt.StringType(), True),
+            pt.StructField('yearID', pt.IntegerType(), True),
+            pt.StructField('stint', pt.IntegerType(), True),
+            pt.StructField('teamID', pt.StringType(), True),
+            pt.StructField('lgID', pt.StringType(), True),
+            pt.StructField('G', pt.IntegerType(), True),
+            pt.StructField('AB', pt.DoubleType(), True),
+            pt.StructField('R', pt.DoubleType(), True),
+            pt.StructField('H', pt.DoubleType(), True),
+            pt.StructField('X2B', pt.DoubleType(), True),
+            pt.StructField('X3B', pt.DoubleType(), True),
+            pt.StructField('HR', pt.DoubleType(), True),
+            pt.StructField('RBI', pt.DoubleType(), True),
+            pt.StructField('SB', pt.DoubleType(), True),
+            pt.StructField('CS', pt.DoubleType(), True),
+            pt.StructField('BB', pt.DoubleType(), True),
+            pt.StructField('SO', pt.DoubleType(), True),
+            pt.StructField('IBB', pt.DoubleType(), True),
+            pt.StructField('HBP', pt.DoubleType(), True),
+            pt.StructField('SH', pt.DoubleType(), True),
+            pt.StructField('SF', pt.DoubleType(), True),
+            pt.StructField('GIDP', pt.DoubleType(), True),
+        ]),
+        header=True,
+    )
+    df_batting.createOrReplaceTempView('batting')
+
+    df_awards_players = client._session.read.csv(
+        path=str(data_directory / 'awards_players.csv'),
+        schema=pt.StructType([
+            pt.StructField('playerID', pt.StringType(), True),
+            pt.StructField('awardID', pt.StringType(), True),
+            pt.StructField('yearID', pt.IntegerType(), True),
+            pt.StructField('lgID', pt.StringType(), True),
+            pt.StructField('tie', pt.StringType(), True),
+            pt.StructField('notes', pt.StringType(), True),
+        ]),
+        header=True,
+    )
+    df_awards_players.createOrReplaceTempView('awards_players')
+
+    df_simple = client._session.createDataFrame([(1, 'a')], ['foo', 'bar'])
+    df_simple.createOrReplaceTempView('simple')
+
+    df_struct = client._session.createDataFrame(
+        [((1, 2, 'a'),)],
+        ['struct_col']
+    )
+    df_struct.createOrReplaceTempView('struct')
+
+    df_nested_types = client._session.createDataFrame(
+        [
+            (
+                [1, 2],
+                [[3, 4], [5, 6]],
+                {'a' : [[2, 4], [3, 5]]},
+            )
+        ],
+        [
+            'list_of_ints',
+            'list_of_list_of_ints',
+            'map_string_list_of_list_of_ints'
+        ]
+    )
+    df_nested_types.createOrReplaceTempView('nested_types')
+
+    df_complicated = client._session.createDataFrame(
+        [({(1, 3) : [[2, 4], [3, 5]]},)],
+        ['map_tuple_list_of_list_of_ints']
+    )
+    df_complicated.createOrReplaceTempView('complicated')
+
+    return client
@@ -1,5 +1,6 @@
 import datetime
 import itertools
+import math
 from io import StringIO
 from operator import add, mul, sub
 from typing import Optional
@@ -631,10 +632,18 @@ def _string_literal_format(translator, expr):
 
 def _number_literal_format(translator, expr):
     value = expr.op().value
-    formatted = repr(value)
 
-    if formatted in {'nan', 'inf', '-inf'}:
-        return "CAST({!r} AS DOUBLE)".format(formatted)
+    if math.isfinite(value):
+        formatted = repr(value)
+    else:
+        if math.isnan(value):
+            formatted_val = 'NaN'
+        elif math.isinf(value):
+            if value > 0:
+                formatted_val = 'Infinity'
+            else:
+                formatted_val = '-Infinity'
+        formatted = "CAST({!r} AS DOUBLE)".format(formatted_val)
 
     return formatted
 
 
@@ -357,10 +357,10 @@ def test_any_all(self):
         bool_expr = t.f == 0
 
         cases = [
-            (bool_expr.any(), 'sum(`f` = 0) > 0'),
-            (-bool_expr.any(), 'sum(`f` = 0) = 0'),
-            (bool_expr.all(), 'sum(`f` = 0) = count(*)'),
-            (-bool_expr.all(), 'sum(`f` = 0) < count(*)'),
+            (bool_expr.any(), 'max(`f` = 0)'),
+            (-bool_expr.any(), 'max(`f` = 0) = FALSE'),
+            (bool_expr.all(), 'min(`f` = 0)'),
+            (-bool_expr.all(), 'min(`f` = 0) = FALSE'),
         ]
         self._check_expr_cases(cases)
 
 
@@ -1,4 +1,5 @@
 from ibis.spark.client import SparkClient
+from ibis.spark.compiler import dialect  # noqa: F401
 
 
 def connect(**kwargs):
@@ -9,4 +10,10 @@ def connect(**kwargs):
     """
     client = SparkClient(**kwargs)
 
+    # Spark internally stores timestamps as UTC values, and timestamp data that
+    # is brought in without a specified time zone is converted as local time to
+    # UTC with microsecond resolution.
+    # https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#timestamp-with-time-zone-semantics
+    client._session.conf.set('spark.sql.session.timeZone', 'UTC')
+
     return client
@@ -1,8 +1,7 @@
-from collections import OrderedDict
-
 import pyspark as ps
 import pyspark.sql.types as pt
 import regex as re
+from pkg_resources import parse_version
 
 import ibis.common as com
 import ibis.expr.datatypes as dt
@@ -11,56 +10,75 @@
 from ibis.client import Database, Query, SQLClient
 from ibis.spark import compiler as comp
 
-_DTYPE_TO_IBIS_TYPE = {
-    pt.NullType : dt.null,
-    pt.StringType : dt.string,
-    pt.BinaryType : dt.binary,
-    pt.BooleanType : dt.boolean,
-    pt.DateType : dt.date,
-    pt.TimestampType : dt.timestamp,
-    pt.DoubleType : dt.double,
-    pt.FloatType : dt.float,
-    pt.ByteType : dt.int8,
-    pt.IntegerType : dt.int32,
-    pt.LongType : dt.int64,
-    pt.ShortType : dt.int16,
+# maps pyspark type class to ibis type class
+_SPARK_DTYPE_TO_IBIS_DTYPE = {
+    pt.NullType : dt.Null,
+    pt.StringType : dt.String,
+    pt.BinaryType : dt.Binary,
+    pt.BooleanType : dt.Boolean,
+    pt.DateType : dt.Date,
+    pt.DoubleType : dt.Double,
+    pt.FloatType : dt.Float,
+    pt.ByteType : dt.Int8,
+    pt.IntegerType : dt.Int32,
+    pt.LongType : dt.Int64,
+    pt.ShortType : dt.Int16,
 }
 
 
 @dt.dtype.register(pt.DataType)
-def spark_type_to_ibis_dtype(spark_type_obj):
-    """Convert Spark SQL types to ibis types."""
-
-    if isinstance(spark_type_obj, pt.DecimalType):
-        precision = spark_type_obj.precision
-        scale = spark_type_obj.scale
-        ibis_type = dt.Decimal(precision, scale)
-    elif isinstance(spark_type_obj, pt.ArrayType):
-        value_type = dt.dtype(spark_type_obj.elementType)
-        nullable = spark_type_obj.containsNull
-        ibis_type = dt.Array(value_type, nullable)
-    elif isinstance(spark_type_obj, pt.MapType):
-        key_type = dt.dtype(spark_type_obj.keyType)
-        value_type = dt.dtype(spark_type_obj.valueType)
-        nullable = spark_type_obj.valueContainsNull
-        ibis_type = dt.Map(key_type, value_type, nullable)
-    elif isinstance(spark_type_obj, pt.StructType):
-        names = spark_type_obj.names
-        fields = spark_type_obj.fields
-        ibis_types = [dt.dtype(f.dataType) for f in fields]
-        ibis_type = dt.Struct(names, ibis_types)
-    else:
-        ibis_type = _DTYPE_TO_IBIS_TYPE.get(type(spark_type_obj))
-
-    return ibis_type
+def spark_dtype_to_ibis_dtype(spark_type_obj, nullable=True):
+    """Convert Spark SQL type objects to ibis type objects."""
+    ibis_type_class = _SPARK_DTYPE_TO_IBIS_DTYPE.get(type(spark_type_obj))
+    return ibis_type_class(nullable=nullable)
+
+
+@dt.dtype.register(pt.TimestampType)
+def spark_timestamp_dtype_to_ibis_dtype(spark_type_obj, nullable=True):
+    return dt.Timestamp(nullable=nullable)
+
+
+@dt.dtype.register(pt.DecimalType)
+def spark_decimal_dtype_to_ibis_dtype(spark_type_obj, nullable=True):
+    precision = spark_type_obj.precision
+    scale = spark_type_obj.scale
+    return dt.Decimal(precision, scale, nullable=nullable)
+
+
+@dt.dtype.register(pt.ArrayType)
+def spark_array_dtype_to_ibis_dtype(spark_type_obj, nullable=True):
+    value_type = dt.dtype(
+        spark_type_obj.elementType,
+        nullable=spark_type_obj.containsNull
+    )
+    return dt.Array(value_type, nullable=nullable)
+
+
+@dt.dtype.register(pt.MapType)
+def spark_map_dtype_to_ibis_dtype(spark_type_obj, nullable=True):
+    key_type = dt.dtype(spark_type_obj.keyType)
+    value_type = dt.dtype(
+        spark_type_obj.valueType,
+        nullable=spark_type_obj.valueContainsNull
+    )
+    return dt.Map(key_type, value_type, nullable=nullable)
+
+
+@dt.dtype.register(pt.StructType)
+def spark_struct_dtype_to_ibis_dtype(spark_type_obj, nullable=True):
+    names = spark_type_obj.names
+    fields = spark_type_obj.fields
+    ibis_types = [dt.dtype(f.dataType, nullable=f.nullable) for f in fields]
+    return dt.Struct(names, ibis_types, nullable=nullable)
 
 
 @sch.infer.register(ps.sql.dataframe.DataFrame)
 def spark_dataframe_schema(df):
     """Infer the schema of a Spark SQL `DataFrame` object."""
-    fields = OrderedDict((el.name, dt.dtype(el.dataType)) for el in df.schema)
+    # df.schema is a pt.StructType
+    schema_struct = dt.dtype(df.schema)
 
-    return sch.schema(fields)
+    return sch.schema(schema_struct.names, schema_struct.types)
 
 
 class SparkCursor:
@@ -180,6 +198,10 @@ def current_database(self):
     def _get_table_schema(self, table_name):
         return self.get_schema(table_name)
 
+    def _get_schema_using_query(self, query):
+        cur = self._execute(query, results=True)
+        return spark_dataframe_schema(cur.query)
+
     def list_tables(self, like=None, database=None):
         """
         List tables in the current (or indicated) database. Like the SHOW
@@ -274,3 +296,7 @@ def get_schema(self, table_name, database=None):
         df = self._session.table(table_name)
 
         return sch.infer(df)
+
+    @property
+    def version(self):
+        return parse_version(ps.__version__)