Skip to content

Commit 91ea332

Browse files
authored
refactor(formats): remove unnecessary schema argument from schema inference (ibis-project#8814)
1 parent 7d593c4 commit 91ea332

7 files changed

Lines changed: 27 additions & 67 deletions

File tree

ibis/backends/dask/__init__.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010

1111
# import the pandas execution module to register dispatched implementations of
1212
# execute_node that the dask backend will later override
13-
import ibis.expr.operations as ops
14-
import ibis.expr.schema as sch
1513
import ibis.expr.types as ir
1614
from ibis import util
1715
from ibis.backends import NoUrl
@@ -167,11 +165,14 @@ def read_parquet(
167165
self.dictionary[table_name] = df
168166
return self.table(table_name)
169167

170-
def table(self, name: str, schema: sch.Schema | None = None):
171-
df = self.dictionary[name]
172-
schema = schema or self.schemas.get(name, None)
173-
schema = PandasData.infer_table(df.head(1), schema=schema)
174-
return ops.DatabaseTable(name, schema, self).to_expr()
168+
def get_schema(self, table_name, *, database=None):
169+
try:
170+
schema = self.schemas[table_name]
171+
except KeyError:
172+
df = self.dictionary[table_name]
173+
self.schemas[table_name] = schema = PandasData.infer_table(df.head(1))
174+
175+
return schema
175176

176177
def _convert_object(self, obj) -> dd.DataFrame:
177178
if isinstance(obj, dd.DataFrame):

ibis/backends/pandas/__init__.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -165,18 +165,16 @@ def list_tables(self, like=None, database=None):
165165
return self._filter_with_like(list(self.dictionary.keys()), like)
166166

167167
def table(self, name: str, schema: sch.Schema | None = None):
168-
df = self.dictionary[name]
169-
schema = schema or self.schemas.get(name, None)
170-
schema = PandasData.infer_table(df, schema=schema)
171-
return ops.DatabaseTable(name, schema, self).to_expr()
168+
inferred_schema = self.get_schema(name)
169+
overridden_schema = {**inferred_schema, **(schema or {})}
170+
return ops.DatabaseTable(name, overridden_schema, self).to_expr()
172171

173172
def get_schema(self, table_name, *, database=None):
174-
schemas = self.schemas
175173
try:
176-
schema = schemas[table_name]
174+
schema = self.schemas[table_name]
177175
except KeyError:
178176
df = self.dictionary[table_name]
179-
schemas[table_name] = schema = PandasData.infer_table(df)
177+
self.schemas[table_name] = schema = PandasData.infer_table(df)
180178

181179
return schema
182180

ibis/expr/datatypes/core.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -247,11 +247,6 @@ def from_polars(cls, polars_type, nullable=True) -> Self:
247247

248248
return PolarsType.to_ibis(polars_type, nullable=nullable)
249249

250-
@classmethod
251-
def from_dask(cls, dask_type, nullable=True) -> Self:
252-
"""Return the equivalent ibis datatype."""
253-
return cls.from_pandas(dask_type, nullable=nullable)
254-
255250
def to_numpy(self):
256251
"""Return the equivalent numpy datatype."""
257252
from ibis.formats.numpy import NumpyType
@@ -276,10 +271,6 @@ def to_polars(self):
276271

277272
return PolarsType.from_ibis(self)
278273

279-
def to_dask(self):
280-
"""Return the equivalent dask datatype."""
281-
return self.to_pandas()
282-
283274
def is_array(self) -> bool:
284275
"""Return True if an instance of an Array type."""
285276
return isinstance(self, Array)

ibis/expr/schema.py

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,6 @@ def from_polars(cls, polars_schema):
162162

163163
return PolarsSchema.to_ibis(polars_schema)
164164

165-
@classmethod
166-
def from_dask(cls, dask_schema):
167-
"""Return the equivalent ibis schema."""
168-
return cls.from_pandas(dask_schema)
169-
170165
def to_numpy(self):
171166
"""Return the equivalent numpy dtypes."""
172167
from ibis.formats.numpy import NumpySchema
@@ -191,10 +186,6 @@ def to_polars(self):
191186

192187
return PolarsSchema.from_ibis(self)
193188

194-
def to_dask(self):
195-
"""Return the equivalent dask dtypes."""
196-
return self.to_pandas()
197-
198189
def as_struct(self) -> dt.Struct:
199190
return dt.Struct(self)
200191

@@ -238,7 +229,7 @@ def schema(value: Any) -> Schema:
238229

239230

240231
@lazy_singledispatch
241-
def infer(value: Any, schema=None) -> Schema:
232+
def infer(value: Any) -> Schema:
242233
"""Infer the corresponding ibis schema for a python object."""
243234
raise InputTypeError(value)
244235

@@ -278,28 +269,25 @@ def from_pyarrow_schema(schema):
278269

279270

280271
@infer.register("pandas.DataFrame")
281-
def infer_pandas_dataframe(df, schema=None):
272+
def infer_pandas_dataframe(df):
282273
from ibis.formats.pandas import PandasData
283274

284-
return PandasData.infer_table(df, schema)
275+
return PandasData.infer_table(df)
285276

286277

287-
# TODO(kszucs): do we really need the schema kwarg?
288278
@infer.register("pyarrow.Table")
289-
def infer_pyarrow_table(table, schema=None):
279+
def infer_pyarrow_table(table):
290280
from ibis.formats.pyarrow import PyArrowSchema
291281

292-
schema = schema if schema is not None else table.schema
293-
return PyArrowSchema.to_ibis(schema)
282+
return PyArrowSchema.to_ibis(table.schema)
294283

295284

296285
@infer.register("polars.DataFrame")
297286
@infer.register("polars.LazyFrame")
298-
def infer_polars_dataframe(df, schema=None):
287+
def infer_polars_dataframe(df):
299288
from ibis.formats.polars import PolarsSchema
300289

301-
schema = schema if schema is not None else df.schema
302-
return PolarsSchema.to_ibis(schema)
290+
return PolarsSchema.to_ibis(df.schema)
303291

304292

305293
# lock the dispatchers to avoid adding new implementations

ibis/expr/tests/test_schema.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,6 @@
2020

2121
has_pandas = True
2222

23-
has_dask = False
24-
with contextlib.suppress(ImportError):
25-
import dask.dataframe as dd # noqa: F401
26-
27-
has_dask = True
28-
2923

3024
def test_whole_schema():
3125
schema = {
@@ -437,11 +431,6 @@ def test_schema_from_to_numpy_dtypes():
437431
@pytest.mark.parametrize(
438432
("from_method", "to_method"),
439433
[
440-
pytest.param(
441-
"from_dask",
442-
"to_dask",
443-
marks=pytest.mark.skipif(not has_dask, reason="dask not installed"),
444-
),
445434
pytest.param(
446435
"from_pandas",
447436
"to_pandas",

ibis/formats/__init__.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,15 +168,13 @@ def convert_column(cls, obj: C, dtype: DataType) -> C:
168168
raise NotImplementedError
169169

170170
@classmethod
171-
def convert_table(cls, obj: T, schema: Schema) -> T:
171+
def convert_table(cls, obj: T) -> T:
172172
"""Convert a format-specific table to the given ibis schema.
173173
174174
Parameters
175175
----------
176176
obj
177177
The format-specific table-like object to convert.
178-
schema
179-
The Ibis schema to convert to.
180178
181179
Returns
182180
-------

ibis/formats/pandas.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -94,25 +94,20 @@ def infer_column(cls, s):
9494
return PyArrowData.infer_column(s)
9595

9696
@classmethod
97-
def infer_table(cls, df, schema=None):
98-
schema = schema if schema is not None else {}
99-
97+
def infer_table(cls, df):
10098
pairs = []
10199
for column_name in df.dtypes.keys():
102100
if not isinstance(column_name, str):
103101
raise TypeError(
104102
"Column names must be strings to use the pandas backend"
105103
)
106104

107-
if column_name in schema:
108-
ibis_dtype = schema[column_name]
105+
pandas_column = df[column_name]
106+
pandas_dtype = pandas_column.dtype
107+
if pandas_dtype == np.object_:
108+
ibis_dtype = cls.infer_column(pandas_column)
109109
else:
110-
pandas_column = df[column_name]
111-
pandas_dtype = pandas_column.dtype
112-
if pandas_dtype == np.object_:
113-
ibis_dtype = cls.infer_column(pandas_column)
114-
else:
115-
ibis_dtype = PandasType.to_ibis(pandas_dtype)
110+
ibis_dtype = PandasType.to_ibis(pandas_dtype)
116111

117112
pairs.append((column_name, ibis_dtype))
118113

0 commit comments

Comments
 (0)