Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Use limit value to minimize batches returned in to_arrow read path
  • Loading branch information
soumya-ghosh committed Aug 14, 2024
commit ad3660ba42da18a08d33c3e970dc2806a90a7d12
31 changes: 19 additions & 12 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1265,21 +1265,27 @@ def _task_to_table(
case_sensitive: bool,
name_mapping: Optional[NameMapping] = None,
use_large_types: bool = True,
limit: Optional[int] = None,
) -> Optional[pa.Table]:
batches = list(
_task_to_record_batches(
fs,
task,
bound_row_filter,
projected_schema,
projected_field_ids,
positional_deletes,
case_sensitive,
name_mapping,
use_large_types,
)
batches_iterator = _task_to_record_batches(
fs,
task,
bound_row_filter,
projected_schema,
projected_field_ids,
positional_deletes,
case_sensitive,
name_mapping,
use_large_types,
)

total_row_count = 0
batches = []
for batch in batches_iterator:
total_row_count += len(batch)
batches.append(batch)
if limit is not None and total_row_count >= limit:
break
if len(batches) > 0:
return pa.Table.from_batches(batches)
else:
Expand Down Expand Up @@ -1366,6 +1372,7 @@ def project_table(
case_sensitive,
table_metadata.name_mapping(),
use_large_types,
limit,
)
for task in tasks
]
Expand Down