Skip to content

Commit 34b54f2

Browse files
authored
Merge pull request #1115 from suketa/table_function_csv_test
add table adapter functionality
2 parents c145a4d + b1cf89b commit 34b54f2

File tree

11 files changed

+287
-85
lines changed

11 files changed

+287
-85
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
All notable changes to this project will be documented in this file.
44

55
# Unreleased
6+
- add `DuckDB::Connection#expose_as_table` to expose a Ruby object as a queryable DuckDB table function via a registered adapter.
7+
- add `DuckDB::TableFunction.add_table_adapter` to register a table adapter for a Ruby class.
8+
- add `DuckDB::TableFunction.table_adapter_for` to look up a registered table adapter by class.
69
- add DuckDB::TableFunction.create class method for declarative table function creation
710
- Automatically sets output.size from return value
811
- Supports positional and named parameters

Gemfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ source 'https://rubygems.org'
66
gemspec
77

88
gem 'bundler', '~> 4.0'
9+
gem 'csv'
910
gem 'minitest', '~> 6.0'
1011
gem 'rake', '~> 13.0'
1112
gem 'rake-compiler'

Gemfile.lock

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,43 +7,55 @@ PATH
77
GEM
88
remote: https://rubygems.org/
99
specs:
10+
addressable (2.8.9)
11+
public_suffix (>= 2.0.2, < 8.0)
1012
ast (2.4.3)
1113
benchmark-ips (2.14.0)
1214
bigdecimal (4.0.1)
13-
json (2.18.0)
15+
csv (3.3.5)
16+
drb (2.2.3)
17+
json (2.18.1)
18+
json-schema (6.1.0)
19+
addressable (~> 2.8)
20+
bigdecimal (>= 3.1, < 5)
1421
language_server-protocol (3.17.0.5)
1522
lint_roller (1.1.0)
23+
mcp (0.7.1)
24+
json-schema (>= 4.1)
1625
mini_portile2 (2.8.9)
17-
minitest (6.0.1)
26+
minitest (6.0.2)
27+
drb (~> 2.0)
1828
prism (~> 1.5)
19-
nokogiri (1.19.0)
29+
nokogiri (1.19.1)
2030
mini_portile2 (~> 2.8.2)
2131
racc (~> 1.4)
22-
nokogiri (1.19.0-aarch64-linux-gnu)
32+
nokogiri (1.19.1-aarch64-linux-gnu)
2333
racc (~> 1.4)
24-
nokogiri (1.19.0-arm-linux-gnu)
34+
nokogiri (1.19.1-arm-linux-gnu)
2535
racc (~> 1.4)
26-
nokogiri (1.19.0-arm64-darwin)
36+
nokogiri (1.19.1-arm64-darwin)
2737
racc (~> 1.4)
28-
nokogiri (1.19.0-x86_64-darwin)
38+
nokogiri (1.19.1-x86_64-darwin)
2939
racc (~> 1.4)
30-
nokogiri (1.19.0-x86_64-linux-gnu)
40+
nokogiri (1.19.1-x86_64-linux-gnu)
3141
racc (~> 1.4)
3242
parallel (1.27.0)
33-
parser (3.3.10.1)
43+
parser (3.3.10.2)
3444
ast (~> 2.4.1)
3545
racc
3646
prism (1.9.0)
47+
public_suffix (7.0.2)
3748
racc (1.8.1)
3849
rainbow (3.1.1)
3950
rake (13.3.1)
4051
rake-compiler (1.3.1)
4152
rake
4253
regexp_parser (2.11.3)
43-
rubocop (1.84.0)
54+
rubocop (1.85.0)
4455
json (~> 2.3)
4556
language_server-protocol (~> 3.17.0.2)
4657
lint_roller (~> 1.1.0)
58+
mcp (~> 0.6)
4759
parallel (~> 1.10)
4860
parser (>= 3.3.0.2)
4961
rainbow (>= 2.2.2, < 4.0)
@@ -54,7 +66,7 @@ GEM
5466
rubocop-ast (1.49.0)
5567
parser (>= 3.3.7.2)
5668
prism (~> 1.7)
57-
rubocop-minitest (0.38.2)
69+
rubocop-minitest (0.39.1)
5870
lint_roller (~> 1.1)
5971
rubocop (>= 1.75.0, < 2.0)
6072
rubocop-ast (>= 1.38.0, < 2.0)
@@ -64,7 +76,7 @@ GEM
6476
ruby-progressbar (1.13.0)
6577
ruby_memcheck (3.0.1)
6678
nokogiri
67-
stackprof (0.2.27)
79+
stackprof (0.2.28)
6880
unicode-display_width (3.2.0)
6981
unicode-emoji (~> 4.1)
7082
unicode-emoji (4.2.0)
@@ -80,6 +92,7 @@ PLATFORMS
8092
DEPENDENCIES
8193
benchmark-ips
8294
bundler (~> 4.0)
95+
csv
8396
duckdb!
8497
minitest (~> 6.0)
8598
rake (~> 13.0)

lib/duckdb/connection.rb

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,42 @@ def register_table_function(table_function)
229229
_register_table_function(table_function)
230230
end
231231

232+
# Exposes a Ruby object as a queryable DuckDB table function via a registered adapter.
233+
#
234+
# Looks up a table adapter registered for the object's class via
235+
# +DuckDB::TableFunction.add_table_adapter+, then uses it to create and register
236+
# a table function under the given name.
237+
#
238+
# @param object [Object] the Ruby object to expose as a table (e.g. a CSV instance)
239+
# @param name [String] the SQL name of the table function
240+
# @param columns [Hash{String => DuckDB::LogicalType}, nil] optional column schema override;
241+
# if omitted, the adapter determines the columns (e.g. from headers or inference)
242+
# @raise [ArgumentError] if no adapter is registered for the object's class
243+
# @raise [DuckDB::Error] if threads setting is not 1
244+
# @return [void]
245+
#
246+
# @example Expose a CSV as a table
247+
# require 'csv'
248+
# con.execute('SET threads=1')
249+
# DuckDB::TableFunction.add_table_adapter(CSV, CSVTableAdapter.new)
250+
# csv = CSV.new(File.read('data.csv'), headers: true)
251+
# con.expose_as_table(csv, 'csv_table')
252+
# con.query('SELECT * FROM csv_table()').to_a
253+
#
254+
# @example With explicit column types
255+
# con.expose_as_table(csv, 'csv_table', columns: {
256+
# 'id' => DuckDB::LogicalType::BIGINT,
257+
# 'name' => DuckDB::LogicalType::VARCHAR
258+
# })
259+
#
260+
def expose_as_table(object, name, columns: nil)
261+
adapter = TableFunction.table_adapter_for(object.class)
262+
raise ArgumentError, "No table adapter registered for #{object.class}" if adapter.nil?
263+
264+
tf = adapter.call(object, name, columns:)
265+
register_table_function(tf)
266+
end
267+
232268
private
233269

234270
def check_threads

lib/duckdb/table_function.rb

Lines changed: 136 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -40,79 +40,152 @@ module DuckDB
4040
class TableFunction
4141
# TableFunction#initialize is defined in C extension
4242

43-
#
44-
# Creates a new table function with a declarative API.
45-
#
46-
# @param name [String] The name of the table function
47-
# @param parameters [Array<LogicalType>, Hash<String, LogicalType>] Function parameters (optional)
48-
# @param columns [Hash<String, LogicalType>] Output columns (required)
49-
# @yield [func_info, output] The execute block that generates data
50-
# @yieldparam func_info [FunctionInfo] Function execution context
51-
# @yieldparam output [DataChunk] Output data chunk to fill
52-
# @yieldreturn [Integer] Number of rows generated (0 when done)
53-
# @return [TableFunction] The configured table function
54-
#
55-
# @example Simple range function
56-
# tf = TableFunction.create(
57-
# name: 'my_range',
58-
# parameters: [LogicalType::BIGINT],
59-
# columns: { 'value' => LogicalType::BIGINT }
60-
# ) do |func_info, output|
61-
# # Generate data...
62-
# 0 # Signal done
63-
# end
64-
#
65-
# @example Function that returns data
66-
# tf = TableFunction.create(
67-
# name: 'my_function',
68-
# columns: { 'value' => LogicalType::BIGINT }
69-
# ) do |func_info, output|
70-
# vec = output.get_vector(0)
71-
# # Fill vector...
72-
# 3 # Return row count
73-
# end
74-
#
75-
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
76-
def self.create(name:, columns:, parameters: nil, &)
77-
raise ArgumentError, 'name is required' unless name
78-
raise ArgumentError, 'columns are required' unless columns
79-
raise ArgumentError, 'block is required' unless block_given?
43+
@table_adapters = {}
8044

81-
tf = new
82-
tf.name = name
45+
class << self
46+
#
47+
# Creates a new table function with a declarative API.
48+
#
49+
# @param name [String] The name of the table function
50+
# @param parameters [Array<LogicalType>, Hash<String, LogicalType>] Function parameters (optional)
51+
# @param columns [Hash<String, LogicalType>] Output columns (required)
52+
# @yield [func_info, output] The execute block that generates data
53+
# @yieldparam func_info [FunctionInfo] Function execution context
54+
# @yieldparam output [DataChunk] Output data chunk to fill
55+
# @yieldreturn [Integer] Number of rows generated (0 when done)
56+
# @return [TableFunction] The configured table function
57+
#
58+
# @example Simple range function
59+
# tf = TableFunction.create(
60+
# name: 'my_range',
61+
# parameters: [LogicalType::BIGINT],
62+
# columns: { 'value' => LogicalType::BIGINT }
63+
# ) do |func_info, output|
64+
# # Generate data...
65+
# 0 # Signal done
66+
# end
67+
#
68+
# @example Function that returns data
69+
# tf = TableFunction.create(
70+
# name: 'my_function',
71+
# columns: { 'value' => LogicalType::BIGINT }
72+
# ) do |func_info, output|
73+
# vec = output.get_vector(0)
74+
# # Fill vector...
75+
# 3 # Return row count
76+
# end
77+
#
78+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
79+
def create(name:, columns:, parameters: nil, &)
80+
raise ArgumentError, 'name is required' unless name
81+
raise ArgumentError, 'columns are required' unless columns
82+
raise ArgumentError, 'block is required' unless block_given?
8383

84-
# Add parameters (positional or named)
85-
if parameters
86-
case parameters
87-
when Array
88-
parameters.each { |type| tf.add_parameter(type) }
89-
when Hash
90-
parameters.each { |param_name, type| tf.add_named_parameter(param_name, type) }
91-
else
92-
raise ArgumentError, 'parameters must be Array or Hash'
84+
tf = new
85+
tf.name = name
86+
87+
# Add parameters (positional or named)
88+
if parameters
89+
case parameters
90+
when Array
91+
parameters.each { |type| tf.add_parameter(type) }
92+
when Hash
93+
parameters.each { |param_name, type| tf.add_named_parameter(param_name, type) }
94+
else
95+
raise ArgumentError, 'parameters must be Array or Hash'
96+
end
9397
end
94-
end
9598

96-
# Set bind callback to add result columns
97-
tf.bind do |bind_info|
98-
columns.each do |col_name, col_type|
99-
bind_info.add_result_column(col_name, col_type)
99+
# Set bind callback to add result columns
100+
tf.bind do |bind_info|
101+
columns.each do |col_name, col_type|
102+
bind_info.add_result_column(col_name, col_type)
103+
end
104+
end
105+
106+
# Set init callback (required by DuckDB)
107+
tf.init do |_init_info|
108+
# No-op
109+
end
110+
111+
# Set execute callback - user's block returns row count
112+
tf.execute do |func_info, output|
113+
size = yield(func_info, output)
114+
output.size = Integer(size)
100115
end
101-
end
102116

103-
# Set init callback (required by DuckDB)
104-
tf.init do |_init_info|
105-
# No-op
117+
tf
106118
end
119+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
107120

108-
# Set execute callback - user's block returns row count
109-
tf.execute do |func_info, output|
110-
size = yield(func_info, output)
111-
output.size = Integer(size)
121+
# Registers a table adapter for a Ruby class.
122+
#
123+
# The adapter is used by +DuckDB::Connection#expose_as_table+ to convert
124+
# instances of +klass+ into a DuckDB table function. The adapter must respond
125+
# to +call(object, name, columns: nil)+ and return a +DuckDB::TableFunction+.
126+
#
127+
# == Implementing a Table Adapter
128+
#
129+
# An adapter is any object that responds to +call(object, name, columns: nil)+.
130+
# The +columns:+ keyword argument allows callers to override the column schema;
131+
# the adapter should fall back to its own schema detection when it is +nil+.
132+
#
133+
# The execute block passed to +DuckDB::TableFunction.create+ must:
134+
# - Write one batch of rows into +output+ per call
135+
# - Return the number of rows written as an +Integer+
136+
# - Return +0+ to signal that all data has been exhausted
137+
#
138+
# @example Minimal adapter for CSV objects
139+
# class CSVTableAdapter
140+
# def call(csv, name, columns: nil)
141+
# columns ||= infer_columns(csv)
142+
#
143+
# DuckDB::TableFunction.create(name:, columns:) do |_func_info, output|
144+
# row = csv.readline
145+
# if row
146+
# row.each_with_index { |cell, i| output.set_value(i, 0, cell[1]) }
147+
# 1 # wrote one row
148+
# else
149+
# csv.rewind
150+
# 0 # signal end of data
151+
# end
152+
# end
153+
# end
154+
#
155+
# private
156+
#
157+
# def infer_columns(csv)
158+
# headers = csv.first.headers
159+
# csv.rewind
160+
# headers.each_with_object({}) { |h, hsh| hsh[h] = DuckDB::LogicalType::VARCHAR }
161+
# end
162+
# end
163+
#
164+
# # Register and use:
165+
# DuckDB::TableFunction.add_table_adapter(CSV, CSVTableAdapter.new)
166+
# con.execute('SET threads=1')
167+
# con.expose_as_table(csv, 'csv_table')
168+
# con.query('SELECT * FROM csv_table()').to_a
169+
#
170+
# @param klass [Class] the Ruby class to register an adapter for (e.g. +CSV+)
171+
# @param adapter [#call] the adapter object
172+
# @return [void]
173+
#
174+
def add_table_adapter(klass, adapter)
175+
@table_adapters[klass] = adapter
112176
end
113177

114-
tf
178+
# Returns the table adapter registered for the given class, or +nil+ if none.
179+
#
180+
# @param klass [Class] the Ruby class to look up
181+
# @return [#call, nil] the registered adapter, or +nil+ if not found
182+
#
183+
# @example
184+
# adapter = DuckDB::TableFunction.table_adapter_for(CSV)
185+
#
186+
def table_adapter_for(klass)
187+
@table_adapters[klass]
188+
end
115189
end
116-
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
117190
end
118191
end

sample/issue922.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ module Polars
3838
module DataFrame
3939
class TableAdapter
4040
def call(df, name) # rubocop:disable Metrics/MethodLength, Naming/MethodParameterName
41-
columns = df.columns.each_with_object({}) { |header, hash| hash[header] = LogicalType::VARCHAR }
41+
columns = df.columns.to_h { |header| [header, LogicalType::VARCHAR] }
4242
counter = 0
4343
height = df.height
4444
width = df.columns.length

sample/issue930.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77

88
module DuckDB
99
class Connection
10-
def register_as_table(name, io, csv_options: {}) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
10+
def register_as_table(name, io, csv_options: {}) # rubocop:disable Metrics/MethodLength
1111
csv = CSV.new(io, **csv_options)
1212
headers = csv.first.headers
1313
csv.rewind
14-
columns = headers.each_with_object({}) { |header, hash| hash[header] = LogicalType::VARCHAR }
14+
columns = headers.to_h { |header| [header, LogicalType::VARCHAR] }
1515
tf = DuckDB::TableFunction.create(
1616
name:,
1717
columns:

0 commit comments

Comments
 (0)