From f6bc3cc6a83eefbee0d958778d72c81f13d8e779 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 26 Aug 2022 16:55:56 -0800 Subject: [PATCH 01/23] Add and fix numpydoc check PR03 in pyarrow --- python/pyarrow/_csv.pyx | 16 ++++++++-------- python/pyarrow/_dataset.pyx | 26 +++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index 0ac32f1bbf26..a3da98f25286 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -104,14 +104,6 @@ cdef class ReadOptions(_Weakrefable): skip_rows : int, optional (default 0) The number of rows to skip before the column names (if any) and the CSV data. - skip_rows_after_names : int, optional (default 0) - The number of rows to skip after the column names. - This number can be larger than the number of rows in one - block, and empty rows are counted. - The order of application is as follows: - - `skip_rows` is applied (if non-zero); - - column names aread (unless `column_names` is set); - - `skip_rows_after_names` is applied (if non-zero). column_names : list, optional The column names of the target table. If empty, fall back on `autogenerate_column_names`. @@ -123,6 +115,14 @@ cdef class ReadOptions(_Weakrefable): encoding : str, optional (default 'utf8') The character encoding of the CSV data. Columns that cannot decode using this encoding can still be read as Binary. + skip_rows_after_names : int, optional (default 0) + The number of rows to skip after the column names. + This number can be larger than the number of rows in one + block, and empty rows are counted. + The order of application is as follows: + - `skip_rows` is applied (if non-zero); + - column names aread (unless `column_names` is set); + - `skip_rows_after_names` is applied (if non-zero). Examples -------- diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 42781ff2aae1..33b1d39edf22 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2513,6 +2513,16 @@ cdef class Scanner(_Weakrefable): ---------- dataset : Dataset Dataset to scan. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + use_async : bool, default True + This flag is deprecated and is being kept for this release for + backwards compatibility. It will be removed in the next + release. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. columns : list of str, default None The columns to project. This can be a list of column names to include (order and duplicates will be preserved), or a dictionary @@ -2601,6 +2611,16 @@ cdef class Scanner(_Weakrefable): fragment to scan. schema : Schema, optional The schema of the fragment. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + use_async : bool, default True + This flag is deprecated and is being kept for this release for + backwards compatibility. It will be removed in the next + release. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. columns : list of str, default None The columns to project. This can be a list of column names to include (order and duplicates will be preserved), or a dictionary @@ -2634,9 +2654,6 @@ cdef class Scanner(_Weakrefable): The number of batches to read ahead in a file. This might not work for all file formats. Increasing this number will increase RAM usage but could also improve IO utilization. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. @@ -2647,6 +2664,9 @@ cdef class Scanner(_Weakrefable): memory_pool : MemoryPool, default None For memory allocations, if required. If not specified, uses the default pool. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. """ cdef: shared_ptr[CScanOptions] options = make_shared[CScanOptions]() From 8f55098adf5d712b3fee22517318b3950e918084 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Mon, 29 Aug 2022 14:07:30 -0800 Subject: [PATCH 02/23] Re-arrange order of skip_rows and skip_rows_after_names --- python/pyarrow/_csv.pyx | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index a3da98f25286..0ac32f1bbf26 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -104,6 +104,14 @@ cdef class ReadOptions(_Weakrefable): skip_rows : int, optional (default 0) The number of rows to skip before the column names (if any) and the CSV data. + skip_rows_after_names : int, optional (default 0) + The number of rows to skip after the column names. + This number can be larger than the number of rows in one + block, and empty rows are counted. + The order of application is as follows: + - `skip_rows` is applied (if non-zero); + - column names aread (unless `column_names` is set); + - `skip_rows_after_names` is applied (if non-zero). column_names : list, optional The column names of the target table. If empty, fall back on `autogenerate_column_names`. @@ -115,14 +123,6 @@ cdef class ReadOptions(_Weakrefable): encoding : str, optional (default 'utf8') The character encoding of the CSV data. Columns that cannot decode using this encoding can still be read as Binary. - skip_rows_after_names : int, optional (default 0) - The number of rows to skip after the column names. - This number can be larger than the number of rows in one - block, and empty rows are counted. - The order of application is as follows: - - `skip_rows` is applied (if non-zero); - - column names aread (unless `column_names` is set); - - `skip_rows_after_names` is applied (if non-zero). Examples -------- From 4a73ce8bcce8aeaf96bf577f0f685b93e9875bc6 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 14 Sep 2022 10:07:00 -0800 Subject: [PATCH 03/23] Clean up arg order in pyarrow Dataset methods --- python/pyarrow/_dataset.pyx | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 33b1d39edf22..406833b125cd 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2513,16 +2513,6 @@ cdef class Scanner(_Weakrefable): ---------- dataset : Dataset Dataset to scan. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - use_async : bool, default True - This flag is deprecated and is being kept for this release for - backwards compatibility. It will be removed in the next - release. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. columns : list of str, default None The columns to project. This can be a list of column names to include (order and duplicates will be preserved), or a dictionary @@ -2611,16 +2601,6 @@ cdef class Scanner(_Weakrefable): fragment to scan. schema : Schema, optional The schema of the fragment. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - use_async : bool, default True - This flag is deprecated and is being kept for this release for - backwards compatibility. It will be removed in the next - release. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. columns : list of str, default None The columns to project. This can be a list of column names to include (order and duplicates will be preserved), or a dictionary @@ -2667,6 +2647,16 @@ cdef class Scanner(_Weakrefable): fragment_scan_options : FragmentScanOptions, default None Options specific to a particular scan and fragment type, which can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + use_async : bool, default True + This flag is deprecated and is being kept for this release for + backwards compatibility. It will be removed in the next + release. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. """ cdef: shared_ptr[CScanOptions] options = make_shared[CScanOptions]() From 61fcbf1def81f8c5e0de6222deb26757ea4305ab Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 14 Sep 2022 13:29:57 -0800 Subject: [PATCH 04/23] Fixes for YD01 --- python/pyarrow/parquet/core.py | 4 ++-- python/pyarrow/plasma.py | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index e6148be0f896..e738236bf53a 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -541,9 +541,9 @@ def iter_batches(self, batch_size=65536, row_groups=None, columns=None, If True and file has custom pandas schema metadata, ensure that index columns are also loaded. - Returns + Yields ------- - iterator of pyarrow.RecordBatch + pyarrow.RecordBatch Contents of each batch as a record batch Examples diff --git a/python/pyarrow/plasma.py b/python/pyarrow/plasma.py index 5c2c6543418d..00342765557d 100644 --- a/python/pyarrow/plasma.py +++ b/python/pyarrow/plasma.py @@ -108,11 +108,12 @@ def start_plasma_store(plasma_store_memory, external_store : str External store to use for evicted objects. - Returns + Yields ------- - result : (str, subprocess.Popen) - A tuple of the name of the plasma store socket and the process ID of - the plasma store process. + plasma_store_name : str + Name of the plasma store socket + proc : subprocess.Popen + Process ID of the plasma store process """ warnings.warn( "Plasma is deprecated since Arrow 10.0.0. It will be removed in " From 5007824cdf829cc3235679f0150b0c2499922e9a Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 14 Sep 2022 13:30:15 -0800 Subject: [PATCH 05/23] Fixes for RT03 --- python/pyarrow/compute.py | 9 ++++++++- python/pyarrow/dataset.py | 2 ++ python/pyarrow/feather.py | 2 ++ python/pyarrow/ipc.py | 3 +++ python/pyarrow/parquet/core.py | 7 ++++++- 5 files changed, 21 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 265d75f6f6b0..1ee6c40f4232 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -374,6 +374,7 @@ def cast(arr, target_type=None, safe=None, options=None): Returns ------- casted : Array + The cast result as a new Array """ safe_vars_passed = (safe is not None) or (target_type is not None) @@ -452,6 +453,7 @@ def take(data, indices, *, boundscheck=True, memory_pool=None): Returns ------- result : depends on inputs + Selected values for the given indices Examples -------- @@ -490,6 +492,7 @@ def fill_null(values, fill_value): Returns ------- result : depends on inputs + Values with all null elements replaced Examples -------- @@ -534,7 +537,8 @@ def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None): Returns ------- - result : Array of indices + result : Array + Indices of the top-k ordered elements Examples -------- @@ -581,6 +585,7 @@ def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None): Returns ------- result : Array of indices + Indices of the bottom-k ordered elements Examples -------- @@ -650,6 +655,7 @@ def field(*name_or_index): Returns ------- field_expr : Expression + Reference to the given field Examples -------- @@ -691,5 +697,6 @@ def scalar(value): Returns ------- scalar_expr : Expression + An Expression representing the scalar value """ return Expression._scalar(value) diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index adbf064a736c..de9469de445b 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -151,6 +151,7 @@ def partitioning(schema=None, field_names=None, flavor=None, Returns ------- Partitioning or PartitioningFactory + The partioning scheme Examples -------- @@ -524,6 +525,7 @@ def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None, Returns ------- FileSystemDataset + The dataset corresponding to the given metadata """ from pyarrow.fs import LocalFileSystem, _ensure_filesystem diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 54a16a2f89d3..fbd060259700 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -221,6 +221,7 @@ def read_feather(source, columns=None, use_threads=True, Returns ------- df : pandas.DataFrame + The contents of the Feather file as a pandas.DataFrame """ return (read_table( source, columns=columns, memory_map=memory_map, @@ -246,6 +247,7 @@ def read_table(source, columns=None, memory_map=False, use_threads=True): Returns ------- table : pyarrow.Table + The contents of the Feather file as a pyarrow.Table """ reader = _feather.FeatherReader( source, use_memory_map=memory_map, use_threads=use_threads) diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py index fc724109d94c..9006637dc477 100644 --- a/python/pyarrow/ipc.py +++ b/python/pyarrow/ipc.py @@ -164,6 +164,7 @@ def new_stream(sink, schema, *, use_legacy_format=None, options=None): Returns ------- writer : RecordBatchStreamWriter + A writer for the given sink """.format(_ipc_writer_class_doc) @@ -202,6 +203,7 @@ def new_file(sink, schema, *, use_legacy_format=None, options=None): Returns ------- writer : RecordBatchFileWriter + A writer for the given sink """.format(_ipc_writer_class_doc) @@ -271,6 +273,7 @@ def deserialize_pandas(buf, *, use_threads=True): Returns ------- df : pandas.DataFrame + The buffer deserialized as pandas DataFrame """ buffer_reader = pa.BufferReader(buf) with pa.RecordBatchStreamReader(buffer_reader) as reader: diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index e738236bf53a..837e35fae8da 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -645,7 +645,8 @@ def scan_contents(self, columns=None, batch_size=65536): Returns ------- - num_rows : number of rows in file + num_rows : int + Number of rows in file Examples -------- @@ -1186,6 +1187,7 @@ def get_metadata(self): Returns ------- metadata : FileMetaData + The file's metadata """ with self.open() as parquet: return parquet.metadata @@ -1222,6 +1224,7 @@ def read(self, columns=None, use_threads=True, partitions=None, Returns ------- table : pyarrow.Table + The pierce as a pyarrow.Table """ if self.open_file_func is not None: reader = self.open() @@ -3560,6 +3563,7 @@ def read_metadata(where, memory_map=False, decryption_properties=None, Returns ------- metadata : FileMetaData + The metadata of the Parquet file Examples -------- @@ -3609,6 +3613,7 @@ def read_schema(where, memory_map=False, decryption_properties=None, Returns ------- schema : pyarrow.Schema + The schema of the Parquet file Examples -------- From 88a078dbb48aeb29b3796360b2d00d95f31baa6b Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Thu, 15 Sep 2022 13:16:17 -0800 Subject: [PATCH 06/23] Fix PR05 --- python/pyarrow/_dataset.pyx | 6 +++--- python/pyarrow/_dataset_parquet.pyx | 18 +++++++++--------- python/pyarrow/fs.py | 2 +- python/pyarrow/io.pxi | 4 ++-- python/pyarrow/ipc.pxi | 26 +++++++++++++------------- python/pyarrow/parquet/core.py | 8 ++++---- 6 files changed, 32 insertions(+), 32 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 406833b125cd..2d39f9e37b1d 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -532,12 +532,12 @@ cdef class InMemoryDataset(Dataset): Parameters ---------- - source : The data for this dataset. + source : The data for this dataset Can be a RecordBatch, Table, list of - RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader. + RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader If an iterable is provided, the schema must also be provided. schema : Schema, optional - Only required if passing an iterable as the source. + Only required if passing an iterable as the source """ cdef: diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 744bfac6bfb5..eefeb49afcf7 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -67,11 +67,11 @@ cdef class ParquetFileFormat(FileFormat): Parameters ---------- read_options : ParquetReadOptions - Read options for the file. + Read options for the file default_fragment_scan_options : ParquetFragmentScanOptions - Scan Options for the file. + Scan Options for the file **kwargs : dict - Additional options for read option or scan option. + Additional options for read option or scan option """ cdef: @@ -236,9 +236,9 @@ class RowGroupInfo: Parameters ---------- - id : the group id. - metadata : the rowgroup metadata. - schema : schema of the rows. + id : the group id + metadata : the rowgroup metadata + schema : schema of the rows """ def __init__(self, id, metadata, schema): @@ -449,12 +449,12 @@ cdef class ParquetReadOptions(_Weakrefable): ---------- dictionary_columns : list of string, default None Names of columns which should be dictionary encoded as - they are read. - coerce_int96_timestamp_unit : str, default None. + they are read + coerce_int96_timestamp_unit : str, default None Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds. + in nanoseconds """ cdef public: diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index ab151bc5d8f6..21db243528c7 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -281,7 +281,7 @@ class FSSpecHandler(FileSystemHandler): Parameters ---------- - fs : FSSpec-compliant filesystem instance. + fs : FSSpec-compliant filesystem instance Examples -------- diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index f1a1b315f681..21c17b4d36dc 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -2207,7 +2207,7 @@ def input_stream(source, compression='detect', buffer_size=None): Parameters ---------- - source : str, Path, buffer, file-like object, ... + source : str, Path, buffer, or file-like object The source to open for reading. compression : str optional, default 'detect' The compression algorithm to use for on-the-fly decompression. @@ -2259,7 +2259,7 @@ def output_stream(source, compression='detect', buffer_size=None): Parameters ---------- - source : str, Path, buffer, file-like object, ... + source : str, Path, buffer, file-like object The source to open for writing. compression : str optional, default 'detect' The compression algorithm to use for on-the-fly compression. diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 35a07d8737b5..2a40a51e9441 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -57,11 +57,11 @@ class WriteStats(_WriteStats): Parameters ---------- - num_messages : number of messages. - num_record_batches : number of record batches. - num_dictionary_batches : number of dictionary batches. - num_dictionary_deltas : delta of dictionaries. - num_replaced_dictionaries : number of replaced dictionaries. + num_messages : number of messages + num_record_batches : number of record batches + num_dictionary_batches : number of dictionary batches + num_dictionary_deltas : delta of dictionaries + num_replaced_dictionaries : number of replaced dictionaries """ __slots__ = () @@ -84,11 +84,11 @@ class ReadStats(_ReadStats): Parameters ---------- - num_messages : number of messages. - num_record_batches : number of record batches. - num_dictionary_batches : number of dictionary batches. - num_dictionary_deltas : delta of dictionaries. - num_replaced_dictionaries : number of replaced dictionaries. + num_messages : number of messages + num_record_batches : number of record batches + num_dictionary_batches : number of dictionary batches + num_dictionary_deltas : delta of dictionaries + num_replaced_dictionaries : number of replaced dictionaries """ __slots__ = () @@ -108,14 +108,14 @@ cdef class IpcReadOptions(_Weakrefable): ---------- ensure_native_endian : bool Whether to convert incoming data to platform-native endianness. - Default is true. + Default is true use_threads : bool Whether to use the global CPU thread pool to parallelize any - computational tasks like decompression. + computational tasks like decompression included_fields : list If empty (the default), return all deserialized fields. If non-empty, the values are the indices of fields to read on - the top-level schema. + the top-level schema """ __slots__ = () diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 837e35fae8da..dd9a426d2cb1 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -242,7 +242,7 @@ class ParquetFile: Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3). If True, Arrow will use a background I/O thread pool. - coerce_int96_timestamp_unit : str, default None. + coerce_int96_timestamp_unit : str, default None Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 timestamps will be inferred as timestamps @@ -1312,7 +1312,7 @@ def get_index(self, key): Parameters ---------- - key : The value for which we want to known the index. + key : The value for which we want to known the index """ if key in self.key_indices: return self.key_indices[key] @@ -1716,7 +1716,7 @@ class ParquetDataset: use_legacy_dataset=False. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. -coerce_int96_timestamp_unit : str, default None. +coerce_int96_timestamp_unit : str, default None Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 timestamps will be inferred as timestamps in nanoseconds. @@ -2786,7 +2786,7 @@ def partitioning(self): use_legacy_dataset=False. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. -coerce_int96_timestamp_unit : str, default None. +coerce_int96_timestamp_unit : str, default None Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 timestamps will be inferred as timestamps From 70387316e386b510773ef1248ed89c89af5e4992 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Thu, 15 Sep 2022 13:16:28 -0800 Subject: [PATCH 07/23] Fix docstrings in pyarrow ipc.py --- python/pyarrow/ipc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py index 9006637dc477..7885636dbc7c 100644 --- a/python/pyarrow/ipc.py +++ b/python/pyarrow/ipc.py @@ -181,6 +181,7 @@ def open_stream(source, *, options=None, memory_pool=None): If None, default values will be used. memory_pool : MemoryPool, default None If None, default memory pool is used. + Returns ------- reader : RecordBatchStreamReader @@ -223,6 +224,7 @@ def open_file(source, footer_offset=None, *, options=None, memory_pool=None): If None, default values will be used. memory_pool : MemoryPool, default None If None, default memory pool is used. + Returns ------- reader : RecordBatchFileReader From 1d1d2d4ad053f61ff76d7abc098c7faaa0ac1dce Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Thu, 15 Sep 2022 13:41:44 -0800 Subject: [PATCH 08/23] Fix PR04 --- python/pyarrow/array.pxi | 6 +++--- python/pyarrow/ipc.pxi | 2 +- python/pyarrow/table.pxi | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5772592ead1f..b2dff6567737 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -984,7 +984,7 @@ cdef class Array(_PandasConvertible): Parameters ---------- - null_encoding + null_encoding : str, default "mask" How to handle null entries. Returns @@ -1265,7 +1265,7 @@ cdef class Array(_PandasConvertible): Parameters ---------- - fill_value + fill_value : any The replacement value for null entries. Returns @@ -1363,7 +1363,7 @@ cdef class Array(_PandasConvertible): ---------- mask : Array or array-like The boolean mask to filter the array with. - null_selection_behavior + null_selection_behavior : str, default "drop" How nulls in the mask should be handled. Returns diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 2a40a51e9441..8ad1f2425d88 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -411,7 +411,7 @@ cdef class MessageReader(_Weakrefable): Parameters ---------- - source + source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object A readable source, like an InputStream """ cdef: diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 53e841228240..bcc428a4cb29 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -366,7 +366,7 @@ cdef class ChunkedArray(_PandasConvertible): Parameters ---------- - fill_value + fill_value : any The replacement value for null entries. Returns @@ -530,7 +530,7 @@ cdef class ChunkedArray(_PandasConvertible): Parameters ---------- - null_encoding + null_encoding : str, default "mask" How to handle null entries. Returns @@ -853,7 +853,7 @@ cdef class ChunkedArray(_PandasConvertible): ---------- mask : Array or array-like The boolean mask to filter the chunked array with. - null_selection_behavior + null_selection_behavior : str, default "drop" How nulls in the mask should be handled. Returns @@ -2103,7 +2103,7 @@ cdef class RecordBatch(_PandasConvertible): ---------- mask : Array or array-like The boolean mask to filter the record batch with. - null_selection_behavior + null_selection_behavior : str, default "drop" How nulls in the mask should be handled. Returns @@ -2938,7 +2938,7 @@ cdef class Table(_PandasConvertible): ---------- mask : Array or array-like or .Expression The boolean mask or the :class:`.Expression` to filter the table with. - null_selection_behavior + null_selection_behavior : str, default "drop" How nulls in the mask should be handled, does nothing if an :class:`.Expression` is used. From 7c9fc851633631c51c3893d8da13f9661590c546 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Mon, 19 Sep 2022 13:02:23 -0800 Subject: [PATCH 09/23] Fixes for RT03 --- python/pyarrow/ipc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py index 7885636dbc7c..523196e1e338 100644 --- a/python/pyarrow/ipc.py +++ b/python/pyarrow/ipc.py @@ -185,6 +185,7 @@ def open_stream(source, *, options=None, memory_pool=None): Returns ------- reader : RecordBatchStreamReader + A reader for the given source """ return RecordBatchStreamReader(source, options=options, memory_pool=memory_pool) @@ -228,6 +229,7 @@ def open_file(source, footer_offset=None, *, options=None, memory_pool=None): Returns ------- reader : RecordBatchFileReader + A reader for the given source """ return RecordBatchFileReader( source, footer_offset=footer_offset, From efaaa3f9e73ee2e049c40e461a6ca659f3309c91 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Mon, 19 Sep 2022 13:04:53 -0800 Subject: [PATCH 10/23] enable numpydoc checks GL10, PR04, PR05, RT03, and YD01 --- dev/archery/archery/cli.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 105a64c0603d..d489372fbd9c 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -331,7 +331,17 @@ def numpydoc(src, symbols, allow_rule, disallow_rule): archery numpydoc pyarrow.csv pyarrow.json pyarrow.parquet archery numpydoc pyarrow.array """ - disallow_rule = disallow_rule or {'GL01', 'SA01', 'EX01', 'ES01'} + allow_rule = allow_rule or { + 'GL10', + 'PR01', + 'PR03', + 'PR04', + 'PR05', + 'PR10', + 'RT03', + 'YD01' + } + try: results = python_numpydoc( symbols, allow_rules=_flatten_numpydoc_rules(allow_rule), From 1166e01626ebece9f2c6b6545bf7f98dbafdedc5 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Mon, 19 Sep 2022 13:08:53 -0800 Subject: [PATCH 11/23] Make archery numpydoc use default behavior --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 23583d6b6542..93cf9563eac7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1107,7 +1107,7 @@ services: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && - archery numpydoc --allow-rule PR01,PR03,PR10 && + archery numpydoc && /arrow/ci/scripts/python_test.sh /arrow"] conda-python-dask: From 6fafe445b294ab0c562c21cf82fd03d7f468fbc3 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 4 Jan 2023 12:51:00 -0900 Subject: [PATCH 12/23] Fix regression in PR03 --- python/pyarrow/_dataset.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 2d39f9e37b1d..aad7f7d6dcc1 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2634,6 +2634,9 @@ cdef class Scanner(_Weakrefable): The number of batches to read ahead in a file. This might not work for all file formats. Increasing this number will increase RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. @@ -2644,9 +2647,6 @@ cdef class Scanner(_Weakrefable): memory_pool : MemoryPool, default None For memory allocations, if required. If not specified, uses the default pool. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. From 31b5c67b3bd9aeed9772884330da7aeba218b0f5 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 4 Jan 2023 12:58:06 -0900 Subject: [PATCH 13/23] Fix regression in RT03 --- python/pyarrow/parquet/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index dd9a426d2cb1..305f6b7648dd 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -162,6 +162,7 @@ def filters_to_expression(filters): Returns ------- pyarrow.compute.Expression + An Expression representing the filters """ import pyarrow.dataset as ds From b38c4f973c40fb1e6d618d1a984ebd7cb48db182 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 4 Jan 2023 13:07:57 -0900 Subject: [PATCH 14/23] Revert unrelated changes to archery/docker-compose --- dev/archery/archery/cli.py | 11 +---------- docker-compose.yml | 2 +- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index d489372fbd9c..77c4362cdb49 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -331,16 +331,7 @@ def numpydoc(src, symbols, allow_rule, disallow_rule): archery numpydoc pyarrow.csv pyarrow.json pyarrow.parquet archery numpydoc pyarrow.array """ - allow_rule = allow_rule or { - 'GL10', - 'PR01', - 'PR03', - 'PR04', - 'PR05', - 'PR10', - 'RT03', - 'YD01' - } + disallow_rule = disallow_rule or {'GL01', 'SA01', 'EX01', 'ES01'} try: results = python_numpydoc( diff --git a/docker-compose.yml b/docker-compose.yml index 93cf9563eac7..9ef8f6dbcebc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1107,7 +1107,7 @@ services: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && - archery numpydoc && + archery numpydoc --allow-rule GL10,PR01,PR03,PR05,PR10,RT03,YD01 && /arrow/ci/scripts/python_test.sh /arrow"] conda-python-dask: From 733b29975896dbfe5a3a5cf0d68af4df6fe88c07 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 4 Jan 2023 13:08:39 -0900 Subject: [PATCH 15/23] Whitespace --- dev/archery/archery/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 77c4362cdb49..105a64c0603d 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -332,7 +332,6 @@ def numpydoc(src, symbols, allow_rule, disallow_rule): archery numpydoc pyarrow.array """ disallow_rule = disallow_rule or {'GL01', 'SA01', 'EX01', 'ES01'} - try: results = python_numpydoc( symbols, allow_rules=_flatten_numpydoc_rules(allow_rule), From 8f42248444fbf1aece52dc1b0e4d841eee766210 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Thu, 5 Jan 2023 15:27:42 -0900 Subject: [PATCH 16/23] Add PR04 to archery job --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 9ef8f6dbcebc..ca42291dcddb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1107,7 +1107,7 @@ services: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && - archery numpydoc --allow-rule GL10,PR01,PR03,PR05,PR10,RT03,YD01 && + archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 && /arrow/ci/scripts/python_test.sh /arrow"] conda-python-dask: From ff0c382c926b4517838103f64e1455457fd901dc Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 6 Jan 2023 12:32:13 -0900 Subject: [PATCH 17/23] Address review comments From https://github.com/apache/arrow/pull/15214 --- python/pyarrow/_dataset.pyx | 4 ++-- python/pyarrow/_dataset_parquet.pyx | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index aad7f7d6dcc1..5f1610c384fb 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -532,8 +532,8 @@ cdef class InMemoryDataset(Dataset): Parameters ---------- - source : The data for this dataset - Can be a RecordBatch, Table, list of + source : RecordBatch, Table, list, tuple + The data for this dataset. Can be a RecordBatch, Table, list of RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader If an iterable is provided, the schema must also be provided. schema : Schema, optional diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index eefeb49afcf7..01a3b30da5ca 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -67,9 +67,9 @@ cdef class ParquetFileFormat(FileFormat): Parameters ---------- read_options : ParquetReadOptions - Read options for the file + Read options for the file. default_fragment_scan_options : ParquetFragmentScanOptions - Scan Options for the file + Scan Options for the file. **kwargs : dict Additional options for read option or scan option """ @@ -236,9 +236,12 @@ class RowGroupInfo: Parameters ---------- - id : the group id - metadata : the rowgroup metadata - schema : schema of the rows + id : integer + The group ID. + metadata : FileMetaData + The rowgroup metadata. + schema : Schema + Schema of the rows. """ def __init__(self, id, metadata, schema): From f1f8a8e04d607e56c3ab1a734b13570d7af4208c Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 6 Jan 2023 12:58:05 -0900 Subject: [PATCH 18/23] Update python/pyarrow/ipc.pxi Co-authored-by: Will Jones --- python/pyarrow/ipc.pxi | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 8ad1f2425d88..318ebc05b1ff 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -57,11 +57,16 @@ class WriteStats(_WriteStats): Parameters ---------- - num_messages : number of messages - num_record_batches : number of record batches - num_dictionary_batches : number of dictionary batches - num_dictionary_deltas : delta of dictionaries - num_replaced_dictionaries : number of replaced dictionaries + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. """ __slots__ = () From 3fd1c364beca8bc9f9c4387c6c65d2e6b13eeebc Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 6 Jan 2023 12:58:13 -0900 Subject: [PATCH 19/23] Update python/pyarrow/ipc.pxi Co-authored-by: Will Jones --- python/pyarrow/ipc.pxi | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 318ebc05b1ff..9aa2573d80b9 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -89,11 +89,16 @@ class ReadStats(_ReadStats): Parameters ---------- - num_messages : number of messages - num_record_batches : number of record batches - num_dictionary_batches : number of dictionary batches - num_dictionary_deltas : delta of dictionaries - num_replaced_dictionaries : number of replaced dictionaries + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. """ __slots__ = () From 3325f2424df280175478e7cfab6150f2d580986f Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 6 Jan 2023 12:58:18 -0900 Subject: [PATCH 20/23] Update python/pyarrow/ipc.pxi Co-authored-by: Will Jones --- python/pyarrow/ipc.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 9aa2573d80b9..d5bb839781c1 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -116,7 +116,7 @@ cdef class IpcReadOptions(_Weakrefable): Parameters ---------- - ensure_native_endian : bool + ensure_native_endian : bool, default True Whether to convert incoming data to platform-native endianness. Default is true use_threads : bool From 058c19c2f2406b5bcbf85fedfbbf59f7046b9302 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 6 Jan 2023 12:58:35 -0900 Subject: [PATCH 21/23] Update python/pyarrow/ipc.pxi Co-authored-by: Will Jones --- python/pyarrow/ipc.pxi | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index d5bb839781c1..6e60b8b9a058 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -118,7 +118,6 @@ cdef class IpcReadOptions(_Weakrefable): ---------- ensure_native_endian : bool, default True Whether to convert incoming data to platform-native endianness. - Default is true use_threads : bool Whether to use the global CPU thread pool to parallelize any computational tasks like decompression From 91fda30c7a7e87fb8771e1a4fde796f8f7d91f75 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 6 Jan 2023 12:58:43 -0900 Subject: [PATCH 22/23] Update python/pyarrow/parquet/core.py Co-authored-by: Will Jones --- python/pyarrow/parquet/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 305f6b7648dd..2ddc6eb67b5d 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1313,7 +1313,8 @@ def get_index(self, key): Parameters ---------- - key : The value for which we want to known the index + key : str or int + The value for which we want to known the index. """ if key in self.key_indices: return self.key_indices[key] From 4e3afcb1b2d32f8e955dd10bd89e45ab6977a6db Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 6 Jan 2023 13:00:18 -0900 Subject: [PATCH 23/23] Update python/pyarrow/parquet/core.py Co-authored-by: Will Jones --- python/pyarrow/parquet/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 2ddc6eb67b5d..88e3cf2a677c 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1225,7 +1225,7 @@ def read(self, columns=None, use_threads=True, partitions=None, Returns ------- table : pyarrow.Table - The pierce as a pyarrow.Table + The piece as a pyarrow.Table. """ if self.open_file_func is not None: reader = self.open()