diff --git a/docker-compose.yml b/docker-compose.yml index 23583d6b6542..ca42291dcddb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1107,7 +1107,7 @@ services: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && - archery numpydoc --allow-rule PR01,PR03,PR10 && + archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 && /arrow/ci/scripts/python_test.sh /arrow"] conda-python-dask: diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 42781ff2aae1..5f1610c384fb 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -532,12 +532,12 @@ cdef class InMemoryDataset(Dataset): Parameters ---------- - source : The data for this dataset. - Can be a RecordBatch, Table, list of - RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader. + source : RecordBatch, Table, list, tuple + The data for this dataset. Can be a RecordBatch, Table, list of + RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader If an iterable is provided, the schema must also be provided. schema : Schema, optional - Only required if passing an iterable as the source. + Only required if passing an iterable as the source """ cdef: @@ -2647,6 +2647,16 @@ cdef class Scanner(_Weakrefable): memory_pool : MemoryPool, default None For memory allocations, if required. If not specified, uses the default pool. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + use_async : bool, default True + This flag is deprecated and is being kept for this release for + backwards compatibility. It will be removed in the next + release. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. """ cdef: shared_ptr[CScanOptions] options = make_shared[CScanOptions]() diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 744bfac6bfb5..01a3b30da5ca 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -71,7 +71,7 @@ cdef class ParquetFileFormat(FileFormat): default_fragment_scan_options : ParquetFragmentScanOptions Scan Options for the file. **kwargs : dict - Additional options for read option or scan option. + Additional options for read option or scan option """ cdef: @@ -236,9 +236,12 @@ class RowGroupInfo: Parameters ---------- - id : the group id. - metadata : the rowgroup metadata. - schema : schema of the rows. + id : integer + The group ID. + metadata : FileMetaData + The rowgroup metadata. + schema : Schema + Schema of the rows. """ def __init__(self, id, metadata, schema): @@ -449,12 +452,12 @@ cdef class ParquetReadOptions(_Weakrefable): ---------- dictionary_columns : list of string, default None Names of columns which should be dictionary encoded as - they are read. - coerce_int96_timestamp_unit : str, default None. + they are read + coerce_int96_timestamp_unit : str, default None Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 timestamps will be inferred as timestamps - in nanoseconds. + in nanoseconds """ cdef public: diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5772592ead1f..b2dff6567737 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -984,7 +984,7 @@ cdef class Array(_PandasConvertible): Parameters ---------- - null_encoding + null_encoding : str, default "mask" How to handle null entries. Returns @@ -1265,7 +1265,7 @@ cdef class Array(_PandasConvertible): Parameters ---------- - fill_value + fill_value : any The replacement value for null entries. Returns @@ -1363,7 +1363,7 @@ cdef class Array(_PandasConvertible): ---------- mask : Array or array-like The boolean mask to filter the array with. - null_selection_behavior + null_selection_behavior : str, default "drop" How nulls in the mask should be handled. Returns diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 265d75f6f6b0..1ee6c40f4232 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -374,6 +374,7 @@ def cast(arr, target_type=None, safe=None, options=None): Returns ------- casted : Array + The cast result as a new Array """ safe_vars_passed = (safe is not None) or (target_type is not None) @@ -452,6 +453,7 @@ def take(data, indices, *, boundscheck=True, memory_pool=None): Returns ------- result : depends on inputs + Selected values for the given indices Examples -------- @@ -490,6 +492,7 @@ def fill_null(values, fill_value): Returns ------- result : depends on inputs + Values with all null elements replaced Examples -------- @@ -534,7 +537,8 @@ def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None): Returns ------- - result : Array of indices + result : Array + Indices of the top-k ordered elements Examples -------- @@ -581,6 +585,7 @@ def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None): Returns ------- result : Array of indices + Indices of the bottom-k ordered elements Examples -------- @@ -650,6 +655,7 @@ def field(*name_or_index): Returns ------- field_expr : Expression + Reference to the given field Examples -------- @@ -691,5 +697,6 @@ def scalar(value): Returns ------- scalar_expr : Expression + An Expression representing the scalar value """ return Expression._scalar(value) diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index adbf064a736c..de9469de445b 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -151,6 +151,7 @@ def partitioning(schema=None, field_names=None, flavor=None, Returns ------- Partitioning or PartitioningFactory + The partioning scheme Examples -------- @@ -524,6 +525,7 @@ def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None, Returns ------- FileSystemDataset + The dataset corresponding to the given metadata """ from pyarrow.fs import LocalFileSystem, _ensure_filesystem diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 54a16a2f89d3..fbd060259700 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -221,6 +221,7 @@ def read_feather(source, columns=None, use_threads=True, Returns ------- df : pandas.DataFrame + The contents of the Feather file as a pandas.DataFrame """ return (read_table( source, columns=columns, memory_map=memory_map, @@ -246,6 +247,7 @@ def read_table(source, columns=None, memory_map=False, use_threads=True): Returns ------- table : pyarrow.Table + The contents of the Feather file as a pyarrow.Table """ reader = _feather.FeatherReader( source, use_memory_map=memory_map, use_threads=use_threads) diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index ab151bc5d8f6..21db243528c7 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -281,7 +281,7 @@ class FSSpecHandler(FileSystemHandler): Parameters ---------- - fs : FSSpec-compliant filesystem instance. + fs : FSSpec-compliant filesystem instance Examples -------- diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index f1a1b315f681..21c17b4d36dc 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -2207,7 +2207,7 @@ def input_stream(source, compression='detect', buffer_size=None): Parameters ---------- - source : str, Path, buffer, file-like object, ... + source : str, Path, buffer, or file-like object The source to open for reading. compression : str optional, default 'detect' The compression algorithm to use for on-the-fly decompression. @@ -2259,7 +2259,7 @@ def output_stream(source, compression='detect', buffer_size=None): Parameters ---------- - source : str, Path, buffer, file-like object, ... + source : str, Path, buffer, file-like object The source to open for writing. compression : str optional, default 'detect' The compression algorithm to use for on-the-fly compression. diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 35a07d8737b5..6e60b8b9a058 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -57,11 +57,16 @@ class WriteStats(_WriteStats): Parameters ---------- - num_messages : number of messages. - num_record_batches : number of record batches. - num_dictionary_batches : number of dictionary batches. - num_dictionary_deltas : delta of dictionaries. - num_replaced_dictionaries : number of replaced dictionaries. + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. """ __slots__ = () @@ -84,11 +89,16 @@ class ReadStats(_ReadStats): Parameters ---------- - num_messages : number of messages. - num_record_batches : number of record batches. - num_dictionary_batches : number of dictionary batches. - num_dictionary_deltas : delta of dictionaries. - num_replaced_dictionaries : number of replaced dictionaries. + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. """ __slots__ = () @@ -106,16 +116,15 @@ cdef class IpcReadOptions(_Weakrefable): Parameters ---------- - ensure_native_endian : bool + ensure_native_endian : bool, default True Whether to convert incoming data to platform-native endianness. - Default is true. use_threads : bool Whether to use the global CPU thread pool to parallelize any - computational tasks like decompression. + computational tasks like decompression included_fields : list If empty (the default), return all deserialized fields. If non-empty, the values are the indices of fields to read on - the top-level schema. + the top-level schema """ __slots__ = () @@ -411,7 +420,7 @@ cdef class MessageReader(_Weakrefable): Parameters ---------- - source + source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object A readable source, like an InputStream """ cdef: diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py index fc724109d94c..523196e1e338 100644 --- a/python/pyarrow/ipc.py +++ b/python/pyarrow/ipc.py @@ -164,6 +164,7 @@ def new_stream(sink, schema, *, use_legacy_format=None, options=None): Returns ------- writer : RecordBatchStreamWriter + A writer for the given sink """.format(_ipc_writer_class_doc) @@ -180,9 +181,11 @@ def open_stream(source, *, options=None, memory_pool=None): If None, default values will be used. memory_pool : MemoryPool, default None If None, default memory pool is used. + Returns ------- reader : RecordBatchStreamReader + A reader for the given source """ return RecordBatchStreamReader(source, options=options, memory_pool=memory_pool) @@ -202,6 +205,7 @@ def new_file(sink, schema, *, use_legacy_format=None, options=None): Returns ------- writer : RecordBatchFileWriter + A writer for the given sink """.format(_ipc_writer_class_doc) @@ -221,9 +225,11 @@ def open_file(source, footer_offset=None, *, options=None, memory_pool=None): If None, default values will be used. memory_pool : MemoryPool, default None If None, default memory pool is used. + Returns ------- reader : RecordBatchFileReader + A reader for the given source """ return RecordBatchFileReader( source, footer_offset=footer_offset, @@ -271,6 +277,7 @@ def deserialize_pandas(buf, *, use_threads=True): Returns ------- df : pandas.DataFrame + The buffer deserialized as pandas DataFrame """ buffer_reader = pa.BufferReader(buf) with pa.RecordBatchStreamReader(buffer_reader) as reader: diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index e6148be0f896..88e3cf2a677c 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -162,6 +162,7 @@ def filters_to_expression(filters): Returns ------- pyarrow.compute.Expression + An Expression representing the filters """ import pyarrow.dataset as ds @@ -242,7 +243,7 @@ class ParquetFile: Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3). If True, Arrow will use a background I/O thread pool. - coerce_int96_timestamp_unit : str, default None. + coerce_int96_timestamp_unit : str, default None Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 timestamps will be inferred as timestamps @@ -541,9 +542,9 @@ def iter_batches(self, batch_size=65536, row_groups=None, columns=None, If True and file has custom pandas schema metadata, ensure that index columns are also loaded. - Returns + Yields ------- - iterator of pyarrow.RecordBatch + pyarrow.RecordBatch Contents of each batch as a record batch Examples @@ -645,7 +646,8 @@ def scan_contents(self, columns=None, batch_size=65536): Returns ------- - num_rows : number of rows in file + num_rows : int + Number of rows in file Examples -------- @@ -1186,6 +1188,7 @@ def get_metadata(self): Returns ------- metadata : FileMetaData + The file's metadata """ with self.open() as parquet: return parquet.metadata @@ -1222,6 +1225,7 @@ def read(self, columns=None, use_threads=True, partitions=None, Returns ------- table : pyarrow.Table + The piece as a pyarrow.Table. """ if self.open_file_func is not None: reader = self.open() @@ -1309,7 +1313,8 @@ def get_index(self, key): Parameters ---------- - key : The value for which we want to known the index. + key : str or int + The value for which we want to known the index. """ if key in self.key_indices: return self.key_indices[key] @@ -1713,7 +1718,7 @@ class ParquetDataset: use_legacy_dataset=False. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. -coerce_int96_timestamp_unit : str, default None. +coerce_int96_timestamp_unit : str, default None Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 timestamps will be inferred as timestamps in nanoseconds. @@ -2783,7 +2788,7 @@ def partitioning(self): use_legacy_dataset=False. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. -coerce_int96_timestamp_unit : str, default None. +coerce_int96_timestamp_unit : str, default None Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 timestamps will be inferred as timestamps @@ -3560,6 +3565,7 @@ def read_metadata(where, memory_map=False, decryption_properties=None, Returns ------- metadata : FileMetaData + The metadata of the Parquet file Examples -------- @@ -3609,6 +3615,7 @@ def read_schema(where, memory_map=False, decryption_properties=None, Returns ------- schema : pyarrow.Schema + The schema of the Parquet file Examples -------- diff --git a/python/pyarrow/plasma.py b/python/pyarrow/plasma.py index 5c2c6543418d..00342765557d 100644 --- a/python/pyarrow/plasma.py +++ b/python/pyarrow/plasma.py @@ -108,11 +108,12 @@ def start_plasma_store(plasma_store_memory, external_store : str External store to use for evicted objects. - Returns + Yields ------- - result : (str, subprocess.Popen) - A tuple of the name of the plasma store socket and the process ID of - the plasma store process. + plasma_store_name : str + Name of the plasma store socket + proc : subprocess.Popen + Process ID of the plasma store process """ warnings.warn( "Plasma is deprecated since Arrow 10.0.0. It will be removed in " diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 53e841228240..bcc428a4cb29 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -366,7 +366,7 @@ cdef class ChunkedArray(_PandasConvertible): Parameters ---------- - fill_value + fill_value : any The replacement value for null entries. Returns @@ -530,7 +530,7 @@ cdef class ChunkedArray(_PandasConvertible): Parameters ---------- - null_encoding + null_encoding : str, default "mask" How to handle null entries. Returns @@ -853,7 +853,7 @@ cdef class ChunkedArray(_PandasConvertible): ---------- mask : Array or array-like The boolean mask to filter the chunked array with. - null_selection_behavior + null_selection_behavior : str, default "drop" How nulls in the mask should be handled. Returns @@ -2103,7 +2103,7 @@ cdef class RecordBatch(_PandasConvertible): ---------- mask : Array or array-like The boolean mask to filter the record batch with. - null_selection_behavior + null_selection_behavior : str, default "drop" How nulls in the mask should be handled. Returns @@ -2938,7 +2938,7 @@ cdef class Table(_PandasConvertible): ---------- mask : Array or array-like or .Expression The boolean mask or the :class:`.Expression` to filter the table with. - null_selection_behavior + null_selection_behavior : str, default "drop" How nulls in the mask should be handled, does nothing if an :class:`.Expression` is used.