apache · wjones127 · Jan 6, 2023 · Aug 27, 2022 · Aug 29, 2022 · Sep 14, 2022
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1107,7 +1107,7 @@ services:
       ["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
         /arrow/ci/scripts/python_build.sh /arrow /build &&
         pip install -e /arrow/dev/archery[numpydoc] &&
-        archery numpydoc --allow-rule PR01,PR03,PR10 &&
+        archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 &&
         /arrow/ci/scripts/python_test.sh /arrow"]
 
   conda-python-dask:

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
@@ -532,12 +532,12 @@ cdef class InMemoryDataset(Dataset):
 
     Parameters
     ----------
-    source : The data for this dataset.
-        Can be a RecordBatch, Table, list of
-        RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader.
+    source : RecordBatch, Table, list, tuple
+        The data for this dataset. Can be a RecordBatch, Table, list of
+        RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader
         If an iterable is provided, the schema must also be provided.
     schema : Schema, optional
-        Only required if passing an iterable as the source.
+        Only required if passing an iterable as the source
     """
 
     cdef:
@@ -2647,6 +2647,16 @@ cdef class Scanner(_Weakrefable):
         memory_pool : MemoryPool, default None
             For memory allocations, if required. If not specified, uses the
             default pool.
+        use_threads : bool, default True
+            If enabled, then maximum parallelism will be used determined by
+            the number of available CPU cores.
+        use_async : bool, default True
+            This flag is deprecated and is being kept for this release for
+            backwards compatibility.  It will be removed in the next
+            release.
+        memory_pool : MemoryPool, default None
+            For memory allocations, if required. If not specified, uses the
+            default pool.
         """
         cdef:
             shared_ptr[CScanOptions] options = make_shared[CScanOptions]()

diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx
@@ -71,7 +71,7 @@ cdef class ParquetFileFormat(FileFormat):
     default_fragment_scan_options : ParquetFragmentScanOptions
         Scan Options for the file.
     **kwargs : dict
-        Additional options for read option or scan option.
+        Additional options for read option or scan option
     """
 
     cdef:
@@ -236,9 +236,12 @@ class RowGroupInfo:
 
     Parameters
     ----------
-    id : the group id.
-    metadata : the rowgroup metadata.
-    schema : schema of the rows.
+    id : integer
+        The group ID.
+    metadata : FileMetaData
+        The rowgroup metadata.
+    schema : Schema
+        Schema of the rows.
     """
 
     def __init__(self, id, metadata, schema):
@@ -449,12 +452,12 @@ cdef class ParquetReadOptions(_Weakrefable):
     ----------
     dictionary_columns : list of string, default None
         Names of columns which should be dictionary encoded as
-        they are read.
-    coerce_int96_timestamp_unit : str, default None.
+        they are read
+    coerce_int96_timestamp_unit : str, default None
         Cast timestamps that are stored in INT96 format to a particular
         resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
         and therefore INT96 timestamps will be inferred as timestamps
-        in nanoseconds.
+        in nanoseconds
     """
 
     cdef public:

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -984,7 +984,7 @@ cdef class Array(_PandasConvertible):
 
         Parameters
         ----------
-        null_encoding
+        null_encoding : str, default "mask"
             How to handle null entries.
 
         Returns
@@ -1265,7 +1265,7 @@ cdef class Array(_PandasConvertible):
 
         Parameters
         ----------
-        fill_value
+        fill_value : any
             The replacement value for null entries.
 
         Returns
@@ -1363,7 +1363,7 @@ cdef class Array(_PandasConvertible):
         ----------
         mask : Array or array-like
             The boolean mask to filter the array with.
-        null_selection_behavior
+        null_selection_behavior : str, default "drop"
             How nulls in the mask should be handled.
 
         Returns

diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
@@ -374,6 +374,7 @@ def cast(arr, target_type=None, safe=None, options=None):
     Returns
     -------
     casted : Array
+        The cast result as a new Array
     """
     safe_vars_passed = (safe is not None) or (target_type is not None)
 
@@ -452,6 +453,7 @@ def take(data, indices, *, boundscheck=True, memory_pool=None):
     Returns
     -------
     result : depends on inputs
+        Selected values for the given indices
 
     Examples
     --------
@@ -490,6 +492,7 @@ def fill_null(values, fill_value):
     Returns
     -------
     result : depends on inputs
+        Values with all null elements replaced
 
     Examples
     --------
@@ -534,7 +537,8 @@ def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
 
     Returns
     -------
-    result : Array of indices
+    result : Array
+        Indices of the top-k ordered elements
 
     Examples
     --------
@@ -581,6 +585,7 @@ def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
     Returns
     -------
     result : Array of indices
+        Indices of the bottom-k ordered elements
 
     Examples
     --------
@@ -650,6 +655,7 @@ def field(*name_or_index):
     Returns
     -------
     field_expr : Expression
+        Reference to the given field
 
     Examples
     --------
@@ -691,5 +697,6 @@ def scalar(value):
     Returns
     -------
     scalar_expr : Expression
+        An Expression representing the scalar value
     """
     return Expression._scalar(value)
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
@@ -151,6 +151,7 @@ def partitioning(schema=None, field_names=None, flavor=None,
     Returns
     -------
     Partitioning or PartitioningFactory
+        The partioning scheme
 
     Examples
     --------
@@ -524,6 +525,7 @@ def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None,
     Returns
     -------
     FileSystemDataset
+        The dataset corresponding to the given metadata
     """
     from pyarrow.fs import LocalFileSystem, _ensure_filesystem
 

diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
@@ -221,6 +221,7 @@ def read_feather(source, columns=None, use_threads=True,
     Returns
     -------
     df : pandas.DataFrame
+        The contents of the Feather file as a pandas.DataFrame
     """
     return (read_table(
         source, columns=columns, memory_map=memory_map,
@@ -246,6 +247,7 @@ def read_table(source, columns=None, memory_map=False, use_threads=True):
     Returns
     -------
     table : pyarrow.Table
+        The contents of the Feather file as a pyarrow.Table
     """
     reader = _feather.FeatherReader(
         source, use_memory_map=memory_map, use_threads=use_threads)

diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py
@@ -281,7 +281,7 @@ class FSSpecHandler(FileSystemHandler):
 
     Parameters
     ----------
-    fs : FSSpec-compliant filesystem instance.
+    fs : FSSpec-compliant filesystem instance
 
     Examples
     --------

diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
@@ -2207,7 +2207,7 @@ def input_stream(source, compression='detect', buffer_size=None):
 
     Parameters
     ----------
-    source : str, Path, buffer, file-like object, ...
+    source : str, Path, buffer, or file-like object
         The source to open for reading.
     compression : str optional, default 'detect'
         The compression algorithm to use for on-the-fly decompression.
@@ -2259,7 +2259,7 @@ def output_stream(source, compression='detect', buffer_size=None):
 
     Parameters
     ----------
-    source : str, Path, buffer, file-like object, ...
+    source : str, Path, buffer, file-like object
         The source to open for writing.
     compression : str optional, default 'detect'
         The compression algorithm to use for on-the-fly compression.

diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
@@ -57,11 +57,16 @@ class WriteStats(_WriteStats):
 
     Parameters
     ----------
-    num_messages : number of messages.
-    num_record_batches : number of record batches.
-    num_dictionary_batches : number of dictionary batches.
-    num_dictionary_deltas : delta of dictionaries.
-    num_replaced_dictionaries : number of replaced dictionaries.
+    num_messages : int
+        Number of messages.
+    num_record_batches : int
+        Number of record batches.
+    num_dictionary_batches : int
+        Number of dictionary batches.
+    num_dictionary_deltas : int
+        Delta of dictionaries.
+    num_replaced_dictionaries : int
+        Number of replaced dictionaries.
     """
     __slots__ = ()
 
@@ -84,11 +89,16 @@ class ReadStats(_ReadStats):
 
     Parameters
     ----------
-    num_messages : number of messages.
-    num_record_batches : number of record batches.
-    num_dictionary_batches : number of dictionary batches.
-    num_dictionary_deltas : delta of dictionaries.
-    num_replaced_dictionaries : number of replaced dictionaries.
+    num_messages : int
+        Number of messages.
+    num_record_batches : int
+        Number of record batches.
+    num_dictionary_batches : int
+        Number of dictionary batches.
+    num_dictionary_deltas : int
+        Delta of dictionaries.
+    num_replaced_dictionaries : int
+        Number of replaced dictionaries.
     """
     __slots__ = ()
 
@@ -106,16 +116,15 @@ cdef class IpcReadOptions(_Weakrefable):
 
     Parameters
     ----------
-    ensure_native_endian : bool
+    ensure_native_endian : bool, default True
         Whether to convert incoming data to platform-native endianness.
-        Default is true.
     use_threads : bool
         Whether to use the global CPU thread pool to parallelize any
-        computational tasks like decompression.
+        computational tasks like decompression
     included_fields : list
         If empty (the default), return all deserialized fields.
         If non-empty, the values are the indices of fields to read on
-        the top-level schema.
+        the top-level schema
     """
     __slots__ = ()
 
@@ -411,7 +420,7 @@ cdef class MessageReader(_Weakrefable):
 
         Parameters
         ----------
-        source
+        source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object
             A readable source, like an InputStream
         """
         cdef:

diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
@@ -164,6 +164,7 @@ def new_stream(sink, schema, *, use_legacy_format=None, options=None):
 Returns
 -------
 writer : RecordBatchStreamWriter
+    A writer for the given sink
 """.format(_ipc_writer_class_doc)
 
 
@@ -180,9 +181,11 @@ def open_stream(source, *, options=None, memory_pool=None):
         If None, default values will be used.
     memory_pool : MemoryPool, default None
         If None, default memory pool is used.
+
     Returns
     -------
     reader : RecordBatchStreamReader
+        A reader for the given source
     """
     return RecordBatchStreamReader(source, options=options,
                                    memory_pool=memory_pool)
@@ -202,6 +205,7 @@ def new_file(sink, schema, *, use_legacy_format=None, options=None):
 Returns
 -------
 writer : RecordBatchFileWriter
+    A writer for the given sink
 """.format(_ipc_writer_class_doc)
 
 
@@ -221,9 +225,11 @@ def open_file(source, footer_offset=None, *, options=None, memory_pool=None):
         If None, default values will be used.
     memory_pool : MemoryPool, default None
         If None, default memory pool is used.
+
     Returns
     -------
     reader : RecordBatchFileReader
+        A reader for the given source
     """
     return RecordBatchFileReader(
         source, footer_offset=footer_offset,
@@ -271,6 +277,7 @@ def deserialize_pandas(buf, *, use_threads=True):
     Returns
     -------
     df : pandas.DataFrame
+        The buffer deserialized as pandas DataFrame
     """
     buffer_reader = pa.BufferReader(buf)
     with pa.RecordBatchStreamReader(buffer_reader) as reader: