From e60d2c5e72f88f77b4f954df882999c892bf1a49 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 24 Aug 2024 13:36:07 -0300 Subject: [PATCH 01/12] http/python: Rewrite README section about chunking --- http/get_simple/python/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/http/get_simple/python/server/README.md b/http/get_simple/python/server/README.md index ee45903..c0b48b0 100644 --- a/http/get_simple/python/server/README.md +++ b/http/get_simple/python/server/README.md @@ -32,4 +32,4 @@ python server.py ``` > [!NOTE] -> This example uses Python's built-in [`http.server`](https://docs.python.org/3/library/http.server.html) module. This server does not implement chunked transfer encoding automatically like more sophisticated HTTP servers do, so this example implements it manually, with each chunk consisting of one Arrow record batch. Note that in servers that implement chunked transfer encoding automatically, each chunk will generally not correspond to one Arrow record batch. +> This example uses Python's built-in [`http.server`](https://docs.python.org/3/library/http.server.html) module. This allows us to implement [chunked transfer encoding](https://en.wikipedia.org/wiki/Chunked_transfer_encoding) manually. Other servers may implement chunked transfer encoding automatically at the cost of an undesirable new layer of buffering. Arrow IPC streams already offer a natural way of chunking large amounts of tabular data. It's not a general requirement, but in this example each chunk corresponds to one Arrow record batch. From 36211610f44e19977342d37aa9c43af2b51aa104 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 23 Aug 2024 19:05:51 -0300 Subject: [PATCH 02/12] get_multipart/python: Add server.py and simple_client.py --- .../python/client/simple_client.py | 156 ++++++++ http/get_multipart/python/server/server.py | 338 ++++++++++++++++++ 2 files changed, 494 insertions(+) create mode 100644 http/get_multipart/python/client/simple_client.py create mode 100644 http/get_multipart/python/server/server.py diff --git a/http/get_multipart/python/client/simple_client.py b/http/get_multipart/python/client/simple_client.py new file mode 100644 index 0000000..c5bc4b2 --- /dev/null +++ b/http/get_multipart/python/client/simple_client.py @@ -0,0 +1,156 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Simple HTTP client parsing a multipart/mixed response. + +This client parses the multipart response produced by server/server.py +by using the multipart message parser from the Python email module. + +This module puts the entire message in memory and seems to spend a lot +of time looking for part delimiter and encoding/decoding the parts. + +The overhead of multipart/mixed parsing is 85% on my machine and after +the ~1GB Arrow Stream message is fully in memory, it takes only 0.06% +of the total execution time to parse it. +""" + +import email +import json +import pyarrow as pa +import sys +import time +import urllib.request + +JSON_FORMAT = "application/json" +TEXT_FORMAT = "text/plain" +ARROW_STREAM_FORMAT = "application/vnd.apache.arrow.stream" + +start_time = time.time() +response_parsing_time = 0 # time to parse the multipart message +arrow_stream_parsing_time = 0 # time to parse the Arrow stream + + +def parse_multipart_message(response, boundary, buffer_size=8192): + """ + Parse a multipart/mixed HTTP response into a list of Message objects. + + Returns + ------- + list of email.message.Message containing the parts of the multipart message. + """ + global response_parsing_time + buffer_size = max(buffer_size, 8192) + buffer = bytearray(buffer_size) + + header = f'MIME-Version: 1.0\r\nContent-Type: multipart/mixed; boundary="{boundary}"\r\n\r\n' + feedparser = email.parser.BytesFeedParser() + feedparser.feed(header.encode("utf-8")) + while bytes_read := response.readinto(buffer): + start_time = time.time() + feedparser.feed(buffer[0:bytes_read]) + response_parsing_time += time.time() - start_time + start_time = time.time() + message = feedparser.close() + response_parsing_time += time.time() - start_time + assert message.is_multipart() + return message.get_payload() + + +def process_json_part(message): + assert message.get_content_type() == JSON_FORMAT + payload = part.get_payload() + print(f"-- {len(payload)} bytes of JSON data:") + try: + PREVIW_SIZE = 5 + data = json.loads(payload) + print("[") + for i in range(min(PREVIW_SIZE, len(data))): + print(f" {data[i]}") + if len(data) > PREVIW_SIZE: + print(f" ...+{len(data) - PREVIW_SIZE} entries...") + print("]") + except json.JSONDecodeError as e: + print(f"Error parsing JSON data: {e}\n", file=sys.stderr) + return data + + +def process_arrow_stream_message(message): + global arrow_stream_parsing_time + assert message.get_content_type() == ARROW_STREAM_FORMAT + payload = part.get_payload(decode=True) + print(f"-- {len(payload)} bytes of Arrow data:") + num_batches = 0 + num_records = 0 + start_time = time.time() + with pa.ipc.open_stream(payload) as reader: + schema = reader.schema + print(f"Schema: \n{schema}\n") + try: + while True: + batch = reader.read_next_batch() + num_batches += 1 + num_records += batch.num_rows + except StopIteration: + pass + arrow_stream_parsing_time = time.time() - start_time + print(f"Parsed {num_records} records in {num_batches} batch(es)") + + +def process_text_part(message): + assert message.get_content_type() == TEXT_FORMAT + payload = part.get_payload() + print("-- Text Message:") + print(payload, end="") + print("-- End of Text Message --") + + +response = urllib.request.urlopen("http://localhost:8008?include_footnotes") + +content_type = response.headers.get_content_type() +if content_type != "multipart/mixed": + raise ValueError(f"Expected multipart/mixed Content-Type, got {content_type}") +boundary = response.headers.get_boundary() +if boundary is None or len(boundary) == 0: + raise ValueError("No multipart boundary found in Content-Type header") + +parts = parse_multipart_message(response, boundary, buffer_size=64 * 1024) +batches = None +for part in parts: + content_type = part.get_content_type() + if content_type == JSON_FORMAT: + process_json_part(part) + elif content_type == ARROW_STREAM_FORMAT: + batches = process_arrow_stream_message(part) + elif content_type == TEXT_FORMAT: + process_text_part(part) + +end_time = time.time() +execution_time = end_time - start_time + +rel_response_parsing_time = response_parsing_time / execution_time +rel_arrow_stream_parsing_time = arrow_stream_parsing_time / execution_time +print(f"{execution_time:.3f} seconds elapsed") +print( + f"""{response_parsing_time:.3f} seconds \ +({rel_response_parsing_time * 100:.2f}%) \ +seconds parsing multipart/mixed response""" +) +print( + f"""{arrow_stream_parsing_time:.3f} seconds \ +({rel_arrow_stream_parsing_time * 100:.2f}%) \ +seconds parsing Arrow stream""" +) diff --git a/http/get_multipart/python/server/server.py b/http/get_multipart/python/server/server.py new file mode 100644 index 0000000..3113a3f --- /dev/null +++ b/http/get_multipart/python/server/server.py @@ -0,0 +1,338 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from random import choice, randint +from http.server import BaseHTTPRequestHandler, HTTPServer +import io +import json +import secrets +import string +import time + +import pyarrow as pa + +# configuration: use chunked transfer encoding for HTTP/1.1 responses? +CHUNKED_ENCODING = True + + +def random_string(alphabet, length): + return "".join(choice(alphabet) for _ in range(length)) + + +def random_name(initial): + length = randint(3, 7) + return initial + random_string(string.ascii_lowercase, length) + + +def example_tickers(num_tickers): + tickers = [] + while len(tickers) < num_tickers: + length = randint(3, 4) + random_ticker = random_string(string.ascii_uppercase, length) + if random_ticker not in tickers: + tickers.append(random_ticker) + return tickers + + +def example_json_data(tickers): + json_data = [] + for ticker in tickers: + description = "" + for c in ticker: + description = " ".join(random_name(c) for c in ticker) + json_data.append( + { + "ticker": ticker, + "description": description, + } + ) + return json_data + + +the_schema = pa.schema( + [ + ("ticker", pa.utf8()), + ("price", pa.int64()), + ("volume", pa.int64()), + ] +) + + +def example_batch(tickers, length): + data = {"ticker": [], "price": [], "volume": []} + for _ in range(length): + data["ticker"].append(choice(tickers)) + data["price"].append(randint(1, 1000) * 100) + data["volume"].append(randint(1, 10000)) + + return pa.RecordBatch.from_pydict(data, the_schema) + + +def example_batches(tickers): + # these parameters are chosen to generate a response + # of ~1 GB and chunks of ~140 KB. + total_records = 42_000_000 + batch_len = 6 * 1024 + # all the batches sent are random slices of the larger base batch + base_batch = example_batch(tickers, length=8 * batch_len) + batches = [] + records = 0 + while records < total_records: + length = min(batch_len, total_records - records) + offset = randint(0, base_batch.num_rows - length - 1) + batch = base_batch.slice(offset, length) + batches.append(batch) + records += length + return batches + + +# end of example data generation + + +def random_multipart_boundary(): + """ + Generate a random boundary string for a multipart response. + + Uses a cryptographically secure random number generator to generate a + random boundary string for a multipart response. The boundary string has + enough entropy to make it impossible that it will be repeated in the + response body. + + Use a new boundary string for each multipart response so that once the + secret is revealed to the client, it won't be possible to exploit it to + create a malicious response. + """ + # 28 bytes (224 bits) of entropy is enough to make a collision impossible. + # See [1] for a mathematical discussion. + # + # The 28 bytes are encoded into URL-safe characters so the string ends + # up longer than 28 characters. RFC1341 [2] recommends a maximum boundary + # length of 70 characters, so we're well within that limit. + # + # [1] https://preshing.com/20110504/hash-collision-probabilities/ + # [2] https://www.w3.org/Protocols/rfc1341/7_2_Multipart.html + return secrets.token_urlsafe(28) + + +def gen_arrow_multipart_buffers(boundary, schema, source, is_last_part=False): + """ + Generate buffers for the Arrow Stream part of a multipart response. + + That is, an HTTP response started with the header: + + Content-type: multipart/mixed; boundary=the_boundary_string + + The buffers, when taken together, will form the following structure: + + --the_boundary_string + Content-Type: application/vnd.apache.arrow.stream + + + + + If is_last_part is True, the boundary string will be appended with two + hyphens at the end of the last buffer to indicate the end of the multipart + response: + + --the_boundary_string-- + """ + with io.BytesIO() as sink, pa.ipc.new_stream(sink, schema) as writer: + sink.write( + f"--{boundary}\r\n" + "Content-Type: application/vnd.apache.arrow.stream\r\n" + "\r\n".encode("utf-8") + ) + for batch in source: + writer.write_batch(batch) + sink.truncate() + with sink.getbuffer() as buffer: + yield buffer + sink.seek(0) + + writer.close() + sink.write("\r\n".encode("utf-8")) + if is_last_part: + sink.write(f"--{boundary}--\r\n".encode("utf-8")) + sink.truncate() + with sink.getbuffer() as buffer: + yield buffer + + +def gen_json_multipart_buffers(boundary, json_data, is_last_part=False): + """ + Generate buffers for the JSON part of a multipart response. + + That is, an HTTP response started with the header: + + Content-type: multipart/mixed; boundary=the_boundary_string + + The buffer will have the following structure: + + --the_boundary_string + Content-Type: application/json + + + + + If is_last_part is True, the boundary string will be appended with two + hyphens at the end of the buffer to indicate the end of the multipart + response: + + --the_boundary_string-- + + Allocation of a big string for the JSON data is avoided by appending the + JSON data directly to the same output buffer. + """ + with io.BytesIO() as sink: + with io.TextIOWrapper(sink, encoding="utf-8", write_through=True) as wrapper: + wrapper.write(f"--{boundary}\r\n" "Content-Type: application/json\r\n\r\n") + json.dump(json_data, wrapper) + wrapper.write("\r\n") + if is_last_part: + wrapper.write(f"--{boundary}--\r\n") + with sink.getbuffer() as buffer: + yield buffer + + +def multipart_buffer_from_string(boundary, content_type, text, is_last_part=False): + close_delimiter = f"--{boundary}--\r\n" if is_last_part else "" + return ( + f"--{boundary}\r\n" + f"Content-Type: {content_type}\r\n\r\n" + f"{text}\r\n{close_delimiter}".encode("utf-8") + ) + + +class MyRequestHandler(BaseHTTPRequestHandler): + """ + Multipart response handler for a simple HTTP server. + + This HTTP request handler serves a multipart/mixed response containing + a JSON data part, followed by an Arrow Stream part and an optional text + footer as the last part. + + The Arrow data is randomly generated "trading data" with a schema consisting + of a ticker, price (in cents), and volume. The JSON header contains all the + tickers and their descriptions. This could be returned as an Arrow table as + well, but to illustrate the use of multiple parts in a response, it is sent + as JSON. + + To make things more... mixed, a third part is added to the response: a + plaintext footer containing footnotes about the request. This part is + optional and only included if the client requests it by sending a query + parameter `include_footnotes`. + """ + + _include_footnotes = False + _start_arrow_stream_time = None + _end_arrow_stream_time = None + _number_of_arrow_data_chunks = 0 + _bytes_sent_on_arrow_stream = 0 + + def _resolve_json_data_header(self): + return the_json_data + + def _resolve_batches(self): + return pa.RecordBatchReader.from_batches(the_schema, all_batches) + + def _build_footnotes(self): + num_batches = len(all_batches) + elapsed_time = self._end_arrow_stream_time - self._start_arrow_stream_time + num_chunks = self._number_of_arrow_data_chunks + avg_chunk_size = self._bytes_sent_on_arrow_stream / num_chunks + text = ( + f"Hello Client,\n\n{num_batches} Arrow batch(es) were sent in " + f"{elapsed_time:.3f} seconds through {num_chunks} HTTP\nresponse chunks. " + f"Average size of each chunk was {avg_chunk_size:.2f} bytes.\n" + "\n--\nSincerely,\nThe Server\n" + ) + return text + + def _gen_buffers(self, boundary, json_header, schema, source): + # JSON header + yield from gen_json_multipart_buffers(boundary, json_header) + # Arrow data + is_last_part = not self._include_footnotes + self._start_arrow_stream_time = time.time() + for buffer in gen_arrow_multipart_buffers( + boundary, schema, source, is_last_part=is_last_part + ): + self._number_of_arrow_data_chunks += 1 + self._bytes_sent_on_arrow_stream += len(buffer) + yield buffer + self._end_arrow_stream_time = time.time() + # Footnotes (optional) + if self._include_footnotes: + footnotes = self._build_footnotes() + yield multipart_buffer_from_string( + boundary, "text/plain", footnotes, is_last_part=True + ) + + def do_GET(self): + ### note: always use urlparse in your applications. + self._include_footnotes = self.path.endswith("?include_footnotes") + ### in a real application the data would be resolved from a database or + ### another source like a file and error handling would be done here + ### before the 200 OK response starts being sent to the client. + json_data_header = self._resolve_json_data_header() + source = self._resolve_batches() + + if self.request_version == "HTTP/1.0": + self.protocol_version = "HTTP/1.0" + chunked = False + else: + self.protocol_version = "HTTP/1.1" + chunked = CHUNKED_ENCODING + + self.send_response(200) + boundary = random_multipart_boundary() + self.send_header("Content-Type", f"multipart/mixed; boundary={boundary}") + ### set these headers if testing with a local browser-based client: + # self.send_header('Access-Control-Allow-Origin', 'http://localhost:8008') + # self.send_header('Access-Control-Allow-Methods', 'GET') + # self.send_header('Access-Control-Allow-Headers', 'Content-Type') + if chunked: + self.send_header("Transfer-Encoding", "chunked") + + self.end_headers() + + for buffer in self._gen_buffers(boundary, json_data_header, the_schema, source): + if chunked: + self.wfile.write(f"{len(buffer):X}\r\n".encode("utf-8")) + self.wfile.write(buffer) + if chunked: + self.wfile.write("\r\n".encode("utf-8")) + self.wfile.flush() + + if chunked: + self.wfile.write("0\r\n\r\n".encode("utf-8")) + self.wfile.flush() + + +print("Generating example data...") +all_tickers = example_tickers(60) +all_batches = example_batches(all_tickers) +the_json_data = example_json_data(all_tickers) + +server_address = ("localhost", 8008) +try: + httpd = HTTPServer(server_address, MyRequestHandler) + print(f"Serving on {server_address[0]}:{server_address[1]}...") + httpd.serve_forever() +except KeyboardInterrupt: + print("Shutting down server") + httpd.socket.close() From 418ec25cebf451d3e1196de70ec4a1652a24b967 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 29 Aug 2024 11:28:18 -0300 Subject: [PATCH 03/12] get_multipart/python: Explain what urlsafe characters are --- http/get_multipart/python/server/server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/http/get_multipart/python/server/server.py b/http/get_multipart/python/server/server.py index 3113a3f..14449e6 100644 --- a/http/get_multipart/python/server/server.py +++ b/http/get_multipart/python/server/server.py @@ -119,9 +119,9 @@ def random_multipart_boundary(): # 28 bytes (224 bits) of entropy is enough to make a collision impossible. # See [1] for a mathematical discussion. # - # The 28 bytes are encoded into URL-safe characters so the string ends - # up longer than 28 characters. RFC1341 [2] recommends a maximum boundary - # length of 70 characters, so we're well within that limit. + # The 28 bytes are encoded into URL-safe characters (alphanumeric, -, and _) + # so the string ends up longer than 28 characters. RFC1341 [2] recommends a + # maximum boundary length of 70 characters, so we're well within that limit. # # [1] https://preshing.com/20110504/hash-collision-probabilities/ # [2] https://www.w3.org/Protocols/rfc1341/7_2_Multipart.html From af2acd9c05549db74a1ffc6c9cb04c78de7bdd90 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 29 Aug 2024 11:29:21 -0300 Subject: [PATCH 04/12] get_multipart/python: Add two new READMEs --- http/get_multipart/README.md | 2 +- http/get_multipart/python/client/README.md | 42 +++++++++++++++++++ http/get_multipart/python/server/README.md | 49 ++++++++++++++++++++++ 3 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 http/get_multipart/python/client/README.md create mode 100644 http/get_multipart/python/server/README.md diff --git a/http/get_multipart/README.md b/http/get_multipart/README.md index 662f38c..35cd96e 100644 --- a/http/get_multipart/README.md +++ b/http/get_multipart/README.md @@ -19,4 +19,4 @@ # HTTP GET Arrow Data: Multipart Examples -This directory contains examples of HTTP servers/clients that send/receive a multipart response (`Content-Type: multipart/mixed`) containing JSON data (`Content-Type: application/json`) and Arrow IPC stream data (`Content-Type: application/vnd.apache.arrow.stream`). +This directory contains examples of HTTP servers/clients that send/receive a multipart response (`Content-Type: multipart/mixed`) containing JSON data (`Content-Type: application/json`), an Arrow IPC stream data (`Content-Type: application/vnd.apache.arrow.stream`), and (optionally) plain text data (`Content-Type: text/plain`). diff --git a/http/get_multipart/python/client/README.md b/http/get_multipart/python/client/README.md new file mode 100644 index 0000000..2dd60a7 --- /dev/null +++ b/http/get_multipart/python/client/README.md @@ -0,0 +1,42 @@ + + +# HTTP GET Arrow Data in multipart/mixed: Python Client Example + +This directory contains an example of a Python HTTP client that receives a +`multipart/mixed` response from the server. The client: +1. Sends an HTTP GET request to a server. +2. Receives an HTTP 200 response from the server, with the response body + containing a `multipart/mixed` response. +3. Parses the `multipart/mixed` response using the `email` module. [1] +4. Extracts the JSON part, parses it and prints a preview of the JSON data. +5. Extracts the Arrow stream part, reads the Arrow stream, and sums the + total number of records in the entire Arrow stream. +6. Extracts the plain text part and prints it as it is. + +To run this example, first start one of the server examples in the parent +directory, then: + +```sh +pip install pyarrow +python simple_client.py +``` + +[1] The `multipart/mixed` standard, used by HTTP, is derived from the MIME +standard used in email. diff --git a/http/get_multipart/python/server/README.md b/http/get_multipart/python/server/README.md new file mode 100644 index 0000000..fc7effa --- /dev/null +++ b/http/get_multipart/python/server/README.md @@ -0,0 +1,49 @@ + + +# HTTP GET Arrow Data in multipart/mixed: Python Client Example + +This directory contains an example of a Python HTTP server that sends a +`multipart/mixed` response to clients. The server: +1. Creates a list of record batches and populates it with synthesized data. +2. Listens for HTTP GET requests from clients. +3. Upon receiving a request, builds and sends an HTTP 200 `multipart/mixed` + response containing: + - A JSON part with metadata about the Arrow stream. + - An Arrow stream part with the Arrow IPC stream of record batches. + - A plain text part with a message containing timing information. This part + is optional (included if `?include_footnotes` is present in the URL). + +To run this example: + +```sh +pip install pyarrow +python server.py +``` + +> [!NOTE] +> This example uses Python's built-in +> [`http.server`](https://docs.python.org/3/library/http.server.html) module. +> This allows us to implement [chunked transfer +> encoding](https://en.wikipedia.org/wiki/Chunked_transfer_encoding) manually. +> Other servers may implement chunked transfer encoding automatically at the +> cost of an undesirable new layer of buffering. Arrow IPC streams already offer +> a natural way of chunking large amounts of tabular data. It's not a general +> requirement, but in this example each chunk corresponds to one Arrow record +> batch. From 64e4b5239bb1a5404f4b3776bdda31c8c4424be9 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 29 Aug 2024 11:33:08 -0300 Subject: [PATCH 05/12] get_multipart/python: Move module-level docs to README --- http/get_multipart/python/client/README.md | 10 ++++++++++ http/get_multipart/python/client/simple_client.py | 13 ------------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/http/get_multipart/python/client/README.md b/http/get_multipart/python/client/README.md index 2dd60a7..7f31551 100644 --- a/http/get_multipart/python/client/README.md +++ b/http/get_multipart/python/client/README.md @@ -38,5 +38,15 @@ pip install pyarrow python simple_client.py ``` +> [!NOTE] +> This `simple_client.py` parses the multipart response using the multipart +> message parser from the Python `email` module. This module puts the entire +> message in memory and seems to spend a lot of time looking for part delimiter +> and encoding/decoding the parts. +> +> The overhead of `multipart/mixed` parsing is 85% on my machine and after the +> ~1GB Arrow Stream message is fully in memory, it takes only 0.06% of the total +> execution time to parse it. + [1] The `multipart/mixed` standard, used by HTTP, is derived from the MIME standard used in email. diff --git a/http/get_multipart/python/client/simple_client.py b/http/get_multipart/python/client/simple_client.py index c5bc4b2..96b1927 100644 --- a/http/get_multipart/python/client/simple_client.py +++ b/http/get_multipart/python/client/simple_client.py @@ -14,19 +14,6 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -""" -Simple HTTP client parsing a multipart/mixed response. - -This client parses the multipart response produced by server/server.py -by using the multipart message parser from the Python email module. - -This module puts the entire message in memory and seems to spend a lot -of time looking for part delimiter and encoding/decoding the parts. - -The overhead of multipart/mixed parsing is 85% on my machine and after -the ~1GB Arrow Stream message is fully in memory, it takes only 0.06% -of the total execution time to parse it. -""" import email import json From 2efad066324fb8a60bfa6623bd495acf6905c401 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 29 Aug 2024 11:35:07 -0300 Subject: [PATCH 06/12] fixup! get_multipart/python: Add two new READMEs --- http/get_multipart/python/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/http/get_multipart/python/server/README.md b/http/get_multipart/python/server/README.md index fc7effa..b75c0a0 100644 --- a/http/get_multipart/python/server/README.md +++ b/http/get_multipart/python/server/README.md @@ -17,7 +17,7 @@ under the License. --> -# HTTP GET Arrow Data in multipart/mixed: Python Client Example +# HTTP GET Arrow Data in multipart/mixed: Python Server Example This directory contains an example of a Python HTTP server that sends a `multipart/mixed` response to clients. The server: From 71c322dd196527ddc302284885269a1708672aeb Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 29 Aug 2024 12:11:01 -0300 Subject: [PATCH 07/12] Add a general boundary generation algorithm recommendation --- http/get_multipart/README.md | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/http/get_multipart/README.md b/http/get_multipart/README.md index 35cd96e..6ec4a32 100644 --- a/http/get_multipart/README.md +++ b/http/get_multipart/README.md @@ -20,3 +20,43 @@ # HTTP GET Arrow Data: Multipart Examples This directory contains examples of HTTP servers/clients that send/receive a multipart response (`Content-Type: multipart/mixed`) containing JSON data (`Content-Type: application/json`), an Arrow IPC stream data (`Content-Type: application/vnd.apache.arrow.stream`), and (optionally) plain text data (`Content-Type: text/plain`). + +## Picking a Boundary + +The `multipart/mixed` response format uses a boundary string to separate the +parts. This string **must not appear in the content of any part** +(RFC 1341 [1]). + +We **do not recommend** checking for the boundary string in the content of the +parts as that would prevent streaming them. Which would add up to the memory +usage of the server and waste CPU time. + +### Recommended Algorithm + +For every `multipart/mixed` response produced by the server: +1. Using a CSPRNG [2], generate a byte string of enough entropy to make the + probability of collision [3] negligible (at least 160 bits = 20 bytes) [4]. +2. Encode the byte string in a way that is safe to use in HTTP headers. We + recommend using `base64url` encoding described in RFC 4648 [5]. + +`base64url` encoding is a variant of `base64` encoding that uses `-` and `_` +instead of `+` and `/` respectively. It also omits padding characters (`=`). + +This algorithm can be implemented in Python using the `secret.token_urlsafe()` +function. + +If you generate a boundary string with generous 224 bits of entropy +(i.e. 28 bytes), the base64url encoding will produce a 38-character +string which is well below the limit defined by RFC 1341 (70 characters). + + >>> import secrets + >>> boundary = secrets.token_urlsafe(28) + >>> len(boundary) + 38 + + +[1] [RFC 1341 - Section 7.2 The Multipart Content-Type](https://www.w3.org/Protocols/rfc1341/7_2_Multipart.html) +[2] [Cryptographically Secure Pseudo-Random Number Generator](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) +[3] [Birthday Problem](https://en.wikipedia.org/wiki/Birthday_problem) +[4] [Hash Collision Probabilities](https://preshing.com/20110504/hash-collision-probabilities/) +[5] [RFC 4648 - Section 5 Base 64 Encoding with URL and Filename Safe Alphabet](https://tools.ietf.org/html/rfc4648#section-5) From f8d960dc0739c5d5774ab4ae85d82be8af0bb0ec Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 29 Aug 2024 13:23:31 -0300 Subject: [PATCH 08/12] Always specify policy --- http/get_multipart/python/client/simple_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/http/get_multipart/python/client/simple_client.py b/http/get_multipart/python/client/simple_client.py index 96b1927..07b18e4 100644 --- a/http/get_multipart/python/client/simple_client.py +++ b/http/get_multipart/python/client/simple_client.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +from email import policy import email import json import pyarrow as pa @@ -44,7 +45,7 @@ def parse_multipart_message(response, boundary, buffer_size=8192): buffer = bytearray(buffer_size) header = f'MIME-Version: 1.0\r\nContent-Type: multipart/mixed; boundary="{boundary}"\r\n\r\n' - feedparser = email.parser.BytesFeedParser() + feedparser = email.parser.BytesFeedParser(policy=policy.default) feedparser.feed(header.encode("utf-8")) while bytes_read := response.readinto(buffer): start_time = time.time() From 6dae2312cd0b78e645bc08592b69d613dbda5132 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 29 Aug 2024 13:26:27 -0300 Subject: [PATCH 09/12] Use the right md syntax for footnotes --- http/get_multipart/README.md | 18 +++++++++--------- http/get_multipart/python/client/README.md | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/http/get_multipart/README.md b/http/get_multipart/README.md index 6ec4a32..8705618 100644 --- a/http/get_multipart/README.md +++ b/http/get_multipart/README.md @@ -25,7 +25,7 @@ This directory contains examples of HTTP servers/clients that send/receive a mul The `multipart/mixed` response format uses a boundary string to separate the parts. This string **must not appear in the content of any part** -(RFC 1341 [1]). +(RFC 1341[^1]). We **do not recommend** checking for the boundary string in the content of the parts as that would prevent streaming them. Which would add up to the memory @@ -34,10 +34,10 @@ usage of the server and waste CPU time. ### Recommended Algorithm For every `multipart/mixed` response produced by the server: -1. Using a CSPRNG [2], generate a byte string of enough entropy to make the - probability of collision [3] negligible (at least 160 bits = 20 bytes) [4]. +1. Using a CSPRNG[^2], generate a byte string of enough entropy to make the + probability of collision[^3] negligible (at least 160 bits = 20 bytes)[^4]. 2. Encode the byte string in a way that is safe to use in HTTP headers. We - recommend using `base64url` encoding described in RFC 4648 [5]. + recommend using `base64url` encoding described in RFC 4648[^5]. `base64url` encoding is a variant of `base64` encoding that uses `-` and `_` instead of `+` and `/` respectively. It also omits padding characters (`=`). @@ -55,8 +55,8 @@ string which is well below the limit defined by RFC 1341 (70 characters). 38 -[1] [RFC 1341 - Section 7.2 The Multipart Content-Type](https://www.w3.org/Protocols/rfc1341/7_2_Multipart.html) -[2] [Cryptographically Secure Pseudo-Random Number Generator](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) -[3] [Birthday Problem](https://en.wikipedia.org/wiki/Birthday_problem) -[4] [Hash Collision Probabilities](https://preshing.com/20110504/hash-collision-probabilities/) -[5] [RFC 4648 - Section 5 Base 64 Encoding with URL and Filename Safe Alphabet](https://tools.ietf.org/html/rfc4648#section-5) +[^1]: [RFC 1341 - Section 7.2 The Multipart Content-Type](https://www.w3.org/Protocols/rfc1341/7_2_Multipart.html) +[^2]: [Cryptographically Secure Pseudo-Random Number Generator](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) +[^3]: [Birthday Problem](https://en.wikipedia.org/wiki/Birthday_problem) +[^4]: [Hash Collision Probabilities](https://preshing.com/20110504/hash-collision-probabilities/) +[^5]: [RFC 4648 - Section 5 Base 64 Encoding with URL and Filename Safe Alphabet](https://tools.ietf.org/html/rfc4648#section-5) diff --git a/http/get_multipart/python/client/README.md b/http/get_multipart/python/client/README.md index 7f31551..4a926de 100644 --- a/http/get_multipart/python/client/README.md +++ b/http/get_multipart/python/client/README.md @@ -24,7 +24,7 @@ This directory contains an example of a Python HTTP client that receives a 1. Sends an HTTP GET request to a server. 2. Receives an HTTP 200 response from the server, with the response body containing a `multipart/mixed` response. -3. Parses the `multipart/mixed` response using the `email` module. [1] +3. Parses the `multipart/mixed` response using the `email` module.[^1] 4. Extracts the JSON part, parses it and prints a preview of the JSON data. 5. Extracts the Arrow stream part, reads the Arrow stream, and sums the total number of records in the entire Arrow stream. @@ -48,5 +48,5 @@ python simple_client.py > ~1GB Arrow Stream message is fully in memory, it takes only 0.06% of the total > execution time to parse it. -[1] The `multipart/mixed` standard, used by HTTP, is derived from the MIME +[^1]: The `multipart/mixed` standard, used by HTTP, is derived from the MIME standard used in email. From 985684d508437a9fe986042951c44bc567aa16ea Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 29 Aug 2024 13:27:03 -0300 Subject: [PATCH 10/12] Change note to warning --- http/get_multipart/python/client/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/http/get_multipart/python/client/README.md b/http/get_multipart/python/client/README.md index 4a926de..532c4b7 100644 --- a/http/get_multipart/python/client/README.md +++ b/http/get_multipart/python/client/README.md @@ -38,7 +38,7 @@ pip install pyarrow python simple_client.py ``` -> [!NOTE] +> [!WARNING] > This `simple_client.py` parses the multipart response using the multipart > message parser from the Python `email` module. This module puts the entire > message in memory and seems to spend a lot of time looking for part delimiter From cda31f7232edf58f7d6a2f0ae84f5d88f92b349b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 29 Aug 2024 13:31:16 -0300 Subject: [PATCH 11/12] Fix positioning of footnote links --- http/get_multipart/README.md | 10 +++++----- http/get_multipart/python/client/README.md | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/http/get_multipart/README.md b/http/get_multipart/README.md index 8705618..9d81e4c 100644 --- a/http/get_multipart/README.md +++ b/http/get_multipart/README.md @@ -24,8 +24,8 @@ This directory contains examples of HTTP servers/clients that send/receive a mul ## Picking a Boundary The `multipart/mixed` response format uses a boundary string to separate the -parts. This string **must not appear in the content of any part** -(RFC 1341[^1]). +parts. This string **must not appear in the content of any part** according +to RFC 1341. [^1] We **do not recommend** checking for the boundary string in the content of the parts as that would prevent streaming them. Which would add up to the memory @@ -34,10 +34,10 @@ usage of the server and waste CPU time. ### Recommended Algorithm For every `multipart/mixed` response produced by the server: -1. Using a CSPRNG[^2], generate a byte string of enough entropy to make the - probability of collision[^3] negligible (at least 160 bits = 20 bytes)[^4]. +1. Using a CSPRNG,[^2] generate a byte string of enough entropy to make the + probability of collision[^3] negligible (at least 160 bits = 20 bytes).[^4] 2. Encode the byte string in a way that is safe to use in HTTP headers. We - recommend using `base64url` encoding described in RFC 4648[^5]. + recommend using `base64url` encoding described in RFC 4648.[^5] `base64url` encoding is a variant of `base64` encoding that uses `-` and `_` instead of `+` and `/` respectively. It also omits padding characters (`=`). diff --git a/http/get_multipart/python/client/README.md b/http/get_multipart/python/client/README.md index 532c4b7..e3959dc 100644 --- a/http/get_multipart/python/client/README.md +++ b/http/get_multipart/python/client/README.md @@ -24,7 +24,7 @@ This directory contains an example of a Python HTTP client that receives a 1. Sends an HTTP GET request to a server. 2. Receives an HTTP 200 response from the server, with the response body containing a `multipart/mixed` response. -3. Parses the `multipart/mixed` response using the `email` module.[^1] +3. Parses the `multipart/mixed` response using the `email` module. [^1] 4. Extracts the JSON part, parses it and prints a preview of the JSON data. 5. Extracts the Arrow stream part, reads the Arrow stream, and sums the total number of records in the entire Arrow stream. From 1d82be8ef0250a7f794da4446ff4cdb15f68873f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 29 Aug 2024 13:36:08 -0300 Subject: [PATCH 12/12] fixup! Fix positioning of footnote links --- http/get_multipart/README.md | 2 +- http/get_multipart/python/client/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/http/get_multipart/README.md b/http/get_multipart/README.md index 9d81e4c..e2f40ae 100644 --- a/http/get_multipart/README.md +++ b/http/get_multipart/README.md @@ -25,7 +25,7 @@ This directory contains examples of HTTP servers/clients that send/receive a mul The `multipart/mixed` response format uses a boundary string to separate the parts. This string **must not appear in the content of any part** according -to RFC 1341. [^1] +to RFC 1341.[^1] We **do not recommend** checking for the boundary string in the content of the parts as that would prevent streaming them. Which would add up to the memory diff --git a/http/get_multipart/python/client/README.md b/http/get_multipart/python/client/README.md index e3959dc..532c4b7 100644 --- a/http/get_multipart/python/client/README.md +++ b/http/get_multipart/python/client/README.md @@ -24,7 +24,7 @@ This directory contains an example of a Python HTTP client that receives a 1. Sends an HTTP GET request to a server. 2. Receives an HTTP 200 response from the server, with the response body containing a `multipart/mixed` response. -3. Parses the `multipart/mixed` response using the `email` module. [^1] +3. Parses the `multipart/mixed` response using the `email` module.[^1] 4. Extracts the JSON part, parses it and prints a preview of the JSON data. 5. Extracts the Arrow stream part, reads the Arrow stream, and sums the total number of records in the entire Arrow stream.