From bd2878036328e199e89e7e32adc364c7c84be19d Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Thu, 4 Jun 2026 02:43:26 +0200 Subject: [PATCH] Validate downloaded paths stay within the destination directory in SFTPHook.retrieve_directory SFTPHook.retrieve_directory and retrieve_directory_concurrently build each local destination path by joining the local directory with a path derived from directory-entry names returned by the remote SFTP server. Because those names can contain ".." components, the recursive download could write outside the configured local destination directory. Add a containment check (_validate_within_directory) that resolves each computed local path and refuses to write when it falls outside the destination directory, applied to both the serial and concurrent retrieval paths. Generated-by: Claude Opus 4.8 (1M context) following the guidelines at https://github.com/apache/airflow/blob/main/contributing-docs/05_pull_requests.rst#gen-ai-assisted-contributions --- .../src/airflow/providers/sftp/hooks/sftp.py | 37 +++++++++++++++++-- .../sftp/tests/unit/sftp/hooks/test_sftp.py | 23 ++++++++++++ 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/providers/sftp/src/airflow/providers/sftp/hooks/sftp.py b/providers/sftp/src/airflow/providers/sftp/hooks/sftp.py index 2f31c66d3b24e..bb2723489742b 100644 --- a/providers/sftp/src/airflow/providers/sftp/hooks/sftp.py +++ b/providers/sftp/src/airflow/providers/sftp/hooks/sftp.py @@ -384,6 +384,25 @@ def delete_file(self, path: str) -> None: """ self.conn.remove(path) # type: ignore[arg-type, union-attr] + @staticmethod + def _validate_within_directory(base_dir: str, candidate: str) -> str: + """ + Ensure ``candidate`` resolves to a path inside ``base_dir``. + + Directory-entry names are returned by the remote SFTP server and may + contain ``..`` components; joining them into the local destination path + could otherwise write outside it. Containment is verified before any + local write or ``mkdir``. + """ + base_real = os.path.realpath(base_dir) + candidate_real = os.path.realpath(candidate) + if candidate_real != base_real and os.path.commonpath([base_real, candidate_real]) != base_real: + raise ValueError( + f"Refusing to write outside the destination directory: " + f"{candidate!r} resolves outside {base_dir!r}" + ) + return candidate + def retrieve_directory(self, remote_full_path: str, local_full_path: str, prefetch: bool = True) -> None: """ Transfer the remote directory to a local location. @@ -400,10 +419,14 @@ def retrieve_directory(self, remote_full_path: str, local_full_path: str, prefet Path(local_full_path).mkdir(parents=True) files, dirs, _ = self.get_tree_map(remote_full_path) for dir_path in dirs: - new_local_path = os.path.join(local_full_path, os.path.relpath(dir_path, remote_full_path)) + new_local_path = self._validate_within_directory( + local_full_path, os.path.join(local_full_path, os.path.relpath(dir_path, remote_full_path)) + ) Path(new_local_path).mkdir(parents=True, exist_ok=True) for file_path in files: - new_local_path = os.path.join(local_full_path, os.path.relpath(file_path, remote_full_path)) + new_local_path = self._validate_within_directory( + local_full_path, os.path.join(local_full_path, os.path.relpath(file_path, remote_full_path)) + ) self.retrieve_file(file_path, new_local_path, prefetch) def retrieve_directory_concurrently( @@ -438,12 +461,18 @@ def retrieve_file_chunk( new_local_file_paths, remote_file_paths = [], [] files, dirs, _ = self.get_tree_map(remote_full_path) for dir_path in dirs: - new_local_path = os.path.join(local_full_path, os.path.relpath(dir_path, remote_full_path)) + new_local_path = self._validate_within_directory( + local_full_path, + os.path.join(local_full_path, os.path.relpath(dir_path, remote_full_path)), + ) Path(new_local_path).mkdir(parents=True, exist_ok=True) for file in files: remote_file_paths.append(file) new_local_file_paths.append( - os.path.join(local_full_path, os.path.relpath(file, remote_full_path)) + self._validate_within_directory( + local_full_path, + os.path.join(local_full_path, os.path.relpath(file, remote_full_path)), + ) ) remote_file_chunks = [remote_file_paths[i::workers] for i in range(workers)] local_file_chunks = [new_local_file_paths[i::workers] for i in range(workers)] diff --git a/providers/sftp/tests/unit/sftp/hooks/test_sftp.py b/providers/sftp/tests/unit/sftp/hooks/test_sftp.py index 902a191041bf6..41ab0878ec410 100644 --- a/providers/sftp/tests/unit/sftp/hooks/test_sftp.py +++ b/providers/sftp/tests/unit/sftp/hooks/test_sftp.py @@ -633,6 +633,29 @@ def test_store_and_retrieve_directory_concurrently(self): ) assert retrieved_dir_name in os.listdir(os.path.join(self.temp_dir, TMP_DIR_FOR_TESTS)) + def test_validate_within_directory_rejects_escape(self): + base = os.path.join(self.temp_dir, "download") + with pytest.raises(ValueError, match="outside the destination directory"): + SFTPHook._validate_within_directory(base, os.path.join(base, "..", "victim")) + # An in-bounds candidate is returned unchanged. + inside = os.path.join(base, "sub", "file") + assert SFTPHook._validate_within_directory(base, inside) == inside + + def test_retrieve_directory_rejects_server_path_traversal(self): + # A remote SFTP server can return a directory-entry name containing ".." + # so the recursive download would escape the local destination directory. + remote = "/srv/export" + local = os.path.join(self.temp_dir, "download_traversal") + escaping_file = "/srv/export/../victim/payload" + with ( + patch.object(SFTPHook, "get_tree_map", return_value=([escaping_file], [], [])), + patch.object(SFTPHook, "retrieve_file") as mock_retrieve, + ): + with pytest.raises(ValueError, match="outside the destination directory"): + self.hook.retrieve_directory(remote_full_path=remote, local_full_path=local) + mock_retrieve.assert_not_called() + assert not os.path.exists(os.path.join(self.temp_dir, "victim")) + @patch("paramiko.SSHClient") @patch("paramiko.ProxyCommand") @patch("airflow.providers.sftp.hooks.sftp.SFTPHook.get_connection")