Skip to content

Commit 2f62fee

Browse files
committed
refactored ignore logic to follow moby (docker project) pattern matching logic and tests
1 parent f2a3b03 commit 2f62fee

8 files changed

Lines changed: 652 additions & 369 deletions

File tree

Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
from __future__ import annotations
2+
3+
import os
4+
from typing import Iterable, Optional, Sequence
5+
from pathlib import Path, PurePosixPath
6+
from dataclasses import dataclass
7+
8+
__all__ = [
9+
"IgnorePattern",
10+
"read_ignorefile",
11+
"compile_ignore",
12+
"path_match",
13+
"is_ignored",
14+
]
15+
16+
17+
@dataclass(frozen=True)
18+
class IgnorePattern:
19+
"""Single parsed ignore pattern.
20+
21+
Follows Docker-style .dockerignore semantics, supports other ignore use cases following same approach.
22+
23+
Details:
24+
- ``pattern``: The normalized pattern text with leading/trailing ``/`` removed.
25+
Always uses POSIX ``'/'`` separators.
26+
- ``negated``: True if this is a negation pattern starting with ``!``.
27+
- ``directory_only``: True if the original pattern ended with ``/`` and should
28+
apply only to directories and their descendants.
29+
- ``anchored``: True if the pattern contains a path separator and should be
30+
matched relative to the root path rather than at any depth.
31+
"""
32+
33+
pattern: str
34+
negated: bool
35+
directory_only: bool
36+
anchored: bool
37+
38+
39+
def _normalize_pattern_line(raw: bytes, *, is_first_line: bool) -> Optional[str]:
40+
"""Normalize a single ignorefile line, mirroring moby's ignorefile.ReadAll.
41+
42+
Behavior is based on:
43+
https://github.com/moby/patternmatcher/blob/main/ignorefile/ignorefile.go
44+
"""
45+
46+
# Strip UTF-8 BOM from the first line if present
47+
if is_first_line and raw.startswith(b"\xef\xbb\xbf"):
48+
raw = raw[len(b"\xef\xbb\xbf") :]
49+
50+
# Decode as UTF-8; we are strict here to surface bad encodings
51+
text = raw.decode("utf-8", errors="strict")
52+
text = text.rstrip("\r\n")
53+
54+
# Lines starting with '#' are comments and are ignored before processing,
55+
# i.e. we do *not* treat leading spaces as part of the comment detection.
56+
if text.startswith("#"):
57+
return None
58+
59+
# Trim leading and trailing whitespace
60+
pattern = text.strip()
61+
if not pattern:
62+
return None
63+
64+
# Normalize absolute paths to paths relative to the context (taking care of '!' prefix)
65+
invert = pattern[0] == "!"
66+
if invert:
67+
pattern = pattern[1:].strip()
68+
69+
if pattern:
70+
# filepath.Clean equivalent
71+
pattern = os.path.normpath(pattern)
72+
# filepath.ToSlash equivalent
73+
pattern = pattern.replace(os.sep, "/")
74+
# Leading forward-slashes are removed so "/some/path" and "some/path"
75+
# are considered equivalent.
76+
if len(pattern) > 1 and pattern[0] == "/":
77+
pattern = pattern[1:]
78+
79+
if invert:
80+
pattern = "!" + pattern
81+
82+
return pattern
83+
84+
85+
def read_ignorefile(path: Optional[Path]) -> list[str]:
86+
"""Read an ignore file and return a list of normalized pattern strings.
87+
88+
This mirrors the behavior of moby's ``ignorefile.ReadAll``:
89+
90+
- UTF-8 BOM on the first line is stripped.
91+
- Lines starting with ``#`` are treated as comments and skipped.
92+
- Remaining lines are trimmed, optionally negated with ``!``, cleaned,
93+
have path separators normalized to ``/``, and leading ``/`` removed.
94+
"""
95+
96+
if path is None:
97+
return []
98+
99+
if not path.exists():
100+
return []
101+
102+
patterns: list[str] = []
103+
with path.open("rb") as f:
104+
first = True
105+
for raw in f:
106+
normalized = _normalize_pattern_line(raw, is_first_line=first)
107+
first = False
108+
if normalized is None:
109+
continue
110+
patterns.append(normalized)
111+
112+
return patterns
113+
114+
115+
def compile_ignore(patterns: Sequence[str]) -> list[IgnorePattern]:
116+
"""Compile raw pattern strings into :class:`IgnorePattern` objects."""
117+
118+
compiled: list[IgnorePattern] = []
119+
120+
for raw in patterns:
121+
if not raw:
122+
continue
123+
124+
negated = raw[0] == "!"
125+
pattern_text = raw[1:] if negated else raw
126+
127+
if not pattern_text:
128+
# Bare "!" is ignored, matching Docker / moby behavior.
129+
continue
130+
131+
directory_only = pattern_text.endswith("/")
132+
if directory_only:
133+
pattern_text = pattern_text.rstrip("/")
134+
135+
if not pattern_text:
136+
continue
137+
138+
# Treat patterns containing a path separator as anchored to the root
139+
anchored = "/" in pattern_text
140+
141+
compiled.append(
142+
IgnorePattern(
143+
pattern=PurePosixPath(pattern_text).as_posix(),
144+
negated=negated,
145+
directory_only=directory_only,
146+
anchored=anchored,
147+
)
148+
)
149+
150+
return compiled
151+
152+
153+
def _segment_match(pattern_segment: str, path_segment: str) -> bool:
154+
"""Match a single path segment against a glob pattern segment.
155+
156+
Supports:
157+
- ``*``: any sequence of characters except ``/``.
158+
- ``?``: any single character except ``/``.
159+
- ``[]``: character classes, excluding ``/``.
160+
"""
161+
162+
import re
163+
164+
escaped = ""
165+
i = 0
166+
while i < len(pattern_segment):
167+
ch = pattern_segment[i]
168+
if ch == "*":
169+
escaped += "[^/]*"
170+
elif ch == "?":
171+
escaped += "[^/]"
172+
elif ch == "[":
173+
# Copy character class as-is until closing ']'.
174+
j = i + 1
175+
while j < len(pattern_segment) and pattern_segment[j] != "]":
176+
j += 1
177+
if j < len(pattern_segment):
178+
escaped += pattern_segment[i : j + 1]
179+
i = j
180+
else:
181+
# Unterminated '['; treat it literally.
182+
escaped += re.escape(ch)
183+
else:
184+
escaped += re.escape(ch)
185+
i += 1
186+
187+
regex = re.compile(rf"^{escaped}$")
188+
return regex.match(path_segment) is not None
189+
190+
191+
def _match_parts_recursive(pattern_parts: list[str], path_parts: list[str]) -> bool:
192+
"""Recursive helper implementing ``**`` segment semantics."""
193+
194+
if not pattern_parts:
195+
return not path_parts
196+
197+
if pattern_parts[0] == "**":
198+
# '**' matches zero or more segments.
199+
for i in range(len(path_parts) + 1):
200+
if _match_parts_recursive(pattern_parts[1:], path_parts[i:]):
201+
return True
202+
return False
203+
204+
if not path_parts:
205+
return False
206+
207+
if not _segment_match(pattern_parts[0], path_parts[0]):
208+
return False
209+
210+
return _match_parts_recursive(pattern_parts[1:], path_parts[1:])
211+
212+
213+
def path_match(pattern: IgnorePattern, relpath: str, *, is_dir: bool) -> bool:
214+
"""Return True if ``relpath`` matches a compiled ignore pattern."""
215+
216+
relpath_posix = PurePosixPath(relpath).as_posix()
217+
path_parts = PurePosixPath(relpath_posix).parts
218+
pattern_parts = PurePosixPath(pattern.pattern).parts
219+
220+
# Directory-only patterns never directly match files here; the effect on
221+
# descendants is enforced by directory pruning in the traversal.
222+
if pattern.directory_only and not is_dir:
223+
return False
224+
225+
if pattern.anchored:
226+
return _match_parts_recursive(list(pattern_parts), list(path_parts))
227+
228+
for start in range(len(path_parts)):
229+
if _match_parts_recursive(list(pattern_parts), list(path_parts[start:])):
230+
return True
231+
return False
232+
233+
234+
def is_ignored(relpath: str, *, is_dir: bool, patterns: Sequence[IgnorePattern]) -> bool:
235+
"""Apply ignore patterns with 'last match wins' semantics.
236+
237+
Examples::
238+
239+
*.log
240+
!important.log
241+
242+
excludes all ``.log`` files except ``important.log``. Patterns are applied
243+
in order, and the last matching pattern determines inclusion.
244+
"""
245+
246+
included = True # include by default
247+
for pat in patterns:
248+
if path_match(pat, relpath, is_dir=is_dir):
249+
included = pat.negated
250+
return not included
251+
252+
253+
def iter_included_files(
254+
root: Path,
255+
*,
256+
patterns: Sequence[IgnorePattern],
257+
) -> Iterable[Path]:
258+
"""Yield all files under ``root`` that are not ignored.
259+
260+
This performs directory pruning so that ignored directories are never
261+
traversed, mirroring Docker's behavior for .dockerignore.
262+
"""
263+
264+
if not root.is_dir():
265+
raise ValueError(f"root must be a directory, got: {root}")
266+
267+
for dirpath, dirs, files in os.walk(root):
268+
dir_path = Path(dirpath)
269+
270+
# Prune ignored directories
271+
for name in list(dirs):
272+
subdir = dir_path / name
273+
rel_dir = subdir.relative_to(root).as_posix()
274+
if is_ignored(rel_dir, is_dir=True, patterns=patterns):
275+
dirs.remove(name)
276+
277+
# Yield non-ignored files
278+
for name in files:
279+
file_path = dir_path / name
280+
rel_file = file_path.relative_to(root).as_posix()
281+
if is_ignored(rel_file, is_dir=False, patterns=patterns):
282+
continue
283+
yield file_path

0 commit comments

Comments
 (0)