From 2564785ae332041cb8e1083e6a0356e8d006dec7 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Tue, 26 May 2026 14:19:47 +0800 Subject: [PATCH 1/3] feat(web2cli,browser): improve cookie scoping and payload handling Align web2cli CLI generation with domain/path-aware Cookie headers, support json/form/raw payload modes, and harden fetch hook capture. Filter browser auth-state export by request URL instead of site-root heuristics. Co-authored-by: Cursor --- .flocks/plugins/skills/web2cli/SKILL.md | 2 +- .../skills/web2cli/scripts/generate-cli.py | 232 +++++++++--- .../skills/web2cli/scripts/generate-spec.py | 52 ++- .../web2cli/scripts/inject-hook-base.js | 142 ++++++- flocks/browser/admin.py | 2 +- flocks/browser/helpers.py | 66 ++-- tests/browser/test_helpers.py | 55 ++- tests/tool/test_skyeye_sensor_skill_cli.py | 93 ----- tests/tool/test_web2cli_generate_cli.py | 348 +++++++++++++++++- tests/tool/test_web2cli_generate_spec.py | 63 ++++ tests/tool/test_web2cli_hook_base.py | 64 ++++ 11 files changed, 909 insertions(+), 210 deletions(-) delete mode 100644 tests/tool/test_skyeye_sensor_skill_cli.py diff --git a/.flocks/plugins/skills/web2cli/SKILL.md b/.flocks/plugins/skills/web2cli/SKILL.md index d505b9162..0f0311d83 100644 --- a/.flocks/plugins/skills/web2cli/SKILL.md +++ b/.flocks/plugins/skills/web2cli/SKILL.md @@ -134,7 +134,7 @@ print(js("window.__apiCapture.config.captureMode")) ### 4. 明确需要捕获的功能/操作 - 要求用户手动操作要捕获的页面动作,例如查询、翻页、筛选、提交表单、点击按钮、导出数据。 -- 或请求用户描述需要 hook 的操作,便于你直接去页面代替用户执行 +- 或者请求用户描述需要 hook 的操作或功能,便于你直接去页面代替用户执行 需要确认捕获是否开始时: diff --git a/.flocks/plugins/skills/web2cli/scripts/generate-cli.py b/.flocks/plugins/skills/web2cli/scripts/generate-cli.py index 6603c74bf..933a5a94f 100644 --- a/.flocks/plugins/skills/web2cli/scripts/generate-cli.py +++ b/.flocks/plugins/skills/web2cli/scripts/generate-cli.py @@ -145,7 +145,7 @@ def generate_python_client(requests: List[Dict], base_url: str) -> str: import json import requests from typing import Dict, Any, Optional, List -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse class APIClient: @@ -174,21 +174,54 @@ def _load_cookie_items(cookie_file: str) -> List[Dict[str, Any]]: return [cookie for cookie in cookies if isinstance(cookie, dict)] + @staticmethod + def _domain_match(host: str, cookie_domain: str) -> bool: + pure_domain = str(cookie_domain or "").lstrip(".") + return bool(pure_domain) and (host == pure_domain or host.endswith(f".{pure_domain}")) + + @staticmethod + def _path_match(request_path: str, cookie_path: str) -> bool: + normalized_cookie_path = str(cookie_path or "/") + normalized_request_path = request_path or "/" + if normalized_cookie_path == "/": + return True + prefix = normalized_cookie_path.rstrip("/") or "/" + return normalized_request_path == prefix or normalized_request_path.startswith(prefix + "/") + + def _build_cookie_header(self, url: str) -> str: + parsed = urlparse(url) + host = parsed.hostname or "" + request_path = parsed.path or "/" + is_https = parsed.scheme == "https" + selected = {} + + for index, cookie in enumerate(self.cookie_items): + name = cookie.get("name") + value = cookie.get("value") + if not name or value in (None, ""): + continue + domain = str(cookie.get("domain", "")) + if domain and not self._domain_match(host, domain): + continue + cookie_path = str(cookie.get("path", "/") or "/") + if not self._path_match(request_path, cookie_path): + continue + if cookie.get("secure") and not is_https: + continue + + score = (len(cookie_path), len(domain.lstrip(".")), index) + current = selected.get(name) + if current is None or score > current[0]: + selected[name] = (score, f"{name}={value}") + + return "; ".join( + header for _, header in sorted(selected.values(), key=lambda item: item[0][2]) + ) + def __init__(self, base_url: str = __BASE_URL__, cookie_file: str = "auth-state.json"): self.base_url = base_url.rstrip("/") self.session = requests.Session() - - # Load cookies - for c in self._load_cookie_items(cookie_file): - name = c.get("name") - if not name: - continue - cookie_kwargs = {} - if c.get("domain"): - cookie_kwargs["domain"] = c["domain"] - if c.get("path"): - cookie_kwargs["path"] = c["path"] - self.session.cookies.set(name, c.get("value", ""), **cookie_kwargs) + self.cookie_items = self._load_cookie_items(cookie_file) # Common headers self.session.headers.update({ @@ -197,8 +230,12 @@ def __init__(self, base_url: str = __BASE_URL__, cookie_file: str = "auth-state. }) def _request(self, method: str, endpoint: str, data: Optional[Dict] = None) -> Dict: - url = f"{{self.base_url}}{{endpoint}}" - resp = self.session.request(method, url, json=data) + url = f"{self.base_url}{endpoint}" + headers = dict(self.session.headers) + cookie_header = self._build_cookie_header(url) + if cookie_header: + headers["Cookie"] = cookie_header + resp = self.session.request(method, url, json=data, headers=headers) resp.raise_for_status() return resp.json() @@ -572,6 +609,7 @@ def generate_markdown_docs_from_spec(spec: Dict[str, Any], title: str = "API Doc - **Base URL**: `{spec.get("baseUrl", "")}` - **Method**: `{operation.get("method", "GET")}` - **Endpoint**: `{operation.get("endpoint", "/")}` +- **Payload Mode**: `{operation.get("payloadMode", "none")}` """ @@ -622,6 +660,8 @@ def generate_postman_collection_from_spec(spec: Dict[str, Any]) -> Dict[str, Any operation = entry["operation"] headers = operation.get("headers", {}) if isinstance(operation.get("headers"), dict) else {} body_template = operation.get("bodyTemplate", {}) if isinstance(operation.get("bodyTemplate"), dict) else {} + payload_mode = operation.get("payloadMode") or ("json" if body_template else "none") + raw_body_template = operation.get("rawBodyTemplate", "") endpoint = operation.get("endpoint", "/") path_parts = endpoint.lstrip("/").split("/") if endpoint.lstrip("/") else [] @@ -634,12 +674,23 @@ def generate_postman_collection_from_spec(spec: Dict[str, Any]) -> Dict[str, Any }, "header": [{"key": key, "value": value} for key, value in headers.items()], } - if body_template: + if payload_mode == "json" and body_template: request["body"] = { "mode": "raw", "raw": json.dumps(body_template, ensure_ascii=False), "options": {"raw": {"language": "json"}}, } + elif payload_mode == "form" and body_template: + request["body"] = { + "mode": "urlencoded", + "urlencoded": [{"key": key, "value": str(value)} for key, value in body_template.items()], + } + elif payload_mode == "raw" and raw_body_template: + request["body"] = { + "mode": "raw", + "raw": str(raw_body_template), + "options": {"raw": {"language": "text"}}, + } items.append( { "name": entry["command"], @@ -672,6 +723,7 @@ def generate_python_cli_from_spec(spec: Dict[str, Any]) -> str: import re import sys from typing import Any, Dict, List +from urllib.parse import urlparse import requests @@ -679,7 +731,7 @@ def generate_python_cli_from_spec(spec: Dict[str, Any]) -> str: SPEC = ''' + spec_json + ''' -def _load_json(path: str) -> Dict[str, Any]: +def _load_json(path: str) -> Any: if not path: return {} try: @@ -689,7 +741,18 @@ def _load_json(path: str) -> Dict[str, Any]: return {} except json.JSONDecodeError: return {} - return payload if isinstance(payload, dict) else {} + return payload + + +def _resolve_string_template(value: str, args: Dict[str, Any]) -> Any: + exact_match = re.fullmatch(r"\\$\\{([A-Za-z0-9_]+)\\}", value) + if exact_match: + return args.get(exact_match.group(1), value) + return re.sub( + r"\\$\\{([A-Za-z0-9_]+)\\}", + lambda match: str(args.get(match.group(1), match.group(0))), + value, + ) def _coerce_bool(value: str) -> bool: @@ -783,13 +846,20 @@ class APIClient: @staticmethod def _load_cookie_items(auth_state_path: str) -> List[Dict[str, Any]]: payload = _load_json(auth_state_path) - cookies = payload.get("cookies", []) + if isinstance(payload, list): + cookies = payload + elif isinstance(payload, dict): + cookies = payload.get("cookies", []) + else: + cookies = [] if isinstance(cookies, list): return [cookie for cookie in cookies if isinstance(cookie, dict)] return [] @staticmethod def _load_storage_map(payload: Dict[str, Any]) -> Dict[str, str]: + if not isinstance(payload, dict): + return {} values = {} for origin_entry in payload.get("origins", []): if not isinstance(origin_entry, dict): @@ -799,22 +869,87 @@ def _load_storage_map(payload: Dict[str, Any]) -> Dict[str, str]: values[item["name"]] = item.get("value", "") return values + @staticmethod + def _domain_match(host: str, cookie_domain: str) -> bool: + pure_domain = str(cookie_domain or "").lstrip(".") + return bool(pure_domain) and (host == pure_domain or host.endswith(f".{pure_domain}")) + + @staticmethod + def _path_match(request_path: str, cookie_path: str) -> bool: + normalized_cookie_path = str(cookie_path or "/") + normalized_request_path = request_path or "/" + if normalized_cookie_path == "/": + return True + prefix = normalized_cookie_path.rstrip("/") or "/" + return normalized_request_path == prefix or normalized_request_path.startswith(prefix + "/") + + def _select_cookie_header(self, url: str) -> str: + parsed = urlparse(url) + host = parsed.hostname or "" + request_path = parsed.path or "/" + is_https = parsed.scheme == "https" + selected = {} + + for index, cookie in enumerate(self._load_cookie_items(self.auth_state_path)): + name = cookie.get("name") + value = cookie.get("value") + if not name or value in (None, ""): + continue + domain = str(cookie.get("domain", "")) + if domain and not self._domain_match(host, domain): + continue + cookie_path = str(cookie.get("path", "/") or "/") + if not self._path_match(request_path, cookie_path): + continue + if cookie.get("secure") and not is_https: + continue + + score = (len(cookie_path), len(domain.lstrip(".")), index) + current = selected.get(name) + if current is None or score > current[0]: + selected[name] = (score, f"{name}={value}") + + return "; ".join( + header for _, header in sorted(selected.values(), key=lambda item: item[0][2]) + ) + + def _resolve_cookie_value(self, key: Any) -> str | None: + target_name = str(key or "") + if not target_name: + return None + parsed = urlparse(self.base_url or "") + host = parsed.hostname or "" + request_path = parsed.path or "/" + is_https = parsed.scheme == "https" + selected = None + for index, cookie in enumerate(self._load_cookie_items(self.auth_state_path)): + if not isinstance(cookie, dict) or cookie.get("name") != target_name: + continue + domain = str(cookie.get("domain", "")) + if domain and not self._domain_match(host, domain): + continue + cookie_path = str(cookie.get("path", "/") or "/") + if not self._path_match(request_path, cookie_path): + continue + if cookie.get("secure") and not is_https: + continue + score = (len(cookie_path), len(domain.lstrip(".")), index) + if selected is None or score > selected[0]: + selected = (score, str(cookie.get("value", ""))) + return selected[1] if selected else None + @staticmethod def _resolve_header_value(payload: Dict[str, Any], rule: Dict[str, Any]) -> str | None: source = rule.get("source") key = rule.get("key") - if source == "cookie": - for cookie in payload.get("cookies", []): - if isinstance(cookie, dict) and cookie.get("name") == key: - return str(cookie.get("value", "")) if source == "localStorage": return APIClient._load_storage_map(payload).get(str(key)) return None @staticmethod def _resolve_template(value: Any, args: Dict[str, Any]) -> Any: - if isinstance(value, str) and value.startswith("${") and value.endswith("}"): - return args.get(value[2:-1], value) + if isinstance(value, str): + return _resolve_string_template(value, args) if isinstance(value, dict): return {key: APIClient._resolve_template(item, args) for key, item in value.items()} if isinstance(value, list): @@ -873,23 +1008,15 @@ def _apply_auth_state(self) -> None: if isinstance(headers, dict) and headers: self.session.headers.update(headers) - if strategy in {"COOKIE", "HEADER"}: - for cookie in self._load_cookie_items(self.auth_state_path): - name = cookie.get("name") - if not name: - continue - kwargs = {} - if cookie.get("domain"): - kwargs["domain"] = cookie["domain"] - if cookie.get("path"): - kwargs["path"] = cookie["path"] - self.session.cookies.set(name, cookie.get("value", ""), **kwargs) - if strategy == "HEADER": for rule in auth.get("requiredHeaders", []): if not isinstance(rule, dict) or not rule.get("name"): continue - value = self._resolve_header_value(self.auth_state, rule) + source = rule.get("source") + if source == "cookie": + value = self._resolve_cookie_value(rule.get("key")) + else: + value = self._resolve_header_value(self.auth_state, rule) if value: self.session.headers[str(rule["name"])] = value @@ -898,14 +1025,36 @@ def build_request(self, args: Dict[str, Any], entry: Dict[str, Any]) -> Dict[str endpoint = operation.get("endpoint", "/") query = self._resolve_template(operation.get("queryTemplate", {}), args) body = self._resolve_template(operation.get("bodyTemplate", {}), args) - headers = operation.get("headers", {}) - return { + payload_mode = str(operation.get("payloadMode") or ("json" if body else "none")).lower() + raw_body = self._resolve_template(operation.get("rawBodyTemplate", ""), args) + headers = dict(operation.get("headers", {}) or {}) + cookie_strategy = str(SPEC.get("strategy", "PUBLIC") or "PUBLIC").upper() + request_options = { "method": operation.get("method", "GET"), "url": f"{self.base_url}{endpoint}", "params": query or None, - "json": body or None, + "json": None, + "data": None, "headers": headers or None, } + if payload_mode == "json": + request_options["json"] = body or None + elif payload_mode == "form": + request_options["data"] = body or None + elif payload_mode == "raw": + request_options["data"] = raw_body or None + if cookie_strategy in {"COOKIE", "HEADER"}: + cookie_header = self._select_cookie_header(request_options["url"]) + if cookie_header: + headers["Cookie"] = cookie_header + return { + "method": request_options["method"], + "url": request_options["url"], + "params": request_options["params"], + "json": request_options["json"], + "data": request_options["data"], + "headers": request_options["headers"], + } def _project_rows(self, payload: Any, entry: Dict[str, Any]) -> List[Dict[str, Any]]: row_source = entry.get("rowSource", {}) @@ -940,6 +1089,7 @@ def run(self, args: Dict[str, Any], entry: Dict[str, Any] | None = None) -> List request_options["url"], params=request_options["params"], json=request_options["json"], + data=request_options["data"], headers=request_options["headers"], ) response.raise_for_status() diff --git a/.flocks/plugins/skills/web2cli/scripts/generate-spec.py b/.flocks/plugins/skills/web2cli/scripts/generate-spec.py index e30848c4f..0f7e122d4 100644 --- a/.flocks/plugins/skills/web2cli/scripts/generate-spec.py +++ b/.flocks/plugins/skills/web2cli/scripts/generate-spec.py @@ -55,6 +55,19 @@ def parse_json_text(text: str) -> Any: return {"raw": text} +def get_request_content_type(request: dict[str, Any]) -> str: + """Return the normalized request content type.""" + direct = request.get("requestContentType") + if direct: + return str(direct).lower() + + headers = request.get("requestHeaders", {}) or request.get("request_headers", {}) + for key, value in headers.items(): + if str(key).lower() == "content-type" and value: + return str(value).lower() + return "" + + def infer_type(value: Any) -> str: """Return a compact type name for spec/verify output.""" if value is None: @@ -237,8 +250,10 @@ def collect_columns(item: Any) -> list[dict[str, Any]]: return columns -def build_templates(request: dict[str, Any], url_info: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any], list[dict[str, Any]]]: - """Build query/body templates and CLI arg definitions.""" +def build_templates( + request: dict[str, Any], url_info: dict[str, Any] +) -> tuple[dict[str, Any], dict[str, Any], list[dict[str, Any]], str, str]: + """Build query/body templates, payload mode, and CLI arg definitions.""" args: list[dict[str, Any]] = [] seen_args: set[str] = set() @@ -264,15 +279,36 @@ def transform_mapping(data: dict[str, Any]) -> dict[str, Any]: result[key] = value return result - body = parse_json_text(str(request.get("requestBody", ""))) - if not isinstance(body, dict) or "raw" in body: - body = {} + body_text = str(request.get("requestBody", "")) + body_kind = str(request.get("requestBodyKind", "")).lower() + content_type = get_request_content_type(request) + parsed_body = parse_json_text(body_text) + body: dict[str, Any] = {} + payload_mode = "none" + raw_body_template = "" + + if isinstance(parsed_body, dict) and "raw" not in parsed_body: + body = parsed_body + if body: + if body_kind in {"urlencoded", "formdata"}: + payload_mode = "form" + elif "application/x-www-form-urlencoded" in content_type or "multipart/form-data" in content_type: + payload_mode = "form" + else: + payload_mode = "json" + elif body_text: + if "application/x-www-form-urlencoded" in content_type: + body = dict(parse_qsl(body_text, keep_blank_values=True)) + payload_mode = "form" if body else "raw" + else: + payload_mode = "raw" + raw_body_template = body_text query_template = transform_mapping(url_info["query"]) body_template = transform_mapping(body) args.sort(key=lambda item: (0 if item["name"] == "page" else 1 if item["name"] == "limit" else 2, item["name"])) - return query_template, body_template, args + return query_template, body_template, args, payload_mode, raw_body_template def build_strategy(request: dict[str, Any]) -> tuple[str, dict[str, Any]]: @@ -364,7 +400,7 @@ def build_operation_entry(request: dict[str, Any]) -> dict[str, Any]: response = parse_json_text(str(request.get("response", ""))) collection = find_best_collection(response) row_item = collection["item"] if collection is not None else response - query_template, body_template, args = build_templates(request, url_info) + query_template, body_template, args, payload_mode, raw_body_template = build_templates(request, url_info) columns = collect_columns(row_item) defaults = {item["name"]: item["default"] for item in args} @@ -386,6 +422,8 @@ def build_operation_entry(request: dict[str, Any]) -> dict[str, Any]: "endpoint": pathname, "queryTemplate": query_template, "bodyTemplate": body_template, + "payloadMode": payload_mode, + "rawBodyTemplate": raw_body_template, "headers": safe_headers(request), "captureSource": request.get("captureSource", "pageHook"), "captureReason": request.get("captureReason", ""), diff --git a/.flocks/plugins/skills/web2cli/scripts/inject-hook-base.js b/.flocks/plugins/skills/web2cli/scripts/inject-hook-base.js index 5e11d5747..ec49a2dde 100644 --- a/.flocks/plugins/skills/web2cli/scripts/inject-hook-base.js +++ b/.flocks/plugins/skills/web2cli/scripts/inject-hook-base.js @@ -78,11 +78,36 @@ return result; } + function mergeHeaders(baseHeaders, overrideHeaders) { + var result = {}; + var key; + var base = normalizeHeaders(baseHeaders); + var override = normalizeHeaders(overrideHeaders); + for (key in base) { + if (Object.prototype.hasOwnProperty.call(base, key)) { + result[key] = base[key]; + } + } + for (key in override) { + if (Object.prototype.hasOwnProperty.call(override, key)) { + result[key] = override[key]; + } + } + return result; + } + function getHeader(headers, name) { + var key; + var expected = String(name || '').toLowerCase(); if (!headers) { return ''; } - return headers[name] || headers[name.toLowerCase()] || ''; + for (key in headers) { + if (Object.prototype.hasOwnProperty.call(headers, key) && String(key).toLowerCase() === expected) { + return headers[key]; + } + } + return ''; } function hasStaticExtension(pathname) { @@ -194,7 +219,22 @@ }; } - function summarizeBody(body) { + function parseUrlEncodedBody(text) { + var result = {}; + try { + if (typeof URLSearchParams !== 'undefined') { + var params = new URLSearchParams(String(text || '')); + params.forEach(function(value, key) { + result[key] = value; + }); + } + } catch (error) { + return null; + } + return result; + } + + function summarizeBody(body, contentType) { var result = { kind: 'empty', display: '', @@ -233,7 +273,17 @@ } if (typeof body === 'string') { + var normalizedContentType = String(contentType || '').toLowerCase(); result.display = truncateText(body, CONFIG.maxRequestBodyLength); + if (/application\/x-www-form-urlencoded/i.test(normalizedContentType)) { + result.parsed = parseUrlEncodedBody(body); + if (result.parsed && Object.keys(result.parsed).length) { + result.kind = 'urlencoded'; + result.display = truncateText(JSON.stringify(result.parsed, null, 2), CONFIG.maxRequestBodyLength); + inferShape(result.parsed, '$', result.shape, 0); + return result; + } + } try { result.parsed = JSON.parse(body); result.kind = 'json'; @@ -451,9 +501,9 @@ } function buildCaptureRecord(base) { - var requestBody = summarizeBody(base.requestBody); - var responseBody = summarizeResponse(base.responseText); var requestContentType = getHeader(base.requestHeaders, 'content-type'); + var requestBody = summarizeBody(base.requestBody, requestContentType); + var responseBody = summarizeResponse(base.responseText); var responseContentType = base.responseContentType || ''; var actionContext = snapshotActionContext(); return { @@ -568,14 +618,63 @@ return originalXHRSend.apply(this, arguments); }; + function isRequestLike(value) { + return !!(value && typeof value === 'object' && typeof value.url === 'string'); + } + + function resolveFetchRequestInfo(input, options) { + var init = options || {}; + var requestLike = isRequestLike(input) ? input : null; + var hasOwnBody = Object.prototype.hasOwnProperty.call(init, 'body'); + var requestHeaders = mergeHeaders(requestLike ? requestLike.headers : {}, init.headers || {}); + var requestBody = hasOwnBody ? init.body : undefined; + var bodyPromise; + + if (typeof requestBody !== 'undefined') { + bodyPromise = Promise.resolve(requestBody); + } else if (requestLike && typeof requestLike.clone === 'function') { + try { + bodyPromise = requestLike.clone().text().then( + function(text) { + return text || ''; + }, + function() { + return ''; + } + ); + } catch (error) { + bodyPromise = Promise.resolve(''); + } + } else if (requestLike && typeof requestLike.text === 'function') { + try { + bodyPromise = requestLike.text().then( + function(text) { + return text || ''; + }, + function() { + return ''; + } + ); + } catch (error) { + bodyPromise = Promise.resolve(''); + } + } else { + bodyPromise = Promise.resolve(''); + } + + return { + method: ((init.method || (requestLike && requestLike.method) || 'GET') + '').toUpperCase(), + requestHeaders: requestHeaders, + requestBodyPromise: bodyPromise, + url: typeof input === 'string' ? input : (input && input.url ? input.url : String(input)) + }; + } + var originalFetch = window.fetch; window.fetch = function(url, options) { - options = options || {}; + var requestInfo = resolveFetchRequestInfo(url, options); var startTime = Date.now(); - var method = (options.method || 'GET').toUpperCase(); - var requestHeaders = normalizeHeaders(options.headers || {}); - var urlStr = typeof url === 'string' ? url : (url && url.url ? url.url : String(url)); - var decision = getCaptureDecision(urlStr, method, requestHeaders); + var decision = getCaptureDecision(requestInfo.url, requestInfo.method, requestInfo.requestHeaders); if (!decision.capture) { return originalFetch.apply(this, arguments); @@ -583,15 +682,20 @@ return originalFetch.apply(this, arguments).then(function(response) { var cloned = response.clone(); - return cloned.text().then(function(text) { + return Promise.all([ + requestInfo.requestBodyPromise, + cloned.text() + ]).then(function(values) { + var requestBody = values[0]; + var text = values[1]; var record = buildCaptureRecord({ type: 'Fetch', - method: method, - url: urlStr, + method: requestInfo.method, + url: requestInfo.url, urlInfo: decision.urlInfo, status: response.status, - requestHeaders: requestHeaders, - requestBody: options.body, + requestHeaders: requestInfo.requestHeaders, + requestBody: requestBody, responseText: text || '', responseContentType: response.headers && typeof response.headers.get === 'function' ? (response.headers.get('content-type') || '') @@ -604,7 +708,7 @@ window.__capturedRequests.push(record); console.log( '[API Capture] Fetch:', - method, + requestInfo.method, record.normalizedUrl, '->', response.status, @@ -615,12 +719,12 @@ }).catch(function(error) { var record = buildCaptureRecord({ type: 'Fetch', - method: method, - url: urlStr, + method: requestInfo.method, + url: requestInfo.url, urlInfo: decision.urlInfo, status: 'error', - requestHeaders: requestHeaders, - requestBody: options.body, + requestHeaders: requestInfo.requestHeaders, + requestBody: '', responseText: '', responseContentType: '', pageContext: getPageContext(), diff --git a/flocks/browser/admin.py b/flocks/browser/admin.py index 04e167007..224dfeeeb 100644 --- a/flocks/browser/admin.py +++ b/flocks/browser/admin.py @@ -487,7 +487,7 @@ def row(label: str, ok: bool, detail: str = "") -> None: browser_running, "" if browser_running else "start Chrome, Chromium, or Edge and rerun `flocks browser --setup`", ) - row("daemon alive", daemon, "" if daemon else "not running; run `flocks browser --setup` to attach") + row("daemon alive", daemon, "" if daemon else "not running; wait user open browser inspect page then run `flocks browser --setup` to attach") row("active browser connections", bool(connections), str(len(connections))) for conn in connections: page = conn.get("page") diff --git a/flocks/browser/helpers.py b/flocks/browser/helpers.py index 18e96a5ba..ea2cf5be7 100644 --- a/flocks/browser/helpers.py +++ b/flocks/browser/helpers.py @@ -22,7 +22,6 @@ AGENT_WORKSPACE = Path(os.environ.get("BH_AGENT_WORKSPACE", DEFAULT_AGENT_WORKSPACE)).expanduser() NAME = os.environ.get("BU_NAME", "default") INTERNAL = INTERNAL_URL_PREFIXES -_COMMON_SECOND_LEVEL_SUFFIXES = {"ac", "co", "com", "edu", "gov", "mil", "net", "org"} _COOKIE_IMPORT_FIELDS = { "name", "value", @@ -243,18 +242,39 @@ def _stringify_json(data: Any) -> str: return json.dumps(data, ensure_ascii=False, separators=(",", ":")) -def _site_root_from_hostname(hostname: str) -> str: - labels = [label for label in hostname.lower().strip(".").split(".") if label] - if len(labels) <= 2: - return ".".join(labels) - if len(labels[-1]) == 2 and labels[-2] in _COMMON_SECOND_LEVEL_SUFFIXES: - return ".".join(labels[-3:]) - return ".".join(labels[-2:]) +def _domain_matches_host(domain: str | None, hostname: str) -> bool: + normalized_domain = str(domain or "").lower().lstrip(".") + normalized_host = str(hostname or "").lower().strip(".") + return bool(normalized_domain and normalized_host) and ( + normalized_host == normalized_domain or normalized_host.endswith(f".{normalized_domain}") + ) -def _domain_matches_site(domain: str | None, site_root: str) -> bool: - normalized = str(domain or "").lower().lstrip(".") - return bool(normalized) and (normalized == site_root or normalized.endswith(f".{site_root}")) +def _path_matches_url(cookie_path: str | None, request_path: str) -> bool: + normalized_cookie_path = str(cookie_path or "/") or "/" + normalized_request_path = request_path or "/" + if normalized_cookie_path == "/": + return True + prefix = normalized_cookie_path.rstrip("/") or "/" + return normalized_request_path == prefix or normalized_request_path.startswith(prefix + "/") + + +def _cookie_matches_url(cookie: dict[str, Any], target_url: str) -> bool: + parsed = urlparse(target_url) + hostname = parsed.hostname or "" + request_path = parsed.path or "/" + is_https = parsed.scheme == "https" + if not isinstance(cookie, dict): + return False + if not cookie.get("name") or cookie.get("value") in (None, ""): + return False + if not _domain_matches_host(cookie.get("domain"), hostname): + return False + if not _path_matches_url(cookie.get("path"), request_path): + return False + if cookie.get("secure") and not is_https: + return False + return True def _current_page_state() -> tuple[dict[str, Any], str]: @@ -290,24 +310,24 @@ def _collect_site_cookies(target_url: str) -> list[dict[str, Any]]: hostname = urlparse(target_url).hostname if not hostname: raise RuntimeError(f"could not determine hostname for {target_url!r}") - site_root = _site_root_from_hostname(hostname) + + try: + cookies = cdp("Network.getCookies", urls=[target_url]).get("cookies", []) + except Exception: + cookies = [] + if isinstance(cookies, list): + filtered = [cookie for cookie in cookies if _cookie_matches_url(cookie, target_url)] + if filtered: + return filtered + try: cookies = cdp("Storage.getCookies").get("cookies", []) except Exception: cookies = [] if not isinstance(cookies, list): cookies = [] - filtered = [ - cookie - for cookie in cookies - if isinstance(cookie, dict) and _domain_matches_site(cookie.get("domain"), site_root) - ] - if filtered: - return filtered - fallback = cdp("Network.getCookies", urls=[target_url]).get("cookies", []) - if not isinstance(fallback, list): - raise RuntimeError("cookie export did not return a list") - return fallback + filtered = [cookie for cookie in cookies if _cookie_matches_url(cookie, target_url)] + return filtered def _set_storage_entries(storage_name: str, entries: list[dict[str, Any]]) -> int: diff --git a/tests/browser/test_helpers.py b/tests/browser/test_helpers.py index 558715f55..6bf28271b 100644 --- a/tests/browser/test_helpers.py +++ b/tests/browser/test_helpers.py @@ -415,12 +415,12 @@ def test_save_state_writes_portable_schema(tmp_path) -> None: out = tmp_path / "auth-state.json" cookies = [ {"name": "sid", "value": "secret", "domain": ".zhihu.com", "path": "/"}, - {"name": "api", "value": "token", "domain": "api.zhihu.com", "path": "/"}, - {"name": "other", "value": "skip", "domain": ".example.com", "path": "/"}, + {"name": "api", "value": "token", "domain": "api.zhihu.com", "path": "/app"}, ] def fake_cdp(method, **kwargs): - if method == "Storage.getCookies": + if method == "Network.getCookies": + assert kwargs == {"urls": ["https://www.zhihu.com/app"]} return {"cookies": cookies} raise AssertionError((method, kwargs)) @@ -439,14 +439,59 @@ def fake_js(expression): result = helpers.save_state(out) saved = json.loads(out.read_text(encoding="utf-8")) - assert result["cookies"] == 2 + assert result["cookies"] == 1 assert set(saved) == {"cookies", "origins"} - assert {item["domain"] for item in saved["cookies"]} == {".zhihu.com", "api.zhihu.com"} + assert {item["domain"] for item in saved["cookies"]} == {".zhihu.com"} assert saved["origins"] == [ {"origin": "https://www.zhihu.com", "localStorage": [{"name": "token", "value": "abc"}]} ] +def test_save_state_falls_back_to_storage_cookies_and_filters_by_exact_url(tmp_path) -> None: + out = tmp_path / "auth-state.json" + storage_cookies = [ + {"name": "shared", "value": "ok", "domain": ".threatbook-inc.cn", "path": "/", "secure": True}, + {"name": "sensor", "value": "ok", "domain": "eagle-sensor.threatbook-inc.cn", "path": "/", "secure": True}, + {"name": "path-match", "value": "ok", "domain": "eagle-sensor.threatbook-inc.cn", "path": "/skyeye", "secure": True}, + {"name": "wrong-path", "value": "skip", "domain": "eagle-sensor.threatbook-inc.cn", "path": "/admin", "secure": True}, + {"name": "wrong-host", "value": "skip", "domain": "wiki.threatbook-inc.cn", "path": "/", "secure": True}, + {"name": "empty-value", "value": "", "domain": ".threatbook-inc.cn", "path": "/", "secure": True}, + ] + + def fake_cdp(method, **kwargs): + if method == "Network.getCookies": + assert kwargs == {"urls": ["https://eagle-sensor.threatbook-inc.cn/skyeye/alarm/list"]} + raise RuntimeError("Network.getCookies unavailable") + if method == "Storage.getCookies": + return {"cookies": storage_cookies} + raise AssertionError((method, kwargs)) + + def fake_js(expression): + if 'window["localStorage"]' in expression: + return '[{"name":"token","value":"abc"}]' + raise AssertionError(expression) + + with ( + patch( + "flocks.browser.helpers.page_info", + return_value={"url": "https://eagle-sensor.threatbook-inc.cn/skyeye/alarm/list", "title": "Sensor"}, + ), + patch("flocks.browser.helpers.cdp", side_effect=fake_cdp), + patch("flocks.browser.helpers.js", side_effect=fake_js), + ): + result = helpers.save_state(out) + + saved = json.loads(out.read_text(encoding="utf-8")) + assert result["cookies"] == 3 + assert [item["name"] for item in saved["cookies"]] == ["shared", "sensor", "path-match"] + assert saved["origins"] == [ + { + "origin": "https://eagle-sensor.threatbook-inc.cn", + "localStorage": [{"name": "token", "value": "abc"}], + } + ] + + def test_load_state_restores_cookies_and_storage(tmp_path) -> None: state_file = tmp_path / "auth-state.json" state_file.write_text( diff --git a/tests/tool/test_skyeye_sensor_skill_cli.py b/tests/tool/test_skyeye_sensor_skill_cli.py deleted file mode 100644 index 1b509208c..000000000 --- a/tests/tool/test_skyeye_sensor_skill_cli.py +++ /dev/null @@ -1,93 +0,0 @@ -import importlib -import io -import sys -from pathlib import Path - -import pytest -from click.testing import CliRunner -from rich.console import Console - - -SCRIPTS_DIR = Path(__file__).resolve().parents[2] / ".flocks" / "plugins" / "skills" / "skyeye-sensor-data-fetch" / "scripts" - - -@pytest.fixture -def cli_module(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setenv("SKYEYE_SENSOR_BASE_URL", "https://sensor.example.com") - monkeypatch.setenv("SKYEYE_SENSOR_AUTH_STATE", str(SCRIPTS_DIR / "auth-state.json")) - monkeypatch.setenv("SKYEYE_SENSOR_CSRF_TOKEN", "test-token") - - sys.path.insert(0, str(SCRIPTS_DIR)) - for module_name in ("skyeye_sensor_cli", "api_client", "config"): - sys.modules.pop(module_name, None) - - module = importlib.import_module("skyeye_sensor_cli") - yield module - - for module_name in ("skyeye_sensor_cli", "api_client", "config"): - sys.modules.pop(module_name, None) - try: - sys.path.remove(str(SCRIPTS_DIR)) - except ValueError: - pass - - -class FakeSkyeyeSensorClient: - def __init__(self, *args, **kwargs) -> None: - pass - - def get_alarm_count_filtered(self, **kwargs): - return { - "status": 200, - "items": [ - {"time": 1773386502971, "value": 2}, - {"time": 1773386503971, "value": 5}, - ], - } - - def get_alarm_list(self, **kwargs): - return { - "status": 200, - "items": [ - { - "access_time": 1773386502971, - "hazard_level": "high", - "threat_name": "测试告警", - "sip": "1.1.1.1", - "dip": "2.2.2.2", - "status": "unhandled", - } - ], - "total": 1, - } - - -def _run_cli(cli_module, monkeypatch: pytest.MonkeyPatch, args: list[str]) -> str: - output = io.StringIO() - monkeypatch.setattr(cli_module, "SkyeyeSensorClient", FakeSkyeyeSensorClient) - monkeypatch.setattr( - cli_module, - "console", - Console(file=output, force_terminal=False, color_system=None, width=120), - ) - - result = CliRunner().invoke(cli_module.cli, args) - assert result.exit_code == 0, result.output - return output.getvalue() + result.output - - -@pytest.mark.parametrize( - ("args", "expected_fragment"), - [ - (["alarm", "count", "--table", "--days", "1"], "告警趋势"), - (["alarm", "list", "--table", "--days", "1"], "测试告警"), - ], -) -def test_skyeye_sensor_skill_cli_commands( - cli_module, - monkeypatch: pytest.MonkeyPatch, - args: list[str], - expected_fragment: str, -): - output = _run_cli(cli_module, monkeypatch, args) - assert expected_fragment in output diff --git a/tests/tool/test_web2cli_generate_cli.py b/tests/tool/test_web2cli_generate_cli.py index 6b69d65d7..8dbadf849 100644 --- a/tests/tool/test_web2cli_generate_cli.py +++ b/tests/tool/test_web2cli_generate_cli.py @@ -14,6 +14,15 @@ / "scripts" / "generate-cli.py" ) +SPEC_SCRIPT_PATH = ( + Path(__file__).resolve().parents[2] + / ".flocks" + / "plugins" + / "skills" + / "web2cli" + / "scripts" + / "generate-spec.py" +) def _load_module(): @@ -26,6 +35,16 @@ def _load_module(): return module +def _load_spec_module(): + spec = importlib.util.spec_from_file_location("web2cli_generate_spec", SPEC_SCRIPT_PATH) + assert spec is not None + assert spec.loader is not None + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + def _sample_requests(): return [ { @@ -127,9 +146,13 @@ class _FakeSession: def __init__(self) -> None: self.cookies = _FakeCookieJar() self.headers = {} + self.request_calls = [] - def request(self, method, url, json=None, params=None, headers=None): - raise AssertionError("request should not be called in cookie-loading tests") + def request(self, method, url, json=None, params=None, data=None, headers=None): + self.request_calls.append( + {"method": method, "url": url, "json": json, "params": params, "data": data, "headers": headers} + ) + return _FakeResponse({}) def test_generated_client_loads_storage_state_cookie_object(tmp_path, monkeypatch): @@ -139,10 +162,11 @@ def test_generated_client_loads_storage_state_cookie_object(tmp_path, monkeypatc json.dumps( { "cookies": [ - {"name": "sid", "value": "cookie-123", "domain": ".zhihu.com", "path": "/"}, - {"name": "api", "value": "cookie-456", "domain": "api.zhihu.com", "path": "/api"}, + {"name": "sid", "value": "cookie-123", "domain": ".example.com", "path": "/"}, + {"name": "api", "value": "cookie-456", "domain": "api.example.com", "path": "/api"}, + {"name": "ignore", "value": "cookie-789", "domain": ".zhihu.com", "path": "/"}, ], - "origins": [{"origin": "https://www.zhihu.com", "localStorage": [{"name": "token", "value": "abc"}]}], + "origins": [{"origin": "https://api.example.com", "localStorage": [{"name": "token", "value": "abc"}]}], } ), encoding="utf-8", @@ -153,14 +177,26 @@ def test_generated_client_loads_storage_state_cookie_object(tmp_path, monkeypatc monkeypatch.setitem(sys.modules, "requests", fake_requests) namespace = {} - exec(module.generate_python_client(_sample_requests(), "https://example.com"), namespace) + exec(module.generate_python_client(_sample_requests(), "https://api.example.com"), namespace) client = namespace["APIClient"](cookie_file=str(auth_state)) + client._request("POST", "/api/items/list", {"page": 1}) assert client.session is fake_session - assert fake_session.cookies.set_calls == [ - {"name": "sid", "value": "cookie-123", "domain": ".zhihu.com", "path": "/"}, - {"name": "api", "value": "cookie-456", "domain": "api.zhihu.com", "path": "/api"}, + assert fake_session.cookies.set_calls == [] + assert fake_session.request_calls == [ + { + "method": "POST", + "url": "https://api.example.com/api/items/list", + "json": {"page": 1}, + "params": None, + "data": None, + "headers": { + "Accept": "application/json, text/plain, */*", + "Content-Type": "application/json; charset=UTF-8", + "Cookie": "sid=cookie-123; api=cookie-456", + }, + } ] @@ -184,11 +220,23 @@ def test_generated_client_still_supports_plain_cookie_list(tmp_path, monkeypatch namespace = {} exec(module.generate_python_client(_sample_requests(), "https://example.com"), namespace) - namespace["APIClient"](cookie_file=str(cookie_file)) + client = namespace["APIClient"](cookie_file=str(cookie_file)) + client._request("POST", "/api/items/list", {"page": 1}) - assert fake_session.cookies.set_calls == [ - {"name": "sid", "value": "cookie-123"}, - {"name": "api", "value": "cookie-456", "path": "/"}, + assert fake_session.cookies.set_calls == [] + assert fake_session.request_calls == [ + { + "method": "POST", + "url": "https://example.com/api/items/list", + "json": {"page": 1}, + "params": None, + "data": None, + "headers": { + "Accept": "application/json, text/plain, */*", + "Content-Type": "application/json; charset=UTF-8", + "Cookie": "sid=cookie-123; api=cookie-456", + }, + } ] @@ -206,6 +254,8 @@ def _sample_spec(): "endpoint": "/api/items/list", "queryTemplate": {}, "bodyTemplate": {"page": "${page}", "size": "${limit}"}, + "payloadMode": "json", + "rawBodyTemplate": "", "headers": {"Content-Type": "application/json"}, }, "rowSource": {"path": "$.data.items[]", "collectionPath": "$.data.items[]"}, @@ -239,6 +289,8 @@ def _multi_operation_spec(): "endpoint": "/api/alerts/list", "queryTemplate": {}, "bodyTemplate": {"page": "${page}", "size": "${limit}"}, + "payloadMode": "json", + "rawBodyTemplate": "", "headers": {"Content-Type": "application/json"}, }, "rowSource": {"path": "$.data.items[]", "collectionPath": "$.data.items[]"}, @@ -254,6 +306,8 @@ def _multi_operation_spec(): "endpoint": "/api/alarms/count", "queryTemplate": {"page": "${page}"}, "bodyTemplate": {}, + "payloadMode": "none", + "rawBodyTemplate": "", "headers": {"Accept": "application/json"}, }, "rowSource": {"path": "$", "collectionPath": "$"}, @@ -272,6 +326,52 @@ def _multi_operation_spec(): return spec +def _form_spec(): + spec = _sample_spec() + spec["command"] = "search_items" + spec["description"] = "Search items with form payload" + spec["operation"] = { + "method": "POST", + "endpoint": "/api/search", + "queryTemplate": {}, + "bodyTemplate": {"page": "${page}", "size": "${limit}", "keyword": "alpha"}, + "payloadMode": "form", + "rawBodyTemplate": "", + "headers": {"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"}, + } + spec["verify"]["args"] = {"page": 1, "limit": 20} + return spec + + +def _raw_spec(): + spec = _sample_spec() + spec["command"] = "raw_search" + spec["description"] = "Search items with raw body" + spec["operation"] = { + "method": "POST", + "endpoint": "/api/raw-search", + "queryTemplate": {"page": "${page}"}, + "bodyTemplate": {}, + "payloadMode": "raw", + "rawBodyTemplate": "keyword=alpha", + "headers": {"Content-Type": "text/plain"}, + } + spec["args"] = [{"name": "page", "type": "int", "default": 1, "help": "Page number"}] + spec["verify"]["args"] = {"page": 1} + return spec + + +def _header_auth_spec(): + spec = _sample_spec() + spec["strategy"] = "HEADER" + spec["auth"] = { + "stateFile": "auth-state.json", + "requiredCookies": [], + "requiredHeaders": [{"name": "X-CSRF-Token", "source": "localStorage", "key": "csrfToken"}], + } + return spec + + class _FakeResponse: def __init__(self, payload): self._payload = payload @@ -289,8 +389,10 @@ def __init__(self, payload) -> None: self._payload = payload self.request_calls = [] - def request(self, method, url, json=None, params=None, headers=None): - self.request_calls.append({"method": method, "url": url, "json": json, "params": params, "headers": headers}) + def request(self, method, url, json=None, params=None, data=None, headers=None): + self.request_calls.append( + {"method": method, "url": url, "json": json, "params": params, "data": data, "headers": headers} + ) return _FakeResponse(self._payload) @@ -337,7 +439,15 @@ def test_generated_spec_cli_executes_request_and_projects_rows(tmp_path, monkeyp module = _load_module() auth_state = tmp_path / "auth-state.json" auth_state.write_text( - json.dumps({"cookies": [{"name": "sid", "value": "cookie-123", "domain": ".example.com", "path": "/"}]}), + json.dumps( + { + "cookies": [ + {"name": "sid", "value": "cookie-123", "domain": ".example.com", "path": "/"}, + {"name": "scoped", "value": "cookie-456", "domain": "example.com", "path": "/api/items"}, + {"name": "ignore", "value": "cookie-789", "domain": ".zhihu.com", "path": "/"}, + ] + } + ), encoding="utf-8", ) @@ -362,12 +472,14 @@ def test_generated_spec_cli_executes_request_and_projects_rows(tmp_path, monkeyp "url": "https://example.com/api/items/list", "json": {"page": 3, "size": 5}, "params": None, - "headers": {"Content-Type": "application/json"}, + "data": None, + "headers": { + "Content-Type": "application/json", + "Cookie": "sid=cookie-123; scoped=cookie-456", + }, } ] - assert fake_session.cookies.set_calls == [ - {"name": "sid", "value": "cookie-123", "domain": ".example.com", "path": "/"} - ] + assert fake_session.cookies.set_calls == [] def test_generated_multi_operation_cli_runs_selected_subcommand(monkeypatch): @@ -390,11 +502,207 @@ def test_generated_multi_operation_cli_runs_selected_subcommand(monkeypatch): "url": "https://example.com/api/alerts/list", "json": {"page": 2, "size": 10}, "params": None, + "data": None, "headers": {"Content-Type": "application/json"}, } ] +def test_generated_form_spec_cli_sends_form_data(monkeypatch): + module = _load_module() + fake_session = _FakeRequestSession({"data": {"items": [{"id": "1", "title": "Alpha"}]}}) + fake_requests = types.SimpleNamespace(Session=lambda: fake_session) + monkeypatch.setitem(sys.modules, "requests", fake_requests) + + namespace = {"__name__": "generated_form_cli"} + exec(module.generate_python_cli_from_spec(_form_spec()), namespace) + + client = namespace["APIClient"]() + rows = client.run({"page": 4, "limit": 30}) + + assert rows == [{"id": "1", "title": "Alpha"}] + assert fake_session.request_calls == [ + { + "method": "POST", + "url": "https://example.com/api/search", + "json": None, + "params": None, + "data": {"page": 4, "size": 30, "keyword": "alpha"}, + "headers": {"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"}, + } + ] + + +def test_generated_raw_spec_cli_sends_raw_body(monkeypatch): + module = _load_module() + fake_session = _FakeRequestSession({"data": {"items": [{"id": "1", "title": "Alpha"}]}}) + fake_requests = types.SimpleNamespace(Session=lambda: fake_session) + monkeypatch.setitem(sys.modules, "requests", fake_requests) + + namespace = {"__name__": "generated_raw_cli"} + exec(module.generate_python_cli_from_spec(_raw_spec()), namespace) + + client = namespace["APIClient"]() + rows = client.run({"page": 7}) + + assert rows == [{"id": "1", "title": "Alpha"}] + assert fake_session.request_calls == [ + { + "method": "POST", + "url": "https://example.com/api/raw-search", + "json": None, + "params": {"page": 7}, + "data": "keyword=alpha", + "headers": {"Content-Type": "text/plain"}, + } + ] + + +def test_generated_multi_operation_cli_sends_get_query_without_body(monkeypatch): + module = _load_module() + fake_session = _FakeRequestSession({"count": 5}) + fake_requests = types.SimpleNamespace(Session=lambda: fake_session) + monkeypatch.setitem(sys.modules, "requests", fake_requests) + + namespace = {"__name__": "generated_get_cli"} + exec(module.generate_python_cli_from_spec(_multi_operation_spec()), namespace) + + client = namespace["APIClient"]() + entry = namespace["_operation_by_command"]("alarm-count") + rows = client.run({"page": 9}, entry) + + assert rows == [{"count": 5}] + assert fake_session.request_calls == [ + { + "method": "GET", + "url": "https://example.com/api/alarms/count", + "json": None, + "params": {"page": 9}, + "data": None, + "headers": {"Accept": "application/json"}, + } + ] + + +def test_generated_header_strategy_cli_sends_cookie_header_and_required_headers(tmp_path, monkeypatch): + module = _load_module() + auth_state = tmp_path / "auth-state.json" + auth_state.write_text( + json.dumps( + { + "cookies": [{"name": "sid", "value": "cookie-123", "domain": ".example.com", "path": "/"}], + "origins": [ + { + "origin": "https://example.com", + "localStorage": [{"name": "csrfToken", "value": "csrf-abc"}], + } + ], + } + ), + encoding="utf-8", + ) + fake_session = _FakeRequestSession({"data": {"items": [{"id": "1", "title": "Alpha"}]}}) + fake_requests = types.SimpleNamespace(Session=lambda: fake_session) + monkeypatch.setitem(sys.modules, "requests", fake_requests) + + namespace = {"__name__": "generated_header_cli"} + exec(module.generate_python_cli_from_spec(_header_auth_spec()), namespace) + + client = namespace["APIClient"](auth_state=str(auth_state)) + rows = client.run({"page": 1, "limit": 20}) + + assert rows == [{"id": "1", "title": "Alpha"}] + assert fake_session.request_calls == [ + { + "method": "POST", + "url": "https://example.com/api/items/list", + "json": {"page": 1, "size": 20}, + "params": None, + "data": None, + "headers": { + "Content-Type": "application/json", + "Cookie": "sid=cookie-123", + }, + } + ] + assert client.session.headers["X-CSRF-Token"] == "csrf-abc" + assert fake_session.cookies.set_calls == [] + + +def test_capture_to_spec_to_cli_preserves_form_payload_mode(monkeypatch): + cli_module = _load_module() + spec_module = _load_spec_module() + fake_session = _FakeRequestSession({"data": {"items": [{"id": "1", "title": "Alpha"}]}}) + fake_requests = types.SimpleNamespace(Session=lambda: fake_session) + monkeypatch.setitem(sys.modules, "requests", fake_requests) + + requests = [ + { + "type": "Fetch", + "method": "POST", + "url": "https://example.com/api/search", + "origin": "https://example.com", + "pathname": "/api/search", + "status": 200, + "captureReason": "nonGet", + "requestContentType": "application/x-www-form-urlencoded; charset=UTF-8", + "requestHeaders": {"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"}, + "requestBodyKind": "urlencoded", + "requestBody": '{\n "page": "1",\n "size": "20",\n "keyword": "alpha"\n}', + "response": '{"data":{"items":[{"id":"1","title":"Alpha"}]}}', + } + ] + generated_spec = spec_module.generate_spec_from_requests(requests) + + namespace = {"__name__": "generated_capture_form_cli"} + exec(cli_module.generate_python_cli_from_spec(generated_spec), namespace) + + client = namespace["APIClient"]() + rows = client.run({"page": 5, "limit": 15}) + + assert rows == [{"id": "1", "title": "Alpha"}] + assert fake_session.request_calls[0]["data"] == {"page": 5, "size": 15, "keyword": "alpha"} + assert fake_session.request_calls[0]["json"] is None + + +def test_capture_to_spec_to_cli_preserves_raw_payload_mode(monkeypatch): + cli_module = _load_module() + spec_module = _load_spec_module() + fake_session = _FakeRequestSession({"data": {"items": [{"id": "1", "title": "Alpha"}]}}) + fake_requests = types.SimpleNamespace(Session=lambda: fake_session) + monkeypatch.setitem(sys.modules, "requests", fake_requests) + + requests = [ + { + "type": "Fetch", + "method": "POST", + "url": "https://example.com/api/raw-search?page=1", + "origin": "https://example.com", + "pathname": "/api/raw-search", + "query": {"page": "1"}, + "status": 200, + "captureReason": "nonGet", + "requestContentType": "text/plain", + "requestHeaders": {"Content-Type": "text/plain"}, + "requestBodyKind": "text", + "requestBody": "keyword=alpha", + "response": '{"data":{"items":[{"id":"1","title":"Alpha"}]}}', + } + ] + generated_spec = spec_module.generate_spec_from_requests(requests) + + namespace = {"__name__": "generated_capture_raw_cli"} + exec(cli_module.generate_python_cli_from_spec(generated_spec), namespace) + + client = namespace["APIClient"]() + rows = client.run({"page": 8}) + + assert rows == [{"id": "1", "title": "Alpha"}] + assert fake_session.request_calls[0]["params"] == {"page": 8} + assert fake_session.request_calls[0]["data"] == "keyword=alpha" + assert fake_session.request_calls[0]["json"] is None + + def test_main_supports_spec_verify_output(tmp_path, monkeypatch, capsys): module = _load_module() spec_path = tmp_path / "web2cli-spec.json" diff --git a/tests/tool/test_web2cli_generate_spec.py b/tests/tool/test_web2cli_generate_spec.py index 9972c6fc8..bbe9329ad 100644 --- a/tests/tool/test_web2cli_generate_spec.py +++ b/tests/tool/test_web2cli_generate_spec.py @@ -89,6 +89,48 @@ def _multi_operation_requests(): ] +def _form_request(): + return [ + { + "type": "Fetch", + "method": "POST", + "url": "https://example.com/api/search", + "origin": "https://example.com", + "pathname": "/api/search", + "status": 200, + "captureReason": "nonGet", + "requestContentType": "application/x-www-form-urlencoded; charset=UTF-8", + "requestHeaders": { + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + "Cookie": "sid=cookie-123", + }, + "requestBodyKind": "urlencoded", + "requestBody": '{\n "page": "1",\n "size": "20",\n "keyword": "alpha"\n}', + "response": '{"data":{"items":[{"id":"1","title":"Alpha"}]}}', + } + ] + + +def _raw_request(): + return [ + { + "type": "Fetch", + "method": "POST", + "url": "https://example.com/api/raw-search?page=1", + "origin": "https://example.com", + "pathname": "/api/raw-search", + "query": {"page": "1"}, + "status": 200, + "captureReason": "nonGet", + "requestContentType": "text/plain", + "requestHeaders": {"Content-Type": "text/plain", "Cookie": "sid=cookie-123"}, + "requestBodyKind": "text", + "requestBody": "keyword=alpha", + "response": '{"data":{"items":[{"id":"1","title":"Alpha"}]}}', + } + ] + + def test_generate_spec_from_requests_picks_primary_collection_endpoint(): module = _load_module() @@ -126,6 +168,27 @@ def test_generate_spec_from_requests_includes_multi_operation_entries(): ] +def test_generate_spec_from_requests_preserves_form_payload_mode(): + module = _load_module() + + spec = module.generate_spec_from_requests(_form_request()) + + assert spec["operation"]["payloadMode"] == "form" + assert spec["operation"]["bodyTemplate"] == {"page": "${page}", "size": "${limit}", "keyword": "alpha"} + assert spec["operation"]["rawBodyTemplate"] == "" + + +def test_generate_spec_from_requests_preserves_raw_payload_mode(): + module = _load_module() + + spec = module.generate_spec_from_requests(_raw_request()) + + assert spec["operation"]["payloadMode"] == "raw" + assert spec["operation"]["bodyTemplate"] == {} + assert spec["operation"]["rawBodyTemplate"] == "keyword=alpha" + assert spec["args"] == [{"name": "page", "type": "int", "default": 1, "help": "Page number"}] + + def test_main_writes_spec_file(tmp_path, monkeypatch, capsys): module = _load_module() input_path = tmp_path / "captured.json" diff --git a/tests/tool/test_web2cli_hook_base.py b/tests/tool/test_web2cli_hook_base.py index 46f73096f..e0d3bc8ec 100644 --- a/tests/tool/test_web2cli_hook_base.py +++ b/tests/tool/test_web2cli_hook_base.py @@ -96,6 +96,11 @@ def _run_node_case(test_logic: str) -> dict: async function fetchImpl() {{ return {{ status: 201, + headers: {{ + get: function(name) {{ + return String(name || "").toLowerCase() === "content-type" ? "application/json" : ""; + }} + }}, clone: function() {{ return {{ text: async function() {{ @@ -223,3 +228,62 @@ def test_hook_base_exposes_debug_state_and_truncates_large_responses(): assert result["debugState"]["lastRequest"]["response"] == result["response"] assert result["debugState"]["lastRequest"]["pathname"] == "/api/debug" assert any("window.__apiCapture.getDebugState()" in line for line in result["logs"]) + + +def test_hook_base_captures_fetch_request_like_body_and_headers(): + result = _run_node_case( + """ + const requestLike = { + url: "https://example.com/api/items/list", + method: "POST", + headers: { + "Content-Type": "application/json", + "X-Trace-Id": "trace-123" + }, + clone: function() { + return { + text: async function() { + return JSON.stringify({ page: 2, size: 50 }); + } + }; + } + }; + + await env.context.window.fetch(requestLike); + + process.stdout.write(JSON.stringify({ + request: env.context.window.__capturedRequests[0] + })); +""" + ) + + assert result["request"]["method"] == "POST" + assert result["request"]["requestHeaders"]["Content-Type"] == "application/json" + assert result["request"]["requestHeaders"]["X-Trace-Id"] == "trace-123" + assert result["request"]["requestBodyKind"] == "json" + assert result["request"]["requestShape"]["$.page"] == "number" + assert result["request"]["requestShape"]["$.size"] == "number" + + +def test_hook_base_uses_content_type_to_parse_urlencoded_string_body(): + result = _run_node_case( + """ + await env.context.window.fetch("https://example.com/api/search", { + method: "POST", + headers: { + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8" + }, + body: "page=3&size=10&keyword=alpha" + }); + + process.stdout.write(JSON.stringify({ + request: env.context.window.__capturedRequests[0] + })); +""" + ) + + assert result["request"]["method"] == "POST" + assert result["request"]["requestBodyKind"] == "urlencoded" + assert result["request"]["requestBody"] == '{\n "page": "3",\n "size": "10",\n "keyword": "alpha"\n}' + assert result["request"]["requestShape"]["$.page"] == "string" + assert result["request"]["requestShape"]["$.keyword"] == "string" From 81940723117ae312eaed1ec5b52ddb3ae0c0cb3c Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Tue, 26 May 2026 15:00:31 +0800 Subject: [PATCH 2/3] fix(web2cli,browser): tighten cookie/header edge cases Preserve empty cookie values, prefer longer-path cookies in header order, resolve cookie-sourced auth headers without base-url path filtering, and avoid consuming fetch Request bodies without clone(). Restore legacy browser hostname helpers for external compatibility. Co-authored-by: Cursor --- .../skills/web2cli/scripts/generate-cli.py | 14 +-- .../web2cli/scripts/inject-hook-base.js | 13 --- flocks/browser/helpers.py | 16 ++++ tests/tool/test_web2cli_generate_cli.py | 92 ++++++++++++++++++- tests/tool/test_web2cli_hook_base.py | 30 ++++++ 5 files changed, 144 insertions(+), 21 deletions(-) diff --git a/.flocks/plugins/skills/web2cli/scripts/generate-cli.py b/.flocks/plugins/skills/web2cli/scripts/generate-cli.py index 933a5a94f..67295d931 100644 --- a/.flocks/plugins/skills/web2cli/scripts/generate-cli.py +++ b/.flocks/plugins/skills/web2cli/scripts/generate-cli.py @@ -198,7 +198,7 @@ def _build_cookie_header(self, url: str) -> str: for index, cookie in enumerate(self.cookie_items): name = cookie.get("name") value = cookie.get("value") - if not name or value in (None, ""): + if not name or value is None: continue domain = str(cookie.get("domain", "")) if domain and not self._domain_match(host, domain): @@ -215,7 +215,7 @@ def _build_cookie_header(self, url: str) -> str: selected[name] = (score, f"{name}={value}") return "; ".join( - header for _, header in sorted(selected.values(), key=lambda item: item[0][2]) + header for _, header in sorted(selected.values(), key=lambda item: (-item[0][0], item[0][2])) ) def __init__(self, base_url: str = __BASE_URL__, cookie_file: str = "auth-state.json"): @@ -893,7 +893,7 @@ def _select_cookie_header(self, url: str) -> str: for index, cookie in enumerate(self._load_cookie_items(self.auth_state_path)): name = cookie.get("name") value = cookie.get("value") - if not name or value in (None, ""): + if not name or value is None: continue domain = str(cookie.get("domain", "")) if domain and not self._domain_match(host, domain): @@ -928,11 +928,9 @@ def _resolve_cookie_value(self, key: Any) -> str | None: domain = str(cookie.get("domain", "")) if domain and not self._domain_match(host, domain): continue - cookie_path = str(cookie.get("path", "/") or "/") - if not self._path_match(request_path, cookie_path): - continue if cookie.get("secure") and not is_https: continue + cookie_path = str(cookie.get("path", "/") or "/") score = (len(cookie_path), len(domain.lstrip(".")), index) if selected is None or score > selected[0]: selected = (score, str(cookie.get("value", ""))) @@ -998,6 +996,8 @@ def __init__(self, base_url: str = SPEC.get("baseUrl", ""), auth_state: str = "a self.base_url = (base_url or SPEC.get("baseUrl", "")).rstrip("/") self.auth_state_path = auth_state self.auth_state = _load_json(auth_state) if auth_state else {} + if not isinstance(self.auth_state, dict): + self.auth_state = {} self.session = requests.Session() self._apply_auth_state() @@ -1017,7 +1017,7 @@ def _apply_auth_state(self) -> None: value = self._resolve_cookie_value(rule.get("key")) else: value = self._resolve_header_value(self.auth_state, rule) - if value: + if value is not None: self.session.headers[str(rule["name"])] = value def build_request(self, args: Dict[str, Any], entry: Dict[str, Any]) -> Dict[str, Any]: diff --git a/.flocks/plugins/skills/web2cli/scripts/inject-hook-base.js b/.flocks/plugins/skills/web2cli/scripts/inject-hook-base.js index ec49a2dde..747b4649a 100644 --- a/.flocks/plugins/skills/web2cli/scripts/inject-hook-base.js +++ b/.flocks/plugins/skills/web2cli/scripts/inject-hook-base.js @@ -645,19 +645,6 @@ } catch (error) { bodyPromise = Promise.resolve(''); } - } else if (requestLike && typeof requestLike.text === 'function') { - try { - bodyPromise = requestLike.text().then( - function(text) { - return text || ''; - }, - function() { - return ''; - } - ); - } catch (error) { - bodyPromise = Promise.resolve(''); - } } else { bodyPromise = Promise.resolve(''); } diff --git a/flocks/browser/helpers.py b/flocks/browser/helpers.py index ea2cf5be7..ce9e91ba5 100644 --- a/flocks/browser/helpers.py +++ b/flocks/browser/helpers.py @@ -22,6 +22,8 @@ AGENT_WORKSPACE = Path(os.environ.get("BH_AGENT_WORKSPACE", DEFAULT_AGENT_WORKSPACE)).expanduser() NAME = os.environ.get("BU_NAME", "default") INTERNAL = INTERNAL_URL_PREFIXES +# Legacy compatibility for external imports; cookie export no longer uses site-root guessing. +_COMMON_SECOND_LEVEL_SUFFIXES = {"ac", "co", "com", "edu", "gov", "mil", "net", "org"} _COOKIE_IMPORT_FIELDS = { "name", "value", @@ -242,6 +244,20 @@ def _stringify_json(data: Any) -> str: return json.dumps(data, ensure_ascii=False, separators=(",", ":")) +def _site_root_from_hostname(hostname: str) -> str: + labels = [label for label in hostname.lower().strip(".").split(".") if label] + if len(labels) <= 2: + return ".".join(labels) + if len(labels[-1]) == 2 and labels[-2] in _COMMON_SECOND_LEVEL_SUFFIXES: + return ".".join(labels[-3:]) + return ".".join(labels[-2:]) + + +def _domain_matches_site(domain: str | None, site_root: str) -> bool: + normalized = str(domain or "").lower().lstrip(".") + return bool(normalized) and (normalized == site_root or normalized.endswith(f".{site_root}")) + + def _domain_matches_host(domain: str | None, hostname: str) -> bool: normalized_domain = str(domain or "").lower().lstrip(".") normalized_host = str(hostname or "").lower().strip(".") diff --git a/tests/tool/test_web2cli_generate_cli.py b/tests/tool/test_web2cli_generate_cli.py index 8dbadf849..294dc7ed5 100644 --- a/tests/tool/test_web2cli_generate_cli.py +++ b/tests/tool/test_web2cli_generate_cli.py @@ -194,7 +194,7 @@ def test_generated_client_loads_storage_state_cookie_object(tmp_path, monkeypatc "headers": { "Accept": "application/json, text/plain, */*", "Content-Type": "application/json; charset=UTF-8", - "Cookie": "sid=cookie-123; api=cookie-456", + "Cookie": "api=cookie-456; sid=cookie-123", }, } ] @@ -372,6 +372,17 @@ def _header_auth_spec(): return spec +def _cookie_source_header_auth_spec(): + spec = _sample_spec() + spec["strategy"] = "HEADER" + spec["auth"] = { + "stateFile": "auth-state.json", + "requiredCookies": [], + "requiredHeaders": [{"name": "X-CSRF-Token", "source": "cookie", "key": "csrf"}], + } + return spec + + class _FakeResponse: def __init__(self, payload): self._payload = payload @@ -629,6 +640,85 @@ def test_generated_header_strategy_cli_sends_cookie_header_and_required_headers( assert fake_session.cookies.set_calls == [] +def test_generated_header_strategy_cookie_source_ignores_base_url_path(tmp_path, monkeypatch): + module = _load_module() + auth_state = tmp_path / "auth-state.json" + auth_state.write_text( + json.dumps( + { + "cookies": [ + {"name": "sid", "value": "cookie-123", "domain": ".example.com", "path": "/"}, + {"name": "csrf", "value": "csrf-from-cookie", "domain": ".example.com", "path": "/api/items"}, + ] + } + ), + encoding="utf-8", + ) + fake_session = _FakeRequestSession({"data": {"items": [{"id": "1", "title": "Alpha"}]}}) + fake_requests = types.SimpleNamespace(Session=lambda: fake_session) + monkeypatch.setitem(sys.modules, "requests", fake_requests) + + namespace = {"__name__": "generated_cookie_header_cli"} + exec(module.generate_python_cli_from_spec(_cookie_source_header_auth_spec()), namespace) + + client = namespace["APIClient"](auth_state=str(auth_state)) + rows = client.run({"page": 1, "limit": 20}) + + assert rows == [{"id": "1", "title": "Alpha"}] + assert client.session.headers["X-CSRF-Token"] == "csrf-from-cookie" + assert fake_session.request_calls[0]["headers"]["Cookie"] == "sid=cookie-123; csrf=csrf-from-cookie" + + +def test_generated_header_strategy_accepts_empty_cookie_values(tmp_path, monkeypatch): + module = _load_module() + auth_state = tmp_path / "auth-state.json" + auth_state.write_text( + json.dumps( + { + "cookies": [ + {"name": "flag", "value": "", "domain": ".example.com", "path": "/"}, + {"name": "sid", "value": "cookie-123", "domain": ".example.com", "path": "/"}, + ] + } + ), + encoding="utf-8", + ) + fake_session = _FakeRequestSession({"data": {"items": [{"id": "1", "title": "Alpha"}]}}) + fake_requests = types.SimpleNamespace(Session=lambda: fake_session) + monkeypatch.setitem(sys.modules, "requests", fake_requests) + + namespace = {"__name__": "generated_empty_cookie_cli"} + exec(module.generate_python_cli_from_spec(_sample_spec()), namespace) + + client = namespace["APIClient"](auth_state=str(auth_state)) + rows = client.run({"page": 1, "limit": 20}) + + assert rows == [{"id": "1", "title": "Alpha"}] + assert fake_session.request_calls[0]["headers"]["Cookie"] == "flag=; sid=cookie-123" + + +def test_generated_cli_normalizes_non_dict_auth_state_for_headers(tmp_path, monkeypatch): + module = _load_module() + auth_state = tmp_path / "auth-state.json" + auth_state.write_text( + json.dumps([{"name": "sid", "value": "cookie-123", "domain": ".example.com", "path": "/"}]), + encoding="utf-8", + ) + fake_session = _FakeRequestSession({"data": {"items": [{"id": "1", "title": "Alpha"}]}}) + fake_requests = types.SimpleNamespace(Session=lambda: fake_session) + monkeypatch.setitem(sys.modules, "requests", fake_requests) + + namespace = {"__name__": "generated_list_auth_state_cli"} + exec(module.generate_python_cli_from_spec(_header_auth_spec()), namespace) + + client = namespace["APIClient"](auth_state=str(auth_state)) + rows = client.run({"page": 1, "limit": 20}) + + assert rows == [{"id": "1", "title": "Alpha"}] + assert client.auth_state == {} + assert "X-CSRF-Token" not in client.session.headers + + def test_capture_to_spec_to_cli_preserves_form_payload_mode(monkeypatch): cli_module = _load_module() spec_module = _load_spec_module() diff --git a/tests/tool/test_web2cli_hook_base.py b/tests/tool/test_web2cli_hook_base.py index e0d3bc8ec..46c3b2aa7 100644 --- a/tests/tool/test_web2cli_hook_base.py +++ b/tests/tool/test_web2cli_hook_base.py @@ -287,3 +287,33 @@ def test_hook_base_uses_content_type_to_parse_urlencoded_string_body(): assert result["request"]["requestBody"] == '{\n "page": "3",\n "size": "10",\n "keyword": "alpha"\n}' assert result["request"]["requestShape"]["$.page"] == "string" assert result["request"]["requestShape"]["$.keyword"] == "string" + + +def test_hook_base_does_not_consume_request_body_without_clone(): + result = _run_node_case( + """ + let textCalls = 0; + const requestLike = { + url: "https://example.com/api/items/list", + method: "POST", + headers: { + "Content-Type": "application/json" + }, + text: async function() { + textCalls += 1; + return JSON.stringify({ page: 9 }); + } + }; + + await env.context.window.fetch(requestLike); + + process.stdout.write(JSON.stringify({ + textCalls, + request: env.context.window.__capturedRequests[0] + })); +""" + ) + + assert result["textCalls"] == 0 + assert result["request"]["method"] == "POST" + assert result["request"]["requestBodyKind"] == "empty" From c98f78ee174720db1e4e9a8ce3e1c82e23ba5ae2 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Tue, 26 May 2026 15:15:33 +0800 Subject: [PATCH 3/3] docs(readme): increase Docker shared memory to 4gb Raise the documented --shm-size for browser workloads in Docker run examples. Co-authored-by: Cursor --- README.md | 4 ++-- README_zh.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b7b7f86f4..a2fff0cd0 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ docker run -d \ --name flocks \ -p 8000:8000 \ -p 5173:5173 \ - --shm-size 2gb \ + --shm-size 4gb \ -v "${HOME}/.flocks:/home/flocks/.flocks" \ ghcr.io/agentflocks/flocks:latest ``` @@ -153,7 +153,7 @@ docker run -d ` --name flocks ` -p 8000:8000 ` -p 5173:5173 ` - --shm-size 2gb ` + --shm-size 4gb ` -v "${env:USERPROFILE}\.flocks:/home/flocks/.flocks" ` ghcr.io/agentflocks/flocks:latest ``` diff --git a/README_zh.md b/README_zh.md index 08464ebd0..fd0a43e5f 100644 --- a/README_zh.md +++ b/README_zh.md @@ -142,7 +142,7 @@ docker run -d \ -e TZ=Asia/Shanghai \ -p 8000:8000 \ -p 5173:5173 \ - --shm-size 2gb \ + --shm-size 4gb \ -v "${HOME}/.flocks:/home/flocks/.flocks" \ ghcr.io/agentflocks/flocks:latest ``` @@ -154,7 +154,7 @@ docker run -d ` -e TZ=Asia/Shanghai ` -p 8000:8000 ` -p 5173:5173 ` - --shm-size 2gb ` + --shm-size 4gb ` -v "${env:USERPROFILE}\.flocks:/home/flocks/.flocks" ` ghcr.io/agentflocks/flocks:latest ```