From 1f9ef47f0029ff2b9750b399eab98fbeec9b8b33 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 16:00:33 +0800 Subject: [PATCH 01/41] feat: add alert_dedup workflow for security alert deduplication MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 基于 aisoc_mini 的告警去重算法(URI 归一化 + 5-gram Shingling + Jaccard 相似度)移植为 flocks 工作流,包含 5 个线性节点:接收解析 → URI 归一化 → 去重键计算 → 分组去重 → 报告生成。支持严格字段精确匹配与 LSH 字段近似 匹配,输出唯一告警、重复告警及 Markdown 分析报告。 Co-authored-by: Cursor --- .../plugins/workflows/alert_dedup/meta.json | 10 ++ .../workflows/alert_dedup/workflow.json | 80 +++++++++ .../plugins/workflows/alert_dedup/workflow.md | 164 ++++++++++++++++++ 3 files changed, 254 insertions(+) create mode 100644 .flocks/plugins/workflows/alert_dedup/meta.json create mode 100644 .flocks/plugins/workflows/alert_dedup/workflow.json create mode 100644 .flocks/plugins/workflows/alert_dedup/workflow.md diff --git a/.flocks/plugins/workflows/alert_dedup/meta.json b/.flocks/plugins/workflows/alert_dedup/meta.json new file mode 100644 index 000000000..2415aeec9 --- /dev/null +++ b/.flocks/plugins/workflows/alert_dedup/meta.json @@ -0,0 +1,10 @@ +{ + "name": "alert_dedup", + "description": "告警去重工作流:通过 URI 归一化、5-gram Shingling 与 Jaccard 相似度对批量安全告警进行聚类去重,降低告警噪声", + "category": "security", + "status": "active", + "createdBy": null, + "createdAt": 1746691200000, + "updatedAt": 1746691200000, + "id": "alert_dedup" +} diff --git a/.flocks/plugins/workflows/alert_dedup/workflow.json b/.flocks/plugins/workflows/alert_dedup/workflow.json new file mode 100644 index 000000000..c4eea5b06 --- /dev/null +++ b/.flocks/plugins/workflows/alert_dedup/workflow.json @@ -0,0 +1,80 @@ +{ + "name": "alert_dedup", + "description": "Alert deduplication workflow - groups similar security alerts using URI normalization, 5-gram shingling and Jaccard similarity to reduce alert noise.", + "description_cn": "告警去重工作流 - 通过 URI 归一化、5-gram Shingling 与 Jaccard 相似度对相似安全告警进行分组,有效降低告警噪声", + "start": "receive_alerts", + "nodes": [ + { + "id": "receive_alerts", + "type": "python", + "description": "接收并校验告警列表,提取去重配置(严格字段、LSH 字段、相似度阈值)", + "code": "import json\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nthreshold = float(inputs.get('threshold', 0.7))\nmax_field_len = int(inputs.get('max_field_len', 500))\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\noutputs['alerts'] = alerts_input\noutputs['total_count'] = len(alerts_input)\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['threshold'] = threshold\noutputs['max_field_len'] = max_field_len" + }, + { + "id": "normalize_alerts", + "type": "python", + "description": "对 LSH 字段进行 URI 归一化(日期→DATETIME、UUID→UUID、长数字→NUM、路径穿越、NULL 字节、URL 编码等),降低文本噪声", + "code": "import re\n\ndef normalize_uri(text):\n if not text:\n return ''\n text = str(text)\n text = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', text)\n text = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', text)\n text = re.sub(r'\\b\\d{6,}\\b', 'NUM', text)\n text = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', text)\n text = re.sub(r'%00', 'NULL', text)\n text = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', text)\n return text\n\nalerts = inputs.get('alerts', [])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\n\nnormalized_alerts = []\nfor alert in alerts:\n norm = dict(alert)\n for field in lsh_fields:\n raw = str(norm.get(field) or '')[:max_field_len]\n norm[f'{field}_normalized'] = normalize_uri(raw)\n normalized_alerts.append(norm)\n\noutputs['normalized_alerts'] = normalized_alerts\noutputs['strict_fields'] = inputs.get('strict_fields', ['sip', 'dip'])\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['threshold'] = inputs.get('threshold', 0.7)" + }, + { + "id": "compute_dedup_keys", + "type": "python", + "description": "为每条告警计算去重键:严格字段精确匹配 + 归一化 LSH 字段的 5-gram Jaccard 近似相似度,相似度超阈值则归入同一簇并复用其 MD5 去重键", + "code": "import hashlib\n\ndef get_shingles(text, k=5):\n text = str(text or '').lower()\n if len(text) < k:\n return frozenset([text]) if text else frozenset()\n return frozenset(text[i:i+k] for i in range(len(text) - k + 1))\n\ndef jaccard(set1, set2):\n if not set1 and not set2:\n return 1.0\n if not set1 or not set2:\n return 0.0\n inter = len(set1 & set2)\n union = len(set1 | set2)\n return inter / union if union > 0 else 0.0\n\ndef md5_key(text):\n return hashlib.md5(text.encode('utf-8')).hexdigest()\n\nalerts = inputs.get('normalized_alerts', [])\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nthreshold = float(inputs.get('threshold', 0.7))\n\n# Registry: dedup_key -> (strict_text, shingle_set)\nkey_registry = {}\n\nkeyed_alerts = []\nfor alert in alerts:\n strict_text = '. '.join(str(alert.get(f, ''))[:max_field_len] for f in strict_fields)\n lsh_text = '. '.join(\n str(alert.get(f'{f}_normalized', alert.get(f, '')))[:max_field_len]\n for f in lsh_fields\n )\n current_shingles = get_shingles(lsh_text)\n\n # Find existing cluster with same strict prefix and similar LSH content\n matched_key = None\n best_sim = 0.0\n for existing_key, (ex_strict, ex_shingles) in key_registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(current_shingles, ex_shingles)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = existing_key\n\n if matched_key is None:\n raw_key = md5_key(f'{strict_text}. {lsh_text}')\n key_registry[raw_key] = (strict_text, current_shingles)\n canonical_key = raw_key\n else:\n canonical_key = matched_key\n\n alert_out = dict(alert)\n alert_out['dedup_key'] = canonical_key\n keyed_alerts.append(alert_out)\n\noutputs['keyed_alerts'] = keyed_alerts\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['threshold'] = threshold" + }, + { + "id": "group_by_dedup_key", + "type": "python", + "description": "按去重键分组:每组第一条告警为代表(unique),其余标记为重复(duplicate);输出统计信息", + "code": "from collections import defaultdict\n\nkeyed_alerts = inputs.get('keyed_alerts', [])\n\ngroups = defaultdict(list)\nfor alert in keyed_alerts:\n key = alert.get('dedup_key', 'unknown')\n groups[key].append(alert)\n\nunique_alerts = []\nduplicate_alerts = []\ndedup_groups = []\n\nfor key, group in groups.items():\n rep = dict(group[0])\n rep['dedup_key_already_exists'] = False\n rep['dedup_group_size'] = len(group)\n unique_alerts.append(rep)\n\n dups = []\n for dup in group[1:]:\n dup_alert = dict(dup)\n dup_alert['dedup_key_already_exists'] = True\n dup_alert['dedup_group_size'] = len(group)\n duplicate_alerts.append(dup_alert)\n dups.append(dup_alert)\n\n dedup_groups.append({\n 'dedup_key': key,\n 'count': len(group),\n 'representative': group[0],\n 'duplicates': dups\n })\n\ntotal = len(keyed_alerts)\ndedup_ratio = round(len(duplicate_alerts) / total, 4) if total > 0 else 0.0\n\ndedup_stats = {\n 'total_count': total,\n 'unique_count': len(unique_alerts),\n 'duplicate_count': len(duplicate_alerts),\n 'dedup_ratio': dedup_ratio,\n 'group_count': len(dedup_groups)\n}\n\noutputs['unique_alerts'] = unique_alerts\noutputs['duplicate_alerts'] = duplicate_alerts\noutputs['dedup_groups'] = dedup_groups\noutputs['dedup_stats'] = dedup_stats\noutputs['keyed_alerts'] = keyed_alerts" + }, + { + "id": "generate_dedup_report", + "type": "python", + "description": "汇总去重结果并生成 Markdown 报告;同步将全量带去重键告警、唯一告警、重复告警及分组统计写入 artifacts/ 目录", + "code": "import os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\ndedup_stats = inputs.get('dedup_stats', {})\nunique_alerts = inputs.get('unique_alerts', [])\nduplicate_alerts = inputs.get('duplicate_alerts', [])\ndedup_groups = inputs.get('dedup_groups', [])\nkeyed_alerts = inputs.get('keyed_alerts', [])\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\n\ntotal = dedup_stats.get('total_count', 0)\nunique = dedup_stats.get('unique_count', 0)\nduplicates = dedup_stats.get('duplicate_count', 0)\nratio = dedup_stats.get('dedup_ratio', 0.0)\ngroups_count = dedup_stats.get('group_count', 0)\n\nsorted_groups = sorted(dedup_groups, key=lambda g: g.get('count', 0), reverse=True)\ntop_groups = sorted_groups[:10]\n\ntop_groups_md = ''\nfor i, g in enumerate(top_groups, 1):\n rep = g.get('representative', {})\n key = g.get('dedup_key', '')\n count = g.get('count', 0)\n sip = rep.get('sip', rep.get('\\u6e90IP', ''))\n dip = rep.get('dip', rep.get('\\u76ee\\u7684IP', ''))\n alert_type = rep.get('alert_type', rep.get('\\u653b\\u51fb\\u7c7b\\u578b\\u5927\\u7c7b', rep.get('type', '')))\n top_groups_md += f'\\n### \\u7b2c {i} \\u7ec4\\uff08\\u5171 {count} \\u6761\\uff09\\n'\n top_groups_md += f'- \\u53bb\\u91cd\\u952e: `{key[:16]}...`\\n'\n if sip:\n top_groups_md += f'- \\u6e90IP: {sip}\\n'\n if dip:\n top_groups_md += f'- \\u76ee\\u7684IP: {dip}\\n'\n if alert_type:\n top_groups_md += f'- \\u544a\\u8b66\\u7c7b\\u578b: {alert_type}\\n'\n\nreport_content = f\"\"\"# \\u544a\\u8b66\\u53bb\\u91cd\\u5206\\u6790\\u62a5\\u544a\n\n## \\u6267\\u884c\\u6458\\u8981\n\n| \\u6307\\u6807 | \\u503c |\n|------|-----|\n| \\u8f93\\u5165\\u544a\\u8b66\\u603b\\u6570 | {total} |\n| \\u53bb\\u91cd\\u540e\\u552f\\u4e00\\u544a\\u8b66\\u6570 | {unique} |\n| \\u91cd\\u590d\\u544a\\u8b66\\u6570 | {duplicates} |\n| \\u53bb\\u91cd\\u7387 | {ratio * 100:.1f}% |\n| \\u53bb\\u91cd\\u5206\\u7ec4\\u6570 | {groups_count} |\n\n> \\u53bb\\u91cd\\u7387 = \\u91cd\\u590d\\u544a\\u8b66\\u6570 / \\u603b\\u544a\\u8b66\\u6570\\uff0c\\u53cd\\u6620\\u544a\\u8b66\\u566a\\u58f0\\u6c34\\u5e73\\u3002\n\n## \\u7b97\\u6cd5\\u8bf4\\u660e\n\n1. **\\u4e25\\u683c\\u5b57\\u6bb5\\u5339\\u914d**\\uff1a\\u5bf9\\u6e90IP/\\u76ee\\u7684IP \\u7b49\\u5173\\u952e\\u5b57\\u6bb5\\u8fdb\\u884c\\u7cbe\\u786e\\u5339\\u914d\n2. **\\u5185\\u5bb9\\u5f52\\u4e00\\u5316**\\uff1a\\u5bf9 URL\\u3001\\u8bf7\\u6c42\\u4f53\\u3001\\u54cd\\u5e94\\u4f53\\u8fdb\\u884c\\u6b63\\u5219\\u5f52\\u4e00\\u5316\\uff08\\u65e5\\u671f\\u2192DATETIME\\u3001UUID\\u2192UUID\\u3001\\u957f\\u6570\\u5b57\\u2192NUM \\u7b49\\uff09\n3. **\\u76f8\\u4f3c\\u5ea6\\u8ba1\\u7b97**\\uff1a\\u4f7f\\u7528 5-gram Shingling + Jaccard \\u76f8\\u4f3c\\u5ea6\\u8fdb\\u884c\\u6a21\\u7cca\\u5339\\u914d\\uff0c\\u9608\\u503c {inputs.get('threshold', 0.7)}\n4. **\\u53bb\\u91cd\\u952e**\\uff1a\\u57fa\\u4e8e MD5 \\u54c8\\u5e0c\\u7684\\u552f\\u4e00\\u6807\\u8bc6\\uff0c\\u76f8\\u4f3c\\u5185\\u5bb9\\u6620\\u5c04\\u81f3\\u540c\\u4e00\\u53bb\\u91cd\\u952e\n\n## Top 10 \\u6700\\u5927\\u544a\\u8b66\\u5206\\u7ec4\n{top_groups_md or '\\uff08\\u65e0\\u5206\\u7ec4\\u6570\\u636e\\uff09'}\n\n---\n\n## \\u8f93\\u51fa\\u6587\\u4ef6\n\n- \\u5168\\u91cf\\u5e26\\u53bb\\u91cd\\u952e\\u544a\\u8b66\\uff1a`artifacts/dedup_all_alerts.jsonl`\n- \\u552f\\u4e00\\u544a\\u8b66\\uff08\\u53bb\\u91cd\\u540e\\uff09\\uff1a`artifacts/dedup_unique_alerts.jsonl`\n- \\u91cd\\u590d\\u544a\\u8b66\\uff1a`artifacts/dedup_duplicate_alerts.jsonl`\n- \\u5206\\u7ec4\\u7edf\\u8ba1\\uff1a`artifacts/dedup_groups.json`\n\n---\n\n*\\u751f\\u6210\\u65f6\\u95f4: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\"\"\"\n\nreport_path = os.path.join(output_dir, 'alert_dedup_report.md')\ntool.run('write', filePath=report_path, content=report_content)\n\ndef write_jsonl(path, records):\n lines = '\\n'.join(json.dumps(r, ensure_ascii=False) for r in records)\n tool.run('write', filePath=path, content=lines)\n\nwrite_jsonl(os.path.join(artifacts_dir, 'dedup_all_alerts.jsonl'), keyed_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'dedup_unique_alerts.jsonl'), unique_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'dedup_duplicate_alerts.jsonl'), duplicate_alerts)\n\ngroups_summary = [{'dedup_key': g['dedup_key'], 'count': g['count']} for g in dedup_groups]\ntool.run('write', filePath=os.path.join(artifacts_dir, 'dedup_groups.json'),\n content=json.dumps(groups_summary, ensure_ascii=False, indent=2))\n\noutputs['report_path'] = report_path\noutputs['unique_alerts'] = unique_alerts\noutputs['duplicate_alerts'] = duplicate_alerts\noutputs['dedup_stats'] = dedup_stats\noutputs['summary'] = (\n f'\\u544a\\u8b66\\u53bb\\u91cd\\u5b8c\\u6210\\uff1a\\u8f93\\u5165 {total} \\u6761\\uff0c'\n f'\\u53bb\\u91cd\\u540e {unique} \\u6761\\u552f\\u4e00\\u544a\\u8b66\\uff0c'\n f'{duplicates} \\u6761\\u91cd\\u590d\\uff0c'\n f'\\u53bb\\u91cd\\u7387 {ratio * 100:.1f}%\\u3002\\u62a5\\u544a\\uff1a{report_path}'\n)" + } + ], + "edges": [ + { + "from": "receive_alerts", + "to": "normalize_alerts" + }, + { + "from": "normalize_alerts", + "to": "compute_dedup_keys" + }, + { + "from": "compute_dedup_keys", + "to": "group_by_dedup_key" + }, + { + "from": "group_by_dedup_key", + "to": "generate_dedup_report" + } + ], + "metadata": { + "node_timeout_s": 300, + "sampleInputs": { + "alerts": [ + { + "sip": "1.2.3.4", + "dip": "10.0.0.1", + "req_http_url": "/admin/login.php?id=1 OR 1=1", + "req_body": "username=admin&password=123456", + "rsp_body": "HTTP/1.1 200 OK" + }, + { + "sip": "1.2.3.4", + "dip": "10.0.0.1", + "req_http_url": "/admin/login.php?id=2 OR 2=2", + "req_body": "username=admin&password=654321", + "rsp_body": "HTTP/1.1 200 OK" + } + ], + "strict_fields": ["sip", "dip"], + "lsh_fields": ["req_http_url", "req_body", "rsp_body"], + "threshold": 0.7 + } + } +} diff --git a/.flocks/plugins/workflows/alert_dedup/workflow.md b/.flocks/plugins/workflows/alert_dedup/workflow.md new file mode 100644 index 000000000..5c8ead96d --- /dev/null +++ b/.flocks/plugins/workflows/alert_dedup/workflow.md @@ -0,0 +1,164 @@ +# alert_dedup — 告警去重工作流 + +## 简介 + +`alert_dedup` 是一个安全告警去重工作流,基于 **aisoc_mini** 项目的去重算法移植而来。 +它通过 URI 归一化 + 5-gram Shingling + Jaccard 相似度,将相似告警归入同一去重簇, +有效降低告警噪声,让安全分析师聚焦于真正唯一的威胁事件。 + +## 使用场景 + +- 批量告警分析前的预处理(降噪) +- SIEM/NDR 告警规律性分析 +- 告警风暴抑制(同一攻击模式产生大量重复告警) +- 与 LLM 研判结合:去重后只对唯一告警调用大模型,节省成本 + +## 输入参数 + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `alerts` | `list[dict]` | 必填 | 待去重告警列表,每条为 JSON 对象 | +| `strict_fields` | `list[str]` | `["sip", "dip"]` | 严格匹配字段(如源IP/目的IP),这些字段不同则一定不归为同组 | +| `lsh_fields` | `list[str]` | `["req_http_url", "req_body", "rsp_body"]` | 近似匹配字段(URL、请求体、响应体),用于 Jaccard 相似度计算 | +| `threshold` | `float` | `0.7` | Jaccard 相似度阈值,超过此值认为是同类告警 | +| `max_field_len` | `int` | `500` | 字段截断长度,避免超长内容影响性能 | + +### 中文字段名支持 + +如果告警使用中文列名(如来自 CSV),可将字段名配置为中文: + +```json +{ + "strict_fields": ["源IP", "目的IP"], + "lsh_fields": ["请求内容", "响应内容", "载荷_decoded"] +} +``` + +## 输出 + +| 字段 | 说明 | +|------|------| +| `unique_alerts` | 去重后的唯一告警列表,每条含 `dedup_key`、`dedup_group_size`、`dedup_key_already_exists=false` | +| `duplicate_alerts` | 被归为重复的告警列表,含 `dedup_key_already_exists=true` | +| `dedup_stats` | 统计信息:总数、唯一数、重复数、去重率、分组数 | +| `report_path` | Markdown 报告路径 | +| `summary` | 单行执行摘要 | + +**告警新增字段说明:** +- `dedup_key`:MD5 去重键,同一去重簇的告警共享相同的值 +- `dedup_key_already_exists`:`true` 表示该告警是重复的 +- `dedup_group_size`:该告警所属分组的总数量 + +## 工作流节点 + +``` +receive_alerts + │ + ▼ +normalize_alerts ← URI 归一化(日期/UUID/长数字/路径穿越/编码字符) + │ + ▼ +compute_dedup_keys ← 严格字段精确匹配 + 5-gram Jaccard 近似相似度 + │ + ▼ +group_by_dedup_key ← 按去重键分组,标记唯一/重复 + │ + ▼ +generate_dedup_report ← 生成报告 + 写出 JSONL/JSON 数据文件 +``` + +### 节点说明 + +| 节点 ID | 类型 | 职责 | +|---------|------|------| +| `receive_alerts` | python | 解析告警输入(支持 JSON 字符串、列表、`{data:[...]}` 嵌套),提取配置 | +| `normalize_alerts` | python | 对 LSH 字段值进行 URI 风格归一化,去除日期/UUID/数字等噪声 | +| `compute_dedup_keys` | python | 5-gram Shingling + Jaccard 相似度计算,生成 MD5 去重键 | +| `group_by_dedup_key` | python | 按去重键分组,首条为代表(unique),其余标记为 duplicate | +| `generate_dedup_report` | python | 生成 Markdown 报告及 JSONL/JSON 数据文件 | + +## 算法说明 + +### 1. URI 归一化(`normalize_alerts`) + +对 URL、请求体等 LSH 字段应用以下替换规则,使内容相同但细节不同的告警在相似度计算上趋近: + +| 模式 | 替换为 | +|------|--------| +| `2024-01-15`、`2024/01/15 14:30` | `DATETIME` | +| `550e8400-e29b-41d4-a716-...` (UUID) | `UUID` | +| 6 位及以上纯数字 | `NUM` | +| `../`、`..\` 路径穿越 | `../` | +| URL 编码的 NULL 字节 `%00` | `NULL` | +| 连续 3 个以上 URL 编码字符 | `ENCODED` | + +### 2. 去重键计算(`compute_dedup_keys`) + +``` +strict_text = join(strict_fields values) +lsh_text = join(normalized lsh_fields values) + +→ 在已有簇中找 strict_text 相同、Jaccard(lsh_text) ≥ threshold 的簇 +→ 若找到:复用该簇的 dedup_key +→ 若未找到:dedup_key = MD5(strict_text + ". " + lsh_text) +``` + +Jaccard 相似度基于 **5-gram** 分词(Character-level shingles)。 + +## 输出文件 + +所有文件写入 `~/.flocks/workspace/outputs//`: + +``` +outputs/ +└── / + ├── alert_dedup_report.md # 主报告(Markdown) + └── artifacts/ + ├── dedup_all_alerts.jsonl # 全量带去重键告警 + ├── dedup_unique_alerts.jsonl # 唯一告警(去重代表) + ├── dedup_duplicate_alerts.jsonl # 重复告警 + └── dedup_groups.json # 分组统计(key + count) +``` + +## 示例 + +```json +{ + "alerts": [ + { + "sip": "1.2.3.4", + "dip": "10.0.0.1", + "req_http_url": "/admin/login.php?id=1 OR 1=1", + "req_body": "username=admin&password=123456", + "rsp_body": "HTTP/1.1 200 OK" + }, + { + "sip": "1.2.3.4", + "dip": "10.0.0.1", + "req_http_url": "/admin/login.php?id=2 OR 2=2", + "req_body": "username=admin&password=654321", + "rsp_body": "HTTP/1.1 200 OK" + } + ], + "strict_fields": ["sip", "dip"], + "lsh_fields": ["req_http_url", "req_body", "rsp_body"], + "threshold": 0.7 +} +``` + +上述两条告警的严格字段相同(同源同目),LSH 字段经归一化后相似度高(SQL 注入 payload 结构一致), +因此会被归为同一去重簇,只保留第一条为代表。 + +## 与 aisoc_mini 的对应关系 + +| aisoc_mini 组件 | 本工作流对应节点 | +|-----------------|----------------| +| `LogDecoder.process()` | `normalize_alerts`(子集:URI 归一化) | +| `LogDedup._generate_dedup_key_text()` | `compute_dedup_keys` | +| `LSHProcessor.query_most_similar()` | `compute_dedup_keys`(5-gram Jaccard 简化版) | +| `LogDedup.process()` | `group_by_dedup_key` | +| 报告输出 | `generate_dedup_report` | + +> **注意**:本工作流使用标准库实现相似度计算(无需 `datasketch`), +> 采用精确 Jaccard 而非 MinHash 近似。对于超大批量(>10 万条)告警, +> 建议改用 `datasketch` 的 MinHash LSH 以获得更好性能。 From 1752b5b8cac6340099843c9826e23ae5ae2b0503 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 16:08:16 +0800 Subject: [PATCH 02/41] refactor: align alert_dedup workflow with full LogProcessPipeline (4 stages) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 重写工作流以完整对齐 aisoc_mini 的 LogProcessPipeline 四阶段主流程: - normalize_logs: TDP/Skyeye 字段映射 + 嵌套结构扁平化 - filter_logs: jsonLogic 规则过滤(扫描类/出站/非HTTP告警剔除) - dedup_logs: URI 归一化 + 5-gram Jaccard 相似度去重,生成 dedup_key - analyze_unique: 仅对唯一 dedup_key 调用 LLM is_attack 研判并回填重复告警 - generate_report: 输出四阶段统计 Markdown 报告及 JSONL 数据文件 Co-authored-by: Cursor --- .../plugins/workflows/alert_dedup/meta.json | 2 +- .../workflows/alert_dedup/workflow.json | 95 ++++---- .../plugins/workflows/alert_dedup/workflow.md | 223 ++++++++---------- 3 files changed, 151 insertions(+), 169 deletions(-) diff --git a/.flocks/plugins/workflows/alert_dedup/meta.json b/.flocks/plugins/workflows/alert_dedup/meta.json index 2415aeec9..e766918be 100644 --- a/.flocks/plugins/workflows/alert_dedup/meta.json +++ b/.flocks/plugins/workflows/alert_dedup/meta.json @@ -1,6 +1,6 @@ { "name": "alert_dedup", - "description": "告警去重工作流:通过 URI 归一化、5-gram Shingling 与 Jaccard 相似度对批量安全告警进行聚类去重,降低告警噪声", + "description": "告警处理四阶段主流程:归一化(TDP/Skyeye字段映射)→ 过滤(扫描/出站剔除)→ 去重(URI归一化+Jaccard相似度)→ LLM研判(仅对唯一dedup_key调用),完整对齐 aisoc_mini LogProcessPipeline", "category": "security", "status": "active", "createdBy": null, diff --git a/.flocks/plugins/workflows/alert_dedup/workflow.json b/.flocks/plugins/workflows/alert_dedup/workflow.json index c4eea5b06..0b33693e0 100644 --- a/.flocks/plugins/workflows/alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/alert_dedup/workflow.json @@ -1,80 +1,77 @@ { "name": "alert_dedup", - "description": "Alert deduplication workflow - groups similar security alerts using URI normalization, 5-gram shingling and Jaccard similarity to reduce alert noise.", - "description_cn": "告警去重工作流 - 通过 URI 归一化、5-gram Shingling 与 Jaccard 相似度对相似安全告警进行分组,有效降低告警噪声", + "description": "Full 4-stage alert processing pipeline: Normalize → Filter → Dedup → Analyze. Mirrors LogProcessPipeline from aisoc_mini: field normalization (TDP/Skyeye), scan-rule filtering, LSH-style deduplication, and LLM-based attack triage on unique alerts only.", + "description_cn": "告警处理四阶段主流程:归一化 → 过滤 → 去重 → 研判分析。对齐 aisoc_mini 的 LogProcessPipeline:字段归一化(TDP/Skyeye 字段映射)→ 规则过滤(扫描/出站/非HTTP剔除)→ URI归一化+Jaccard相似度去重(生成dedup_key)→ 仅对唯一dedup_key做LLM研判并将结果回填重复告警,最终输出完整管道报告。", "start": "receive_alerts", "nodes": [ { "id": "receive_alerts", "type": "python", - "description": "接收并校验告警列表,提取去重配置(严格字段、LSH 字段、相似度阈值)", - "code": "import json\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nthreshold = float(inputs.get('threshold', 0.7))\nmax_field_len = int(inputs.get('max_field_len', 500))\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\noutputs['alerts'] = alerts_input\noutputs['total_count'] = len(alerts_input)\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['threshold'] = threshold\noutputs['max_field_len'] = max_field_len" + "description": "接收原始告警列表,解析输入格式,提取 Pipeline 配置(日志来源类型、去重字段、LSH 阈值、是否启用各阶段)", + "code": "import json\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nnormalize_enabled = bool(inputs.get('normalize_enabled', True))\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nanalyze_enabled = bool(inputs.get('analyze_enabled', True))\n\ndedup_threshold = float(inputs.get('threshold', inputs.get('dedup_threshold', 0.7)))\nstrict_fields = inputs.get('strict_fields', inputs.get('dedup_fields_strict', ['sip', 'dip']))\nlsh_fields = inputs.get('lsh_fields', inputs.get('dedup_fields_lsh', ['req_http_url', 'req_body', 'rsp_body']))\nmax_field_len = int(inputs.get('max_field_len', 500))\nanalyze_max_workers = int(inputs.get('analyze_max_workers', 10))\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\noutputs['raw_alerts'] = alerts_input\noutputs['stats'] = {'raw_count': len(alerts_input)}\noutputs['source_log_type'] = source_log_type\noutputs['normalize_enabled'] = normalize_enabled\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['analyze_enabled'] = analyze_enabled\noutputs['dedup_threshold'] = dedup_threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['analyze_max_workers'] = analyze_max_workers" }, { - "id": "normalize_alerts", + "id": "normalize_logs", "type": "python", - "description": "对 LSH 字段进行 URI 归一化(日期→DATETIME、UUID→UUID、长数字→NUM、路径穿越、NULL 字节、URL 编码等),降低文本噪声", - "code": "import re\n\ndef normalize_uri(text):\n if not text:\n return ''\n text = str(text)\n text = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', text)\n text = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', text)\n text = re.sub(r'\\b\\d{6,}\\b', 'NUM', text)\n text = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', text)\n text = re.sub(r'%00', 'NULL', text)\n text = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', text)\n return text\n\nalerts = inputs.get('alerts', [])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\n\nnormalized_alerts = []\nfor alert in alerts:\n norm = dict(alert)\n for field in lsh_fields:\n raw = str(norm.get(field) or '')[:max_field_len]\n norm[f'{field}_normalized'] = normalize_uri(raw)\n normalized_alerts.append(norm)\n\noutputs['normalized_alerts'] = normalized_alerts\noutputs['strict_fields'] = inputs.get('strict_fields', ['sip', 'dip'])\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['threshold'] = inputs.get('threshold', 0.7)" + "description": "Step 1 — 归一化:将 TDP 或 Skyeye 原始字段映射为统一标准字段(sip/dip/req_http_url/req_body/rsp_body/threat_name 等),扁平化嵌套结构,缺失 id 时自动生成 UUID", + "code": "import uuid\nimport json\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent': 'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent': 'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_direction': 'none',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm_alert):\n id_str = ''.join(str(v) for v in norm_alert.values())\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, id_str))\n\ndef normalize_single(alert, field_map):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in field_map.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nnormalize_enabled = inputs.get('normalize_enabled', True)\nsource_log_type = inputs.get('source_log_type', 'tdp')\nstats = dict(inputs.get('stats', {}))\n\nif normalize_enabled:\n field_map = SKYEYE_FIELD_MAP if 'skyeye' in source_log_type else TDP_FIELD_MAP\n normalized = [normalize_single(a, field_map) for a in raw_alerts]\nelse:\n normalized = [dict(a) for a in raw_alerts]\n\nstats['normalized_count'] = len(normalized)\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled', 'analyze_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" }, { - "id": "compute_dedup_keys", + "id": "filter_logs", "type": "python", - "description": "为每条告警计算去重键:严格字段精确匹配 + 归一化 LSH 字段的 5-gram Jaccard 近似相似度,相似度超阈值则归入同一簇并复用其 MD5 去重键", - "code": "import hashlib\n\ndef get_shingles(text, k=5):\n text = str(text or '').lower()\n if len(text) < k:\n return frozenset([text]) if text else frozenset()\n return frozenset(text[i:i+k] for i in range(len(text) - k + 1))\n\ndef jaccard(set1, set2):\n if not set1 and not set2:\n return 1.0\n if not set1 or not set2:\n return 0.0\n inter = len(set1 & set2)\n union = len(set1 | set2)\n return inter / union if union > 0 else 0.0\n\ndef md5_key(text):\n return hashlib.md5(text.encode('utf-8')).hexdigest()\n\nalerts = inputs.get('normalized_alerts', [])\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nthreshold = float(inputs.get('threshold', 0.7))\n\n# Registry: dedup_key -> (strict_text, shingle_set)\nkey_registry = {}\n\nkeyed_alerts = []\nfor alert in alerts:\n strict_text = '. '.join(str(alert.get(f, ''))[:max_field_len] for f in strict_fields)\n lsh_text = '. '.join(\n str(alert.get(f'{f}_normalized', alert.get(f, '')))[:max_field_len]\n for f in lsh_fields\n )\n current_shingles = get_shingles(lsh_text)\n\n # Find existing cluster with same strict prefix and similar LSH content\n matched_key = None\n best_sim = 0.0\n for existing_key, (ex_strict, ex_shingles) in key_registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(current_shingles, ex_shingles)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = existing_key\n\n if matched_key is None:\n raw_key = md5_key(f'{strict_text}. {lsh_text}')\n key_registry[raw_key] = (strict_text, current_shingles)\n canonical_key = raw_key\n else:\n canonical_key = matched_key\n\n alert_out = dict(alert)\n alert_out['dedup_key'] = canonical_key\n keyed_alerts.append(alert_out)\n\noutputs['keyed_alerts'] = keyed_alerts\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['threshold'] = threshold" + "description": "Step 2 — 过滤:应用业务规则剔除不需研判的告警(纯扫描类且非 WebShell、出站流量、非 HTTP 协议);为每条告警写入 _need_analysis_is_attack/_threat_type 字段,只保留 need_analysis=True 的告警", + "code": "def json_logic(rule, data):\n if not isinstance(rule, dict):\n return rule\n if len(rule) != 1:\n raise ValueError('json_logic: expected single-key dict')\n op, params = next(iter(rule.items()))\n if op == 'and':\n return all(json_logic(p, data) for p in params)\n elif op == 'or':\n return any(json_logic(p, data) for p in params)\n elif op == 'in':\n item = json_logic(params[0], data)\n container = json_logic(params[1], data)\n return item in (container or [])\n elif op == '!':\n return not json_logic(params, data)\n elif op == 'var':\n key = params if isinstance(params, str) else (params[0] if params else '')\n default = None if isinstance(params, str) else (params[1] if len(params) > 1 else None)\n return data.get(key, default)\n return rule\n\n# Rule 1: 扫描类(且非 WebShell)告警 → 不需分析\nrule_is_scan = {\n 'or': [\n {'and': [{'in': ['扫描', {'var': 'threat_name'}]}, {'!': {'in': ['webshell', {'var': 'threat_name'}]}}]},\n {'and': [{'in': ['扫描', {'var': 'threat_type'}]}, {'!': {'in': ['webshell', {'var': 'threat_type'}]}}]}\n ]\n}\n# Rule 2: 入站 + HTTP 流量 → 需要分析\nrule_inbound_http = {\n 'and': [\n {'in': [{'var': 'direction'}, ['in', 'none']]},\n {'in': [{'var': 'net_type'}, ['http', 'none']]}\n ]\n}\n\nnormalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nstats = dict(inputs.get('stats', {}))\n\nfiltered = []\nfor alert in normalized_alerts:\n alert = dict(alert)\n threat_name = str(alert.get('threat_name', '') or '').lower()\n threat_type = str(alert.get('threat_type', '') or '').lower()\n data = {**alert, 'threat_name': threat_name, 'threat_type': threat_type}\n\n if filter_enabled:\n is_scan = json_logic(rule_is_scan, data)\n is_inbound_http = json_logic(rule_inbound_http, data)\n need_analysis = (not is_scan) and is_inbound_http\n else:\n is_scan = False\n is_inbound_http = True\n need_analysis = True\n\n alert['_need_analysis_is_attack'] = need_analysis\n alert['_is_scan'] = is_scan\n alert['_is_inbound_http'] = is_inbound_http\n alert['_threat_type'] = alert.get('threat_type', '')\n\n if need_analysis:\n filtered.append(alert)\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'analyze_enabled', 'dedup_threshold',\n 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" }, { - "id": "group_by_dedup_key", + "id": "dedup_logs", "type": "python", - "description": "按去重键分组:每组第一条告警为代表(unique),其余标记为重复(duplicate);输出统计信息", - "code": "from collections import defaultdict\n\nkeyed_alerts = inputs.get('keyed_alerts', [])\n\ngroups = defaultdict(list)\nfor alert in keyed_alerts:\n key = alert.get('dedup_key', 'unknown')\n groups[key].append(alert)\n\nunique_alerts = []\nduplicate_alerts = []\ndedup_groups = []\n\nfor key, group in groups.items():\n rep = dict(group[0])\n rep['dedup_key_already_exists'] = False\n rep['dedup_group_size'] = len(group)\n unique_alerts.append(rep)\n\n dups = []\n for dup in group[1:]:\n dup_alert = dict(dup)\n dup_alert['dedup_key_already_exists'] = True\n dup_alert['dedup_group_size'] = len(group)\n duplicate_alerts.append(dup_alert)\n dups.append(dup_alert)\n\n dedup_groups.append({\n 'dedup_key': key,\n 'count': len(group),\n 'representative': group[0],\n 'duplicates': dups\n })\n\ntotal = len(keyed_alerts)\ndedup_ratio = round(len(duplicate_alerts) / total, 4) if total > 0 else 0.0\n\ndedup_stats = {\n 'total_count': total,\n 'unique_count': len(unique_alerts),\n 'duplicate_count': len(duplicate_alerts),\n 'dedup_ratio': dedup_ratio,\n 'group_count': len(dedup_groups)\n}\n\noutputs['unique_alerts'] = unique_alerts\noutputs['duplicate_alerts'] = duplicate_alerts\noutputs['dedup_groups'] = dedup_groups\noutputs['dedup_stats'] = dedup_stats\noutputs['keyed_alerts'] = keyed_alerts" + "description": "Step 3 — 去重:对 LSH 字段做 URI 归一化(日期/UUID/长数字/路径穿越/URL 编码替换),以严格字段精确匹配 + 5-gram Jaccard 相似度进行聚类,为每条告警生成 dedup_key(MD5);输出全量带 dedup_key 告警及唯一代表告警列表", + "code": "import re\nimport hashlib\n\ndef normalize_uri(text):\n if not text or str(text) == 'none':\n return str(text or '')\n t = str(text)\n t = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', t)\n t = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', t)\n t = re.sub(r'\\b\\d{6,}\\b', 'NUM', t)\n t = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', t)\n t = re.sub(r'%00', 'NULL', t)\n t = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', t)\n return t\n\ndef shingles(text, k=5):\n t = str(text or '').lower()\n if len(t) < k:\n return frozenset([t]) if t else frozenset()\n return frozenset(t[i:i+k] for i in range(len(t) - k + 1))\n\ndef jaccard(a, b):\n if not a and not b:\n return 1.0\n if not a or not b:\n return 0.0\n return len(a & b) / len(a | b)\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# registry: dedup_key -> (strict_text, shingle_set)\nregistry = {}\nkeyed = []\n\nfor alert in filtered_alerts:\n alert = dict(alert)\n if not dedup_enabled:\n import hashlib as _hl\n raw = ''.join(str(alert.get(f, ''))[:max_len] for f in strict_fields + lsh_fields)\n alert['dedup_key'] = _hl.md5(raw.encode()).hexdigest()\n alert['dedup_key_already_exists'] = False\n keyed.append(alert)\n continue\n\n strict_text = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n lsh_text = '. '.join(\n normalize_uri(str(alert.get(f, ''))[:max_len])\n for f in lsh_fields\n )\n cur_shingles = shingles(lsh_text)\n\n matched_key = None\n best_sim = 0.0\n for existing_key, (ex_strict, ex_shingles) in registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(cur_shingles, ex_shingles)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = existing_key\n\n if matched_key is None:\n raw_key = hashlib.md5(f'{strict_text}. {lsh_text}'.encode('utf-8')).hexdigest()\n registry[raw_key] = (strict_text, cur_shingles)\n canonical = raw_key\n alert['dedup_key_already_exists'] = False\n else:\n canonical = matched_key\n alert['dedup_key_already_exists'] = True\n\n alert['dedup_key'] = canonical\n keyed.append(alert)\n\n# Build unique alert list (first occurrence per dedup_key)\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = len(keyed) - len(unique_alerts)\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\nfor k in ['analyze_enabled', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" }, { - "id": "generate_dedup_report", + "id": "analyze_unique", "type": "python", - "description": "汇总去重结果并生成 Markdown 报告;同步将全量带去重键告警、唯一告警、重复告警及分组统计写入 artifacts/ 目录", - "code": "import os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\ndedup_stats = inputs.get('dedup_stats', {})\nunique_alerts = inputs.get('unique_alerts', [])\nduplicate_alerts = inputs.get('duplicate_alerts', [])\ndedup_groups = inputs.get('dedup_groups', [])\nkeyed_alerts = inputs.get('keyed_alerts', [])\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\n\ntotal = dedup_stats.get('total_count', 0)\nunique = dedup_stats.get('unique_count', 0)\nduplicates = dedup_stats.get('duplicate_count', 0)\nratio = dedup_stats.get('dedup_ratio', 0.0)\ngroups_count = dedup_stats.get('group_count', 0)\n\nsorted_groups = sorted(dedup_groups, key=lambda g: g.get('count', 0), reverse=True)\ntop_groups = sorted_groups[:10]\n\ntop_groups_md = ''\nfor i, g in enumerate(top_groups, 1):\n rep = g.get('representative', {})\n key = g.get('dedup_key', '')\n count = g.get('count', 0)\n sip = rep.get('sip', rep.get('\\u6e90IP', ''))\n dip = rep.get('dip', rep.get('\\u76ee\\u7684IP', ''))\n alert_type = rep.get('alert_type', rep.get('\\u653b\\u51fb\\u7c7b\\u578b\\u5927\\u7c7b', rep.get('type', '')))\n top_groups_md += f'\\n### \\u7b2c {i} \\u7ec4\\uff08\\u5171 {count} \\u6761\\uff09\\n'\n top_groups_md += f'- \\u53bb\\u91cd\\u952e: `{key[:16]}...`\\n'\n if sip:\n top_groups_md += f'- \\u6e90IP: {sip}\\n'\n if dip:\n top_groups_md += f'- \\u76ee\\u7684IP: {dip}\\n'\n if alert_type:\n top_groups_md += f'- \\u544a\\u8b66\\u7c7b\\u578b: {alert_type}\\n'\n\nreport_content = f\"\"\"# \\u544a\\u8b66\\u53bb\\u91cd\\u5206\\u6790\\u62a5\\u544a\n\n## \\u6267\\u884c\\u6458\\u8981\n\n| \\u6307\\u6807 | \\u503c |\n|------|-----|\n| \\u8f93\\u5165\\u544a\\u8b66\\u603b\\u6570 | {total} |\n| \\u53bb\\u91cd\\u540e\\u552f\\u4e00\\u544a\\u8b66\\u6570 | {unique} |\n| \\u91cd\\u590d\\u544a\\u8b66\\u6570 | {duplicates} |\n| \\u53bb\\u91cd\\u7387 | {ratio * 100:.1f}% |\n| \\u53bb\\u91cd\\u5206\\u7ec4\\u6570 | {groups_count} |\n\n> \\u53bb\\u91cd\\u7387 = \\u91cd\\u590d\\u544a\\u8b66\\u6570 / \\u603b\\u544a\\u8b66\\u6570\\uff0c\\u53cd\\u6620\\u544a\\u8b66\\u566a\\u58f0\\u6c34\\u5e73\\u3002\n\n## \\u7b97\\u6cd5\\u8bf4\\u660e\n\n1. **\\u4e25\\u683c\\u5b57\\u6bb5\\u5339\\u914d**\\uff1a\\u5bf9\\u6e90IP/\\u76ee\\u7684IP \\u7b49\\u5173\\u952e\\u5b57\\u6bb5\\u8fdb\\u884c\\u7cbe\\u786e\\u5339\\u914d\n2. **\\u5185\\u5bb9\\u5f52\\u4e00\\u5316**\\uff1a\\u5bf9 URL\\u3001\\u8bf7\\u6c42\\u4f53\\u3001\\u54cd\\u5e94\\u4f53\\u8fdb\\u884c\\u6b63\\u5219\\u5f52\\u4e00\\u5316\\uff08\\u65e5\\u671f\\u2192DATETIME\\u3001UUID\\u2192UUID\\u3001\\u957f\\u6570\\u5b57\\u2192NUM \\u7b49\\uff09\n3. **\\u76f8\\u4f3c\\u5ea6\\u8ba1\\u7b97**\\uff1a\\u4f7f\\u7528 5-gram Shingling + Jaccard \\u76f8\\u4f3c\\u5ea6\\u8fdb\\u884c\\u6a21\\u7cca\\u5339\\u914d\\uff0c\\u9608\\u503c {inputs.get('threshold', 0.7)}\n4. **\\u53bb\\u91cd\\u952e**\\uff1a\\u57fa\\u4e8e MD5 \\u54c8\\u5e0c\\u7684\\u552f\\u4e00\\u6807\\u8bc6\\uff0c\\u76f8\\u4f3c\\u5185\\u5bb9\\u6620\\u5c04\\u81f3\\u540c\\u4e00\\u53bb\\u91cd\\u952e\n\n## Top 10 \\u6700\\u5927\\u544a\\u8b66\\u5206\\u7ec4\n{top_groups_md or '\\uff08\\u65e0\\u5206\\u7ec4\\u6570\\u636e\\uff09'}\n\n---\n\n## \\u8f93\\u51fa\\u6587\\u4ef6\n\n- \\u5168\\u91cf\\u5e26\\u53bb\\u91cd\\u952e\\u544a\\u8b66\\uff1a`artifacts/dedup_all_alerts.jsonl`\n- \\u552f\\u4e00\\u544a\\u8b66\\uff08\\u53bb\\u91cd\\u540e\\uff09\\uff1a`artifacts/dedup_unique_alerts.jsonl`\n- \\u91cd\\u590d\\u544a\\u8b66\\uff1a`artifacts/dedup_duplicate_alerts.jsonl`\n- \\u5206\\u7ec4\\u7edf\\u8ba1\\uff1a`artifacts/dedup_groups.json`\n\n---\n\n*\\u751f\\u6210\\u65f6\\u95f4: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\"\"\"\n\nreport_path = os.path.join(output_dir, 'alert_dedup_report.md')\ntool.run('write', filePath=report_path, content=report_content)\n\ndef write_jsonl(path, records):\n lines = '\\n'.join(json.dumps(r, ensure_ascii=False) for r in records)\n tool.run('write', filePath=path, content=lines)\n\nwrite_jsonl(os.path.join(artifacts_dir, 'dedup_all_alerts.jsonl'), keyed_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'dedup_unique_alerts.jsonl'), unique_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'dedup_duplicate_alerts.jsonl'), duplicate_alerts)\n\ngroups_summary = [{'dedup_key': g['dedup_key'], 'count': g['count']} for g in dedup_groups]\ntool.run('write', filePath=os.path.join(artifacts_dir, 'dedup_groups.json'),\n content=json.dumps(groups_summary, ensure_ascii=False, indent=2))\n\noutputs['report_path'] = report_path\noutputs['unique_alerts'] = unique_alerts\noutputs['duplicate_alerts'] = duplicate_alerts\noutputs['dedup_stats'] = dedup_stats\noutputs['summary'] = (\n f'\\u544a\\u8b66\\u53bb\\u91cd\\u5b8c\\u6210\\uff1a\\u8f93\\u5165 {total} \\u6761\\uff0c'\n f'\\u53bb\\u91cd\\u540e {unique} \\u6761\\u552f\\u4e00\\u544a\\u8b66\\uff0c'\n f'{duplicates} \\u6761\\u91cd\\u590d\\uff0c'\n f'\\u53bb\\u91cd\\u7387 {ratio * 100:.1f}%\\u3002\\u62a5\\u544a\\uff1a{report_path}'\n)" - } - ], - "edges": [ - { - "from": "receive_alerts", - "to": "normalize_alerts" - }, - { - "from": "normalize_alerts", - "to": "compute_dedup_keys" - }, - { - "from": "compute_dedup_keys", - "to": "group_by_dedup_key" + "description": "Step 4 — 研判分析:仅对唯一 dedup_key 的代表告警调用 LLM 判断是否为真实攻击(is_attack),然后将结果回填给同组所有重复告警;输出含 is_attack 字段的完整告警列表", + "code": "import json\n\nIS_ATTACK_PROMPT = \"\"\"作为网络安全专家,请分析告警日志,判断是否为成功的攻击事件。\n\n### 告警数据:\n{alert}\n\n### 判定标准:\n**必须同时满足以下条件才判定为\"是\":**\n\n1. **存在明确攻击特征**:\n - 载荷或请求内容包含明确的攻击模式(SQL注入、XSS、命令执行、路径遍历、webshell等)\n - 攻击载荷经解码后确认为恶意代码\n\n2. **攻击成功执行且有明确证据**:\n - 命令执行类:响应内容包含命令执行结果(系统信息、用户信息、目录列表等)\n - 数据泄露类:响应内容包含敏感信息(数据库错误信息、用户数据、系统配置等)\n - 文件操作类:响应显示文件上传成功或敏感文件访问成功\n - webshell类:响应包含webshell标识符\n\n### 严格排除(以下必须判定为\"否\"):\n- 所有 HTTP 错误状态码(400/401/403/404/405/500 等)响应\n- 含安全拦截信息(waf/blocked/已拦截/firewall 等)的响应\n- 载荷为 none 且无明确攻击载荷的情况\n- 正常网页内容、域名停放页面、API 正常响应\n\n### 输出要求:\n仅输出\"是\"或\"否\",不要任何解释或其他内容。\"\"\"\n\ndeduped_alerts = inputs.get('deduped_alerts', [])\nunique_alerts = inputs.get('unique_alerts', [])\nanalyze_enabled = inputs.get('analyze_enabled', True)\nstats = dict(inputs.get('stats', {}))\n\nif not analyze_enabled:\n for a in deduped_alerts:\n a['is_attack'] = None\n a['_analysis_skipped'] = True\n stats['analyzed_unique_count'] = 0\n outputs['analyzed_alerts'] = deduped_alerts\n outputs['stats'] = stats\nelse:\n # Step A: LLM analysis on unique dedup_key alerts\n dedup_results = {} # dedup_key -> is_attack bool\n for alert in unique_alerts:\n dedup_key = alert.get('dedup_key', '')\n try:\n alert_str = json.dumps(alert, ensure_ascii=False)[:4000]\n prompt = IS_ATTACK_PROMPT.format(alert=alert_str)\n response = str(llm.ask(prompt)).strip().lower()\n first_line = response.split('\\n')[0]\n is_attack = '是' in first_line or 'true' in first_line or 'yes' in first_line\n except Exception:\n is_attack = False\n dedup_results[dedup_key] = is_attack\n\n # Step B: backfill is_attack for all alerts (including duplicates)\n analyzed = []\n for alert in deduped_alerts:\n a = dict(alert)\n a['is_attack'] = dedup_results.get(a.get('dedup_key', ''), False)\n analyzed.append(a)\n\n attack_count = sum(1 for a in analyzed if a.get('is_attack'))\n stats['analyzed_unique_count'] = len(dedup_results)\n stats['attack_count'] = attack_count\n stats['non_attack_count'] = len(analyzed) - attack_count\n outputs['analyzed_alerts'] = analyzed\n outputs['stats'] = stats" }, { - "from": "group_by_dedup_key", - "to": "generate_dedup_report" + "id": "generate_report", + "type": "python", + "description": "汇总四阶段流水线统计数据,写出最终 Markdown 报告及各阶段告警的 JSONL 数据文件", + "code": "import os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nanalyzed_alerts = inputs.get('analyzed_alerts', [])\nstats = inputs.get('stats', {})\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\n\nraw_count = stats.get('raw_count', 0)\nnorm_count = stats.get('normalized_count', raw_count)\nfilter_count = stats.get('after_filter_count', norm_count)\nfilter_removed = stats.get('filter_removed_count', 0)\ndedup_count = stats.get('after_dedup_count', filter_count)\nunique_key_count = stats.get('unique_key_count', dedup_count)\ndedup_removed = stats.get('dedup_removed_count', 0)\nanalyzed_unique = stats.get('analyzed_unique_count', 0)\nattack_count = stats.get('attack_count', 0)\nnon_attack_count = stats.get('non_attack_count', 0)\n\nattack_alerts = [a for a in analyzed_alerts if a.get('is_attack')]\nnon_attack_alerts = [a for a in analyzed_alerts if not a.get('is_attack') and a.get('is_attack') is not None]\n\nreport = f\"\"\"# 告警处理 Pipeline 报告\n\n## 执行摘要\n\n| 阶段 | 告警数 | 说明 |\n|------|--------|------|\n| 原始输入 | {raw_count} | 接收到的原始告警总数 |\n| Step 1 归一化 | {norm_count} | 字段映射后告警数 |\n| Step 2 过滤后 | {filter_count} | 剔除 {filter_removed} 条(扫描/出站/非HTTP)|\n| Step 3 去重后(唯一簇) | {unique_key_count} | 从 {dedup_count} 条中识别出 {unique_key_count} 个唯一去重键,剔除 {dedup_removed} 条重复 |\n| Step 4 研判(LLM 调用次数) | {analyzed_unique} | 仅对唯一 dedup_key 调用 LLM |\n| 判定为攻击 | {attack_count} | is_attack = True |\n| 判定为非攻击 | {non_attack_count} | is_attack = False |\n\n## 真实攻击告警(Top 20)\n\"\"\"\n\nfor i, a in enumerate(attack_alerts[:20], 1):\n sip = a.get('sip', a.get('\\u6e90IP', ''))\n dip = a.get('dip', a.get('\\u76ee\\u7684IP', ''))\n threat = a.get('threat_name', a.get('\\u5a01\\u80c1\\u540d\\u79f0', a.get('alert_type', '')))\n url = a.get('req_http_url', a.get('\\u8bf7\\u6c42\\u5185\\u5bb9', ''))[:80]\n dedup_key = a.get('dedup_key', '')[:16]\n report += f'\\n{i}. **{threat}** | {sip} \\u2192 {dip} | URL: `{url}` | dedup_key: `{dedup_key}...`\\n'\n\nreport += f\"\"\"\n\n---\n\n## 算法说明\n\n1. **归一化**:TDP/Skyeye 字段映射 → 标准字段(sip/dip/req_http_url/req_body/rsp_body/threat_name 等)\n2. **过滤**:剔除扫描类(非 webshell)告警、出站流量、非 HTTP 协议告警\n3. **去重**:URI 归一化(日期→DATETIME/UUID→UUID/长数字→NUM)+ 5-gram Jaccard 相似度聚类\n4. **研判**:LLM 仅分析唯一 dedup_key 的代表告警,结果回填至同簇所有告警\n\n---\n\n*生成时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\"\"\"\n\nreport_path = os.path.join(output_dir, 'alert_pipeline_report.md')\ntool.run('write', filePath=report_path, content=report)\n\ndef write_jsonl(path, records):\n tool.run('write', filePath=path,\n content='\\n'.join(json.dumps(r, ensure_ascii=False) for r in records))\n\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_all_analyzed.jsonl'), analyzed_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_attack_alerts.jsonl'), attack_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_non_attack_alerts.jsonl'), non_attack_alerts)\n\noutputs['report_path'] = report_path\noutputs['analyzed_alerts'] = analyzed_alerts\noutputs['attack_alerts'] = attack_alerts\noutputs['stats'] = stats\noutputs['summary'] = (\n f'Pipeline \\u5b8c\\u6210: \\u8f93\\u5165 {raw_count} \\u6761 \\u2192 '\n f'\\u8fc7\\u6ee4\\u540e {filter_count} \\u6761 \\u2192 '\n f'\\u53bb\\u91cd {unique_key_count} \\u4e2a\\u7c07 \\u2192 '\n f'\\u5224\\u5b9a\\u653b\\u51fb {attack_count} \\u6761\\u3002\\u62a5\\u544a: {report_path}'\n)" } ], + "edges": [ + {"from": "receive_alerts", "to": "normalize_logs"}, + {"from": "normalize_logs", "to": "filter_logs"}, + {"from": "filter_logs", "to": "dedup_logs"}, + {"from": "dedup_logs", "to": "analyze_unique"}, + {"from": "analyze_unique", "to": "generate_report"} + ], "metadata": { - "node_timeout_s": 300, + "node_timeout_s": 600, "sampleInputs": { + "source_log_type": "tdp", + "normalize_enabled": true, + "filter_enabled": true, + "dedup_enabled": true, + "analyze_enabled": true, + "threshold": 0.7, + "strict_fields": ["sip", "dip"], + "lsh_fields": ["req_http_url", "req_body", "rsp_body"], "alerts": [ { - "sip": "1.2.3.4", - "dip": "10.0.0.1", - "req_http_url": "/admin/login.php?id=1 OR 1=1", - "req_body": "username=admin&password=123456", - "rsp_body": "HTTP/1.1 200 OK" - }, - { - "sip": "1.2.3.4", - "dip": "10.0.0.1", - "req_http_url": "/admin/login.php?id=2 OR 2=2", - "req_body": "username=admin&password=654321", - "rsp_body": "HTTP/1.1 200 OK" + "net_real_src_ip": "1.2.3.4", + "net_dest_ip": "10.0.0.1", + "direction": "in", + "net_type": "http", + "net_http_url": "/admin/login.php?id=1 OR 1=1", + "net_http_reqs_body": "username=admin&password=123456", + "net_http_resp_body": "root@localhost", + "threat_name": "SQL注入攻击", + "threat_type": "web攻击" } - ], - "strict_fields": ["sip", "dip"], - "lsh_fields": ["req_http_url", "req_body", "rsp_body"], - "threshold": 0.7 + ] } } } diff --git a/.flocks/plugins/workflows/alert_dedup/workflow.md b/.flocks/plugins/workflows/alert_dedup/workflow.md index 5c8ead96d..1ad0dfb4c 100644 --- a/.flocks/plugins/workflows/alert_dedup/workflow.md +++ b/.flocks/plugins/workflows/alert_dedup/workflow.md @@ -1,53 +1,14 @@ -# alert_dedup — 告警去重工作流 +# alert_dedup — 告警处理四阶段 Pipeline 工作流 ## 简介 -`alert_dedup` 是一个安全告警去重工作流,基于 **aisoc_mini** 项目的去重算法移植而来。 -它通过 URI 归一化 + 5-gram Shingling + Jaccard 相似度,将相似告警归入同一去重簇, -有效降低告警噪声,让安全分析师聚焦于真正唯一的威胁事件。 +`alert_dedup` 完整实现了 `aisoc_mini` 项目中 `LogProcessPipeline` 的四阶段主流程: -## 使用场景 - -- 批量告警分析前的预处理(降噪) -- SIEM/NDR 告警规律性分析 -- 告警风暴抑制(同一攻击模式产生大量重复告警) -- 与 LLM 研判结合:去重后只对唯一告警调用大模型,节省成本 - -## 输入参数 - -| 参数 | 类型 | 默认值 | 说明 | -|------|------|--------|------| -| `alerts` | `list[dict]` | 必填 | 待去重告警列表,每条为 JSON 对象 | -| `strict_fields` | `list[str]` | `["sip", "dip"]` | 严格匹配字段(如源IP/目的IP),这些字段不同则一定不归为同组 | -| `lsh_fields` | `list[str]` | `["req_http_url", "req_body", "rsp_body"]` | 近似匹配字段(URL、请求体、响应体),用于 Jaccard 相似度计算 | -| `threshold` | `float` | `0.7` | Jaccard 相似度阈值,超过此值认为是同类告警 | -| `max_field_len` | `int` | `500` | 字段截断长度,避免超长内容影响性能 | - -### 中文字段名支持 - -如果告警使用中文列名(如来自 CSV),可将字段名配置为中文: - -```json -{ - "strict_fields": ["源IP", "目的IP"], - "lsh_fields": ["请求内容", "响应内容", "载荷_decoded"] -} +``` +原始告警 → 归一化 → 过滤 → 去重 → 研判分析 ``` -## 输出 - -| 字段 | 说明 | -|------|------| -| `unique_alerts` | 去重后的唯一告警列表,每条含 `dedup_key`、`dedup_group_size`、`dedup_key_already_exists=false` | -| `duplicate_alerts` | 被归为重复的告警列表,含 `dedup_key_already_exists=true` | -| `dedup_stats` | 统计信息:总数、唯一数、重复数、去重率、分组数 | -| `report_path` | Markdown 报告路径 | -| `summary` | 单行执行摘要 | - -**告警新增字段说明:** -- `dedup_key`:MD5 去重键,同一去重簇的告警共享相同的值 -- `dedup_key_already_exists`:`true` 表示该告警是重复的 -- `dedup_group_size`:该告警所属分组的总数量 +每阶段均可通过配置独立开关,支持 TDP 和 Skyeye 两种日志格式。 ## 工作流节点 @@ -55,110 +16,134 @@ receive_alerts │ ▼ -normalize_alerts ← URI 归一化(日期/UUID/长数字/路径穿越/编码字符) +normalize_logs ← Step 1: TDP/Skyeye 字段映射,扁平化嵌套结构 + │ + ▼ +filter_logs ← Step 2: 过滤扫描类/出站/非 HTTP 告警 │ ▼ -compute_dedup_keys ← 严格字段精确匹配 + 5-gram Jaccard 近似相似度 +dedup_logs ← Step 3: URI 归一化 + 5-gram Jaccard 去重,生成 dedup_key │ ▼ -group_by_dedup_key ← 按去重键分组,标记唯一/重复 +analyze_unique ← Step 4: LLM 研判(仅对唯一 dedup_key 调用,结果回填重复告警) │ ▼ -generate_dedup_report ← 生成报告 + 写出 JSONL/JSON 数据文件 +generate_report ← 汇总统计,写出 Markdown 报告与 JSONL 数据文件 ``` -### 节点说明 +## 输入参数 + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `alerts` | `list[dict]` | 必填 | 原始告警列表 | +| `source_log_type` | `str` | `"tdp"` | 日志来源类型,`"tdp"` 或 `"skyeye"` | +| `normalize_enabled` | `bool` | `true` | 是否执行字段归一化 | +| `filter_enabled` | `bool` | `true` | 是否执行规则过滤 | +| `dedup_enabled` | `bool` | `true` | 是否执行去重 | +| `analyze_enabled` | `bool` | `true` | 是否执行 LLM 研判 | +| `threshold` | `float` | `0.7` | Jaccard 相似度阈值(去重步骤) | +| `strict_fields` | `list[str]` | `["sip","dip"]` | 严格匹配字段 | +| `lsh_fields` | `list[str]` | `["req_http_url","req_body","rsp_body"]` | 近似匹配字段 | +| `max_field_len` | `int` | `500` | 字段截断长度 | -| 节点 ID | 类型 | 职责 | -|---------|------|------| -| `receive_alerts` | python | 解析告警输入(支持 JSON 字符串、列表、`{data:[...]}` 嵌套),提取配置 | -| `normalize_alerts` | python | 对 LSH 字段值进行 URI 风格归一化,去除日期/UUID/数字等噪声 | -| `compute_dedup_keys` | python | 5-gram Shingling + Jaccard 相似度计算,生成 MD5 去重键 | -| `group_by_dedup_key` | python | 按去重键分组,首条为代表(unique),其余标记为 duplicate | -| `generate_dedup_report` | python | 生成 Markdown 报告及 JSONL/JSON 数据文件 | +## 四阶段详解 -## 算法说明 +### Step 1 — 归一化(`normalize_logs`) -### 1. URI 归一化(`normalize_alerts`) +将 TDP 或 Skyeye 原始字段映射为统一标准字段: -对 URL、请求体等 LSH 字段应用以下替换规则,使内容相同但细节不同的告警在相似度计算上趋近: +| 标准字段 | TDP 原始字段 | Skyeye 原始字段 | +|----------|-------------|----------------| +| `sip` | `net_real_src_ip` | `sip` | +| `dip` | `net_dest_ip` | `dip` | +| `req_http_url` | `net_http_url` | `uri` | +| `req_body` | `net_http_reqs_body` | `req_body` | +| `rsp_body` | `net_http_resp_body` | `rsp_body` | +| `threat_name` | `threat_name` | `vuln_name` | +| `direction` | `direction` | *(none)* | +| `net_type` | `net_type` | *(none,自动探测 method)* | -| 模式 | 替换为 | -|------|--------| -| `2024-01-15`、`2024/01/15 14:30` | `DATETIME` | -| `550e8400-e29b-41d4-a716-...` (UUID) | `UUID` | -| 6 位及以上纯数字 | `NUM` | -| `../`、`..\` 路径穿越 | `../` | -| URL 编码的 NULL 字节 `%00` | `NULL` | -| 连续 3 个以上 URL 编码字符 | `ENCODED` | +支持嵌套结构(自动扁平化),缺失 `id` 时自动生成 UUID。 -### 2. 去重键计算(`compute_dedup_keys`) +### Step 2 — 过滤(`filter_logs`) -``` -strict_text = join(strict_fields values) -lsh_text = join(normalized lsh_fields values) +应用两条 jsonLogic 规则: -→ 在已有簇中找 strict_text 相同、Jaccard(lsh_text) ≥ threshold 的簇 -→ 若找到:复用该簇的 dedup_key -→ 若未找到:dedup_key = MD5(strict_text + ". " + lsh_text) -``` +| 规则 | 逻辑 | 结果 | +|------|------|------| +| 扫描过滤 | `threat_name/threat_type 含"扫描"` 且 **不含** `"webshell"` | 剔除 | +| 方向+协议 | `direction in ["in","none"]` 且 `net_type in ["http","none"]` | 保留 | -Jaccard 相似度基于 **5-gram** 分词(Character-level shingles)。 +仅保留**不是扫描类**且**满足入站 HTTP 条件**的告警,添加 `_need_analysis_is_attack`、`_is_scan`、`_is_inbound_http` 字段。 -## 输出文件 +### Step 3 — 去重(`dedup_logs`) + +1. **URI 归一化**:对 lsh_fields 字段值做正则替换(日期→`DATETIME`、UUID→`UUID`、6位+数字→`NUM`、路径穿越、URL 编码) +2. **相似度计算**:5-gram Character Shingles + Jaccard 相似度 +3. **聚类规则**:严格字段完全相同 + lsh_fields Jaccard ≥ threshold → 归为同一簇,复用同一 `dedup_key` +4. **去重键**:新簇时用 MD5(`strict_text + ". " + normalized_lsh_text`) 生成 -所有文件写入 `~/.flocks/workspace/outputs//`: +每条告警新增字段: +- `dedup_key`:MD5 哈希串 +- `dedup_key_already_exists`:`true` 表示该告警是重复告警 + +### Step 4 — 研判分析(`analyze_unique`) + +- **LLM 调用策略**:仅对每个 `dedup_key` 的第一条告警(代表)调用 LLM,重复告警直接复用结果 → 节省大量 LLM 调用开销 +- **Prompt**:专业安全研判 Prompt,明确区分"成功攻击"与"扫描/误报/正常流量" +- **输出字段**:每条告警新增 `is_attack: bool` + +## 输出 + +| 字段 | 说明 | +|------|------| +| `analyzed_alerts` | 全量含 `is_attack` 字段的告警列表 | +| `attack_alerts` | 判定为真实攻击的告警子集 | +| `stats` | 各阶段统计:raw/normalized/filtered/dedup/analyzed 计数 | +| `report_path` | 最终 Markdown 报告路径 | +| `summary` | 单行执行摘要 | + +## 输出文件 ``` -outputs/ -└── / - ├── alert_dedup_report.md # 主报告(Markdown) - └── artifacts/ - ├── dedup_all_alerts.jsonl # 全量带去重键告警 - ├── dedup_unique_alerts.jsonl # 唯一告警(去重代表) - ├── dedup_duplicate_alerts.jsonl # 重复告警 - └── dedup_groups.json # 分组统计(key + count) +outputs// +├── alert_pipeline_report.md # 主报告 +└── artifacts/ + ├── pipeline_all_analyzed.jsonl # 全量含 is_attack 告警 + ├── pipeline_attack_alerts.jsonl # 真实攻击告警 + └── pipeline_non_attack_alerts.jsonl # 非攻击/误报告警 ``` -## 示例 +## 与 aisoc_mini 的对应关系 + +| aisoc_mini 类/函数 | 本工作流节点 | +|-------------------|------------| +| `LogNormalization.process()` / `normalize_ndr_log()` | `normalize_logs` | +| `LogFilter.filter()` + `jsonLogic(rule_1, rule_2)` | `filter_logs` | +| `LogDedup.process()` + `LSHProcessor` + `normalize_uri()` | `dedup_logs` | +| `LogAnalysis.process_parallel()` | `analyze_unique` | +| `PipelineResult.stats` | `generate_report` | + +## 示例输入 ```json { + "source_log_type": "tdp", + "threshold": 0.7, "alerts": [ { - "sip": "1.2.3.4", - "dip": "10.0.0.1", - "req_http_url": "/admin/login.php?id=1 OR 1=1", - "req_body": "username=admin&password=123456", - "rsp_body": "HTTP/1.1 200 OK" - }, - { - "sip": "1.2.3.4", - "dip": "10.0.0.1", - "req_http_url": "/admin/login.php?id=2 OR 2=2", - "req_body": "username=admin&password=654321", - "rsp_body": "HTTP/1.1 200 OK" + "net_real_src_ip": "1.2.3.4", + "net_dest_ip": "10.0.0.1", + "direction": "in", + "net_type": "http", + "net_http_url": "/admin/login.php?id=1 OR 1=1", + "net_http_reqs_body": "username=admin&password=123456", + "net_http_resp_body": "root@localhost, MySQL 5.7", + "threat_name": "SQL注入攻击", + "threat_type": "web攻击" } - ], - "strict_fields": ["sip", "dip"], - "lsh_fields": ["req_http_url", "req_body", "rsp_body"], - "threshold": 0.7 + ] } ``` -上述两条告警的严格字段相同(同源同目),LSH 字段经归一化后相似度高(SQL 注入 payload 结构一致), -因此会被归为同一去重簇,只保留第一条为代表。 - -## 与 aisoc_mini 的对应关系 - -| aisoc_mini 组件 | 本工作流对应节点 | -|-----------------|----------------| -| `LogDecoder.process()` | `normalize_alerts`(子集:URI 归一化) | -| `LogDedup._generate_dedup_key_text()` | `compute_dedup_keys` | -| `LSHProcessor.query_most_similar()` | `compute_dedup_keys`(5-gram Jaccard 简化版) | -| `LogDedup.process()` | `group_by_dedup_key` | -| 报告输出 | `generate_dedup_report` | - -> **注意**:本工作流使用标准库实现相似度计算(无需 `datasketch`), -> 采用精确 Jaccard 而非 MinHash 近似。对于超大批量(>10 万条)告警, -> 建议改用 `datasketch` 的 MinHash LSH 以获得更好性能。 +> **提示**:`analyze_enabled: false` 可跳过 LLM 调用,仅做去重统计,适合纯降噪场景。 From 746bb83e35cd25412977c64d5a66dfc5ba47485a Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 16:12:31 +0800 Subject: [PATCH 03/41] fix(alert_dedup): align filter logic with aisoc_mini LogFilter and parallelize LLM analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修复审阅中发现的功能性偏差: 1. filter_logs: 完整对齐 LogFilter._get_tdp_process_type() 9 种 process_type 分类 - 修正:HTTP 非扫描告警无论方向(in/out/lateral)都需研判(之前错误地只保留 direction=in/none,会丢失大量本应分析的出站/横向 HTTP 告警) - HTTP 协议判断兼容 application_layer_protocol/net_type/net_app_proto 多字段 - threat_type 取值与原版一致:tdp 取 threat_name,skyeye 取 threat_type - 新增 _process_type、_need_analysis_attack_status 字段 - 统计中加入 filter_process_type_counts 2. analyze_unique: ThreadPoolExecutor 并行调用 LLM(与 LogAnalysis.process_parallel 一致,可通过 analyze_max_workers 配置),单条失败不影响其他 3. dedup_logs: 移除函数体内重复的 import hashlib as _hl 4. 各节点新增轻量 print 进度日志,便于大批量调试 5. workflow.md 同步修正过滤逻辑描述 Co-authored-by: Cursor --- .../workflows/alert_dedup/workflow.json | 8 ++--- .../plugins/workflows/alert_dedup/workflow.md | 30 ++++++++++++++----- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/.flocks/plugins/workflows/alert_dedup/workflow.json b/.flocks/plugins/workflows/alert_dedup/workflow.json index 0b33693e0..c12f37e6d 100644 --- a/.flocks/plugins/workflows/alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/alert_dedup/workflow.json @@ -19,20 +19,20 @@ { "id": "filter_logs", "type": "python", - "description": "Step 2 — 过滤:应用业务规则剔除不需研判的告警(纯扫描类且非 WebShell、出站流量、非 HTTP 协议);为每条告警写入 _need_analysis_is_attack/_threat_type 字段,只保留 need_analysis=True 的告警", - "code": "def json_logic(rule, data):\n if not isinstance(rule, dict):\n return rule\n if len(rule) != 1:\n raise ValueError('json_logic: expected single-key dict')\n op, params = next(iter(rule.items()))\n if op == 'and':\n return all(json_logic(p, data) for p in params)\n elif op == 'or':\n return any(json_logic(p, data) for p in params)\n elif op == 'in':\n item = json_logic(params[0], data)\n container = json_logic(params[1], data)\n return item in (container or [])\n elif op == '!':\n return not json_logic(params, data)\n elif op == 'var':\n key = params if isinstance(params, str) else (params[0] if params else '')\n default = None if isinstance(params, str) else (params[1] if len(params) > 1 else None)\n return data.get(key, default)\n return rule\n\n# Rule 1: 扫描类(且非 WebShell)告警 → 不需分析\nrule_is_scan = {\n 'or': [\n {'and': [{'in': ['扫描', {'var': 'threat_name'}]}, {'!': {'in': ['webshell', {'var': 'threat_name'}]}}]},\n {'and': [{'in': ['扫描', {'var': 'threat_type'}]}, {'!': {'in': ['webshell', {'var': 'threat_type'}]}}]}\n ]\n}\n# Rule 2: 入站 + HTTP 流量 → 需要分析\nrule_inbound_http = {\n 'and': [\n {'in': [{'var': 'direction'}, ['in', 'none']]},\n {'in': [{'var': 'net_type'}, ['http', 'none']]}\n ]\n}\n\nnormalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nstats = dict(inputs.get('stats', {}))\n\nfiltered = []\nfor alert in normalized_alerts:\n alert = dict(alert)\n threat_name = str(alert.get('threat_name', '') or '').lower()\n threat_type = str(alert.get('threat_type', '') or '').lower()\n data = {**alert, 'threat_name': threat_name, 'threat_type': threat_type}\n\n if filter_enabled:\n is_scan = json_logic(rule_is_scan, data)\n is_inbound_http = json_logic(rule_inbound_http, data)\n need_analysis = (not is_scan) and is_inbound_http\n else:\n is_scan = False\n is_inbound_http = True\n need_analysis = True\n\n alert['_need_analysis_is_attack'] = need_analysis\n alert['_is_scan'] = is_scan\n alert['_is_inbound_http'] = is_inbound_http\n alert['_threat_type'] = alert.get('threat_type', '')\n\n if need_analysis:\n filtered.append(alert)\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'analyze_enabled', 'dedup_threshold',\n 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" + "description": "Step 2 — 过滤(对齐 aisoc_mini LogFilter):标记 process_type(scan/non-scan × http/non-http × in/out/lateral 共 9 种);non-scan + HTTP(任意方向)需要分析;TDP 的 threat_type 取 threat_name,Skyeye 取 threat_type", + "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n elif source == 'tdp':\n return str(alert.get('threat_name', 'general') or 'general')\n return 'general'\n\ndef is_http_protocol(alert):\n # 兼容多种 HTTP 协议字段:原版 LogFilter 用 application_layer_protocol;\n # 归一化后用 net_type;某些样本里 net_app_proto 也表示协议\n candidates = [\n alert.get('application_layer_protocol', ''),\n alert.get('net_type', ''),\n alert.get('net_app_proto', ''),\n ]\n return any('http' in str(c).lower() for c in candidates if c and str(c).lower() != 'none')\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n is_scan = is_scan_alert(threat_name)\n http = is_http_protocol(alert)\n\n if source == 'skyeye':\n return 'alert_scan_direction_in' if is_scan else 'alert_not_scan_http_direction_in'\n\n if is_scan:\n if direction == 'lateral':\n return 'alert_scan_direction_lateral'\n if direction == 'in':\n return 'alert_scan_direction_in'\n if direction == 'out':\n return 'alert_scan_direction_out'\n return 'alert_scan_direction_in'\n if http:\n if direction == 'lateral':\n return 'alert_not_scan_http_direction_lateral'\n if direction == 'out':\n return 'alert_not_scan_http_direction_out'\n return 'alert_not_scan_http_direction_in'\n if direction == 'lateral':\n return 'alert_not_scan_not_http_direction_lateral'\n if direction == 'out':\n return 'alert_not_scan_not_http_direction_out'\n if direction == 'in':\n return 'alert_not_scan_not_http_direction_in'\n return 'alert_not_process'\n\nNEED_ANALYSIS_TYPES = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_lateral',\n 'alert_not_scan_http_direction_out',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need_analysis = ptype in NEED_ANALYSIS_TYPES\n threat_type = get_threat_type(alert, source_log_type)\n need_attack_status = (ptype == 'alert_not_scan_http_direction_in')\n else:\n ptype = 'alert_filter_disabled'\n need_analysis = True\n threat_type = get_threat_type(alert, source_log_type)\n need_attack_status = True\n\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_need_analysis_is_attack'] = need_analysis\n alert['_need_analysis_attack_status'] = need_attack_status\n alert['_threat_type'] = threat_type\n\n if need_analysis:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}, removed={len(normalized_alerts) - len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'analyze_enabled', 'dedup_threshold',\n 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" }, { "id": "dedup_logs", "type": "python", "description": "Step 3 — 去重:对 LSH 字段做 URI 归一化(日期/UUID/长数字/路径穿越/URL 编码替换),以严格字段精确匹配 + 5-gram Jaccard 相似度进行聚类,为每条告警生成 dedup_key(MD5);输出全量带 dedup_key 告警及唯一代表告警列表", - "code": "import re\nimport hashlib\n\ndef normalize_uri(text):\n if not text or str(text) == 'none':\n return str(text or '')\n t = str(text)\n t = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', t)\n t = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', t)\n t = re.sub(r'\\b\\d{6,}\\b', 'NUM', t)\n t = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', t)\n t = re.sub(r'%00', 'NULL', t)\n t = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', t)\n return t\n\ndef shingles(text, k=5):\n t = str(text or '').lower()\n if len(t) < k:\n return frozenset([t]) if t else frozenset()\n return frozenset(t[i:i+k] for i in range(len(t) - k + 1))\n\ndef jaccard(a, b):\n if not a and not b:\n return 1.0\n if not a or not b:\n return 0.0\n return len(a & b) / len(a | b)\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# registry: dedup_key -> (strict_text, shingle_set)\nregistry = {}\nkeyed = []\n\nfor alert in filtered_alerts:\n alert = dict(alert)\n if not dedup_enabled:\n import hashlib as _hl\n raw = ''.join(str(alert.get(f, ''))[:max_len] for f in strict_fields + lsh_fields)\n alert['dedup_key'] = _hl.md5(raw.encode()).hexdigest()\n alert['dedup_key_already_exists'] = False\n keyed.append(alert)\n continue\n\n strict_text = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n lsh_text = '. '.join(\n normalize_uri(str(alert.get(f, ''))[:max_len])\n for f in lsh_fields\n )\n cur_shingles = shingles(lsh_text)\n\n matched_key = None\n best_sim = 0.0\n for existing_key, (ex_strict, ex_shingles) in registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(cur_shingles, ex_shingles)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = existing_key\n\n if matched_key is None:\n raw_key = hashlib.md5(f'{strict_text}. {lsh_text}'.encode('utf-8')).hexdigest()\n registry[raw_key] = (strict_text, cur_shingles)\n canonical = raw_key\n alert['dedup_key_already_exists'] = False\n else:\n canonical = matched_key\n alert['dedup_key_already_exists'] = True\n\n alert['dedup_key'] = canonical\n keyed.append(alert)\n\n# Build unique alert list (first occurrence per dedup_key)\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = len(keyed) - len(unique_alerts)\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\nfor k in ['analyze_enabled', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" + "code": "import re\nimport hashlib\n\ndef normalize_uri(text):\n if not text or str(text) == 'none':\n return str(text or '')\n t = str(text)\n t = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', t)\n t = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', t)\n t = re.sub(r'\\b\\d{6,}\\b', 'NUM', t)\n t = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', t)\n t = re.sub(r'%00', 'NULL', t)\n t = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', t)\n return t\n\ndef shingles(text, k=5):\n t = str(text or '').lower()\n if len(t) < k:\n return frozenset([t]) if t else frozenset()\n return frozenset(t[i:i+k] for i in range(len(t) - k + 1))\n\ndef jaccard(a, b):\n if not a and not b:\n return 1.0\n if not a or not b:\n return 0.0\n return len(a & b) / len(a | b)\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\nregistry = {}\nkeyed = []\n\nfor alert in filtered_alerts:\n alert = dict(alert)\n if not dedup_enabled:\n raw = ''.join(str(alert.get(f, ''))[:max_len] for f in strict_fields + lsh_fields)\n alert['dedup_key'] = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key_already_exists'] = False\n keyed.append(alert)\n continue\n\n strict_text = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n lsh_text = '. '.join(\n normalize_uri(str(alert.get(f, ''))[:max_len])\n for f in lsh_fields\n )\n cur_shingles = shingles(lsh_text)\n\n matched_key = None\n best_sim = 0.0\n for existing_key, (ex_strict, ex_shingles) in registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(cur_shingles, ex_shingles)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = existing_key\n\n if matched_key is None:\n raw_key = hashlib.md5(f'{strict_text}. {lsh_text}'.encode('utf-8')).hexdigest()\n registry[raw_key] = (strict_text, cur_shingles)\n canonical = raw_key\n alert['dedup_key_already_exists'] = False\n else:\n canonical = matched_key\n alert['dedup_key_already_exists'] = True\n\n alert['dedup_key'] = canonical\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\nprint(f'[dedup] input={len(filtered_alerts)}, unique_keys={len(unique_alerts)}, duplicates={len(keyed) - len(unique_alerts)}, threshold={threshold}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = len(keyed) - len(unique_alerts)\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\nfor k in ['analyze_enabled', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" }, { "id": "analyze_unique", "type": "python", "description": "Step 4 — 研判分析:仅对唯一 dedup_key 的代表告警调用 LLM 判断是否为真实攻击(is_attack),然后将结果回填给同组所有重复告警;输出含 is_attack 字段的完整告警列表", - "code": "import json\n\nIS_ATTACK_PROMPT = \"\"\"作为网络安全专家,请分析告警日志,判断是否为成功的攻击事件。\n\n### 告警数据:\n{alert}\n\n### 判定标准:\n**必须同时满足以下条件才判定为\"是\":**\n\n1. **存在明确攻击特征**:\n - 载荷或请求内容包含明确的攻击模式(SQL注入、XSS、命令执行、路径遍历、webshell等)\n - 攻击载荷经解码后确认为恶意代码\n\n2. **攻击成功执行且有明确证据**:\n - 命令执行类:响应内容包含命令执行结果(系统信息、用户信息、目录列表等)\n - 数据泄露类:响应内容包含敏感信息(数据库错误信息、用户数据、系统配置等)\n - 文件操作类:响应显示文件上传成功或敏感文件访问成功\n - webshell类:响应包含webshell标识符\n\n### 严格排除(以下必须判定为\"否\"):\n- 所有 HTTP 错误状态码(400/401/403/404/405/500 等)响应\n- 含安全拦截信息(waf/blocked/已拦截/firewall 等)的响应\n- 载荷为 none 且无明确攻击载荷的情况\n- 正常网页内容、域名停放页面、API 正常响应\n\n### 输出要求:\n仅输出\"是\"或\"否\",不要任何解释或其他内容。\"\"\"\n\ndeduped_alerts = inputs.get('deduped_alerts', [])\nunique_alerts = inputs.get('unique_alerts', [])\nanalyze_enabled = inputs.get('analyze_enabled', True)\nstats = dict(inputs.get('stats', {}))\n\nif not analyze_enabled:\n for a in deduped_alerts:\n a['is_attack'] = None\n a['_analysis_skipped'] = True\n stats['analyzed_unique_count'] = 0\n outputs['analyzed_alerts'] = deduped_alerts\n outputs['stats'] = stats\nelse:\n # Step A: LLM analysis on unique dedup_key alerts\n dedup_results = {} # dedup_key -> is_attack bool\n for alert in unique_alerts:\n dedup_key = alert.get('dedup_key', '')\n try:\n alert_str = json.dumps(alert, ensure_ascii=False)[:4000]\n prompt = IS_ATTACK_PROMPT.format(alert=alert_str)\n response = str(llm.ask(prompt)).strip().lower()\n first_line = response.split('\\n')[0]\n is_attack = '是' in first_line or 'true' in first_line or 'yes' in first_line\n except Exception:\n is_attack = False\n dedup_results[dedup_key] = is_attack\n\n # Step B: backfill is_attack for all alerts (including duplicates)\n analyzed = []\n for alert in deduped_alerts:\n a = dict(alert)\n a['is_attack'] = dedup_results.get(a.get('dedup_key', ''), False)\n analyzed.append(a)\n\n attack_count = sum(1 for a in analyzed if a.get('is_attack'))\n stats['analyzed_unique_count'] = len(dedup_results)\n stats['attack_count'] = attack_count\n stats['non_attack_count'] = len(analyzed) - attack_count\n outputs['analyzed_alerts'] = analyzed\n outputs['stats'] = stats" + "code": "import json\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nIS_ATTACK_PROMPT = \"\"\"作为网络安全专家,请分析告警日志,判断是否为成功的攻击事件。\n\n### 告警数据:\n{alert}\n\n### 判定标准:\n**必须同时满足以下条件才判定为\"是\":**\n\n1. **存在明确攻击特征**:\n - 载荷或请求内容包含明确的攻击模式(SQL注入、XSS、命令执行、路径遍历、webshell等)\n - 攻击载荷经解码后确认为恶意代码\n\n2. **攻击成功执行且有明确证据**:\n - 命令执行类:响应内容包含命令执行结果(系统信息、用户信息、目录列表等)\n - 数据泄露类:响应内容包含敏感信息(数据库错误信息、用户数据、系统配置等)\n - 文件操作类:响应显示文件上传成功或敏感文件访问成功\n - webshell类:响应包含webshell标识符\n\n### 严格排除(以下必须判定为\"否\"):\n- 所有 HTTP 错误状态码(400/401/403/404/405/500 等)响应\n- 含安全拦截信息(waf/blocked/已拦截/firewall 等)的响应\n- 载荷为 none 且无明确攻击载荷的情况\n- 正常网页内容、域名停放页面、API 正常响应\n\n### 输出要求:\n仅输出\"是\"或\"否\",不要任何解释或其他内容。\"\"\"\n\ndeduped_alerts = inputs.get('deduped_alerts', [])\nunique_alerts = inputs.get('unique_alerts', [])\nanalyze_enabled = inputs.get('analyze_enabled', True)\nmax_workers = int(inputs.get('analyze_max_workers', 10))\nstats = dict(inputs.get('stats', {}))\n\ndef parse_is_attack(response_text):\n if not response_text:\n return False\n first_line = str(response_text).strip().lower().split('\\n')[0]\n return ('是' in first_line) or ('true' in first_line) or ('yes' in first_line)\n\ndef judge_one(alert):\n dedup_key = alert.get('dedup_key', '')\n try:\n alert_str = json.dumps(alert, ensure_ascii=False)[:4000]\n prompt = IS_ATTACK_PROMPT.format(alert=alert_str)\n response = llm.ask(prompt)\n return dedup_key, parse_is_attack(response), None\n except Exception as e:\n return dedup_key, False, str(e)\n\nif not analyze_enabled:\n for a in deduped_alerts:\n a['is_attack'] = None\n a['_analysis_skipped'] = True\n print('[analyze] disabled, skipping LLM calls')\n stats['analyzed_unique_count'] = 0\n outputs['analyzed_alerts'] = deduped_alerts\n outputs['stats'] = stats\nelse:\n print(f'[analyze] LLM judging {len(unique_alerts)} unique alerts (max_workers={max_workers})')\n dedup_results = {}\n error_count = 0\n with ThreadPoolExecutor(max_workers=max(1, max_workers)) as executor:\n futures = [executor.submit(judge_one, a) for a in unique_alerts]\n for fut in as_completed(futures):\n try:\n dk, is_attack, err = fut.result()\n except Exception as e:\n dk, is_attack, err = '', False, str(e)\n if err:\n error_count += 1\n if dk:\n dedup_results[dk] = is_attack\n\n analyzed = []\n for alert in deduped_alerts:\n a = dict(alert)\n a['is_attack'] = dedup_results.get(a.get('dedup_key', ''), False)\n analyzed.append(a)\n\n attack_count = sum(1 for a in analyzed if a.get('is_attack'))\n print(f'[analyze] done: unique_judged={len(dedup_results)}, errors={error_count}, attacks={attack_count}/{len(analyzed)}')\n\n stats['analyzed_unique_count'] = len(dedup_results)\n stats['analyze_error_count'] = error_count\n stats['attack_count'] = attack_count\n stats['non_attack_count'] = len(analyzed) - attack_count\n outputs['analyzed_alerts'] = analyzed\n outputs['stats'] = stats" }, { "id": "generate_report", diff --git a/.flocks/plugins/workflows/alert_dedup/workflow.md b/.flocks/plugins/workflows/alert_dedup/workflow.md index 1ad0dfb4c..0d85c0e06 100644 --- a/.flocks/plugins/workflows/alert_dedup/workflow.md +++ b/.flocks/plugins/workflows/alert_dedup/workflow.md @@ -67,14 +67,28 @@ generate_report ← 汇总统计,写出 Markdown 报告与 JSONL 数据文 ### Step 2 — 过滤(`filter_logs`) -应用两条 jsonLogic 规则: +完整对齐 `aisoc_mini` 的 `LogFilter._get_tdp_process_type()` / `_get_skyeye_process_type()`: -| 规则 | 逻辑 | 结果 | -|------|------|------| -| 扫描过滤 | `threat_name/threat_type 含"扫描"` 且 **不含** `"webshell"` | 剔除 | -| 方向+协议 | `direction in ["in","none"]` 且 `net_type in ["http","none"]` | 保留 | +1. **扫描判定**:`threat_name` 含「扫描」且 **不含** `webshell` → `is_scan = True` +2. **HTTP 判定**:`application_layer_protocol` / `net_type` / `net_app_proto` 任一字段含 `http` → HTTP 协议 +3. **process_type 计算**(共 9 种 + 1 种未处理): -仅保留**不是扫描类**且**满足入站 HTTP 条件**的告警,添加 `_need_analysis_is_attack`、`_is_scan`、`_is_inbound_http` 字段。 +| 类别 | direction | 标记 | 是否分析 | +|------|-----------|------|---------| +| 非扫描 + HTTP | in | `alert_not_scan_http_direction_in` | ✅ | +| 非扫描 + HTTP | out | `alert_not_scan_http_direction_out` | ✅ | +| 非扫描 + HTTP | lateral | `alert_not_scan_http_direction_lateral` | ✅ | +| 扫描类 | * | `alert_scan_direction_*` | ❌ | +| 非扫描 + 非HTTP | * | `alert_not_scan_not_http_direction_*` | ❌ | + +> **关键**:HTTP 非扫描告警**无论方向**(in/out/lateral)都需研判,与 aisoc_mini 行为一致。 + +4. **threat_type 取值**(与原版一致): + - skyeye → `threat_type` 字段 + - tdp → `threat_name` 字段(注意:**不是** `threat_type`) + +每条告警新增字段:`_process_type`、`_need_analysis_is_attack`、`_need_analysis_attack_status`、`_threat_type`。 +统计中包含 `filter_process_type_counts` 显示各类告警分布。 ### Step 3 — 去重(`dedup_logs`) @@ -89,8 +103,10 @@ generate_report ← 汇总统计,写出 Markdown 报告与 JSONL 数据文 ### Step 4 — 研判分析(`analyze_unique`) -- **LLM 调用策略**:仅对每个 `dedup_key` 的第一条告警(代表)调用 LLM,重复告警直接复用结果 → 节省大量 LLM 调用开销 +- **并行 LLM 调用**:使用 `ThreadPoolExecutor`(默认 `max_workers=10`,可通过 `analyze_max_workers` 配置),仅对每个 `dedup_key` 的代表告警调用 LLM +- **结果回填**:将 `is_attack` 结果回填给同簇所有重复告警 → 节省大量 LLM 调用开销 - **Prompt**:专业安全研判 Prompt,明确区分"成功攻击"与"扫描/误报/正常流量" +- **错误隔离**:单条告警 LLM 调用失败不影响其他告警,记入 `analyze_error_count` - **输出字段**:每条告警新增 `is_attack: bool` ## 输出 From f8f0c83115d76b31c9570e2636198a0d31d3d356 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 16:35:24 +0800 Subject: [PATCH 04/41] feat(alert_dedup): add explicit branch nodes for log-type and filter-result routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 用显式 branch 节点替换代码内部的 if-else 路由,拓扑结构变为: receive_alerts → branch_log_type (select_key: source_log_type) label:"tdp" → normalize_tdp → filter_logs label:"skyeye" → normalize_skyeye → filter_logs → branch_has_alerts (select_key: _has_alerts) label:"true" → dedup_logs → analyze_unique → generate_report label:"false" → generate_empty_report(无 LLM 调用的快速终点) 修复: - branch_log_type 两条边均使用显式 label ("tdp"/"skyeye"), 使 lint 正确识别为互斥路径,消除 multi_incoming_no_join 报错 - false 分支独立终结于 generate_empty_report, 避免 generate_report 多入边 lint 错误 - 测试验证:TDP/Skyeye 字段映射、has_alerts true/false 四条路径均正确路由 Co-authored-by: Cursor --- .../workflows/alert_dedup/workflow.json | 72 +++++++++++++------ 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/.flocks/plugins/workflows/alert_dedup/workflow.json b/.flocks/plugins/workflows/alert_dedup/workflow.json index c12f37e6d..3e6e21c5b 100644 --- a/.flocks/plugins/workflows/alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/alert_dedup/workflow.json @@ -1,64 +1,90 @@ { "name": "alert_dedup", - "description": "Full 4-stage alert processing pipeline: Normalize → Filter → Dedup → Analyze. Mirrors LogProcessPipeline from aisoc_mini: field normalization (TDP/Skyeye), scan-rule filtering, LSH-style deduplication, and LLM-based attack triage on unique alerts only.", - "description_cn": "告警处理四阶段主流程:归一化 → 过滤 → 去重 → 研判分析。对齐 aisoc_mini 的 LogProcessPipeline:字段归一化(TDP/Skyeye 字段映射)→ 规则过滤(扫描/出站/非HTTP剔除)→ URI归一化+Jaccard相似度去重(生成dedup_key)→ 仅对唯一dedup_key做LLM研判并将结果回填重复告警,最终输出完整管道报告。", + "description": "Full 4-stage alert processing pipeline with explicit branching: log-type branch (TDP vs Skyeye normalization), filter branch (has-alerts vs empty-result). Normalize → Filter → Dedup → Analyze.", + "description_cn": "告警处理四阶段主流程(含显式分支):日志类型分支(TDP/Skyeye 字段映射各走独立节点)→ 过滤分支(有可研判告警走去重+研判路径,无告警直接跳到报告节点)→ 去重 → LLM 研判。", "start": "receive_alerts", "nodes": [ { "id": "receive_alerts", "type": "python", - "description": "接收原始告警列表,解析输入格式,提取 Pipeline 配置(日志来源类型、去重字段、LSH 阈值、是否启用各阶段)", - "code": "import json\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nnormalize_enabled = bool(inputs.get('normalize_enabled', True))\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nanalyze_enabled = bool(inputs.get('analyze_enabled', True))\n\ndedup_threshold = float(inputs.get('threshold', inputs.get('dedup_threshold', 0.7)))\nstrict_fields = inputs.get('strict_fields', inputs.get('dedup_fields_strict', ['sip', 'dip']))\nlsh_fields = inputs.get('lsh_fields', inputs.get('dedup_fields_lsh', ['req_http_url', 'req_body', 'rsp_body']))\nmax_field_len = int(inputs.get('max_field_len', 500))\nanalyze_max_workers = int(inputs.get('analyze_max_workers', 10))\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\noutputs['raw_alerts'] = alerts_input\noutputs['stats'] = {'raw_count': len(alerts_input)}\noutputs['source_log_type'] = source_log_type\noutputs['normalize_enabled'] = normalize_enabled\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['analyze_enabled'] = analyze_enabled\noutputs['dedup_threshold'] = dedup_threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['analyze_max_workers'] = analyze_max_workers" + "description": "接收原始告警列表,解析输入格式,提取 Pipeline 配置,输出 source_log_type 供后续分支节点路由", + "code": "import json\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nanalyze_enabled = bool(inputs.get('analyze_enabled', True))\ndedup_threshold = float(inputs.get('threshold', inputs.get('dedup_threshold', 0.7)))\nstrict_fields = inputs.get('strict_fields', inputs.get('dedup_fields_strict', ['sip', 'dip']))\nlsh_fields = inputs.get('lsh_fields', inputs.get('dedup_fields_lsh', ['req_http_url', 'req_body', 'rsp_body']))\nmax_field_len = int(inputs.get('max_field_len', 500))\nanalyze_max_workers = int(inputs.get('analyze_max_workers', 10))\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\nprint(f'[receive] source_log_type={source_log_type}, total={len(alerts_input)}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['stats'] = {'raw_count': len(alerts_input)}\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['analyze_enabled'] = analyze_enabled\noutputs['dedup_threshold'] = dedup_threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['analyze_max_workers'] = analyze_max_workers" }, { - "id": "normalize_logs", + "id": "branch_log_type", + "type": "branch", + "select_key": "source_log_type", + "description": "按 source_log_type 路由:'skyeye' → normalize_skyeye;其他(默认 'tdp')→ normalize_tdp" + }, + { + "id": "normalize_tdp", + "type": "python", + "description": "TDP 字段归一化:将 TDP 原始嵌套字段(net_real_src_ip/net_http_url/threat_name 等)映射为标准字段(sip/dip/req_http_url/threat_name 等)", + "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent': 'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in TDP_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none')\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_tdp] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled', 'analyze_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" + }, + { + "id": "normalize_skyeye", "type": "python", - "description": "Step 1 — 归一化:将 TDP 或 Skyeye 原始字段映射为统一标准字段(sip/dip/req_http_url/req_body/rsp_body/threat_name 等),扁平化嵌套结构,缺失 id 时自动生成 UUID", - "code": "import uuid\nimport json\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent': 'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent': 'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_direction': 'none',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm_alert):\n id_str = ''.join(str(v) for v in norm_alert.values())\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, id_str))\n\ndef normalize_single(alert, field_map):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in field_map.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nnormalize_enabled = inputs.get('normalize_enabled', True)\nsource_log_type = inputs.get('source_log_type', 'tdp')\nstats = dict(inputs.get('stats', {}))\n\nif normalize_enabled:\n field_map = SKYEYE_FIELD_MAP if 'skyeye' in source_log_type else TDP_FIELD_MAP\n normalized = [normalize_single(a, field_map) for a in raw_alerts]\nelse:\n normalized = [dict(a) for a in raw_alerts]\n\nstats['normalized_count'] = len(normalized)\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled', 'analyze_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" + "description": "Skyeye 字段归一化:将 Skyeye 原始字段(uri/agent/host/vuln_name/attack_result 等)映射为标准字段", + "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent': 'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in SKYEYE_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_skyeye] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled', 'analyze_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" }, { "id": "filter_logs", "type": "python", - "description": "Step 2 — 过滤(对齐 aisoc_mini LogFilter):标记 process_type(scan/non-scan × http/non-http × in/out/lateral 共 9 种);non-scan + HTTP(任意方向)需要分析;TDP 的 threat_type 取 threat_name,Skyeye 取 threat_type", - "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n elif source == 'tdp':\n return str(alert.get('threat_name', 'general') or 'general')\n return 'general'\n\ndef is_http_protocol(alert):\n # 兼容多种 HTTP 协议字段:原版 LogFilter 用 application_layer_protocol;\n # 归一化后用 net_type;某些样本里 net_app_proto 也表示协议\n candidates = [\n alert.get('application_layer_protocol', ''),\n alert.get('net_type', ''),\n alert.get('net_app_proto', ''),\n ]\n return any('http' in str(c).lower() for c in candidates if c and str(c).lower() != 'none')\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n is_scan = is_scan_alert(threat_name)\n http = is_http_protocol(alert)\n\n if source == 'skyeye':\n return 'alert_scan_direction_in' if is_scan else 'alert_not_scan_http_direction_in'\n\n if is_scan:\n if direction == 'lateral':\n return 'alert_scan_direction_lateral'\n if direction == 'in':\n return 'alert_scan_direction_in'\n if direction == 'out':\n return 'alert_scan_direction_out'\n return 'alert_scan_direction_in'\n if http:\n if direction == 'lateral':\n return 'alert_not_scan_http_direction_lateral'\n if direction == 'out':\n return 'alert_not_scan_http_direction_out'\n return 'alert_not_scan_http_direction_in'\n if direction == 'lateral':\n return 'alert_not_scan_not_http_direction_lateral'\n if direction == 'out':\n return 'alert_not_scan_not_http_direction_out'\n if direction == 'in':\n return 'alert_not_scan_not_http_direction_in'\n return 'alert_not_process'\n\nNEED_ANALYSIS_TYPES = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_lateral',\n 'alert_not_scan_http_direction_out',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need_analysis = ptype in NEED_ANALYSIS_TYPES\n threat_type = get_threat_type(alert, source_log_type)\n need_attack_status = (ptype == 'alert_not_scan_http_direction_in')\n else:\n ptype = 'alert_filter_disabled'\n need_analysis = True\n threat_type = get_threat_type(alert, source_log_type)\n need_attack_status = True\n\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_need_analysis_is_attack'] = need_analysis\n alert['_need_analysis_attack_status'] = need_attack_status\n alert['_threat_type'] = threat_type\n\n if need_analysis:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}, removed={len(normalized_alerts) - len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'analyze_enabled', 'dedup_threshold',\n 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" + "description": "Step 2 — 过滤(对齐 aisoc_mini LogFilter):9 种 process_type 分类;non-scan + HTTP(任意方向 in/out/lateral)需分析;输出 _has_alerts 供后续分支节点路由", + "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if source == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert, source_log_type)\n need_attack_status = (ptype == 'alert_not_scan_http_direction_in')\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert, source_log_type)\n need_attack_status = True\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_need_analysis_is_attack'] = need\n alert['_need_analysis_attack_status'] = need_attack_status\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nhas_alerts = len(filtered) > 0\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}, has_alerts={has_alerts}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['_has_alerts'] = has_alerts\noutputs['stats'] = stats\n# 预填 dedup/analyze 空结果,供 false 分支直接到 generate_report 时使用\noutputs['deduped_alerts'] = []\noutputs['unique_alerts'] = []\noutputs['analyzed_alerts'] = []\nfor k in ['dedup_enabled', 'analyze_enabled', 'dedup_threshold',\n 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" + }, + { + "id": "branch_has_alerts", + "type": "branch", + "select_key": "_has_alerts", + "description": "按 _has_alerts 路由:True → dedup_logs(继续去重+研判);False(默认)→ generate_report(无可研判告警,直接出报告)" }, { "id": "dedup_logs", "type": "python", - "description": "Step 3 — 去重:对 LSH 字段做 URI 归一化(日期/UUID/长数字/路径穿越/URL 编码替换),以严格字段精确匹配 + 5-gram Jaccard 相似度进行聚类,为每条告警生成 dedup_key(MD5);输出全量带 dedup_key 告警及唯一代表告警列表", - "code": "import re\nimport hashlib\n\ndef normalize_uri(text):\n if not text or str(text) == 'none':\n return str(text or '')\n t = str(text)\n t = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', t)\n t = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', t)\n t = re.sub(r'\\b\\d{6,}\\b', 'NUM', t)\n t = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', t)\n t = re.sub(r'%00', 'NULL', t)\n t = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', t)\n return t\n\ndef shingles(text, k=5):\n t = str(text or '').lower()\n if len(t) < k:\n return frozenset([t]) if t else frozenset()\n return frozenset(t[i:i+k] for i in range(len(t) - k + 1))\n\ndef jaccard(a, b):\n if not a and not b:\n return 1.0\n if not a or not b:\n return 0.0\n return len(a & b) / len(a | b)\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\nregistry = {}\nkeyed = []\n\nfor alert in filtered_alerts:\n alert = dict(alert)\n if not dedup_enabled:\n raw = ''.join(str(alert.get(f, ''))[:max_len] for f in strict_fields + lsh_fields)\n alert['dedup_key'] = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key_already_exists'] = False\n keyed.append(alert)\n continue\n\n strict_text = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n lsh_text = '. '.join(\n normalize_uri(str(alert.get(f, ''))[:max_len])\n for f in lsh_fields\n )\n cur_shingles = shingles(lsh_text)\n\n matched_key = None\n best_sim = 0.0\n for existing_key, (ex_strict, ex_shingles) in registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(cur_shingles, ex_shingles)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = existing_key\n\n if matched_key is None:\n raw_key = hashlib.md5(f'{strict_text}. {lsh_text}'.encode('utf-8')).hexdigest()\n registry[raw_key] = (strict_text, cur_shingles)\n canonical = raw_key\n alert['dedup_key_already_exists'] = False\n else:\n canonical = matched_key\n alert['dedup_key_already_exists'] = True\n\n alert['dedup_key'] = canonical\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\nprint(f'[dedup] input={len(filtered_alerts)}, unique_keys={len(unique_alerts)}, duplicates={len(keyed) - len(unique_alerts)}, threshold={threshold}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = len(keyed) - len(unique_alerts)\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\nfor k in ['analyze_enabled', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" + "description": "Step 3 — 去重:URI 归一化 + 5-gram Jaccard 相似度聚类,为每条告警生成 dedup_key(MD5);输出全量 deduped_alerts 及唯一代表 unique_alerts", + "code": "import re\nimport hashlib\n\ndef normalize_uri(text):\n if not text or str(text) == 'none':\n return str(text or '')\n t = str(text)\n t = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', t)\n t = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', t)\n t = re.sub(r'\\b\\d{6,}\\b', 'NUM', t)\n t = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', t)\n t = re.sub(r'%00', 'NULL', t)\n t = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', t)\n return t\n\ndef shingles(text, k=5):\n t = str(text or '').lower()\n if len(t) < k:\n return frozenset([t]) if t else frozenset()\n return frozenset(t[i:i+k] for i in range(len(t) - k + 1))\n\ndef jaccard(a, b):\n if not a and not b:\n return 1.0\n if not a or not b:\n return 0.0\n return len(a & b) / len(a | b)\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\nregistry = {}\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n if not dedup_enabled:\n raw = ''.join(str(alert.get(f, ''))[:max_len] for f in strict_fields + lsh_fields)\n alert['dedup_key'] = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key_already_exists'] = False\n keyed.append(alert)\n continue\n strict_text = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n lsh_text = '. '.join(normalize_uri(str(alert.get(f, ''))[:max_len]) for f in lsh_fields)\n cur_sh = shingles(lsh_text)\n matched_key = None\n best_sim = 0.0\n for ek, (ex_strict, ex_sh) in registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(cur_sh, ex_sh)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = ek\n if matched_key is None:\n raw_key = hashlib.md5(f'{strict_text}. {lsh_text}'.encode('utf-8')).hexdigest()\n registry[raw_key] = (strict_text, cur_sh)\n canonical = raw_key\n alert['dedup_key_already_exists'] = False\n else:\n canonical = matched_key\n alert['dedup_key_already_exists'] = True\n alert['dedup_key'] = canonical\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\nprint(f'[dedup] input={len(filtered_alerts)}, unique={len(unique_alerts)}, dup={len(keyed)-len(unique_alerts)}')\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = len(keyed) - len(unique_alerts)\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\nfor k in ['analyze_enabled', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" }, { "id": "analyze_unique", "type": "python", - "description": "Step 4 — 研判分析:仅对唯一 dedup_key 的代表告警调用 LLM 判断是否为真实攻击(is_attack),然后将结果回填给同组所有重复告警;输出含 is_attack 字段的完整告警列表", - "code": "import json\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nIS_ATTACK_PROMPT = \"\"\"作为网络安全专家,请分析告警日志,判断是否为成功的攻击事件。\n\n### 告警数据:\n{alert}\n\n### 判定标准:\n**必须同时满足以下条件才判定为\"是\":**\n\n1. **存在明确攻击特征**:\n - 载荷或请求内容包含明确的攻击模式(SQL注入、XSS、命令执行、路径遍历、webshell等)\n - 攻击载荷经解码后确认为恶意代码\n\n2. **攻击成功执行且有明确证据**:\n - 命令执行类:响应内容包含命令执行结果(系统信息、用户信息、目录列表等)\n - 数据泄露类:响应内容包含敏感信息(数据库错误信息、用户数据、系统配置等)\n - 文件操作类:响应显示文件上传成功或敏感文件访问成功\n - webshell类:响应包含webshell标识符\n\n### 严格排除(以下必须判定为\"否\"):\n- 所有 HTTP 错误状态码(400/401/403/404/405/500 等)响应\n- 含安全拦截信息(waf/blocked/已拦截/firewall 等)的响应\n- 载荷为 none 且无明确攻击载荷的情况\n- 正常网页内容、域名停放页面、API 正常响应\n\n### 输出要求:\n仅输出\"是\"或\"否\",不要任何解释或其他内容。\"\"\"\n\ndeduped_alerts = inputs.get('deduped_alerts', [])\nunique_alerts = inputs.get('unique_alerts', [])\nanalyze_enabled = inputs.get('analyze_enabled', True)\nmax_workers = int(inputs.get('analyze_max_workers', 10))\nstats = dict(inputs.get('stats', {}))\n\ndef parse_is_attack(response_text):\n if not response_text:\n return False\n first_line = str(response_text).strip().lower().split('\\n')[0]\n return ('是' in first_line) or ('true' in first_line) or ('yes' in first_line)\n\ndef judge_one(alert):\n dedup_key = alert.get('dedup_key', '')\n try:\n alert_str = json.dumps(alert, ensure_ascii=False)[:4000]\n prompt = IS_ATTACK_PROMPT.format(alert=alert_str)\n response = llm.ask(prompt)\n return dedup_key, parse_is_attack(response), None\n except Exception as e:\n return dedup_key, False, str(e)\n\nif not analyze_enabled:\n for a in deduped_alerts:\n a['is_attack'] = None\n a['_analysis_skipped'] = True\n print('[analyze] disabled, skipping LLM calls')\n stats['analyzed_unique_count'] = 0\n outputs['analyzed_alerts'] = deduped_alerts\n outputs['stats'] = stats\nelse:\n print(f'[analyze] LLM judging {len(unique_alerts)} unique alerts (max_workers={max_workers})')\n dedup_results = {}\n error_count = 0\n with ThreadPoolExecutor(max_workers=max(1, max_workers)) as executor:\n futures = [executor.submit(judge_one, a) for a in unique_alerts]\n for fut in as_completed(futures):\n try:\n dk, is_attack, err = fut.result()\n except Exception as e:\n dk, is_attack, err = '', False, str(e)\n if err:\n error_count += 1\n if dk:\n dedup_results[dk] = is_attack\n\n analyzed = []\n for alert in deduped_alerts:\n a = dict(alert)\n a['is_attack'] = dedup_results.get(a.get('dedup_key', ''), False)\n analyzed.append(a)\n\n attack_count = sum(1 for a in analyzed if a.get('is_attack'))\n print(f'[analyze] done: unique_judged={len(dedup_results)}, errors={error_count}, attacks={attack_count}/{len(analyzed)}')\n\n stats['analyzed_unique_count'] = len(dedup_results)\n stats['analyze_error_count'] = error_count\n stats['attack_count'] = attack_count\n stats['non_attack_count'] = len(analyzed) - attack_count\n outputs['analyzed_alerts'] = analyzed\n outputs['stats'] = stats" + "description": "Step 4 — 研判分析:ThreadPoolExecutor 并行调用 LLM,仅对唯一 dedup_key 调用,is_attack 结果回填给同簇所有重复告警", + "code": "import json\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nIS_ATTACK_PROMPT = \"\"\"作为网络安全专家,请分析告警日志,判断是否为成功的攻击事件。\n\n### 告警数据:\n{alert}\n\n### 判定标准:\n**必须同时满足以下条件才判定为\"是\":**\n\n1. **存在明确攻击特征**:\n - 载荷或请求内容包含明确的攻击模式(SQL注入、XSS、命令执行、路径遍历、webshell等)\n - 攻击载荷经解码后确认为恶意代码\n\n2. **攻击成功执行且有明确证据**:\n - 命令执行类:响应内容包含命令执行结果(系统信息、用户信息、目录列表等)\n - 数据泄露类:响应内容包含敏感信息(数据库错误信息、用户数据等)\n - 文件操作类:响应显示文件上传成功或敏感文件访问成功\n - webshell类:响应包含webshell标识符\n\n### 严格排除(以下必须判定为\"否\"):\n- 所有 HTTP 错误状态码(400/401/403/404/405/500 等)响应\n- 含安全拦截信息(waf/blocked/已拦截/firewall 等)的响应\n- 载荷为 none 且无明确攻击载荷的情况\n- 正常网页内容、域名停放页面、API 正常响应\n\n### 输出要求:\n仅输出\"是\"或\"否\",不要任何解释或其他内容。\"\"\"\n\ndef parse_is_attack(text):\n if not text:\n return False\n line = str(text).strip().lower().split('\\n')[0]\n return ('是' in line) or ('true' in line) or ('yes' in line)\n\ndef judge_one(alert):\n dk = alert.get('dedup_key', '')\n try:\n alert_str = json.dumps(alert, ensure_ascii=False)[:4000]\n prompt = IS_ATTACK_PROMPT.format(alert=alert_str)\n return dk, parse_is_attack(llm.ask(prompt)), None\n except Exception as e:\n return dk, False, str(e)\n\ndeduped_alerts = inputs.get('deduped_alerts', [])\nunique_alerts = inputs.get('unique_alerts', [])\nanalyze_enabled = inputs.get('analyze_enabled', True)\nmax_workers = int(inputs.get('analyze_max_workers', 10))\nstats = dict(inputs.get('stats', {}))\n\nif not analyze_enabled:\n print('[analyze] disabled')\n for a in deduped_alerts:\n a['is_attack'] = None\n a['_analysis_skipped'] = True\n stats['analyzed_unique_count'] = 0\n outputs['analyzed_alerts'] = deduped_alerts\n outputs['stats'] = stats\nelse:\n print(f'[analyze] LLM judging {len(unique_alerts)} unique keys (workers={max_workers})')\n dedup_results = {}\n errors = 0\n with ThreadPoolExecutor(max_workers=max(1, max_workers)) as ex:\n futures = [ex.submit(judge_one, a) for a in unique_alerts]\n for fut in as_completed(futures):\n try:\n dk, is_attack, err = fut.result()\n except Exception as e:\n dk, is_attack, err = '', False, str(e)\n if err:\n errors += 1\n if dk:\n dedup_results[dk] = is_attack\n analyzed = []\n for alert in deduped_alerts:\n a = dict(alert)\n a['is_attack'] = dedup_results.get(a.get('dedup_key', ''), False)\n analyzed.append(a)\n attack_count = sum(1 for a in analyzed if a.get('is_attack'))\n print(f'[analyze] done: judged={len(dedup_results)}, errors={errors}, attacks={attack_count}/{len(analyzed)}')\n stats['analyzed_unique_count'] = len(dedup_results)\n stats['analyze_error_count'] = errors\n stats['attack_count'] = attack_count\n stats['non_attack_count'] = len(analyzed) - attack_count\n outputs['analyzed_alerts'] = analyzed\n outputs['stats'] = stats" + }, + { + "id": "generate_empty_report", + "type": "python", + "description": "过滤后无可研判告警时的快速终点:输出简短报告说明各阶段数量与过滤原因,写出空 JSONL 文件,不调用 LLM", + "code": "import os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nstats = inputs.get('stats', {})\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\n\nraw_count = stats.get('raw_count', 0)\nnorm_count = stats.get('normalized_count', raw_count)\nfilter_count = stats.get('after_filter_count', 0)\nfilter_removed = stats.get('filter_removed_count', 0)\nprocess_type_counts = stats.get('filter_process_type_counts', {})\n\nprocess_type_md = ''\nfor pt, cnt in sorted(process_type_counts.items(), key=lambda x: -x[1]):\n kept = '✅ 需研判' if 'not_scan_http' in pt else '❌ 已过滤'\n process_type_md += f'| `{pt}` | {cnt} | {kept} |\\n'\n\nreport = f\"\"\"# 告警处理 Pipeline 报告(无可研判告警)\n\n## 执行摘要\n\n过滤后无可研判告警,Pipeline 在过滤阶段终止。\n\n| 阶段 | 告警数 | 说明 |\n|------|--------|------|\n| 原始输入 | {raw_count} | 接收到的原始告警总数 |\n| Step 1 归一化 | {norm_count} | 字段映射后告警数 |\n| Step 2 过滤后 | {filter_count} | 剔除 {filter_removed} 条(扫描/非HTTP/其他)|\n| Step 3 去重 | — | 无告警,跳过 |\n| Step 4 研判 | — | 无告警,跳过 |\n\n## process_type 分布\n\n| process_type | 数量 | 说明 |\n|-------------|------|------|\n{process_type_md}\n---\n*生成时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\"\"\"\n\nreport_path = os.path.join(output_dir, 'alert_pipeline_report.md')\ntool.run('write', filePath=report_path, content=report)\nfor fname in ('pipeline_all_analyzed.jsonl', 'pipeline_attack_alerts.jsonl', 'pipeline_non_attack_alerts.jsonl'):\n tool.run('write', filePath=os.path.join(artifacts_dir, fname), content='')\n\noutputs['report_path'] = report_path\noutputs['analyzed_alerts'] = []\noutputs['attack_alerts'] = []\noutputs['stats'] = stats\noutputs['summary'] = f'Pipeline 完成(无可研判告警): 输入 {raw_count} 条 → 过滤后 {filter_count} 条。报告: {report_path}'" }, { "id": "generate_report", "type": "python", - "description": "汇总四阶段流水线统计数据,写出最终 Markdown 报告及各阶段告警的 JSONL 数据文件", - "code": "import os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nanalyzed_alerts = inputs.get('analyzed_alerts', [])\nstats = inputs.get('stats', {})\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\n\nraw_count = stats.get('raw_count', 0)\nnorm_count = stats.get('normalized_count', raw_count)\nfilter_count = stats.get('after_filter_count', norm_count)\nfilter_removed = stats.get('filter_removed_count', 0)\ndedup_count = stats.get('after_dedup_count', filter_count)\nunique_key_count = stats.get('unique_key_count', dedup_count)\ndedup_removed = stats.get('dedup_removed_count', 0)\nanalyzed_unique = stats.get('analyzed_unique_count', 0)\nattack_count = stats.get('attack_count', 0)\nnon_attack_count = stats.get('non_attack_count', 0)\n\nattack_alerts = [a for a in analyzed_alerts if a.get('is_attack')]\nnon_attack_alerts = [a for a in analyzed_alerts if not a.get('is_attack') and a.get('is_attack') is not None]\n\nreport = f\"\"\"# 告警处理 Pipeline 报告\n\n## 执行摘要\n\n| 阶段 | 告警数 | 说明 |\n|------|--------|------|\n| 原始输入 | {raw_count} | 接收到的原始告警总数 |\n| Step 1 归一化 | {norm_count} | 字段映射后告警数 |\n| Step 2 过滤后 | {filter_count} | 剔除 {filter_removed} 条(扫描/出站/非HTTP)|\n| Step 3 去重后(唯一簇) | {unique_key_count} | 从 {dedup_count} 条中识别出 {unique_key_count} 个唯一去重键,剔除 {dedup_removed} 条重复 |\n| Step 4 研判(LLM 调用次数) | {analyzed_unique} | 仅对唯一 dedup_key 调用 LLM |\n| 判定为攻击 | {attack_count} | is_attack = True |\n| 判定为非攻击 | {non_attack_count} | is_attack = False |\n\n## 真实攻击告警(Top 20)\n\"\"\"\n\nfor i, a in enumerate(attack_alerts[:20], 1):\n sip = a.get('sip', a.get('\\u6e90IP', ''))\n dip = a.get('dip', a.get('\\u76ee\\u7684IP', ''))\n threat = a.get('threat_name', a.get('\\u5a01\\u80c1\\u540d\\u79f0', a.get('alert_type', '')))\n url = a.get('req_http_url', a.get('\\u8bf7\\u6c42\\u5185\\u5bb9', ''))[:80]\n dedup_key = a.get('dedup_key', '')[:16]\n report += f'\\n{i}. **{threat}** | {sip} \\u2192 {dip} | URL: `{url}` | dedup_key: `{dedup_key}...`\\n'\n\nreport += f\"\"\"\n\n---\n\n## 算法说明\n\n1. **归一化**:TDP/Skyeye 字段映射 → 标准字段(sip/dip/req_http_url/req_body/rsp_body/threat_name 等)\n2. **过滤**:剔除扫描类(非 webshell)告警、出站流量、非 HTTP 协议告警\n3. **去重**:URI 归一化(日期→DATETIME/UUID→UUID/长数字→NUM)+ 5-gram Jaccard 相似度聚类\n4. **研判**:LLM 仅分析唯一 dedup_key 的代表告警,结果回填至同簇所有告警\n\n---\n\n*生成时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\"\"\"\n\nreport_path = os.path.join(output_dir, 'alert_pipeline_report.md')\ntool.run('write', filePath=report_path, content=report)\n\ndef write_jsonl(path, records):\n tool.run('write', filePath=path,\n content='\\n'.join(json.dumps(r, ensure_ascii=False) for r in records))\n\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_all_analyzed.jsonl'), analyzed_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_attack_alerts.jsonl'), attack_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_non_attack_alerts.jsonl'), non_attack_alerts)\n\noutputs['report_path'] = report_path\noutputs['analyzed_alerts'] = analyzed_alerts\noutputs['attack_alerts'] = attack_alerts\noutputs['stats'] = stats\noutputs['summary'] = (\n f'Pipeline \\u5b8c\\u6210: \\u8f93\\u5165 {raw_count} \\u6761 \\u2192 '\n f'\\u8fc7\\u6ee4\\u540e {filter_count} \\u6761 \\u2192 '\n f'\\u53bb\\u91cd {unique_key_count} \\u4e2a\\u7c07 \\u2192 '\n f'\\u5224\\u5b9a\\u653b\\u51fb {attack_count} \\u6761\\u3002\\u62a5\\u544a: {report_path}'\n)" + "description": "完整路径终点:汇总四阶段流水线统计数据,写出 Markdown 报告及 JSONL 数据文件(含 is_attack 研判结果)", + "code": "import os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nanalyzed_alerts = inputs.get('analyzed_alerts', [])\nstats = inputs.get('stats', {})\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\n\nraw_count = stats.get('raw_count', 0)\nnorm_count = stats.get('normalized_count', raw_count)\nfilter_count = stats.get('after_filter_count', norm_count)\nfilter_removed = stats.get('filter_removed_count', 0)\nprocess_type_counts = stats.get('filter_process_type_counts', {})\nunique_key_count = stats.get('unique_key_count', 0)\ndedup_count = stats.get('after_dedup_count', filter_count)\ndedup_removed = stats.get('dedup_removed_count', 0)\nanalyzed_unique = stats.get('analyzed_unique_count', 0)\nattack_count = stats.get('attack_count', 0)\nnon_attack_count = stats.get('non_attack_count', 0)\n\nattack_alerts = [a for a in analyzed_alerts if a.get('is_attack')]\nnon_attack_alerts = [a for a in analyzed_alerts if a.get('is_attack') is False]\n\nprocess_type_md = ''\nfor pt, cnt in sorted(process_type_counts.items(), key=lambda x: -x[1]):\n kept = '✅ 需研判' if 'not_scan_http' in pt else '❌ 已过滤'\n process_type_md += f'| `{pt}` | {cnt} | {kept} |\\n'\n\nreport = f\"\"\"# 告警处理 Pipeline 报告\n\n## 执行摘要\n\n| 阶段 | 告警数 | 说明 |\n|------|--------|------|\n| 原始输入 | {raw_count} | 接收到的原始告警总数 |\n| Step 1 归一化 | {norm_count} | 字段映射后告警数 |\n| Step 2 过滤后 | {filter_count} | 剔除 {filter_removed} 条(扫描/非HTTP/其他)|\n| Step 3 去重唯一簇 | {unique_key_count} | 从 {dedup_count} 条压缩掉 {dedup_removed} 条重复 |\n| Step 4 LLM 调用次数 | {analyzed_unique} | 仅对唯一 dedup_key 调用 |\n| 判定为攻击 | {attack_count} | is_attack = True |\n| 判定为非攻击 | {non_attack_count} | is_attack = False |\n\n## process_type 分布\n\n| process_type | 数量 | 说明 |\n|-------------|------|------|\n{process_type_md}\n## 真实攻击告警(Top 20)\n\"\"\"\n\nfor i, a in enumerate(attack_alerts[:20], 1):\n sip = a.get('sip', '')\n dip = a.get('dip', '')\n threat = str(a.get('threat_name', a.get('alert_type', '')))\n url = str(a.get('req_http_url', ''))[:80]\n dk = a.get('dedup_key', '')[:16]\n report += f'\\n{i}. **{threat}** | {sip} → {dip} | `{url}` | key:`{dk}...`\\n'\n\nreport += f\"\"\"\n---\n*生成时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\"\"\"\n\nreport_path = os.path.join(output_dir, 'alert_pipeline_report.md')\ntool.run('write', filePath=report_path, content=report)\n\ndef write_jsonl(path, records):\n tool.run('write', filePath=path,\n content='\\n'.join(json.dumps(r, ensure_ascii=False) for r in records))\n\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_all_analyzed.jsonl'), analyzed_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_attack_alerts.jsonl'), attack_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_non_attack_alerts.jsonl'), non_attack_alerts)\n\noutputs['report_path'] = report_path\noutputs['analyzed_alerts'] = analyzed_alerts\noutputs['attack_alerts'] = attack_alerts\noutputs['stats'] = stats\noutputs['summary'] = (\n f'Pipeline 完成: 输入 {raw_count} 条 → 过滤后 {filter_count} 条 → '\n f'去重 {unique_key_count} 个簇 → 判定攻击 {attack_count} 条。报告: {report_path}'\n)" } ], "edges": [ - {"from": "receive_alerts", "to": "normalize_logs"}, - {"from": "normalize_logs", "to": "filter_logs"}, - {"from": "filter_logs", "to": "dedup_logs"}, - {"from": "dedup_logs", "to": "analyze_unique"}, - {"from": "analyze_unique", "to": "generate_report"} + {"from": "receive_alerts", "to": "branch_log_type", "order": 0}, + {"from": "branch_log_type", "to": "normalize_tdp", "label": "tdp", "order": 0}, + {"from": "branch_log_type", "to": "normalize_skyeye", "label": "skyeye","order": 1}, + {"from": "normalize_tdp", "to": "filter_logs", "order": 0}, + {"from": "normalize_skyeye", "to": "filter_logs", "order": 0}, + {"from": "filter_logs", "to": "branch_has_alerts", "order": 0}, + {"from": "branch_has_alerts", "to": "dedup_logs", "label": "true", "order": 0}, + {"from": "branch_has_alerts", "to": "generate_empty_report", "label": "false", "order": 1}, + {"from": "dedup_logs", "to": "analyze_unique", "order": 0}, + {"from": "analyze_unique", "to": "generate_report", "order": 0} ], "metadata": { "node_timeout_s": 600, "sampleInputs": { "source_log_type": "tdp", - "normalize_enabled": true, "filter_enabled": true, "dedup_enabled": true, "analyze_enabled": true, "threshold": 0.7, - "strict_fields": ["sip", "dip"], - "lsh_fields": ["req_http_url", "req_body", "rsp_body"], "alerts": [ { "net_real_src_ip": "1.2.3.4", From 44fdca58c28b45c29ba5405e36f3ee347f9e1b68 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 16:54:56 +0800 Subject: [PATCH 05/41] =?UTF-8?q?refactor(workflow):=20rename=20alert=5Fde?= =?UTF-8?q?dup=20=E2=86=92=20network=5Falert=5Fdedup,=20simplify=20to=20de?= =?UTF-8?q?dup-only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 目录重命名 alert_dedup → network_alert_dedup - workflow.json / meta.json name & id 改为 network_alert_dedup - 移除 analyze_unique / generate_report / generate_empty_report / branch_has_alerts 四个节点 - 移除 LLM 研判相关参数(analyze_enabled / analyze_max_workers) - dedup_logs 成为终点节点,直接输出 dict: deduped_alerts(全量含 dedup_key)/ unique_alerts(唯一簇)/ stats / dedup_summary - filter_logs 清理预填空结果逻辑,精简 outputs 传递 - workflow.md 更新为三阶段流程说明(归一化→过滤→去重) - lint 无警告,模型验证通过 Co-authored-by: Cursor --- .../plugins/workflows/alert_dedup/meta.json | 10 -- .../workflows/alert_dedup/workflow.json | 103 ----------- .../plugins/workflows/alert_dedup/workflow.md | 165 ------------------ .../workflows/network_alert_dedup/meta.json | 10 ++ .../network_alert_dedup/workflow.json | 74 ++++++++ .../workflows/network_alert_dedup/workflow.md | 126 +++++++++++++ 6 files changed, 210 insertions(+), 278 deletions(-) delete mode 100644 .flocks/plugins/workflows/alert_dedup/meta.json delete mode 100644 .flocks/plugins/workflows/alert_dedup/workflow.json delete mode 100644 .flocks/plugins/workflows/alert_dedup/workflow.md create mode 100644 .flocks/plugins/workflows/network_alert_dedup/meta.json create mode 100644 .flocks/plugins/workflows/network_alert_dedup/workflow.json create mode 100644 .flocks/plugins/workflows/network_alert_dedup/workflow.md diff --git a/.flocks/plugins/workflows/alert_dedup/meta.json b/.flocks/plugins/workflows/alert_dedup/meta.json deleted file mode 100644 index e766918be..000000000 --- a/.flocks/plugins/workflows/alert_dedup/meta.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "name": "alert_dedup", - "description": "告警处理四阶段主流程:归一化(TDP/Skyeye字段映射)→ 过滤(扫描/出站剔除)→ 去重(URI归一化+Jaccard相似度)→ LLM研判(仅对唯一dedup_key调用),完整对齐 aisoc_mini LogProcessPipeline", - "category": "security", - "status": "active", - "createdBy": null, - "createdAt": 1746691200000, - "updatedAt": 1746691200000, - "id": "alert_dedup" -} diff --git a/.flocks/plugins/workflows/alert_dedup/workflow.json b/.flocks/plugins/workflows/alert_dedup/workflow.json deleted file mode 100644 index 3e6e21c5b..000000000 --- a/.flocks/plugins/workflows/alert_dedup/workflow.json +++ /dev/null @@ -1,103 +0,0 @@ -{ - "name": "alert_dedup", - "description": "Full 4-stage alert processing pipeline with explicit branching: log-type branch (TDP vs Skyeye normalization), filter branch (has-alerts vs empty-result). Normalize → Filter → Dedup → Analyze.", - "description_cn": "告警处理四阶段主流程(含显式分支):日志类型分支(TDP/Skyeye 字段映射各走独立节点)→ 过滤分支(有可研判告警走去重+研判路径,无告警直接跳到报告节点)→ 去重 → LLM 研判。", - "start": "receive_alerts", - "nodes": [ - { - "id": "receive_alerts", - "type": "python", - "description": "接收原始告警列表,解析输入格式,提取 Pipeline 配置,输出 source_log_type 供后续分支节点路由", - "code": "import json\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nanalyze_enabled = bool(inputs.get('analyze_enabled', True))\ndedup_threshold = float(inputs.get('threshold', inputs.get('dedup_threshold', 0.7)))\nstrict_fields = inputs.get('strict_fields', inputs.get('dedup_fields_strict', ['sip', 'dip']))\nlsh_fields = inputs.get('lsh_fields', inputs.get('dedup_fields_lsh', ['req_http_url', 'req_body', 'rsp_body']))\nmax_field_len = int(inputs.get('max_field_len', 500))\nanalyze_max_workers = int(inputs.get('analyze_max_workers', 10))\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\nprint(f'[receive] source_log_type={source_log_type}, total={len(alerts_input)}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['stats'] = {'raw_count': len(alerts_input)}\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['analyze_enabled'] = analyze_enabled\noutputs['dedup_threshold'] = dedup_threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['analyze_max_workers'] = analyze_max_workers" - }, - { - "id": "branch_log_type", - "type": "branch", - "select_key": "source_log_type", - "description": "按 source_log_type 路由:'skyeye' → normalize_skyeye;其他(默认 'tdp')→ normalize_tdp" - }, - { - "id": "normalize_tdp", - "type": "python", - "description": "TDP 字段归一化:将 TDP 原始嵌套字段(net_real_src_ip/net_http_url/threat_name 等)映射为标准字段(sip/dip/req_http_url/threat_name 等)", - "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent': 'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in TDP_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none')\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_tdp] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled', 'analyze_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" - }, - { - "id": "normalize_skyeye", - "type": "python", - "description": "Skyeye 字段归一化:将 Skyeye 原始字段(uri/agent/host/vuln_name/attack_result 等)映射为标准字段", - "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent': 'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in SKYEYE_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_skyeye] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled', 'analyze_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" - }, - { - "id": "filter_logs", - "type": "python", - "description": "Step 2 — 过滤(对齐 aisoc_mini LogFilter):9 种 process_type 分类;non-scan + HTTP(任意方向 in/out/lateral)需分析;输出 _has_alerts 供后续分支节点路由", - "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if source == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert, source_log_type)\n need_attack_status = (ptype == 'alert_not_scan_http_direction_in')\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert, source_log_type)\n need_attack_status = True\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_need_analysis_is_attack'] = need\n alert['_need_analysis_attack_status'] = need_attack_status\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nhas_alerts = len(filtered) > 0\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}, has_alerts={has_alerts}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['_has_alerts'] = has_alerts\noutputs['stats'] = stats\n# 预填 dedup/analyze 空结果,供 false 分支直接到 generate_report 时使用\noutputs['deduped_alerts'] = []\noutputs['unique_alerts'] = []\noutputs['analyzed_alerts'] = []\nfor k in ['dedup_enabled', 'analyze_enabled', 'dedup_threshold',\n 'strict_fields', 'lsh_fields', 'max_field_len', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" - }, - { - "id": "branch_has_alerts", - "type": "branch", - "select_key": "_has_alerts", - "description": "按 _has_alerts 路由:True → dedup_logs(继续去重+研判);False(默认)→ generate_report(无可研判告警,直接出报告)" - }, - { - "id": "dedup_logs", - "type": "python", - "description": "Step 3 — 去重:URI 归一化 + 5-gram Jaccard 相似度聚类,为每条告警生成 dedup_key(MD5);输出全量 deduped_alerts 及唯一代表 unique_alerts", - "code": "import re\nimport hashlib\n\ndef normalize_uri(text):\n if not text or str(text) == 'none':\n return str(text or '')\n t = str(text)\n t = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', t)\n t = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', t)\n t = re.sub(r'\\b\\d{6,}\\b', 'NUM', t)\n t = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', t)\n t = re.sub(r'%00', 'NULL', t)\n t = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', t)\n return t\n\ndef shingles(text, k=5):\n t = str(text or '').lower()\n if len(t) < k:\n return frozenset([t]) if t else frozenset()\n return frozenset(t[i:i+k] for i in range(len(t) - k + 1))\n\ndef jaccard(a, b):\n if not a and not b:\n return 1.0\n if not a or not b:\n return 0.0\n return len(a & b) / len(a | b)\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\nregistry = {}\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n if not dedup_enabled:\n raw = ''.join(str(alert.get(f, ''))[:max_len] for f in strict_fields + lsh_fields)\n alert['dedup_key'] = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key_already_exists'] = False\n keyed.append(alert)\n continue\n strict_text = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n lsh_text = '. '.join(normalize_uri(str(alert.get(f, ''))[:max_len]) for f in lsh_fields)\n cur_sh = shingles(lsh_text)\n matched_key = None\n best_sim = 0.0\n for ek, (ex_strict, ex_sh) in registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(cur_sh, ex_sh)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = ek\n if matched_key is None:\n raw_key = hashlib.md5(f'{strict_text}. {lsh_text}'.encode('utf-8')).hexdigest()\n registry[raw_key] = (strict_text, cur_sh)\n canonical = raw_key\n alert['dedup_key_already_exists'] = False\n else:\n canonical = matched_key\n alert['dedup_key_already_exists'] = True\n alert['dedup_key'] = canonical\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\nprint(f'[dedup] input={len(filtered_alerts)}, unique={len(unique_alerts)}, dup={len(keyed)-len(unique_alerts)}')\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = len(keyed) - len(unique_alerts)\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\nfor k in ['analyze_enabled', 'analyze_max_workers']:\n outputs[k] = inputs.get(k)" - }, - { - "id": "analyze_unique", - "type": "python", - "description": "Step 4 — 研判分析:ThreadPoolExecutor 并行调用 LLM,仅对唯一 dedup_key 调用,is_attack 结果回填给同簇所有重复告警", - "code": "import json\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nIS_ATTACK_PROMPT = \"\"\"作为网络安全专家,请分析告警日志,判断是否为成功的攻击事件。\n\n### 告警数据:\n{alert}\n\n### 判定标准:\n**必须同时满足以下条件才判定为\"是\":**\n\n1. **存在明确攻击特征**:\n - 载荷或请求内容包含明确的攻击模式(SQL注入、XSS、命令执行、路径遍历、webshell等)\n - 攻击载荷经解码后确认为恶意代码\n\n2. **攻击成功执行且有明确证据**:\n - 命令执行类:响应内容包含命令执行结果(系统信息、用户信息、目录列表等)\n - 数据泄露类:响应内容包含敏感信息(数据库错误信息、用户数据等)\n - 文件操作类:响应显示文件上传成功或敏感文件访问成功\n - webshell类:响应包含webshell标识符\n\n### 严格排除(以下必须判定为\"否\"):\n- 所有 HTTP 错误状态码(400/401/403/404/405/500 等)响应\n- 含安全拦截信息(waf/blocked/已拦截/firewall 等)的响应\n- 载荷为 none 且无明确攻击载荷的情况\n- 正常网页内容、域名停放页面、API 正常响应\n\n### 输出要求:\n仅输出\"是\"或\"否\",不要任何解释或其他内容。\"\"\"\n\ndef parse_is_attack(text):\n if not text:\n return False\n line = str(text).strip().lower().split('\\n')[0]\n return ('是' in line) or ('true' in line) or ('yes' in line)\n\ndef judge_one(alert):\n dk = alert.get('dedup_key', '')\n try:\n alert_str = json.dumps(alert, ensure_ascii=False)[:4000]\n prompt = IS_ATTACK_PROMPT.format(alert=alert_str)\n return dk, parse_is_attack(llm.ask(prompt)), None\n except Exception as e:\n return dk, False, str(e)\n\ndeduped_alerts = inputs.get('deduped_alerts', [])\nunique_alerts = inputs.get('unique_alerts', [])\nanalyze_enabled = inputs.get('analyze_enabled', True)\nmax_workers = int(inputs.get('analyze_max_workers', 10))\nstats = dict(inputs.get('stats', {}))\n\nif not analyze_enabled:\n print('[analyze] disabled')\n for a in deduped_alerts:\n a['is_attack'] = None\n a['_analysis_skipped'] = True\n stats['analyzed_unique_count'] = 0\n outputs['analyzed_alerts'] = deduped_alerts\n outputs['stats'] = stats\nelse:\n print(f'[analyze] LLM judging {len(unique_alerts)} unique keys (workers={max_workers})')\n dedup_results = {}\n errors = 0\n with ThreadPoolExecutor(max_workers=max(1, max_workers)) as ex:\n futures = [ex.submit(judge_one, a) for a in unique_alerts]\n for fut in as_completed(futures):\n try:\n dk, is_attack, err = fut.result()\n except Exception as e:\n dk, is_attack, err = '', False, str(e)\n if err:\n errors += 1\n if dk:\n dedup_results[dk] = is_attack\n analyzed = []\n for alert in deduped_alerts:\n a = dict(alert)\n a['is_attack'] = dedup_results.get(a.get('dedup_key', ''), False)\n analyzed.append(a)\n attack_count = sum(1 for a in analyzed if a.get('is_attack'))\n print(f'[analyze] done: judged={len(dedup_results)}, errors={errors}, attacks={attack_count}/{len(analyzed)}')\n stats['analyzed_unique_count'] = len(dedup_results)\n stats['analyze_error_count'] = errors\n stats['attack_count'] = attack_count\n stats['non_attack_count'] = len(analyzed) - attack_count\n outputs['analyzed_alerts'] = analyzed\n outputs['stats'] = stats" - }, - { - "id": "generate_empty_report", - "type": "python", - "description": "过滤后无可研判告警时的快速终点:输出简短报告说明各阶段数量与过滤原因,写出空 JSONL 文件,不调用 LLM", - "code": "import os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nstats = inputs.get('stats', {})\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\n\nraw_count = stats.get('raw_count', 0)\nnorm_count = stats.get('normalized_count', raw_count)\nfilter_count = stats.get('after_filter_count', 0)\nfilter_removed = stats.get('filter_removed_count', 0)\nprocess_type_counts = stats.get('filter_process_type_counts', {})\n\nprocess_type_md = ''\nfor pt, cnt in sorted(process_type_counts.items(), key=lambda x: -x[1]):\n kept = '✅ 需研判' if 'not_scan_http' in pt else '❌ 已过滤'\n process_type_md += f'| `{pt}` | {cnt} | {kept} |\\n'\n\nreport = f\"\"\"# 告警处理 Pipeline 报告(无可研判告警)\n\n## 执行摘要\n\n过滤后无可研判告警,Pipeline 在过滤阶段终止。\n\n| 阶段 | 告警数 | 说明 |\n|------|--------|------|\n| 原始输入 | {raw_count} | 接收到的原始告警总数 |\n| Step 1 归一化 | {norm_count} | 字段映射后告警数 |\n| Step 2 过滤后 | {filter_count} | 剔除 {filter_removed} 条(扫描/非HTTP/其他)|\n| Step 3 去重 | — | 无告警,跳过 |\n| Step 4 研判 | — | 无告警,跳过 |\n\n## process_type 分布\n\n| process_type | 数量 | 说明 |\n|-------------|------|------|\n{process_type_md}\n---\n*生成时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\"\"\"\n\nreport_path = os.path.join(output_dir, 'alert_pipeline_report.md')\ntool.run('write', filePath=report_path, content=report)\nfor fname in ('pipeline_all_analyzed.jsonl', 'pipeline_attack_alerts.jsonl', 'pipeline_non_attack_alerts.jsonl'):\n tool.run('write', filePath=os.path.join(artifacts_dir, fname), content='')\n\noutputs['report_path'] = report_path\noutputs['analyzed_alerts'] = []\noutputs['attack_alerts'] = []\noutputs['stats'] = stats\noutputs['summary'] = f'Pipeline 完成(无可研判告警): 输入 {raw_count} 条 → 过滤后 {filter_count} 条。报告: {report_path}'" - }, - { - "id": "generate_report", - "type": "python", - "description": "完整路径终点:汇总四阶段流水线统计数据,写出 Markdown 报告及 JSONL 数据文件(含 is_attack 研判结果)", - "code": "import os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nanalyzed_alerts = inputs.get('analyzed_alerts', [])\nstats = inputs.get('stats', {})\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\n\nraw_count = stats.get('raw_count', 0)\nnorm_count = stats.get('normalized_count', raw_count)\nfilter_count = stats.get('after_filter_count', norm_count)\nfilter_removed = stats.get('filter_removed_count', 0)\nprocess_type_counts = stats.get('filter_process_type_counts', {})\nunique_key_count = stats.get('unique_key_count', 0)\ndedup_count = stats.get('after_dedup_count', filter_count)\ndedup_removed = stats.get('dedup_removed_count', 0)\nanalyzed_unique = stats.get('analyzed_unique_count', 0)\nattack_count = stats.get('attack_count', 0)\nnon_attack_count = stats.get('non_attack_count', 0)\n\nattack_alerts = [a for a in analyzed_alerts if a.get('is_attack')]\nnon_attack_alerts = [a for a in analyzed_alerts if a.get('is_attack') is False]\n\nprocess_type_md = ''\nfor pt, cnt in sorted(process_type_counts.items(), key=lambda x: -x[1]):\n kept = '✅ 需研判' if 'not_scan_http' in pt else '❌ 已过滤'\n process_type_md += f'| `{pt}` | {cnt} | {kept} |\\n'\n\nreport = f\"\"\"# 告警处理 Pipeline 报告\n\n## 执行摘要\n\n| 阶段 | 告警数 | 说明 |\n|------|--------|------|\n| 原始输入 | {raw_count} | 接收到的原始告警总数 |\n| Step 1 归一化 | {norm_count} | 字段映射后告警数 |\n| Step 2 过滤后 | {filter_count} | 剔除 {filter_removed} 条(扫描/非HTTP/其他)|\n| Step 3 去重唯一簇 | {unique_key_count} | 从 {dedup_count} 条压缩掉 {dedup_removed} 条重复 |\n| Step 4 LLM 调用次数 | {analyzed_unique} | 仅对唯一 dedup_key 调用 |\n| 判定为攻击 | {attack_count} | is_attack = True |\n| 判定为非攻击 | {non_attack_count} | is_attack = False |\n\n## process_type 分布\n\n| process_type | 数量 | 说明 |\n|-------------|------|------|\n{process_type_md}\n## 真实攻击告警(Top 20)\n\"\"\"\n\nfor i, a in enumerate(attack_alerts[:20], 1):\n sip = a.get('sip', '')\n dip = a.get('dip', '')\n threat = str(a.get('threat_name', a.get('alert_type', '')))\n url = str(a.get('req_http_url', ''))[:80]\n dk = a.get('dedup_key', '')[:16]\n report += f'\\n{i}. **{threat}** | {sip} → {dip} | `{url}` | key:`{dk}...`\\n'\n\nreport += f\"\"\"\n---\n*生成时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\"\"\"\n\nreport_path = os.path.join(output_dir, 'alert_pipeline_report.md')\ntool.run('write', filePath=report_path, content=report)\n\ndef write_jsonl(path, records):\n tool.run('write', filePath=path,\n content='\\n'.join(json.dumps(r, ensure_ascii=False) for r in records))\n\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_all_analyzed.jsonl'), analyzed_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_attack_alerts.jsonl'), attack_alerts)\nwrite_jsonl(os.path.join(artifacts_dir, 'pipeline_non_attack_alerts.jsonl'), non_attack_alerts)\n\noutputs['report_path'] = report_path\noutputs['analyzed_alerts'] = analyzed_alerts\noutputs['attack_alerts'] = attack_alerts\noutputs['stats'] = stats\noutputs['summary'] = (\n f'Pipeline 完成: 输入 {raw_count} 条 → 过滤后 {filter_count} 条 → '\n f'去重 {unique_key_count} 个簇 → 判定攻击 {attack_count} 条。报告: {report_path}'\n)" - } - ], - "edges": [ - {"from": "receive_alerts", "to": "branch_log_type", "order": 0}, - {"from": "branch_log_type", "to": "normalize_tdp", "label": "tdp", "order": 0}, - {"from": "branch_log_type", "to": "normalize_skyeye", "label": "skyeye","order": 1}, - {"from": "normalize_tdp", "to": "filter_logs", "order": 0}, - {"from": "normalize_skyeye", "to": "filter_logs", "order": 0}, - {"from": "filter_logs", "to": "branch_has_alerts", "order": 0}, - {"from": "branch_has_alerts", "to": "dedup_logs", "label": "true", "order": 0}, - {"from": "branch_has_alerts", "to": "generate_empty_report", "label": "false", "order": 1}, - {"from": "dedup_logs", "to": "analyze_unique", "order": 0}, - {"from": "analyze_unique", "to": "generate_report", "order": 0} - ], - "metadata": { - "node_timeout_s": 600, - "sampleInputs": { - "source_log_type": "tdp", - "filter_enabled": true, - "dedup_enabled": true, - "analyze_enabled": true, - "threshold": 0.7, - "alerts": [ - { - "net_real_src_ip": "1.2.3.4", - "net_dest_ip": "10.0.0.1", - "direction": "in", - "net_type": "http", - "net_http_url": "/admin/login.php?id=1 OR 1=1", - "net_http_reqs_body": "username=admin&password=123456", - "net_http_resp_body": "root@localhost", - "threat_name": "SQL注入攻击", - "threat_type": "web攻击" - } - ] - } - } -} diff --git a/.flocks/plugins/workflows/alert_dedup/workflow.md b/.flocks/plugins/workflows/alert_dedup/workflow.md deleted file mode 100644 index 0d85c0e06..000000000 --- a/.flocks/plugins/workflows/alert_dedup/workflow.md +++ /dev/null @@ -1,165 +0,0 @@ -# alert_dedup — 告警处理四阶段 Pipeline 工作流 - -## 简介 - -`alert_dedup` 完整实现了 `aisoc_mini` 项目中 `LogProcessPipeline` 的四阶段主流程: - -``` -原始告警 → 归一化 → 过滤 → 去重 → 研判分析 -``` - -每阶段均可通过配置独立开关,支持 TDP 和 Skyeye 两种日志格式。 - -## 工作流节点 - -``` -receive_alerts - │ - ▼ -normalize_logs ← Step 1: TDP/Skyeye 字段映射,扁平化嵌套结构 - │ - ▼ -filter_logs ← Step 2: 过滤扫描类/出站/非 HTTP 告警 - │ - ▼ -dedup_logs ← Step 3: URI 归一化 + 5-gram Jaccard 去重,生成 dedup_key - │ - ▼ -analyze_unique ← Step 4: LLM 研判(仅对唯一 dedup_key 调用,结果回填重复告警) - │ - ▼ -generate_report ← 汇总统计,写出 Markdown 报告与 JSONL 数据文件 -``` - -## 输入参数 - -| 参数 | 类型 | 默认值 | 说明 | -|------|------|--------|------| -| `alerts` | `list[dict]` | 必填 | 原始告警列表 | -| `source_log_type` | `str` | `"tdp"` | 日志来源类型,`"tdp"` 或 `"skyeye"` | -| `normalize_enabled` | `bool` | `true` | 是否执行字段归一化 | -| `filter_enabled` | `bool` | `true` | 是否执行规则过滤 | -| `dedup_enabled` | `bool` | `true` | 是否执行去重 | -| `analyze_enabled` | `bool` | `true` | 是否执行 LLM 研判 | -| `threshold` | `float` | `0.7` | Jaccard 相似度阈值(去重步骤) | -| `strict_fields` | `list[str]` | `["sip","dip"]` | 严格匹配字段 | -| `lsh_fields` | `list[str]` | `["req_http_url","req_body","rsp_body"]` | 近似匹配字段 | -| `max_field_len` | `int` | `500` | 字段截断长度 | - -## 四阶段详解 - -### Step 1 — 归一化(`normalize_logs`) - -将 TDP 或 Skyeye 原始字段映射为统一标准字段: - -| 标准字段 | TDP 原始字段 | Skyeye 原始字段 | -|----------|-------------|----------------| -| `sip` | `net_real_src_ip` | `sip` | -| `dip` | `net_dest_ip` | `dip` | -| `req_http_url` | `net_http_url` | `uri` | -| `req_body` | `net_http_reqs_body` | `req_body` | -| `rsp_body` | `net_http_resp_body` | `rsp_body` | -| `threat_name` | `threat_name` | `vuln_name` | -| `direction` | `direction` | *(none)* | -| `net_type` | `net_type` | *(none,自动探测 method)* | - -支持嵌套结构(自动扁平化),缺失 `id` 时自动生成 UUID。 - -### Step 2 — 过滤(`filter_logs`) - -完整对齐 `aisoc_mini` 的 `LogFilter._get_tdp_process_type()` / `_get_skyeye_process_type()`: - -1. **扫描判定**:`threat_name` 含「扫描」且 **不含** `webshell` → `is_scan = True` -2. **HTTP 判定**:`application_layer_protocol` / `net_type` / `net_app_proto` 任一字段含 `http` → HTTP 协议 -3. **process_type 计算**(共 9 种 + 1 种未处理): - -| 类别 | direction | 标记 | 是否分析 | -|------|-----------|------|---------| -| 非扫描 + HTTP | in | `alert_not_scan_http_direction_in` | ✅ | -| 非扫描 + HTTP | out | `alert_not_scan_http_direction_out` | ✅ | -| 非扫描 + HTTP | lateral | `alert_not_scan_http_direction_lateral` | ✅ | -| 扫描类 | * | `alert_scan_direction_*` | ❌ | -| 非扫描 + 非HTTP | * | `alert_not_scan_not_http_direction_*` | ❌ | - -> **关键**:HTTP 非扫描告警**无论方向**(in/out/lateral)都需研判,与 aisoc_mini 行为一致。 - -4. **threat_type 取值**(与原版一致): - - skyeye → `threat_type` 字段 - - tdp → `threat_name` 字段(注意:**不是** `threat_type`) - -每条告警新增字段:`_process_type`、`_need_analysis_is_attack`、`_need_analysis_attack_status`、`_threat_type`。 -统计中包含 `filter_process_type_counts` 显示各类告警分布。 - -### Step 3 — 去重(`dedup_logs`) - -1. **URI 归一化**:对 lsh_fields 字段值做正则替换(日期→`DATETIME`、UUID→`UUID`、6位+数字→`NUM`、路径穿越、URL 编码) -2. **相似度计算**:5-gram Character Shingles + Jaccard 相似度 -3. **聚类规则**:严格字段完全相同 + lsh_fields Jaccard ≥ threshold → 归为同一簇,复用同一 `dedup_key` -4. **去重键**:新簇时用 MD5(`strict_text + ". " + normalized_lsh_text`) 生成 - -每条告警新增字段: -- `dedup_key`:MD5 哈希串 -- `dedup_key_already_exists`:`true` 表示该告警是重复告警 - -### Step 4 — 研判分析(`analyze_unique`) - -- **并行 LLM 调用**:使用 `ThreadPoolExecutor`(默认 `max_workers=10`,可通过 `analyze_max_workers` 配置),仅对每个 `dedup_key` 的代表告警调用 LLM -- **结果回填**:将 `is_attack` 结果回填给同簇所有重复告警 → 节省大量 LLM 调用开销 -- **Prompt**:专业安全研判 Prompt,明确区分"成功攻击"与"扫描/误报/正常流量" -- **错误隔离**:单条告警 LLM 调用失败不影响其他告警,记入 `analyze_error_count` -- **输出字段**:每条告警新增 `is_attack: bool` - -## 输出 - -| 字段 | 说明 | -|------|------| -| `analyzed_alerts` | 全量含 `is_attack` 字段的告警列表 | -| `attack_alerts` | 判定为真实攻击的告警子集 | -| `stats` | 各阶段统计:raw/normalized/filtered/dedup/analyzed 计数 | -| `report_path` | 最终 Markdown 报告路径 | -| `summary` | 单行执行摘要 | - -## 输出文件 - -``` -outputs// -├── alert_pipeline_report.md # 主报告 -└── artifacts/ - ├── pipeline_all_analyzed.jsonl # 全量含 is_attack 告警 - ├── pipeline_attack_alerts.jsonl # 真实攻击告警 - └── pipeline_non_attack_alerts.jsonl # 非攻击/误报告警 -``` - -## 与 aisoc_mini 的对应关系 - -| aisoc_mini 类/函数 | 本工作流节点 | -|-------------------|------------| -| `LogNormalization.process()` / `normalize_ndr_log()` | `normalize_logs` | -| `LogFilter.filter()` + `jsonLogic(rule_1, rule_2)` | `filter_logs` | -| `LogDedup.process()` + `LSHProcessor` + `normalize_uri()` | `dedup_logs` | -| `LogAnalysis.process_parallel()` | `analyze_unique` | -| `PipelineResult.stats` | `generate_report` | - -## 示例输入 - -```json -{ - "source_log_type": "tdp", - "threshold": 0.7, - "alerts": [ - { - "net_real_src_ip": "1.2.3.4", - "net_dest_ip": "10.0.0.1", - "direction": "in", - "net_type": "http", - "net_http_url": "/admin/login.php?id=1 OR 1=1", - "net_http_reqs_body": "username=admin&password=123456", - "net_http_resp_body": "root@localhost, MySQL 5.7", - "threat_name": "SQL注入攻击", - "threat_type": "web攻击" - } - ] -} -``` - -> **提示**:`analyze_enabled: false` 可跳过 LLM 调用,仅做去重统计,适合纯降噪场景。 diff --git a/.flocks/plugins/workflows/network_alert_dedup/meta.json b/.flocks/plugins/workflows/network_alert_dedup/meta.json new file mode 100644 index 000000000..0fa1c72b1 --- /dev/null +++ b/.flocks/plugins/workflows/network_alert_dedup/meta.json @@ -0,0 +1,10 @@ +{ + "name": "network_alert_dedup", + "description": "网络告警去重 Pipeline:归一化(TDP/Skyeye 字段映射)→ 过滤(剔除扫描/非 HTTP 告警)→ 去重(URI 归一化 + 5-gram Jaccard 相似度聚类)。输入 dict,输出 dict(deduped_alerts / unique_alerts / stats)。", + "category": "security", + "status": "active", + "createdBy": null, + "createdAt": 1746691200000, + "updatedAt": 1746777600000, + "id": "network_alert_dedup" +} diff --git a/.flocks/plugins/workflows/network_alert_dedup/workflow.json b/.flocks/plugins/workflows/network_alert_dedup/workflow.json new file mode 100644 index 000000000..959f96a49 --- /dev/null +++ b/.flocks/plugins/workflows/network_alert_dedup/workflow.json @@ -0,0 +1,74 @@ +{ + "name": "network_alert_dedup", + "description": "Network alert deduplication pipeline: normalize (TDP/Skyeye field mapping) → filter (remove scans / non-HTTP) → dedup (URI normalization + 5-gram Jaccard similarity). Returns a dict with deduped_alerts, unique_alerts and stats.", + "description_cn": "网络告警去重 Pipeline:归一化(TDP/Skyeye 字段映射,含日志类型分支)→ 过滤(剔除扫描/非 HTTP 告警)→ 去重(URI 归一化 + 5-gram Jaccard 相似度聚类,MD5 dedup_key)。输入 dict,输出 dict(deduped_alerts / unique_alerts / stats)。", + "start": "receive_alerts", + "nodes": [ + { + "id": "receive_alerts", + "type": "python", + "description": "接收原始告警列表,解析输入格式,提取 Pipeline 配置,输出 source_log_type 供后续分支节点路由", + "code": "import json\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\ndedup_threshold = float(inputs.get('threshold', inputs.get('dedup_threshold', 0.7)))\nstrict_fields = inputs.get('strict_fields', inputs.get('dedup_fields_strict', ['sip', 'dip']))\nlsh_fields = inputs.get('lsh_fields', inputs.get('dedup_fields_lsh', ['req_http_url', 'req_body', 'rsp_body']))\nmax_field_len = int(inputs.get('max_field_len', 500))\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\nprint(f'[receive] source_log_type={source_log_type}, total={len(alerts_input)}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['stats'] = {'raw_count': len(alerts_input)}\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['dedup_threshold'] = dedup_threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len" + }, + { + "id": "branch_log_type", + "type": "branch", + "select_key": "source_log_type", + "description": "按 source_log_type 路由:'skyeye' → normalize_skyeye;'tdp'(默认)→ normalize_tdp" + }, + { + "id": "normalize_tdp", + "type": "python", + "description": "TDP 字段归一化:将 TDP 原始嵌套字段(net_real_src_ip/net_http_url/threat_name 等)映射为标准字段(sip/dip/req_http_url/threat_name 等)", + "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent': 'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in TDP_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none')\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_tdp] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len']:\n outputs[k] = inputs.get(k)" + }, + { + "id": "normalize_skyeye", + "type": "python", + "description": "Skyeye 字段归一化:将 Skyeye 原始字段(uri/agent/host/vuln_name/attack_result 等)映射为标准字段", + "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent': 'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in SKYEYE_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_skyeye] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len']:\n outputs[k] = inputs.get(k)" + }, + { + "id": "filter_logs", + "type": "python", + "description": "Step 2 — 过滤(对齐 aisoc_mini LogFilter):9 种 process_type 分类;保留 non-scan + HTTP(任意方向 in/out/lateral)需分析的告警", + "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if source == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert, source_log_type)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert, source_log_type)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len']:\n outputs[k] = inputs.get(k)" + }, + { + "id": "dedup_logs", + "type": "python", + "description": "Step 3 — 去重(终点):URI 归一化 + 5-gram Jaccard 相似度聚类,为每条告警生成 dedup_key(MD5);输出 deduped_alerts(全量含重复标记)、unique_alerts(每簇代表) 及 stats", + "code": "import re\nimport hashlib\n\ndef normalize_uri(text):\n if not text or str(text) == 'none':\n return str(text or '')\n t = str(text)\n t = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', t)\n t = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', t)\n t = re.sub(r'\\b\\d{6,}\\b', 'NUM', t)\n t = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', t)\n t = re.sub(r'%00', 'NULL', t)\n t = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', t)\n return t\n\ndef shingles(text, k=5):\n t = str(text or '').lower()\n if len(t) < k:\n return frozenset([t]) if t else frozenset()\n return frozenset(t[i:i+k] for i in range(len(t) - k + 1))\n\ndef jaccard(a, b):\n if not a and not b:\n return 1.0\n if not a or not b:\n return 0.0\n return len(a & b) / len(a | b)\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\nregistry = {}\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n if not dedup_enabled:\n raw = ''.join(str(alert.get(f, ''))[:max_len] for f in strict_fields + lsh_fields)\n alert['dedup_key'] = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key_already_exists'] = False\n keyed.append(alert)\n continue\n strict_text = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n lsh_text = '. '.join(normalize_uri(str(alert.get(f, ''))[:max_len]) for f in lsh_fields)\n cur_sh = shingles(lsh_text)\n matched_key = None\n best_sim = 0.0\n for ek, (ex_strict, ex_sh) in registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(cur_sh, ex_sh)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = ek\n if matched_key is None:\n raw_key = hashlib.md5(f'{strict_text}. {lsh_text}'.encode('utf-8')).hexdigest()\n registry[raw_key] = (strict_text, cur_sh)\n canonical = raw_key\n alert['dedup_key_already_exists'] = False\n else:\n canonical = matched_key\n alert['dedup_key_already_exists'] = True\n alert['dedup_key'] = canonical\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique={len(unique_alerts)}, dup={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nif len(keyed) > 0:\n stats['dedup_ratio'] = round(dup_count / len(keyed), 4)\nelse:\n stats['dedup_ratio'] = 0.0\n\nsummary = (\n f'network_alert_dedup 完成: 输入 {stats.get(\"raw_count\", 0)} 条'\n f' → 归一化 {stats.get(\"normalized_count\", 0)} 条'\n f' → 过滤后 {stats.get(\"after_filter_count\", 0)} 条'\n f' → 去重后 {len(unique_alerts)} 个唯一簇(压缩率 {stats[\"dedup_ratio\"]:.1%})'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" + } + ], + "edges": [ + {"from": "receive_alerts", "to": "branch_log_type", "order": 0}, + {"from": "branch_log_type", "to": "normalize_tdp", "label": "tdp", "order": 0}, + {"from": "branch_log_type", "to": "normalize_skyeye", "label": "skyeye", "order": 1}, + {"from": "normalize_tdp", "to": "filter_logs", "order": 0}, + {"from": "normalize_skyeye", "to": "filter_logs", "order": 0}, + {"from": "filter_logs", "to": "dedup_logs", "order": 0} + ], + "metadata": { + "node_timeout_s": 300, + "sampleInputs": { + "source_log_type": "tdp", + "filter_enabled": true, + "dedup_enabled": true, + "threshold": 0.7, + "alerts": [ + { + "net_real_src_ip": "1.2.3.4", + "net_dest_ip": "10.0.0.1", + "direction": "in", + "net_type": "http", + "net_http_url": "/admin/login.php?id=1 OR 1=1", + "net_http_reqs_body": "username=admin&password=123456", + "net_http_resp_body": "root@localhost", + "threat_name": "SQL注入攻击", + "threat_type": "web攻击" + } + ] + } + } +} diff --git a/.flocks/plugins/workflows/network_alert_dedup/workflow.md b/.flocks/plugins/workflows/network_alert_dedup/workflow.md new file mode 100644 index 000000000..0331c039e --- /dev/null +++ b/.flocks/plugins/workflows/network_alert_dedup/workflow.md @@ -0,0 +1,126 @@ +# network_alert_dedup + +网络告警去重 Pipeline,三阶段处理:**归一化 → 过滤 → 去重**。 + +输入 `dict`(原始告警列表 + 配置),输出 `dict`(去重后的告警 + 统计信息),不调用 LLM。 + +## 工作流图 + +``` +receive_alerts + │ +branch_log_type + ├─ tdp ─→ normalize_tdp + └─ skyeye ─→ normalize_skyeye + │ + filter_logs + │ + dedup_logs ◀── 终点,输出 dict +``` + +## 输入参数 + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `alerts` | `list[dict]` | — | 原始告警列表(必填) | +| `source_log_type` | `str` | `"tdp"` | 日志来源类型,`"tdp"` 或 `"skyeye"` | +| `filter_enabled` | `bool` | `true` | 是否启用过滤阶段 | +| `dedup_enabled` | `bool` | `true` | 是否启用去重阶段(false 时每条告警独立分配 key) | +| `threshold` | `float` | `0.7` | Jaccard 相似度阈值(0–1) | +| `strict_fields` | `list[str]` | `["sip","dip"]` | 严格匹配字段(需完全相同才参与模糊聚类) | +| `lsh_fields` | `list[str]` | `["req_http_url","req_body","rsp_body"]` | 模糊匹配字段(URI 归一化 + Jaccard) | +| `max_field_len` | `int` | `500` | 单字段截断长度 | + +## 输出参数(终点节点 `dedup_logs` 的 outputs) + +| 字段 | 类型 | 说明 | +|------|------|------| +| `deduped_alerts` | `list[dict]` | 全量告警(经过滤),每条含 `dedup_key`(MD5)和 `dedup_key_already_exists`(是否重复) | +| `unique_alerts` | `list[dict]` | 每个 dedup_key 的代表性告警(去重后唯一集合) | +| `stats` | `dict` | 各阶段统计(见下表) | +| `dedup_summary` | `str` | 一行文字摘要 | + +### stats 字段 + +| 字段 | 说明 | +|------|------| +| `raw_count` | 原始输入告警数 | +| `normalized_count` | 归一化后告警数 | +| `after_filter_count` | 过滤后保留数 | +| `filter_removed_count` | 过滤剔除数 | +| `filter_process_type_counts` | 各 process_type 计数 dict | +| `after_dedup_count` | 去重后告警总数(等于 after_filter_count) | +| `unique_key_count` | 唯一 dedup_key 数(簇数) | +| `dedup_removed_count` | 去重压缩的重复条数 | +| `dedup_ratio` | 压缩率(dedup_removed / after_dedup) | + +## 节点说明 + +### receive_alerts +解析输入,支持 `alerts` / `alert_list` 键,支持 JSON 字符串或 `{"data": [...]}` 包装格式,提取 Pipeline 配置参数。 + +### branch_log_type +按 `source_log_type` 路由:`"tdp"` → `normalize_tdp`,`"skyeye"` → `normalize_skyeye`。 + +### normalize_tdp / normalize_skyeye +字段映射,将各来源的原始字段统一为标准字段(`sip`/`dip`/`req_http_url`/`req_body`/`rsp_body`/`threat_name` 等)。对缺失 `id` 的告警使用 UUID v3 生成。 + +**TDP 关键映射(部分)** + +| 标准字段 | TDP 原始字段 | +|----------|-------------| +| `sip` | `net_real_src_ip` | +| `dip` | `net_dest_ip` | +| `req_http_url` | `net_http_url` | +| `req_body` | `net_http_reqs_body` | +| `rsp_body` | `net_http_resp_body` | +| `threat_name` | `threat_name` | + +**Skyeye 关键映射(部分)** + +| 标准字段 | Skyeye 原始字段 | +|----------|----------------| +| `req_http_url` | `uri` | +| `threat_name` | `vuln_name` | +| `threat_type` | `vuln_type` | +| `threat_result` | `attack_result` | + +### filter_logs +基于 `process_type` 的 9 类分类过滤(对齐 `aisoc_mini.LogFilter`): + +| process_type | 保留/过滤 | +|-------------|----------| +| `alert_not_scan_http_direction_in` | ✅ 保留 | +| `alert_not_scan_http_direction_out` | ✅ 保留 | +| `alert_not_scan_http_direction_lateral` | ✅ 保留 | +| `alert_scan_direction_*` | ❌ 过滤(扫描类) | +| `alert_not_scan_not_http_*` | ❌ 过滤(非 HTTP) | +| `alert_not_process` | ❌ 过滤(其他) | + +### dedup_logs(终点) + +**URI 归一化**(减少 LSH 字段噪音): + +| 正则模式 | 替换为 | +|---------|--------| +| 日期时间 | `DATETIME` | +| UUID | `UUID` | +| 6 位以上数字 | `NUM` | +| 路径穿越 | `../` | +| `%00` | `NULL` | +| 连续 URL 编码(≥3 组) | `ENCODED` | + +**去重算法**: +1. `strict_fields` 拼接作为严格前缀,不同前缀的告警不归并 +2. 对 `lsh_fields`(URI 归一化后)做 **5-gram shingling** +3. 与已注册簇计算 **Jaccard 相似度**,≥ `threshold` 则归入该簇 +4. 新簇生成 **MD5 dedup_key**;重复告警标记 `dedup_key_already_exists=True` + +## 对应 aisoc_mini 关系 + +| workflow 节点 | aisoc_mini 类/方法 | +|--------------|------------------| +| `normalize_tdp` | `LogNorm.normalize_ndr_log(type='tdp')` | +| `normalize_skyeye` | `LogNorm.normalize_ndr_log(type='skyeye')` | +| `filter_logs` | `LogFilter.filter()` | +| `dedup_logs` | `LogDedup.dedup()` | From edd9d63c224be25d1f791d6ea958b8f5f4f5cb8f Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 16:57:01 +0800 Subject: [PATCH 06/41] chore(workflow): remove aisoc_mini references from network_alert_dedup Co-authored-by: Cursor --- .../workflows/network_alert_dedup/workflow.json | 2 +- .../plugins/workflows/network_alert_dedup/workflow.md | 10 +--------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/.flocks/plugins/workflows/network_alert_dedup/workflow.json b/.flocks/plugins/workflows/network_alert_dedup/workflow.json index 959f96a49..bc59407d0 100644 --- a/.flocks/plugins/workflows/network_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/network_alert_dedup/workflow.json @@ -31,7 +31,7 @@ { "id": "filter_logs", "type": "python", - "description": "Step 2 — 过滤(对齐 aisoc_mini LogFilter):9 种 process_type 分类;保留 non-scan + HTTP(任意方向 in/out/lateral)需分析的告警", + "description": "Step 2 — 过滤:9 种 process_type 分类;保留 non-scan + HTTP(任意方向 in/out/lateral)的告警", "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if source == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert, source_log_type)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert, source_log_type)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len']:\n outputs[k] = inputs.get(k)" }, { diff --git a/.flocks/plugins/workflows/network_alert_dedup/workflow.md b/.flocks/plugins/workflows/network_alert_dedup/workflow.md index 0331c039e..526634a1c 100644 --- a/.flocks/plugins/workflows/network_alert_dedup/workflow.md +++ b/.flocks/plugins/workflows/network_alert_dedup/workflow.md @@ -86,7 +86,7 @@ branch_log_type | `threat_result` | `attack_result` | ### filter_logs -基于 `process_type` 的 9 类分类过滤(对齐 `aisoc_mini.LogFilter`): +基于 `process_type` 的 9 类分类过滤: | process_type | 保留/过滤 | |-------------|----------| @@ -116,11 +116,3 @@ branch_log_type 3. 与已注册簇计算 **Jaccard 相似度**,≥ `threshold` 则归入该簇 4. 新簇生成 **MD5 dedup_key**;重复告警标记 `dedup_key_already_exists=True` -## 对应 aisoc_mini 关系 - -| workflow 节点 | aisoc_mini 类/方法 | -|--------------|------------------| -| `normalize_tdp` | `LogNorm.normalize_ndr_log(type='tdp')` | -| `normalize_skyeye` | `LogNorm.normalize_ndr_log(type='skyeye')` | -| `filter_logs` | `LogFilter.filter()` | -| `dedup_logs` | `LogDedup.dedup()` | From 7b2cbd06bd96b7cdcadf6dcfb7feb46b63ee54d6 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 17:04:18 +0800 Subject: [PATCH 07/41] feat(workflow): add branch_has_alerts after filter_logs in network_alert_dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - filter_logs 新增 _has_alerts 输出 - 插入 branch_has_alerts 分支节点(select_key: _has_alerts) - true 路径 → dedup_logs(执行去重,终点) - false 路径 → dedup_empty(无告警,直接返回空 dict,终点) - 更新 workflow.md 流程图及节点说明 - lint 无警告,模型验证通过(8 节点 / 8 边) Co-authored-by: Cursor --- .../network_alert_dedup/workflow.json | 24 +++++++++++++++---- .../workflows/network_alert_dedup/workflow.md | 14 ++++++++--- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/.flocks/plugins/workflows/network_alert_dedup/workflow.json b/.flocks/plugins/workflows/network_alert_dedup/workflow.json index bc59407d0..f280c42c3 100644 --- a/.flocks/plugins/workflows/network_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/network_alert_dedup/workflow.json @@ -31,8 +31,20 @@ { "id": "filter_logs", "type": "python", - "description": "Step 2 — 过滤:9 种 process_type 分类;保留 non-scan + HTTP(任意方向 in/out/lateral)的告警", - "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if source == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert, source_log_type)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert, source_log_type)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len']:\n outputs[k] = inputs.get(k)" + "description": "Step 2 — 过滤:9 种 process_type 分类;保留 non-scan + HTTP(任意方向 in/out/lateral)的告警;输出 _has_alerts 供后续分支路由", + "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if source == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert, source_log_type)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert, source_log_type)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nhas_alerts = len(filtered) > 0\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}, has_alerts={has_alerts}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['_has_alerts'] = has_alerts\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len']:\n outputs[k] = inputs.get(k)" + }, + { + "id": "branch_has_alerts", + "type": "branch", + "select_key": "_has_alerts", + "description": "按 _has_alerts 路由:True → dedup_logs(继续去重);False → dedup_empty(无告警,直接返回空结果)" + }, + { + "id": "dedup_empty", + "type": "python", + "description": "过滤后无告警时的终点:直接返回空的去重结果 dict,格式与 dedup_logs 输出保持一致", + "code": "stats = dict(inputs.get('stats', {}))\nstats.setdefault('after_dedup_count', 0)\nstats.setdefault('unique_key_count', 0)\nstats.setdefault('dedup_removed_count', 0)\nstats['dedup_ratio'] = 0.0\n\nsummary = (\n f'network_alert_dedup 完成(无告警): 输入 {stats.get(\"raw_count\", 0)} 条'\n f' → 归一化 {stats.get(\"normalized_count\", 0)} 条'\n f' → 过滤后 0 条,跳过去重'\n)\nprint(f'[dedup_empty] {summary}')\n\noutputs['deduped_alerts'] = []\noutputs['unique_alerts'] = []\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" }, { "id": "dedup_logs", @@ -45,9 +57,11 @@ {"from": "receive_alerts", "to": "branch_log_type", "order": 0}, {"from": "branch_log_type", "to": "normalize_tdp", "label": "tdp", "order": 0}, {"from": "branch_log_type", "to": "normalize_skyeye", "label": "skyeye", "order": 1}, - {"from": "normalize_tdp", "to": "filter_logs", "order": 0}, - {"from": "normalize_skyeye", "to": "filter_logs", "order": 0}, - {"from": "filter_logs", "to": "dedup_logs", "order": 0} + {"from": "normalize_tdp", "to": "filter_logs", "order": 0}, + {"from": "normalize_skyeye", "to": "filter_logs", "order": 0}, + {"from": "filter_logs", "to": "branch_has_alerts", "order": 0}, + {"from": "branch_has_alerts", "to": "dedup_logs", "label": "true", "order": 0}, + {"from": "branch_has_alerts", "to": "dedup_empty", "label": "false", "order": 1} ], "metadata": { "node_timeout_s": 300, diff --git a/.flocks/plugins/workflows/network_alert_dedup/workflow.md b/.flocks/plugins/workflows/network_alert_dedup/workflow.md index 526634a1c..76fd7fd6a 100644 --- a/.flocks/plugins/workflows/network_alert_dedup/workflow.md +++ b/.flocks/plugins/workflows/network_alert_dedup/workflow.md @@ -15,7 +15,9 @@ branch_log_type │ filter_logs │ - dedup_logs ◀── 终点,输出 dict + branch_has_alerts + ├─ true ─→ dedup_logs ◀── 终点,输出 dict + └─ false ─→ dedup_empty ◀── 终点,输出空 dict ``` ## 输入参数 @@ -86,7 +88,7 @@ branch_log_type | `threat_result` | `attack_result` | ### filter_logs -基于 `process_type` 的 9 类分类过滤: +基于 `process_type` 的 9 类分类过滤,输出 `_has_alerts` 布尔值供后续分支路由: | process_type | 保留/过滤 | |-------------|----------| @@ -97,7 +99,13 @@ branch_log_type | `alert_not_scan_not_http_*` | ❌ 过滤(非 HTTP) | | `alert_not_process` | ❌ 过滤(其他) | -### dedup_logs(终点) +### branch_has_alerts +按 `_has_alerts` 路由:`true` → `dedup_logs`;`false` → `dedup_empty`。 + +### dedup_empty(终点 — 无告警路径) +过滤后无告警时直接返回空结果 dict,格式与 `dedup_logs` 输出一致(`deduped_alerts=[]`,`unique_alerts=[]`,stats 补零)。 + +### dedup_logs(终点 — 有告警路径) **URI 归一化**(减少 LSH 字段噪音): From 4f59e98149ec7b49e867d1631a529ef06fa89e30 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 17:14:01 +0800 Subject: [PATCH 08/41] =?UTF-8?q?refactor(workflow):=20rename=20network=5F?= =?UTF-8?q?alert=5Fdedup=20=E2=86=92=20http=5Falert=5Fdedup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Cursor --- .../{network_alert_dedup => http_alert_dedup}/meta.json | 4 ++-- .../{network_alert_dedup => http_alert_dedup}/workflow.json | 6 +++--- .../{network_alert_dedup => http_alert_dedup}/workflow.md | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) rename .flocks/plugins/workflows/{network_alert_dedup => http_alert_dedup}/meta.json (86%) rename .flocks/plugins/workflows/{network_alert_dedup => http_alert_dedup}/workflow.json (94%) rename .flocks/plugins/workflows/{network_alert_dedup => http_alert_dedup}/workflow.md (99%) diff --git a/.flocks/plugins/workflows/network_alert_dedup/meta.json b/.flocks/plugins/workflows/http_alert_dedup/meta.json similarity index 86% rename from .flocks/plugins/workflows/network_alert_dedup/meta.json rename to .flocks/plugins/workflows/http_alert_dedup/meta.json index 0fa1c72b1..a9e5e0be8 100644 --- a/.flocks/plugins/workflows/network_alert_dedup/meta.json +++ b/.flocks/plugins/workflows/http_alert_dedup/meta.json @@ -1,10 +1,10 @@ { - "name": "network_alert_dedup", + "name": "http_alert_dedup", "description": "网络告警去重 Pipeline:归一化(TDP/Skyeye 字段映射)→ 过滤(剔除扫描/非 HTTP 告警)→ 去重(URI 归一化 + 5-gram Jaccard 相似度聚类)。输入 dict,输出 dict(deduped_alerts / unique_alerts / stats)。", "category": "security", "status": "active", "createdBy": null, "createdAt": 1746691200000, "updatedAt": 1746777600000, - "id": "network_alert_dedup" + "id": "http_alert_dedup" } diff --git a/.flocks/plugins/workflows/network_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json similarity index 94% rename from .flocks/plugins/workflows/network_alert_dedup/workflow.json rename to .flocks/plugins/workflows/http_alert_dedup/workflow.json index f280c42c3..8b36a2cb4 100644 --- a/.flocks/plugins/workflows/network_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.json @@ -1,5 +1,5 @@ { - "name": "network_alert_dedup", + "name": "http_alert_dedup", "description": "Network alert deduplication pipeline: normalize (TDP/Skyeye field mapping) → filter (remove scans / non-HTTP) → dedup (URI normalization + 5-gram Jaccard similarity). Returns a dict with deduped_alerts, unique_alerts and stats.", "description_cn": "网络告警去重 Pipeline:归一化(TDP/Skyeye 字段映射,含日志类型分支)→ 过滤(剔除扫描/非 HTTP 告警)→ 去重(URI 归一化 + 5-gram Jaccard 相似度聚类,MD5 dedup_key)。输入 dict,输出 dict(deduped_alerts / unique_alerts / stats)。", "start": "receive_alerts", @@ -44,13 +44,13 @@ "id": "dedup_empty", "type": "python", "description": "过滤后无告警时的终点:直接返回空的去重结果 dict,格式与 dedup_logs 输出保持一致", - "code": "stats = dict(inputs.get('stats', {}))\nstats.setdefault('after_dedup_count', 0)\nstats.setdefault('unique_key_count', 0)\nstats.setdefault('dedup_removed_count', 0)\nstats['dedup_ratio'] = 0.0\n\nsummary = (\n f'network_alert_dedup 完成(无告警): 输入 {stats.get(\"raw_count\", 0)} 条'\n f' → 归一化 {stats.get(\"normalized_count\", 0)} 条'\n f' → 过滤后 0 条,跳过去重'\n)\nprint(f'[dedup_empty] {summary}')\n\noutputs['deduped_alerts'] = []\noutputs['unique_alerts'] = []\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" + "code": "stats = dict(inputs.get('stats', {}))\nstats.setdefault('after_dedup_count', 0)\nstats.setdefault('unique_key_count', 0)\nstats.setdefault('dedup_removed_count', 0)\nstats['dedup_ratio'] = 0.0\n\nsummary = (\n f'http_alert_dedup 完成(无告警): 输入 {stats.get(\"raw_count\", 0)} 条'\n f' → 归一化 {stats.get(\"normalized_count\", 0)} 条'\n f' → 过滤后 0 条,跳过去重'\n)\nprint(f'[dedup_empty] {summary}')\n\noutputs['deduped_alerts'] = []\noutputs['unique_alerts'] = []\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" }, { "id": "dedup_logs", "type": "python", "description": "Step 3 — 去重(终点):URI 归一化 + 5-gram Jaccard 相似度聚类,为每条告警生成 dedup_key(MD5);输出 deduped_alerts(全量含重复标记)、unique_alerts(每簇代表) 及 stats", - "code": "import re\nimport hashlib\n\ndef normalize_uri(text):\n if not text or str(text) == 'none':\n return str(text or '')\n t = str(text)\n t = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', t)\n t = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', t)\n t = re.sub(r'\\b\\d{6,}\\b', 'NUM', t)\n t = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', t)\n t = re.sub(r'%00', 'NULL', t)\n t = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', t)\n return t\n\ndef shingles(text, k=5):\n t = str(text or '').lower()\n if len(t) < k:\n return frozenset([t]) if t else frozenset()\n return frozenset(t[i:i+k] for i in range(len(t) - k + 1))\n\ndef jaccard(a, b):\n if not a and not b:\n return 1.0\n if not a or not b:\n return 0.0\n return len(a & b) / len(a | b)\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\nregistry = {}\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n if not dedup_enabled:\n raw = ''.join(str(alert.get(f, ''))[:max_len] for f in strict_fields + lsh_fields)\n alert['dedup_key'] = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key_already_exists'] = False\n keyed.append(alert)\n continue\n strict_text = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n lsh_text = '. '.join(normalize_uri(str(alert.get(f, ''))[:max_len]) for f in lsh_fields)\n cur_sh = shingles(lsh_text)\n matched_key = None\n best_sim = 0.0\n for ek, (ex_strict, ex_sh) in registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(cur_sh, ex_sh)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = ek\n if matched_key is None:\n raw_key = hashlib.md5(f'{strict_text}. {lsh_text}'.encode('utf-8')).hexdigest()\n registry[raw_key] = (strict_text, cur_sh)\n canonical = raw_key\n alert['dedup_key_already_exists'] = False\n else:\n canonical = matched_key\n alert['dedup_key_already_exists'] = True\n alert['dedup_key'] = canonical\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique={len(unique_alerts)}, dup={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nif len(keyed) > 0:\n stats['dedup_ratio'] = round(dup_count / len(keyed), 4)\nelse:\n stats['dedup_ratio'] = 0.0\n\nsummary = (\n f'network_alert_dedup 完成: 输入 {stats.get(\"raw_count\", 0)} 条'\n f' → 归一化 {stats.get(\"normalized_count\", 0)} 条'\n f' → 过滤后 {stats.get(\"after_filter_count\", 0)} 条'\n f' → 去重后 {len(unique_alerts)} 个唯一簇(压缩率 {stats[\"dedup_ratio\"]:.1%})'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" + "code": "import re\nimport hashlib\n\ndef normalize_uri(text):\n if not text or str(text) == 'none':\n return str(text or '')\n t = str(text)\n t = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', t)\n t = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', t)\n t = re.sub(r'\\b\\d{6,}\\b', 'NUM', t)\n t = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', t)\n t = re.sub(r'%00', 'NULL', t)\n t = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', t)\n return t\n\ndef shingles(text, k=5):\n t = str(text or '').lower()\n if len(t) < k:\n return frozenset([t]) if t else frozenset()\n return frozenset(t[i:i+k] for i in range(len(t) - k + 1))\n\ndef jaccard(a, b):\n if not a and not b:\n return 1.0\n if not a or not b:\n return 0.0\n return len(a & b) / len(a | b)\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\nregistry = {}\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n if not dedup_enabled:\n raw = ''.join(str(alert.get(f, ''))[:max_len] for f in strict_fields + lsh_fields)\n alert['dedup_key'] = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key_already_exists'] = False\n keyed.append(alert)\n continue\n strict_text = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n lsh_text = '. '.join(normalize_uri(str(alert.get(f, ''))[:max_len]) for f in lsh_fields)\n cur_sh = shingles(lsh_text)\n matched_key = None\n best_sim = 0.0\n for ek, (ex_strict, ex_sh) in registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(cur_sh, ex_sh)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = ek\n if matched_key is None:\n raw_key = hashlib.md5(f'{strict_text}. {lsh_text}'.encode('utf-8')).hexdigest()\n registry[raw_key] = (strict_text, cur_sh)\n canonical = raw_key\n alert['dedup_key_already_exists'] = False\n else:\n canonical = matched_key\n alert['dedup_key_already_exists'] = True\n alert['dedup_key'] = canonical\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique={len(unique_alerts)}, dup={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nif len(keyed) > 0:\n stats['dedup_ratio'] = round(dup_count / len(keyed), 4)\nelse:\n stats['dedup_ratio'] = 0.0\n\nsummary = (\n f'http_alert_dedup 完成: 输入 {stats.get(\"raw_count\", 0)} 条'\n f' → 归一化 {stats.get(\"normalized_count\", 0)} 条'\n f' → 过滤后 {stats.get(\"after_filter_count\", 0)} 条'\n f' → 去重后 {len(unique_alerts)} 个唯一簇(压缩率 {stats[\"dedup_ratio\"]:.1%})'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" } ], "edges": [ diff --git a/.flocks/plugins/workflows/network_alert_dedup/workflow.md b/.flocks/plugins/workflows/http_alert_dedup/workflow.md similarity index 99% rename from .flocks/plugins/workflows/network_alert_dedup/workflow.md rename to .flocks/plugins/workflows/http_alert_dedup/workflow.md index 76fd7fd6a..3fe34cdf9 100644 --- a/.flocks/plugins/workflows/network_alert_dedup/workflow.md +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.md @@ -1,4 +1,4 @@ -# network_alert_dedup +# http_alert_dedup 网络告警去重 Pipeline,三阶段处理:**归一化 → 过滤 → 去重**。 From 248aa19bc57129d43cf55464d4281e226f5a6272 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 17:15:40 +0800 Subject: [PATCH 09/41] refactor(workflow): remove branch_has_alerts and dedup_empty from http_alert_dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit filter_logs 直接连接 dedup_logs,dedup_logs 自然处理空列表输入 Co-authored-by: Cursor --- .../workflows/http_alert_dedup/workflow.json | 20 +++---------------- .../workflows/http_alert_dedup/workflow.md | 14 +++---------- 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json index 8b36a2cb4..e3e481880 100644 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.json @@ -31,20 +31,8 @@ { "id": "filter_logs", "type": "python", - "description": "Step 2 — 过滤:9 种 process_type 分类;保留 non-scan + HTTP(任意方向 in/out/lateral)的告警;输出 _has_alerts 供后续分支路由", - "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if source == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert, source_log_type)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert, source_log_type)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nhas_alerts = len(filtered) > 0\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}, has_alerts={has_alerts}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['_has_alerts'] = has_alerts\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len']:\n outputs[k] = inputs.get(k)" - }, - { - "id": "branch_has_alerts", - "type": "branch", - "select_key": "_has_alerts", - "description": "按 _has_alerts 路由:True → dedup_logs(继续去重);False → dedup_empty(无告警,直接返回空结果)" - }, - { - "id": "dedup_empty", - "type": "python", - "description": "过滤后无告警时的终点:直接返回空的去重结果 dict,格式与 dedup_logs 输出保持一致", - "code": "stats = dict(inputs.get('stats', {}))\nstats.setdefault('after_dedup_count', 0)\nstats.setdefault('unique_key_count', 0)\nstats.setdefault('dedup_removed_count', 0)\nstats['dedup_ratio'] = 0.0\n\nsummary = (\n f'http_alert_dedup 完成(无告警): 输入 {stats.get(\"raw_count\", 0)} 条'\n f' → 归一化 {stats.get(\"normalized_count\", 0)} 条'\n f' → 过滤后 0 条,跳过去重'\n)\nprint(f'[dedup_empty] {summary}')\n\noutputs['deduped_alerts'] = []\noutputs['unique_alerts'] = []\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" + "description": "Step 2 — 过滤:9 种 process_type 分类;保留 non-scan + HTTP(任意方向 in/out/lateral)的告警", + "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if source == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert, source_log_type)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert, source_log_type)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len']:\n outputs[k] = inputs.get(k)" }, { "id": "dedup_logs", @@ -59,9 +47,7 @@ {"from": "branch_log_type", "to": "normalize_skyeye", "label": "skyeye", "order": 1}, {"from": "normalize_tdp", "to": "filter_logs", "order": 0}, {"from": "normalize_skyeye", "to": "filter_logs", "order": 0}, - {"from": "filter_logs", "to": "branch_has_alerts", "order": 0}, - {"from": "branch_has_alerts", "to": "dedup_logs", "label": "true", "order": 0}, - {"from": "branch_has_alerts", "to": "dedup_empty", "label": "false", "order": 1} + {"from": "filter_logs", "to": "dedup_logs", "order": 0} ], "metadata": { "node_timeout_s": 300, diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.md b/.flocks/plugins/workflows/http_alert_dedup/workflow.md index 3fe34cdf9..eb272c3d2 100644 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.md +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.md @@ -15,9 +15,7 @@ branch_log_type │ filter_logs │ - branch_has_alerts - ├─ true ─→ dedup_logs ◀── 终点,输出 dict - └─ false ─→ dedup_empty ◀── 终点,输出空 dict + dedup_logs ◀── 终点,输出 dict ``` ## 输入参数 @@ -88,7 +86,7 @@ branch_log_type | `threat_result` | `attack_result` | ### filter_logs -基于 `process_type` 的 9 类分类过滤,输出 `_has_alerts` 布尔值供后续分支路由: +基于 `process_type` 的 9 类分类过滤: | process_type | 保留/过滤 | |-------------|----------| @@ -99,13 +97,7 @@ branch_log_type | `alert_not_scan_not_http_*` | ❌ 过滤(非 HTTP) | | `alert_not_process` | ❌ 过滤(其他) | -### branch_has_alerts -按 `_has_alerts` 路由:`true` → `dedup_logs`;`false` → `dedup_empty`。 - -### dedup_empty(终点 — 无告警路径) -过滤后无告警时直接返回空结果 dict,格式与 `dedup_logs` 输出一致(`deduped_alerts=[]`,`unique_alerts=[]`,stats 补零)。 - -### dedup_logs(终点 — 有告警路径) +### dedup_logs(终点) **URI 归一化**(减少 LSH 字段噪音): From 7ef901bd31299fed0b8197df263314ba7b319f25 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 17:45:19 +0800 Subject: [PATCH 10/41] feat(workflow): replace brute-force Jaccard with MinHash LSH in dedup_logs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 使用 datasketch MinHash + MinHashLSH 替换原有 O(n²) 暴力 Jaccard: - NUM_PERM=128, MINHASH_SEED=2024(对齐 lsh_processor.py) - 共享 permutations,对齐 LSHProcessor 初始化方式 - query_most_similar:LSH 快速候选 → 精确 Jaccard 取最相似(对齐原版逻辑) - normalize_uri 对齐 utils.py(DATETIME/UUID/TRAVERSAL/NULL_REPLACED/HEXADECIMAL) - dedup_key = MD5(strict_text + '. ' + lsh_key),对齐 LogDedup._generate_dedup_key_text - 实测 453 条 TDP 过滤后告警 → 360 个唯一簇,压缩率 20.5% Co-authored-by: Cursor --- .flocks/plugins/workflows/http_alert_dedup/workflow.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json index e3e481880..d9a4e0b5c 100644 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.json @@ -37,8 +37,8 @@ { "id": "dedup_logs", "type": "python", - "description": "Step 3 — 去重(终点):URI 归一化 + 5-gram Jaccard 相似度聚类,为每条告警生成 dedup_key(MD5);输出 deduped_alerts(全量含重复标记)、unique_alerts(每簇代表) 及 stats", - "code": "import re\nimport hashlib\n\ndef normalize_uri(text):\n if not text or str(text) == 'none':\n return str(text or '')\n t = str(text)\n t = re.sub(r'\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}(?:[T ]\\d{2}:\\d{2}(?::\\d{2})?)?', 'DATETIME', t)\n t = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', 'UUID', t)\n t = re.sub(r'\\b\\d{6,}\\b', 'NUM', t)\n t = re.sub(r'(?:\\.\\./|\\.\\\\\\\\)+', '../', t)\n t = re.sub(r'%00', 'NULL', t)\n t = re.sub(r'(?:%[0-9a-fA-F]{2}){3,}', 'ENCODED', t)\n return t\n\ndef shingles(text, k=5):\n t = str(text or '').lower()\n if len(t) < k:\n return frozenset([t]) if t else frozenset()\n return frozenset(t[i:i+k] for i in range(len(t) - k + 1))\n\ndef jaccard(a, b):\n if not a and not b:\n return 1.0\n if not a or not b:\n return 0.0\n return len(a & b) / len(a | b)\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\nregistry = {}\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n if not dedup_enabled:\n raw = ''.join(str(alert.get(f, ''))[:max_len] for f in strict_fields + lsh_fields)\n alert['dedup_key'] = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key_already_exists'] = False\n keyed.append(alert)\n continue\n strict_text = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n lsh_text = '. '.join(normalize_uri(str(alert.get(f, ''))[:max_len]) for f in lsh_fields)\n cur_sh = shingles(lsh_text)\n matched_key = None\n best_sim = 0.0\n for ek, (ex_strict, ex_sh) in registry.items():\n if ex_strict != strict_text:\n continue\n sim = jaccard(cur_sh, ex_sh)\n if sim >= threshold and sim > best_sim:\n best_sim = sim\n matched_key = ek\n if matched_key is None:\n raw_key = hashlib.md5(f'{strict_text}. {lsh_text}'.encode('utf-8')).hexdigest()\n registry[raw_key] = (strict_text, cur_sh)\n canonical = raw_key\n alert['dedup_key_already_exists'] = False\n else:\n canonical = matched_key\n alert['dedup_key_already_exists'] = True\n alert['dedup_key'] = canonical\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique={len(unique_alerts)}, dup={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nif len(keyed) > 0:\n stats['dedup_ratio'] = round(dup_count / len(keyed), 4)\nelse:\n stats['dedup_ratio'] = 0.0\n\nsummary = (\n f'http_alert_dedup 完成: 输入 {stats.get(\"raw_count\", 0)} 条'\n f' → 归一化 {stats.get(\"normalized_count\", 0)} 条'\n f' → 过滤后 {stats.get(\"after_filter_count\", 0)} 条'\n f' → 去重后 {len(unique_alerts)} 个唯一簇(压缩率 {stats[\"dedup_ratio\"]:.1%})'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" + "description": "Step 3 — 去重(终点):URI 归一化 + MinHash LSH(datasketch,128 排列,5-gram shingle)聚类,对齐 lsh_processor.py;为每条告警生成 dedup_key(MD5);输出 deduped_alerts / unique_alerts / stats", + "code": "import re\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\n\ndef normalize_uri(uri):\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, num_perm, seed, permutations):\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=num_perm, seed=seed, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# 共享排列参数(对齐 LSHProcessor.__init__ 中的 self.permutations)\n_template_minhash = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED)\n_permutations = _template_minhash.permutations\n\n# 初始化 MinHashLSH(对齐 LSHProcessor.__init__)\nlsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\nlsh_cache = {} # int key -> {'minhash': MinHash}\n\ndef query_most_similar(minhash):\n \"\"\"对齐 LSHProcessor.query_most_similar:先用 LSH 找候选,再精确比较 Jaccard 取最相似\"\"\"\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]['minhash']) for k in candidates]\n return candidates[sims.index(max(sims))]\n else:\n new_key = len(lsh_cache)\n lsh_index.insert(new_key, minhash)\n lsh_cache[new_key] = {'minhash': minhash}\n return new_key\n\ndedup_key_seen = set()\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n raw = f'{text_strict}. {text_lsh}'\n dk = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), NUM_PERM, MINHASH_SEED, _permutations)\n lsh_key = query_most_similar(mh) # int,对应 LSHProcessor 中的缓存 key\n dedup_key_text = f'{text_strict}. {lsh_key}'\n dk = hashlib.md5(dedup_key_text.encode('utf-8')).hexdigest()\n\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\n\nsummary = (\n f'http_alert_dedup 完成: 输入 {stats.get(\"raw_count\", 0)} 条'\n f' → 归一化 {stats.get(\"normalized_count\", 0)} 条'\n f' → 过滤后 {stats.get(\"after_filter_count\", 0)} 条'\n f' → 去重后 {len(unique_alerts)} 个唯一簇(压缩率 {stats[\"dedup_ratio\"]:.1%})'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" } ], "edges": [ From 89019d3408a50141d8c321c36d2c4e94a2c99f65 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 18:33:27 +0800 Subject: [PATCH 11/41] refactor(workflow): English comments and expose _lsh_cluster_id in dedup_logs - All Chinese comments replaced with English - cluster_id (int) written to alert['_lsh_cluster_id'] so callers can inspect LSH cluster membership - lsh_cache now stores MinHash directly (not wrapped in dict) for cleaner re-ranking - summary string switched to English Co-authored-by: Cursor --- .flocks/plugins/workflows/http_alert_dedup/workflow.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json index d9a4e0b5c..bc069cb3e 100644 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.json @@ -38,7 +38,7 @@ "id": "dedup_logs", "type": "python", "description": "Step 3 — 去重(终点):URI 归一化 + MinHash LSH(datasketch,128 排列,5-gram shingle)聚类,对齐 lsh_processor.py;为每条告警生成 dedup_key(MD5);输出 deduped_alerts / unique_alerts / stats", - "code": "import re\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\n\ndef normalize_uri(uri):\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, num_perm, seed, permutations):\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=num_perm, seed=seed, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# 共享排列参数(对齐 LSHProcessor.__init__ 中的 self.permutations)\n_template_minhash = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED)\n_permutations = _template_minhash.permutations\n\n# 初始化 MinHashLSH(对齐 LSHProcessor.__init__)\nlsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\nlsh_cache = {} # int key -> {'minhash': MinHash}\n\ndef query_most_similar(minhash):\n \"\"\"对齐 LSHProcessor.query_most_similar:先用 LSH 找候选,再精确比较 Jaccard 取最相似\"\"\"\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]['minhash']) for k in candidates]\n return candidates[sims.index(max(sims))]\n else:\n new_key = len(lsh_cache)\n lsh_index.insert(new_key, minhash)\n lsh_cache[new_key] = {'minhash': minhash}\n return new_key\n\ndedup_key_seen = set()\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n raw = f'{text_strict}. {text_lsh}'\n dk = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), NUM_PERM, MINHASH_SEED, _permutations)\n lsh_key = query_most_similar(mh) # int,对应 LSHProcessor 中的缓存 key\n dedup_key_text = f'{text_strict}. {lsh_key}'\n dk = hashlib.md5(dedup_key_text.encode('utf-8')).hexdigest()\n\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\n\nsummary = (\n f'http_alert_dedup 完成: 输入 {stats.get(\"raw_count\", 0)} 条'\n f' → 归一化 {stats.get(\"normalized_count\", 0)} 条'\n f' → 过滤后 {stats.get(\"after_filter_count\", 0)} 条'\n f' → 去重后 {len(unique_alerts)} 个唯一簇(压缩率 {stats[\"dedup_ratio\"]:.1%})'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" + "code": "import re\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n # Mirrors utils.normalize_uri() in lsh_processor.py.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, num_perm, seed, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n # Mirrors LSHProcessor.gen_text_minhash(text, k=5).\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=num_perm, seed=seed, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# Share permutation parameters across all MinHash objects for consistency.\n# Mirrors LSHProcessor.__init__: self.permutations = MinHash(...).permutations\n_template_minhash = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED)\n_permutations = _template_minhash.permutations\n\n# lsh_index: MinHashLSH band/row index — stores band hashes in memory, supports O(1) candidate lookup.\n# lsh_cache: maps integer cluster key → MinHash object, used for exact Jaccard re-ranking.\n# Both are ephemeral (in-memory per run). Mirrors LSHProcessor.lsh and LSHProcessor.lsh_hash_cache.\nlsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\nlsh_cache = {} # int cluster_id -> MinHash\n\ndef query_most_similar(minhash):\n # Mirrors LSHProcessor.query_most_similar():\n # 1. LSH candidate lookup (fast, approximate).\n # 2. Exact Jaccard re-ranking among top-100 candidates.\n # 3. If no candidate found, insert as a new cluster and return its id.\n # The returned integer (cluster_id) is stored in lsh_cache and used to build dedup_key_text.\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n else:\n cluster_id = len(lsh_cache)\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash # stored here; retrieved during re-ranking\n return cluster_id\n\ndedup_key_seen = set()\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n raw = f'{text_strict}. {text_lsh}'\n dk = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), NUM_PERM, MINHASH_SEED, _permutations)\n # cluster_id is an integer index into lsh_cache — it identifies the LSH cluster this alert belongs to.\n # It is written to '_lsh_cluster_id' on the alert so callers can inspect cluster membership.\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n # dedup_key = MD5(strict_fields_text + '.' + cluster_id)\n # Mirrors LogDedup._generate_dedup_key_text / hashlib_gen_md5_str.\n dedup_key_text = f'{text_strict}. {cluster_id}'\n dk = hashlib.md5(dedup_key_text.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\n\nsummary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" } ], "edges": [ From ce4a920b367715adea79aed123246f193dda8f63 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 18:51:16 +0800 Subject: [PATCH 12/41] feat(workflow): persist LSH state to disk in dedup_logs LSH state (lsh_index + lsh_cache) is now saved to and loaded from: ~/.flocks/data/workflows/http_alert_dedup/lsh_state_np128_th.pkl - get_lsh_state_path(): builds path via Config().get_data_path() - load_lsh_state(): loads pickle, validates num_perm/threshold params match - dump_lsh_state(): saves after each run; mirrors LogDedup.dump() - Filename encodes NUM_PERM and threshold so param changes auto-reset state - stats gains lsh_total_clusters and lsh_state_path fields - Verified: run2 correctly loads run1 clusters and accumulates new ones Co-authored-by: Cursor --- .flocks/plugins/workflows/http_alert_dedup/workflow.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json index bc069cb3e..d4e9eedca 100644 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.json @@ -38,7 +38,7 @@ "id": "dedup_logs", "type": "python", "description": "Step 3 — 去重(终点):URI 归一化 + MinHash LSH(datasketch,128 排列,5-gram shingle)聚类,对齐 lsh_processor.py;为每条告警生成 dedup_key(MD5);输出 deduped_alerts / unique_alerts / stats", - "code": "import re\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n # Mirrors utils.normalize_uri() in lsh_processor.py.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, num_perm, seed, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n # Mirrors LSHProcessor.gen_text_minhash(text, k=5).\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=num_perm, seed=seed, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# Share permutation parameters across all MinHash objects for consistency.\n# Mirrors LSHProcessor.__init__: self.permutations = MinHash(...).permutations\n_template_minhash = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED)\n_permutations = _template_minhash.permutations\n\n# lsh_index: MinHashLSH band/row index — stores band hashes in memory, supports O(1) candidate lookup.\n# lsh_cache: maps integer cluster key → MinHash object, used for exact Jaccard re-ranking.\n# Both are ephemeral (in-memory per run). Mirrors LSHProcessor.lsh and LSHProcessor.lsh_hash_cache.\nlsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\nlsh_cache = {} # int cluster_id -> MinHash\n\ndef query_most_similar(minhash):\n # Mirrors LSHProcessor.query_most_similar():\n # 1. LSH candidate lookup (fast, approximate).\n # 2. Exact Jaccard re-ranking among top-100 candidates.\n # 3. If no candidate found, insert as a new cluster and return its id.\n # The returned integer (cluster_id) is stored in lsh_cache and used to build dedup_key_text.\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n else:\n cluster_id = len(lsh_cache)\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash # stored here; retrieved during re-ranking\n return cluster_id\n\ndedup_key_seen = set()\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n raw = f'{text_strict}. {text_lsh}'\n dk = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), NUM_PERM, MINHASH_SEED, _permutations)\n # cluster_id is an integer index into lsh_cache — it identifies the LSH cluster this alert belongs to.\n # It is written to '_lsh_cluster_id' on the alert so callers can inspect cluster membership.\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n # dedup_key = MD5(strict_fields_text + '.' + cluster_id)\n # Mirrors LogDedup._generate_dedup_key_text / hashlib_gen_md5_str.\n dedup_key_text = f'{text_strict}. {cluster_id}'\n dk = hashlib.md5(dedup_key_text.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\n\nsummary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" + "code": "import os\nimport re\nimport pickle\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\nfrom flocks.config import Config\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'http_alert_dedup'\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n # Mirrors utils.normalize_uri() in lsh_processor.py.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n # Mirrors LSHProcessor.gen_text_minhash(text, k=5).\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_lsh_state_path(threshold):\n # Persist LSH state under ~/.flocks/data/workflows//lsh_state.pkl.\n # Mirrors LogDedup.dump_file — keeps state across restarts.\n data_dir = str(Config().get_data_path())\n state_dir = os.path.join(data_dir, 'workflows', WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n return os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold*100)}.pkl')\n\ndef load_lsh_state(path, threshold):\n # Load persisted lsh_index + lsh_cache from disk.\n # If file is missing, empty, or incompatible, start fresh.\n if not os.path.exists(path) or os.path.getsize(path) == 0:\n return None, None\n try:\n with open(path, 'rb') as f:\n state = pickle.load(f)\n # Validate parameters match current run.\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] LSH state params mismatch, starting fresh (stored: {state.get(\"num_perm\")}/{state.get(\"threshold\")})')\n return None, None\n print(f'[dedup] Loaded LSH state: {len(state[\"lsh_cache\"])} existing clusters from {path}')\n return state['lsh_index'], state['lsh_cache']\n except Exception as e:\n print(f'[dedup] Failed to load LSH state ({e}), starting fresh')\n return None, None\n\ndef dump_lsh_state(path, lsh_index, lsh_cache, threshold):\n # Persist updated lsh_index + lsh_cache to disk.\n # Mirrors LSHProcessor.dump() / LogDedup.dump().\n try:\n state = {'lsh_index': lsh_index, 'lsh_cache': lsh_cache, 'num_perm': NUM_PERM, 'threshold': threshold}\n with open(path, 'wb') as f:\n pickle.dump(state, f)\n print(f'[dedup] LSH state saved: {len(lsh_cache)} clusters -> {path}')\n except Exception as e:\n print(f'[dedup] Failed to save LSH state: {e}')\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# Share permutation parameters across all MinHash objects for consistency.\n# Mirrors LSHProcessor.__init__: self.permutations = MinHash(...).permutations\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\n# Load persisted LSH state if available, otherwise initialize fresh.\n# lsh_index: MinHashLSH band/row index — O(1) approximate candidate lookup.\n# lsh_cache: int cluster_id -> MinHash, used for exact Jaccard re-ranking.\nlsh_state_path = get_lsh_state_path(threshold)\nlsh_index, lsh_cache = load_lsh_state(lsh_state_path, threshold)\nif lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n\ndef query_most_similar(minhash):\n # Mirrors LSHProcessor.query_most_similar():\n # 1. LSH candidate lookup (fast, approximate).\n # 2. Exact Jaccard re-ranking among top-100 candidates.\n # 3. If no candidate found, insert as a new cluster and return its id.\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = len(lsh_cache)\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\ndedup_key_seen = set()\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n raw = f'{text_strict}. {text_lsh}'\n dk = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n # cluster_id identifies the LSH cluster; written to alert for inspection.\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n # dedup_key = MD5(strict_fields_text + '.' + cluster_id)\n # Mirrors LogDedup._generate_dedup_key_text / hashlib_gen_md5_str.\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n\n# Persist updated LSH state to disk for reuse across runs.\nif dedup_enabled:\n dump_lsh_state(lsh_state_path, lsh_index, lsh_cache, threshold)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\nstats['lsh_total_clusters'] = len(lsh_cache)\nstats['lsh_state_path'] = lsh_state_path\n\nsummary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | lsh_total={len(lsh_cache)}'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" } ], "edges": [ From df2f3722df009d1d4b98e2cf7bc91848f463f9d5 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 19:01:52 +0800 Subject: [PATCH 13/41] fix(workflow): harden LSH state persistence in dedup_logs Four correctness fixes plus the datasketch dependency: 1. Persist dedup_key_cache (the set of MD5 keys ever seen) into the same pkl so that 'dedup_key_already_exists' is correct across restarts and batches. Previously it was a per-run local set, breaking cross-batch dedup detection. 2. Atomic write: pickle to .tmp, fsync, then os.replace() over the target. A crash mid-write no longer corrupts the persisted state. 3. fcntl.flock(LOCK_EX) on a sibling .lock file serializes load+modify+dump across concurrent workflow runs, eliminating the read-modify-write race that previously caused lost updates. 4. dedup_enabled=False branch now sets _lsh_cluster_id=None so that downstream consumers see a consistent schema regardless of dedup mode. In-batch duplicate detection still works in this branch. Also: warn when persisted cluster count exceeds 100k so operators can rotate state. New stat: lsh_total_dedup_keys. Verified end-to-end: - cross-batch: same alert in run2 reports already_exists=True - corruption: garbage pkl is auto-discarded, no crash - concurrent: 5 parallel processes writing 10 alerts each end up with 50 persisted clusters (not lost to read-modify-write) - disabled: _lsh_cluster_id=None, in-batch duplicate detected Co-authored-by: Cursor --- .../workflows/http_alert_dedup/workflow.json | 4 +- pyproject.toml | 1 + uv.lock | 38 ++++++++++++++++++- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json index d4e9eedca..8c56a3330 100644 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.json @@ -37,8 +37,8 @@ { "id": "dedup_logs", "type": "python", - "description": "Step 3 — 去重(终点):URI 归一化 + MinHash LSH(datasketch,128 排列,5-gram shingle)聚类,对齐 lsh_processor.py;为每条告警生成 dedup_key(MD5);输出 deduped_alerts / unique_alerts / stats", - "code": "import os\nimport re\nimport pickle\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\nfrom flocks.config import Config\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'http_alert_dedup'\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n # Mirrors utils.normalize_uri() in lsh_processor.py.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n # Mirrors LSHProcessor.gen_text_minhash(text, k=5).\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_lsh_state_path(threshold):\n # Persist LSH state under ~/.flocks/data/workflows//lsh_state.pkl.\n # Mirrors LogDedup.dump_file — keeps state across restarts.\n data_dir = str(Config().get_data_path())\n state_dir = os.path.join(data_dir, 'workflows', WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n return os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold*100)}.pkl')\n\ndef load_lsh_state(path, threshold):\n # Load persisted lsh_index + lsh_cache from disk.\n # If file is missing, empty, or incompatible, start fresh.\n if not os.path.exists(path) or os.path.getsize(path) == 0:\n return None, None\n try:\n with open(path, 'rb') as f:\n state = pickle.load(f)\n # Validate parameters match current run.\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] LSH state params mismatch, starting fresh (stored: {state.get(\"num_perm\")}/{state.get(\"threshold\")})')\n return None, None\n print(f'[dedup] Loaded LSH state: {len(state[\"lsh_cache\"])} existing clusters from {path}')\n return state['lsh_index'], state['lsh_cache']\n except Exception as e:\n print(f'[dedup] Failed to load LSH state ({e}), starting fresh')\n return None, None\n\ndef dump_lsh_state(path, lsh_index, lsh_cache, threshold):\n # Persist updated lsh_index + lsh_cache to disk.\n # Mirrors LSHProcessor.dump() / LogDedup.dump().\n try:\n state = {'lsh_index': lsh_index, 'lsh_cache': lsh_cache, 'num_perm': NUM_PERM, 'threshold': threshold}\n with open(path, 'wb') as f:\n pickle.dump(state, f)\n print(f'[dedup] LSH state saved: {len(lsh_cache)} clusters -> {path}')\n except Exception as e:\n print(f'[dedup] Failed to save LSH state: {e}')\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# Share permutation parameters across all MinHash objects for consistency.\n# Mirrors LSHProcessor.__init__: self.permutations = MinHash(...).permutations\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\n# Load persisted LSH state if available, otherwise initialize fresh.\n# lsh_index: MinHashLSH band/row index — O(1) approximate candidate lookup.\n# lsh_cache: int cluster_id -> MinHash, used for exact Jaccard re-ranking.\nlsh_state_path = get_lsh_state_path(threshold)\nlsh_index, lsh_cache = load_lsh_state(lsh_state_path, threshold)\nif lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n\ndef query_most_similar(minhash):\n # Mirrors LSHProcessor.query_most_similar():\n # 1. LSH candidate lookup (fast, approximate).\n # 2. Exact Jaccard re-ranking among top-100 candidates.\n # 3. If no candidate found, insert as a new cluster and return its id.\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = len(lsh_cache)\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\ndedup_key_seen = set()\nkeyed = []\nfor alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n raw = f'{text_strict}. {text_lsh}'\n dk = hashlib.md5(raw.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n # cluster_id identifies the LSH cluster; written to alert for inspection.\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n # dedup_key = MD5(strict_fields_text + '.' + cluster_id)\n # Mirrors LogDedup._generate_dedup_key_text / hashlib_gen_md5_str.\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_seen\n dedup_key_seen.add(dk)\n keyed.append(alert)\n\n# Persist updated LSH state to disk for reuse across runs.\nif dedup_enabled:\n dump_lsh_state(lsh_state_path, lsh_index, lsh_cache, threshold)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\nstats['lsh_total_clusters'] = len(lsh_cache)\nstats['lsh_state_path'] = lsh_state_path\n\nsummary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | lsh_total={len(lsh_cache)}'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" + "description": "Step 3 — Dedup (terminal): URI normalization + MinHash LSH (datasketch, 128 perms, 5-gram shingles). LSH index + dedup_key cache are persisted to ~/.flocks/data/workflows/http_alert_dedup/ with atomic write and fcntl file lock; survives restarts and is safe for concurrent runs.", + "code": "import os\nimport re\nimport pickle\nimport hashlib\nimport fcntl\nfrom datasketch import MinHash, MinHashLSH\nfrom flocks.config import Config\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'http_alert_dedup'\nLSH_CLUSTER_WARN_THRESHOLD = 100000 # Warn when persisted cluster count exceeds this.\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n # Mirrors utils.normalize_uri() in lsh_processor.py.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n # Mirrors LSHProcessor.gen_text_minhash(text, k=5).\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_state_paths(threshold):\n # Persist LSH state under ~/.flocks/data/workflows//.\n # Returns (state_path, lock_path). Filename encodes NUM_PERM + threshold so that\n # parameter changes implicitly reset the state.\n data_dir = str(Config().get_data_path())\n state_dir = os.path.join(data_dir, 'workflows', WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n base = os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\ndef acquire_lock(lock_path):\n # POSIX exclusive file lock — protects load+modify+dump from concurrent\n # workflow runs racing on the same state file.\n fh = open(lock_path, 'w')\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef release_lock(fh):\n try:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef load_state(state_path, threshold):\n # Returns (lsh_index, lsh_cache, dedup_key_cache).\n # On any error or parameter mismatch, returns (None, None, None) so the caller\n # initializes fresh state.\n if not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch (stored np={state.get(\"num_perm\")}, th={state.get(\"threshold\")}), starting fresh')\n return None, None, None\n cache = state['lsh_cache']\n seen = state.get('dedup_key_cache', set())\n print(f'[dedup] loaded state: {len(cache)} clusters, {len(seen)} dedup_keys from {state_path}')\n return state['lsh_index'], cache, seen\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold):\n # Atomic write: pickle to .tmp, fsync, then os.replace() over the target.\n # Crash mid-write leaves the original file intact instead of a corrupt half-file.\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index,\n 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache,\n 'num_perm': NUM_PERM,\n 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, state_path)\n print(f'[dedup] state saved: {len(lsh_cache)} clusters, {len(dedup_key_cache)} dedup_keys -> {state_path}')\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try:\n os.remove(tmp)\n except Exception:\n pass\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# Shared permutation parameters across all MinHash objects.\n# Mirrors LSHProcessor.__init__: self.permutations = MinHash(...).permutations\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\nstate_path, lock_path = get_state_paths(threshold)\nlock_fh = acquire_lock(lock_path) if dedup_enabled else None\n\ntry:\n # lsh_index: MinHashLSH band/row index — O(1) approximate candidate lookup.\n # lsh_cache: int cluster_id -> MinHash, used for exact Jaccard re-ranking.\n # dedup_key_cache: set of MD5 dedup_keys ever seen — survives restarts so that\n # dedup_key_already_exists is correct across batches.\n if dedup_enabled:\n lsh_index, lsh_cache, dedup_key_cache = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n dedup_key_cache = set()\n else:\n lsh_index, lsh_cache, dedup_key_cache = None, {}, set()\n\n def query_most_similar(minhash):\n # Mirrors LSHProcessor.query_most_similar():\n # 1. LSH candidate lookup (fast, approximate).\n # 2. Exact Jaccard re-ranking among top-100 candidates.\n # 3. If no candidate found, insert as a new cluster and return its id.\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = len(lsh_cache)\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\n keyed = []\n for alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n # Dedup disabled: hash raw text, no clustering, no cross-batch tracking.\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n alert['_lsh_cluster_id'] = None\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n # dedup_key = MD5(strict_fields_text + '.' + cluster_id)\n # Mirrors LogDedup._generate_dedup_key_text / hashlib_gen_md5_str.\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n # Cross-batch awareness: dedup_key_cache is loaded from disk at start.\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n\n if dedup_enabled:\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: lsh_cache holds {len(lsh_cache)} clusters '\n f'(>{LSH_CLUSTER_WARN_THRESHOLD}); consider rotating state file at {state_path}')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold)\nfinally:\n if lock_fh is not None:\n release_lock(lock_fh)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\nstats['lsh_total_clusters'] = len(lsh_cache)\nstats['lsh_total_dedup_keys'] = len(dedup_key_cache)\nstats['lsh_state_path'] = state_path\n\nsummary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | persisted_clusters={len(lsh_cache)}, persisted_keys={len(dedup_key_cache)}'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" } ], "edges": [ diff --git a/pyproject.toml b/pyproject.toml index 26953367a..5ffa925df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,6 +83,7 @@ dependencies = [ # browser "cdp-use>=1.4.5", "pillow>=12.2.0", + "datasketch>=1.10.0", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index ae679487d..de0035408 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = "==3.12.*" resolution-markers = [ "sys_platform == 'win32'", @@ -384,6 +384,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/ef/0c2f4a8e31018a986949d34a01115dd057bf536905dca38897bacd21fac3/cryptography-46.0.5-cp38-abi3-win_amd64.whl", hash = "sha256:556e106ee01aa13484ce9b0239bca667be5004efb0aabbed28d353df86445595", size = 3467050, upload-time = "2026-02-10T19:18:18.899Z" }, ] +[[package]] +name = "datasketch" +version = "1.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8d/73/8e9014887f9fca2d785777a0a6186813e4fc7faa24f05fc88c6420624891/datasketch-1.10.0.tar.gz", hash = "sha256:d23aea80ce4c40790ca7a40795659848be92ecc43db80942be26f21e81d24714", size = 91699, upload-time = "2026-04-17T23:06:56.388Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/e7/a94668082e078099eb0161635649510aa887690767b779fffe4bdc479913/datasketch-1.10.0-py3-none-any.whl", hash = "sha256:303dd90cda0948a21abba3aaefc9f8528fa12b8204edc5e1ae8b1d7b750234e7", size = 99914, upload-time = "2026-04-17T23:06:54.39Z" }, +] + [[package]] name = "decorator" version = "5.2.1" @@ -508,6 +521,7 @@ dependencies = [ { name = "claude" }, { name = "click" }, { name = "croniter" }, + { name = "datasketch" }, { name = "defusedxml" }, { name = "dingtalk-stream" }, { name = "fastapi" }, @@ -573,6 +587,7 @@ requires-dist = [ { name = "claude", specifier = ">=0.4.11" }, { name = "click", specifier = ">=8.1.7" }, { name = "croniter", specifier = ">=6.0.0" }, + { name = "datasketch", specifier = ">=1.10.0" }, { name = "defusedxml", specifier = ">=0.7.1" }, { name = "dingtalk-stream", specifier = ">=0.20" }, { name = "fastapi", specifier = ">=0.109.0" }, @@ -2005,6 +2020,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/e8/726643a3ea68c727da31570bde48c7a10f1aa60eddd628d94078fec586ff/ruff-0.15.7-py3-none-win_arm64.whl", hash = "sha256:18e8d73f1c3fdf27931497972250340f92e8c861722161a9caeb89a58ead6ed2", size = 11023304, upload-time = "2026-03-19T16:26:51.669Z" }, ] +[[package]] +name = "scipy" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/48/b992b488d6f299dbe3f11a20b24d3dda3d46f1a635ede1c46b5b17a7b163/scipy-1.17.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:35c3a56d2ef83efc372eaec584314bd0ef2e2f0d2adb21c55e6ad5b344c0dcb8", size = 31610954, upload-time = "2026-02-23T00:17:49.855Z" }, + { url = "https://files.pythonhosted.org/packages/b2/02/cf107b01494c19dc100f1d0b7ac3cc08666e96ba2d64db7626066cee895e/scipy-1.17.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fcb310ddb270a06114bb64bbe53c94926b943f5b7f0842194d585c65eb4edd76", size = 28172662, upload-time = "2026-02-23T00:18:01.64Z" }, + { url = "https://files.pythonhosted.org/packages/cf/a9/599c28631bad314d219cf9ffd40e985b24d603fc8a2f4ccc5ae8419a535b/scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cc90d2e9c7e5c7f1a482c9875007c095c3194b1cfedca3c2f3291cdc2bc7c086", size = 20344366, upload-time = "2026-02-23T00:18:12.015Z" }, + { url = "https://files.pythonhosted.org/packages/35/f5/906eda513271c8deb5af284e5ef0206d17a96239af79f9fa0aebfe0e36b4/scipy-1.17.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c80be5ede8f3f8eded4eff73cc99a25c388ce98e555b17d31da05287015ffa5b", size = 22704017, upload-time = "2026-02-23T00:18:21.502Z" }, + { url = "https://files.pythonhosted.org/packages/da/34/16f10e3042d2f1d6b66e0428308ab52224b6a23049cb2f5c1756f713815f/scipy-1.17.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e19ebea31758fac5893a2ac360fedd00116cbb7628e650842a6691ba7ca28a21", size = 32927842, upload-time = "2026-02-23T00:18:35.367Z" }, + { url = "https://files.pythonhosted.org/packages/01/8e/1e35281b8ab6d5d72ebe9911edcdffa3f36b04ed9d51dec6dd140396e220/scipy-1.17.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02ae3b274fde71c5e92ac4d54bc06c42d80e399fec704383dcd99b301df37458", size = 35235890, upload-time = "2026-02-23T00:18:49.188Z" }, + { url = "https://files.pythonhosted.org/packages/c5/5c/9d7f4c88bea6e0d5a4f1bc0506a53a00e9fcb198de372bfe4d3652cef482/scipy-1.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8a604bae87c6195d8b1045eddece0514d041604b14f2727bbc2b3020172045eb", size = 35003557, upload-time = "2026-02-23T00:18:54.74Z" }, + { url = "https://files.pythonhosted.org/packages/65/94/7698add8f276dbab7a9de9fb6b0e02fc13ee61d51c7c3f85ac28b65e1239/scipy-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f590cd684941912d10becc07325a3eeb77886fe981415660d9265c4c418d0bea", size = 37625856, upload-time = "2026-02-23T00:19:00.307Z" }, + { url = "https://files.pythonhosted.org/packages/a2/84/dc08d77fbf3d87d3ee27f6a0c6dcce1de5829a64f2eae85a0ecc1f0daa73/scipy-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:41b71f4a3a4cab9d366cd9065b288efc4d4f3c0b37a91a8e0947fb5bd7f31d87", size = 36549682, upload-time = "2026-02-23T00:19:07.67Z" }, + { url = "https://files.pythonhosted.org/packages/bc/98/fe9ae9ffb3b54b62559f52dedaebe204b408db8109a8c66fdd04869e6424/scipy-1.17.1-cp312-cp312-win_arm64.whl", hash = "sha256:f4115102802df98b2b0db3cce5cb9b92572633a1197c77b7553e5203f284a5b3", size = 24547340, upload-time = "2026-02-23T00:19:12.024Z" }, +] + [[package]] name = "shellingham" version = "1.5.4" From 02f67c7e21be96d53f71ddb0fe04b93acbace116 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 19:12:32 +0800 Subject: [PATCH 14/41] fix(workflow): cross-platform file lock + cleaner stats in dedup_logs Cross-platform support: - Replace bare 'import fcntl' with platform detection. - POSIX path keeps fcntl.flock(LOCK_EX). - Windows path uses msvcrt.locking(LK_LOCK, 1) on a single byte of the lock file, looping on OSError to wait beyond LK_LOCK's built-in 10-second cap. - Both branches release the lock symmetrically in release_lock(). Review fixes: - 'lsh_state_path', 'lsh_total_clusters', 'lsh_total_dedup_keys' are no longer written to stats when dedup_enabled=False (was misleading; no file is touched in that mode). Added 'dedup_state_persisted' bool for callers to branch on. - Cluster-overflow warning now also checks dedup_key_cache, since pkl size is dominated by per-key entries, not per-cluster. - normalize_uri's UUID regex now uses re.IGNORECASE so uppercase UUIDs map to the same 'UUID' placeholder (and thus the same LSH cluster) as lowercase ones. - Disabled-mode summary clearly states 'no state persisted, in-batch only'. Verified: cross-batch, disabled-mode stats, uppercase UUID dedup, concurrent writers. Co-authored-by: Cursor --- .flocks/plugins/workflows/http_alert_dedup/workflow.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json index 8c56a3330..3b21b3a8f 100644 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.json @@ -37,8 +37,8 @@ { "id": "dedup_logs", "type": "python", - "description": "Step 3 — Dedup (terminal): URI normalization + MinHash LSH (datasketch, 128 perms, 5-gram shingles). LSH index + dedup_key cache are persisted to ~/.flocks/data/workflows/http_alert_dedup/ with atomic write and fcntl file lock; survives restarts and is safe for concurrent runs.", - "code": "import os\nimport re\nimport pickle\nimport hashlib\nimport fcntl\nfrom datasketch import MinHash, MinHashLSH\nfrom flocks.config import Config\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'http_alert_dedup'\nLSH_CLUSTER_WARN_THRESHOLD = 100000 # Warn when persisted cluster count exceeds this.\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n # Mirrors utils.normalize_uri() in lsh_processor.py.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n # Mirrors LSHProcessor.gen_text_minhash(text, k=5).\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_state_paths(threshold):\n # Persist LSH state under ~/.flocks/data/workflows//.\n # Returns (state_path, lock_path). Filename encodes NUM_PERM + threshold so that\n # parameter changes implicitly reset the state.\n data_dir = str(Config().get_data_path())\n state_dir = os.path.join(data_dir, 'workflows', WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n base = os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\ndef acquire_lock(lock_path):\n # POSIX exclusive file lock — protects load+modify+dump from concurrent\n # workflow runs racing on the same state file.\n fh = open(lock_path, 'w')\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef release_lock(fh):\n try:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef load_state(state_path, threshold):\n # Returns (lsh_index, lsh_cache, dedup_key_cache).\n # On any error or parameter mismatch, returns (None, None, None) so the caller\n # initializes fresh state.\n if not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch (stored np={state.get(\"num_perm\")}, th={state.get(\"threshold\")}), starting fresh')\n return None, None, None\n cache = state['lsh_cache']\n seen = state.get('dedup_key_cache', set())\n print(f'[dedup] loaded state: {len(cache)} clusters, {len(seen)} dedup_keys from {state_path}')\n return state['lsh_index'], cache, seen\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold):\n # Atomic write: pickle to .tmp, fsync, then os.replace() over the target.\n # Crash mid-write leaves the original file intact instead of a corrupt half-file.\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index,\n 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache,\n 'num_perm': NUM_PERM,\n 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, state_path)\n print(f'[dedup] state saved: {len(lsh_cache)} clusters, {len(dedup_key_cache)} dedup_keys -> {state_path}')\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try:\n os.remove(tmp)\n except Exception:\n pass\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# Shared permutation parameters across all MinHash objects.\n# Mirrors LSHProcessor.__init__: self.permutations = MinHash(...).permutations\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\nstate_path, lock_path = get_state_paths(threshold)\nlock_fh = acquire_lock(lock_path) if dedup_enabled else None\n\ntry:\n # lsh_index: MinHashLSH band/row index — O(1) approximate candidate lookup.\n # lsh_cache: int cluster_id -> MinHash, used for exact Jaccard re-ranking.\n # dedup_key_cache: set of MD5 dedup_keys ever seen — survives restarts so that\n # dedup_key_already_exists is correct across batches.\n if dedup_enabled:\n lsh_index, lsh_cache, dedup_key_cache = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n dedup_key_cache = set()\n else:\n lsh_index, lsh_cache, dedup_key_cache = None, {}, set()\n\n def query_most_similar(minhash):\n # Mirrors LSHProcessor.query_most_similar():\n # 1. LSH candidate lookup (fast, approximate).\n # 2. Exact Jaccard re-ranking among top-100 candidates.\n # 3. If no candidate found, insert as a new cluster and return its id.\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = len(lsh_cache)\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\n keyed = []\n for alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n # Dedup disabled: hash raw text, no clustering, no cross-batch tracking.\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n alert['_lsh_cluster_id'] = None\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n # dedup_key = MD5(strict_fields_text + '.' + cluster_id)\n # Mirrors LogDedup._generate_dedup_key_text / hashlib_gen_md5_str.\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n # Cross-batch awareness: dedup_key_cache is loaded from disk at start.\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n\n if dedup_enabled:\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: lsh_cache holds {len(lsh_cache)} clusters '\n f'(>{LSH_CLUSTER_WARN_THRESHOLD}); consider rotating state file at {state_path}')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold)\nfinally:\n if lock_fh is not None:\n release_lock(lock_fh)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\nstats['lsh_total_clusters'] = len(lsh_cache)\nstats['lsh_total_dedup_keys'] = len(dedup_key_cache)\nstats['lsh_state_path'] = state_path\n\nsummary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | persisted_clusters={len(lsh_cache)}, persisted_keys={len(dedup_key_cache)}'\n)\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" + "description": "Step 3 — Dedup (terminal): URI normalization + MinHash LSH (datasketch, 128 perms, 5-gram shingles). LSH index + dedup_key cache are persisted to ~/.flocks/data/workflows/http_alert_dedup/ with atomic write and a cross-platform exclusive file lock (POSIX fcntl / Windows msvcrt). Survives restarts; safe for concurrent runs. When dedup_enabled=False, no disk state is read or written and cross-batch duplicate detection is intentionally disabled.", + "code": "import os\nimport re\nimport sys\nimport pickle\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\nfrom flocks.config import Config\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'http_alert_dedup'\nLSH_CLUSTER_WARN_THRESHOLD = 100000 # Warn when persisted cluster or dedup-key count exceeds this.\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n # Mirrors utils.normalize_uri() in lsh_processor.py.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n # Mirrors LSHProcessor.gen_text_minhash(text, k=5).\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_state_paths(threshold):\n # Persist LSH state under ~/.flocks/data/workflows//.\n # Returns (state_path, lock_path). Filename encodes NUM_PERM + threshold so that\n # parameter changes implicitly reset the state.\n data_dir = str(Config().get_data_path())\n state_dir = os.path.join(data_dir, 'workflows', WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n base = os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\ndef acquire_lock(lock_path):\n # Cross-platform exclusive file lock that serializes load+modify+dump across\n # concurrent workflow runs racing on the same state file.\n # POSIX: fcntl.flock(LOCK_EX) — blocks until acquired.\n # Windows: msvcrt.locking(LK_LOCK) — locks 1 byte; blocks ~10s per call,\n # so we loop until the lock is granted.\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L')\n fh.flush()\n fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1)\n break\n except OSError:\n # LK_LOCK retried internally and still failed; loop and try again.\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0)\n msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef load_state(state_path, threshold):\n # Returns (lsh_index, lsh_cache, dedup_key_cache).\n # On any error or parameter mismatch, returns (None, None, None) so the caller\n # initializes fresh state.\n if not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch (stored np={state.get(\"num_perm\")}, th={state.get(\"threshold\")}), starting fresh')\n return None, None, None\n cache = state['lsh_cache']\n seen = state.get('dedup_key_cache', set())\n print(f'[dedup] loaded state: {len(cache)} clusters, {len(seen)} dedup_keys from {state_path}')\n return state['lsh_index'], cache, seen\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold):\n # Atomic write: pickle to .tmp, fsync, then os.replace() over the target.\n # Crash mid-write leaves the original file intact instead of a corrupt half-file.\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index,\n 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache,\n 'num_perm': NUM_PERM,\n 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, state_path)\n print(f'[dedup] state saved: {len(lsh_cache)} clusters, {len(dedup_key_cache)} dedup_keys -> {state_path}')\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try:\n os.remove(tmp)\n except Exception:\n pass\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# Shared permutation parameters across all MinHash objects.\n# Mirrors LSHProcessor.__init__: self.permutations = MinHash(...).permutations\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\nstate_path, lock_path = get_state_paths(threshold)\nlock_fh = acquire_lock(lock_path) if dedup_enabled else None\n\ntry:\n # lsh_index: MinHashLSH band/row index — O(1) approximate candidate lookup.\n # lsh_cache: int cluster_id -> MinHash, used for exact Jaccard re-ranking.\n # dedup_key_cache: set of MD5 dedup_keys ever seen — survives restarts so that\n # dedup_key_already_exists is correct across batches.\n if dedup_enabled:\n lsh_index, lsh_cache, dedup_key_cache = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n dedup_key_cache = set()\n else:\n lsh_index, lsh_cache, dedup_key_cache = None, {}, set()\n\n def query_most_similar(minhash):\n # Mirrors LSHProcessor.query_most_similar():\n # 1. LSH candidate lookup (fast, approximate).\n # 2. Exact Jaccard re-ranking among top-100 candidates.\n # 3. If no candidate found, insert as a new cluster and return its id.\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = len(lsh_cache)\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\n keyed = []\n for alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n # Dedup disabled: hash raw text, no clustering, no cross-batch tracking.\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n alert['_lsh_cluster_id'] = None\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n # dedup_key = MD5(strict_fields_text + '.' + cluster_id)\n # Mirrors LogDedup._generate_dedup_key_text / hashlib_gen_md5_str.\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n # Cross-batch awareness: dedup_key_cache is loaded from disk at start.\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n\n if dedup_enabled:\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters '\n f'and {len(dedup_key_cache)} dedup_keys (threshold={LSH_CLUSTER_WARN_THRESHOLD}); '\n f'consider rotating state file at {state_path}')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold)\nfinally:\n if lock_fh is not None:\n release_lock(lock_fh)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\nstats['dedup_state_persisted'] = bool(dedup_enabled)\nif dedup_enabled:\n # Only expose persistence-related fields when state is actually read/written.\n stats['lsh_total_clusters'] = len(lsh_cache)\n stats['lsh_total_dedup_keys'] = len(dedup_key_cache)\n stats['lsh_state_path'] = state_path\n\nif dedup_enabled:\n summary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | persisted_clusters={len(lsh_cache)}, persisted_keys={len(dedup_key_cache)}'\n )\nelse:\n summary = (\n f'http_alert_dedup done (dedup_enabled=False, no state persisted): '\n f'raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique={len(unique_alerts)} (in-batch only)'\n )\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" } ], "edges": [ From 56d3478cafe2bdf0301a08a28f10783ad0ff5221 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 19:29:31 +0800 Subject: [PATCH 15/41] fix(run_workflow): handle JSON-encoded string path and bad dict input; add alert_file support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run_workflow.py: - When workflow is a JSON-decoded non-dict (e.g. the AI wraps the path in extra quotes producing '\"..path..\"'), treat the decoded string as a file path instead of crashing with 'str object has no attribute name'. - Add upfront dict sanity-check: if the dict has no 'start' key it's almost certainly the inputs dict passed to the wrong parameter; return a clear error instead of a Pydantic validation traceback. - Add a comment clarifying that the else-branch delivers a Path, not a str. workflow.py: - Add sync_workflows_from_filesystem() which app.py imports at startup. Triggers the one-time storage→filesystem migration then returns the count of discovered workflows so the startup log entry is informative. http_alert_dedup/workflow.json: - receive_alerts now accepts alert_file (absolute or ~-prefixed path to a JSON file) as an alternative to inlining the alerts list directly in inputs. This removes the need to manually load large log files via bash before running the workflow (the main friction observed in session 询问可用工作流). - sampleInputs annotated with a _comment_alert_file hint. Co-authored-by: Cursor --- .../workflows/http_alert_dedup/workflow.json | 3 +- flocks/server/routes/workflow.py | 14 +++++++++ flocks/tool/task/run_workflow.py | 31 ++++++++++++++++--- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json index 3b21b3a8f..20bce3249 100644 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.json @@ -8,7 +8,7 @@ "id": "receive_alerts", "type": "python", "description": "接收原始告警列表,解析输入格式,提取 Pipeline 配置,输出 source_log_type 供后续分支节点路由", - "code": "import json\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\ndedup_threshold = float(inputs.get('threshold', inputs.get('dedup_threshold', 0.7)))\nstrict_fields = inputs.get('strict_fields', inputs.get('dedup_fields_strict', ['sip', 'dip']))\nlsh_fields = inputs.get('lsh_fields', inputs.get('dedup_fields_lsh', ['req_http_url', 'req_body', 'rsp_body']))\nmax_field_len = int(inputs.get('max_field_len', 500))\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\nprint(f'[receive] source_log_type={source_log_type}, total={len(alerts_input)}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['stats'] = {'raw_count': len(alerts_input)}\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['dedup_threshold'] = dedup_threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len" + "code": "import json\nimport os\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n\n# Support alert_file: load JSON from a local file path when alerts list is not given directly.\n# This lets callers pass a file path (e.g. '/Users/foo/Downloads/tdp_logs.json') instead of\n# inlining potentially thousands of alerts into the workflow inputs.\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n print(f'[receive] loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from file: {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\ndedup_threshold = float(inputs.get('threshold', inputs.get('dedup_threshold', 0.7)))\nstrict_fields = inputs.get('strict_fields', inputs.get('dedup_fields_strict', ['sip', 'dip']))\nlsh_fields = inputs.get('lsh_fields', inputs.get('dedup_fields_lsh', ['req_http_url', 'req_body', 'rsp_body']))\nmax_field_len = int(inputs.get('max_field_len', 500))\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\nprint(f'[receive] source_log_type={source_log_type}, total={len(alerts_input)}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['stats'] = {'raw_count': len(alerts_input)}\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['dedup_threshold'] = dedup_threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len" }, { "id": "branch_log_type", @@ -56,6 +56,7 @@ "filter_enabled": true, "dedup_enabled": true, "threshold": 0.7, + "_comment_alert_file": "Alternative to 'alerts': pass a JSON file path, e.g. alert_file: '/Users/foo/Downloads/tdp_logs.json'", "alerts": [ { "net_real_src_ip": "1.2.3.4", diff --git a/flocks/server/routes/workflow.py b/flocks/server/routes/workflow.py index 99bf56bed..a992d4116 100644 --- a/flocks/server/routes/workflow.py +++ b/flocks/server/routes/workflow.py @@ -349,6 +349,20 @@ def _list_workflows_from_fs() -> List[Dict[str, Any]]: return list(by_id.values()) +async def sync_workflows_from_filesystem() -> int: + """Scan all workflow directories on the filesystem and return the count of discovered workflows. + + Called at server startup (app.py) to surface any filesystem-only workflows early. + The filesystem is already the sole source of truth for workflow definitions, so this + function only needs to scan and count—no data needs to be written anywhere. + It also triggers the one-time storage → filesystem migration as a side-effect. + """ + await _migrate_storage_to_filesystem() + all_data = _list_workflows_from_fs() + log.info("workflow.sync.filesystem", {"count": len(all_data)}) + return len(all_data) + + async def _migrate_storage_to_filesystem() -> None: """One-time migration: move Storage-only workflow definitions to the filesystem. diff --git a/flocks/tool/task/run_workflow.py b/flocks/tool/task/run_workflow.py index 2741d5d7b..9dba679a1 100644 --- a/flocks/tool/task/run_workflow.py +++ b/flocks/tool/task/run_workflow.py @@ -343,12 +343,22 @@ async def run_workflow_tool( workflow_source: Union[Dict[str, Any], Path] if isinstance(workflow, str): raw = workflow.strip() - # If it's a JSON string, try to parse it. + # Try to parse as JSON first (handles JSON-encoded dicts or strings). + parsed = None try: - workflow_source = json.loads(raw) + parsed = json.loads(raw) except json.JSONDecodeError: - # Otherwise treat it as a file path. - p = Path(raw).expanduser() + pass + + if isinstance(parsed, dict): + # Valid workflow JSON object. + workflow_source = parsed + else: + # Either not JSON, or JSON that decoded to a non-dict (e.g. a JSON-encoded + # string like '"/path/to/workflow.json"'). Treat the raw value (or the + # decoded string) as a file path. + file_path_str = parsed if isinstance(parsed, str) else raw + p = Path(file_path_str).expanduser() if p.exists() and p.is_file(): workflow_source = p else: @@ -367,12 +377,25 @@ async def run_workflow_tool( error=f"workflow must be a dictionary or string, got {type(workflow).__name__}" ) + # Sanity-check dict workflows: must have at least a `start` field so we + # surface a clear error instead of a confusing Pydantic validation message. + if isinstance(workflow_source, dict) and "start" not in workflow_source: + return ToolResult( + success=False, + error=( + "Invalid workflow definition: the `start` field is required. " + "Make sure you pass the workflow JSON (with `start`, `nodes`, `edges`) " + "as the `workflow` parameter, not the execution inputs." + ) + ) + # Request permission (workflow execution can run arbitrary code) if isinstance(workflow_source, dict): workflow_name = workflow_source.get("name", "unnamed workflow") # Use id if available, otherwise use name or generate a fallback workflow_id = workflow_source.get("id") or workflow_source.get("name") or "unknown" else: + # workflow_source is a Path object here; Path.name gives the filename. workflow_name = workflow_source.name workflow_id = str(workflow_source) From 2580cef69749672ab458c8f851362e857e1208f6 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 8 May 2026 19:32:45 +0800 Subject: [PATCH 16/41] fix(run_workflow): split non-dict json.loads branches with clear per-type errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous fix used 'parsed if isinstance(parsed, str) else raw' which silently passed list/int/bool results to Path(), giving a misleading 'file not found' error. Now three explicit branches: - parsed is str → double-encoded path; use decoded value as file path - parsed is None → JSONDecodeError; use raw string as file path (original behaviour) - parsed is other → clear error stating the unexpected JSON type Co-authored-by: Cursor --- flocks/tool/task/run_workflow.py | 35 +++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/flocks/tool/task/run_workflow.py b/flocks/tool/task/run_workflow.py index 9dba679a1..8f1ddf788 100644 --- a/flocks/tool/task/run_workflow.py +++ b/flocks/tool/task/run_workflow.py @@ -353,22 +353,43 @@ async def run_workflow_tool( if isinstance(parsed, dict): # Valid workflow JSON object. workflow_source = parsed - else: - # Either not JSON, or JSON that decoded to a non-dict (e.g. a JSON-encoded - # string like '"/path/to/workflow.json"'). Treat the raw value (or the - # decoded string) as a file path. - file_path_str = parsed if isinstance(parsed, str) else raw - p = Path(file_path_str).expanduser() + elif isinstance(parsed, str): + # json.loads decoded a JSON-encoded string, e.g. the AI double-encoded the + # path: workflow='"/path/to/workflow.json"' → parsed='/path/to/workflow.json'. + # Use the decoded string (no surrounding quotes) as the file path. + p = Path(parsed).expanduser() + if p.exists() and p.is_file(): + workflow_source = p + else: + return ToolResult( + success=False, + error=( + f"Workflow file not found: {parsed!r}. " + "Provide a valid workflow JSON file path or a workflow dict." + ) + ) + elif parsed is None: + # json.loads raised JSONDecodeError — raw is not JSON; treat as a plain file path. + p = Path(raw).expanduser() if p.exists() and p.is_file(): workflow_source = p else: return ToolResult( success=False, error=( - "Unsupported workflow string. Provide a workflow JSON string, " + "Unsupported workflow string. Provide a workflow JSON string " "or a valid workflow JSON file path." ) ) + else: + # json.loads returned list / int / bool — not a valid workflow parameter. + return ToolResult( + success=False, + error=( + f"Invalid workflow parameter: expected a workflow dict or a file path string, " + f"got JSON-decoded {type(parsed).__name__} ({parsed!r})." + ) + ) elif isinstance(workflow, dict): workflow_source = workflow else: From 6265dd1f6da61c40a3cf2024250a8a96eae38fcc Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Sat, 9 May 2026 18:28:27 +0800 Subject: [PATCH 17/41] fix(workflow): re-apply workspace path for LSH state (lost in dev merge) The get_state_paths() change to ~/.flocks/workspace/workflows/ was lost when merging dev into feat/alert-dedup-workflow (da58f7b2). Re-apply: - Remove top-level `from flocks.config import Config` import - Replace Config().get_data_path() with Config().get_global().data_dir.parent / 'workspace' / 'workflows' - Update node description accordingly Also sync the running release snapshot and restart the service process so the /invoke API immediately uses the new path. Co-authored-by: Cursor --- .flocks/plugins/workflows/http_alert_dedup/workflow.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json index 20bce3249..122f1d776 100644 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.json @@ -37,8 +37,8 @@ { "id": "dedup_logs", "type": "python", - "description": "Step 3 — Dedup (terminal): URI normalization + MinHash LSH (datasketch, 128 perms, 5-gram shingles). LSH index + dedup_key cache are persisted to ~/.flocks/data/workflows/http_alert_dedup/ with atomic write and a cross-platform exclusive file lock (POSIX fcntl / Windows msvcrt). Survives restarts; safe for concurrent runs. When dedup_enabled=False, no disk state is read or written and cross-batch duplicate detection is intentionally disabled.", - "code": "import os\nimport re\nimport sys\nimport pickle\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\nfrom flocks.config import Config\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'http_alert_dedup'\nLSH_CLUSTER_WARN_THRESHOLD = 100000 # Warn when persisted cluster or dedup-key count exceeds this.\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n # Mirrors utils.normalize_uri() in lsh_processor.py.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n # Mirrors LSHProcessor.gen_text_minhash(text, k=5).\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_state_paths(threshold):\n # Persist LSH state under ~/.flocks/data/workflows//.\n # Returns (state_path, lock_path). Filename encodes NUM_PERM + threshold so that\n # parameter changes implicitly reset the state.\n data_dir = str(Config().get_data_path())\n state_dir = os.path.join(data_dir, 'workflows', WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n base = os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\ndef acquire_lock(lock_path):\n # Cross-platform exclusive file lock that serializes load+modify+dump across\n # concurrent workflow runs racing on the same state file.\n # POSIX: fcntl.flock(LOCK_EX) — blocks until acquired.\n # Windows: msvcrt.locking(LK_LOCK) — locks 1 byte; blocks ~10s per call,\n # so we loop until the lock is granted.\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L')\n fh.flush()\n fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1)\n break\n except OSError:\n # LK_LOCK retried internally and still failed; loop and try again.\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0)\n msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef load_state(state_path, threshold):\n # Returns (lsh_index, lsh_cache, dedup_key_cache).\n # On any error or parameter mismatch, returns (None, None, None) so the caller\n # initializes fresh state.\n if not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch (stored np={state.get(\"num_perm\")}, th={state.get(\"threshold\")}), starting fresh')\n return None, None, None\n cache = state['lsh_cache']\n seen = state.get('dedup_key_cache', set())\n print(f'[dedup] loaded state: {len(cache)} clusters, {len(seen)} dedup_keys from {state_path}')\n return state['lsh_index'], cache, seen\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold):\n # Atomic write: pickle to .tmp, fsync, then os.replace() over the target.\n # Crash mid-write leaves the original file intact instead of a corrupt half-file.\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index,\n 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache,\n 'num_perm': NUM_PERM,\n 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, state_path)\n print(f'[dedup] state saved: {len(lsh_cache)} clusters, {len(dedup_key_cache)} dedup_keys -> {state_path}')\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try:\n os.remove(tmp)\n except Exception:\n pass\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# Shared permutation parameters across all MinHash objects.\n# Mirrors LSHProcessor.__init__: self.permutations = MinHash(...).permutations\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\nstate_path, lock_path = get_state_paths(threshold)\nlock_fh = acquire_lock(lock_path) if dedup_enabled else None\n\ntry:\n # lsh_index: MinHashLSH band/row index — O(1) approximate candidate lookup.\n # lsh_cache: int cluster_id -> MinHash, used for exact Jaccard re-ranking.\n # dedup_key_cache: set of MD5 dedup_keys ever seen — survives restarts so that\n # dedup_key_already_exists is correct across batches.\n if dedup_enabled:\n lsh_index, lsh_cache, dedup_key_cache = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n dedup_key_cache = set()\n else:\n lsh_index, lsh_cache, dedup_key_cache = None, {}, set()\n\n def query_most_similar(minhash):\n # Mirrors LSHProcessor.query_most_similar():\n # 1. LSH candidate lookup (fast, approximate).\n # 2. Exact Jaccard re-ranking among top-100 candidates.\n # 3. If no candidate found, insert as a new cluster and return its id.\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = len(lsh_cache)\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\n keyed = []\n for alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n # Dedup disabled: hash raw text, no clustering, no cross-batch tracking.\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n alert['_lsh_cluster_id'] = None\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n # dedup_key = MD5(strict_fields_text + '.' + cluster_id)\n # Mirrors LogDedup._generate_dedup_key_text / hashlib_gen_md5_str.\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n # Cross-batch awareness: dedup_key_cache is loaded from disk at start.\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n\n if dedup_enabled:\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters '\n f'and {len(dedup_key_cache)} dedup_keys (threshold={LSH_CLUSTER_WARN_THRESHOLD}); '\n f'consider rotating state file at {state_path}')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold)\nfinally:\n if lock_fh is not None:\n release_lock(lock_fh)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\nstats['dedup_state_persisted'] = bool(dedup_enabled)\nif dedup_enabled:\n # Only expose persistence-related fields when state is actually read/written.\n stats['lsh_total_clusters'] = len(lsh_cache)\n stats['lsh_total_dedup_keys'] = len(dedup_key_cache)\n stats['lsh_state_path'] = state_path\n\nif dedup_enabled:\n summary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | persisted_clusters={len(lsh_cache)}, persisted_keys={len(dedup_key_cache)}'\n )\nelse:\n summary = (\n f'http_alert_dedup done (dedup_enabled=False, no state persisted): '\n f'raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique={len(unique_alerts)} (in-batch only)'\n )\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" + "description": "Step 3 — Dedup (terminal): URI normalization + MinHash LSH (datasketch, 128 perms, 5-gram shingles). LSH index + dedup_key cache are persisted to ~/.flocks/workspace/workflows/http_alert_dedup/ with atomic write and a cross-platform exclusive file lock (POSIX fcntl / Windows msvcrt). Survives restarts; safe for concurrent runs. When dedup_enabled=False, no disk state is read or written and cross-batch duplicate detection is intentionally disabled.", + "code": "import os\nimport re\nimport sys\nimport pickle\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'http_alert_dedup'\nLSH_CLUSTER_WARN_THRESHOLD = 100000 # Warn when persisted cluster or dedup-key count exceeds this.\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n # Mirrors utils.normalize_uri() in lsh_processor.py.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n # Mirrors LSHProcessor.gen_text_minhash(text, k=5).\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_state_paths(threshold):\n # Persist under ~/.flocks/workspace/workflows// \u2014 the flocks workspace\n # output directory, consistent with where other runtime outputs land.\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent # ~/.flocks\n state_dir = str(flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n base = os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\ndef acquire_lock(lock_path):\n # Cross-platform exclusive file lock that serializes load+modify+dump across\n # concurrent workflow runs racing on the same state file.\n # POSIX: fcntl.flock(LOCK_EX) — blocks until acquired.\n # Windows: msvcrt.locking(LK_LOCK) — locks 1 byte; blocks ~10s per call,\n # so we loop until the lock is granted.\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L')\n fh.flush()\n fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1)\n break\n except OSError:\n # LK_LOCK retried internally and still failed; loop and try again.\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0)\n msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef load_state(state_path, threshold):\n # Returns (lsh_index, lsh_cache, dedup_key_cache).\n # On any error or parameter mismatch, returns (None, None, None) so the caller\n # initializes fresh state.\n if not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch (stored np={state.get(\"num_perm\")}, th={state.get(\"threshold\")}), starting fresh')\n return None, None, None\n cache = state['lsh_cache']\n seen = state.get('dedup_key_cache', set())\n print(f'[dedup] loaded state: {len(cache)} clusters, {len(seen)} dedup_keys from {state_path}')\n return state['lsh_index'], cache, seen\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold):\n # Atomic write: pickle to .tmp, fsync, then os.replace() over the target.\n # Crash mid-write leaves the original file intact instead of a corrupt half-file.\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index,\n 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache,\n 'num_perm': NUM_PERM,\n 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, state_path)\n print(f'[dedup] state saved: {len(lsh_cache)} clusters, {len(dedup_key_cache)} dedup_keys -> {state_path}')\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try:\n os.remove(tmp)\n except Exception:\n pass\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# Shared permutation parameters across all MinHash objects.\n# Mirrors LSHProcessor.__init__: self.permutations = MinHash(...).permutations\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\nstate_path, lock_path = get_state_paths(threshold)\nlock_fh = acquire_lock(lock_path) if dedup_enabled else None\n\ntry:\n # lsh_index: MinHashLSH band/row index — O(1) approximate candidate lookup.\n # lsh_cache: int cluster_id -> MinHash, used for exact Jaccard re-ranking.\n # dedup_key_cache: set of MD5 dedup_keys ever seen — survives restarts so that\n # dedup_key_already_exists is correct across batches.\n if dedup_enabled:\n lsh_index, lsh_cache, dedup_key_cache = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n dedup_key_cache = set()\n else:\n lsh_index, lsh_cache, dedup_key_cache = None, {}, set()\n\n def query_most_similar(minhash):\n # Mirrors LSHProcessor.query_most_similar():\n # 1. LSH candidate lookup (fast, approximate).\n # 2. Exact Jaccard re-ranking among top-100 candidates.\n # 3. If no candidate found, insert as a new cluster and return its id.\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = len(lsh_cache)\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\n keyed = []\n for alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n # Dedup disabled: hash raw text, no clustering, no cross-batch tracking.\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n alert['_lsh_cluster_id'] = None\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n # dedup_key = MD5(strict_fields_text + '.' + cluster_id)\n # Mirrors LogDedup._generate_dedup_key_text / hashlib_gen_md5_str.\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n # Cross-batch awareness: dedup_key_cache is loaded from disk at start.\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n\n if dedup_enabled:\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters '\n f'and {len(dedup_key_cache)} dedup_keys (threshold={LSH_CLUSTER_WARN_THRESHOLD}); '\n f'consider rotating state file at {state_path}')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold)\nfinally:\n if lock_fh is not None:\n release_lock(lock_fh)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\nstats['dedup_state_persisted'] = bool(dedup_enabled)\nif dedup_enabled:\n # Only expose persistence-related fields when state is actually read/written.\n stats['lsh_total_clusters'] = len(lsh_cache)\n stats['lsh_total_dedup_keys'] = len(dedup_key_cache)\n stats['lsh_state_path'] = state_path\n\nif dedup_enabled:\n summary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | persisted_clusters={len(lsh_cache)}, persisted_keys={len(dedup_key_cache)}'\n )\nelse:\n summary = (\n f'http_alert_dedup done (dedup_enabled=False, no state persisted): '\n f'raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique={len(unique_alerts)} (in-batch only)'\n )\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" } ], "edges": [ From 9930b7220f0401616d5dca403e0c8b2583c39aa9 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Sat, 9 May 2026 18:34:39 +0800 Subject: [PATCH 18/41] feat(workflow): record invocation stats for published-service invoke calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit workflow_center_invoke (POST /workflow-center/{id}/invoke) now records execution stats via create_execution_record + _update_workflow_stats + _record_execution_result, so that UI callCount / successCount / errorCount counters are updated for every /workflow-center call — not only for agent-driven /workflow/{id}/run calls. Co-authored-by: Cursor --- flocks/server/routes/workflow.py | 48 ++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/flocks/server/routes/workflow.py b/flocks/server/routes/workflow.py index a636d0267..ed6b8e092 100644 --- a/flocks/server/routes/workflow.py +++ b/flocks/server/routes/workflow.py @@ -1026,21 +1026,59 @@ async def workflow_center_stop(workflow_id: str): @router.post("/workflow-center/{workflow_id}/invoke") async def workflow_center_invoke(workflow_id: str, req: WorkflowCenterInvokeRequest): - """Proxy invoke request to active published workflow service.""" + """Proxy invoke request to active published workflow service. + + Also records execution stats (callCount / successCount / errorCount) so + that the UI invocation counter is updated for every published-service call, + not just agent-driven /run calls. + """ + started = time.time() + exec_data = await create_execution_record( + workflow_id, + input_params=req.inputs or {}, + ) + exec_id = str(exec_data["id"]) try: - return await invoke_published_workflow( + result = await invoke_published_workflow( workflow_id, inputs=req.inputs, timeout_s=req.timeout_s, request_id=req.request_id, ) - except WorkflowNotFoundError as e: - raise HTTPException(status_code=404, detail=str(e)) - except WorkflowNotPublishedError as e: + duration = time.time() - started + raw_status = result.get("status", "SUCCEEDED") if isinstance(result, dict) else "SUCCEEDED" + status_value = _normalize_execution_status(raw_status) + success = status_value == "success" + await _update_workflow_stats(workflow_id, success, duration) + exec_data.update({ + "outputResults": result.get("outputs", {}) if isinstance(result, dict) else {}, + "status": status_value, + "finishedAt": int(time.time() * 1000), + "duration": duration, + "currentPhase": status_value, + }) + await _record_execution_result(workflow_id, exec_id, exec_data) + return result + except (WorkflowNotFoundError, WorkflowNotPublishedError) as e: + duration = time.time() - started + await _update_workflow_stats(workflow_id, False, duration) + exec_data.update({"status": "error", "finishedAt": int(time.time() * 1000), + "duration": duration, "errorMessage": str(e)}) + await _record_execution_result(workflow_id, exec_id, exec_data) raise HTTPException(status_code=404, detail=str(e)) except WorkflowCenterError as e: + duration = time.time() - started + await _update_workflow_stats(workflow_id, False, duration) + exec_data.update({"status": "error", "finishedAt": int(time.time() * 1000), + "duration": duration, "errorMessage": str(e)}) + await _record_execution_result(workflow_id, exec_id, exec_data) raise HTTPException(status_code=400, detail=str(e)) except Exception as e: + duration = time.time() - started + await _update_workflow_stats(workflow_id, False, duration) + exec_data.update({"status": "error", "finishedAt": int(time.time() * 1000), + "duration": duration, "errorMessage": str(e)}) + await _record_execution_result(workflow_id, exec_id, exec_data) log.error("workflow.center.invoke.error", {"workflow_id": workflow_id, "error": str(e)}) raise HTTPException(status_code=500, detail=f"Failed to invoke workflow service: {str(e)}") From 276beabcdce8548baba13726754d6272fdc24995 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Sat, 9 May 2026 20:31:49 +0800 Subject: [PATCH 19/41] feat(workflow): add alert dedup-triage pipeline and harden LSH eviction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New workflow: - Add alert_dedup_triage workflow: chains http_alert_dedup → tdp_alert_triage in a single pipeline; per-alert dedup via MinHash LSH, then LLM triage for first-seen unique alerts; duplicate alerts are annotated with cached triage results from a persisted triage_cache.pkl (FIFO LRU, max_dedup_keys cap). http_alert_dedup improvements: - Upgrade dedup_key_cache from set to ordered dict for FIFO LRU eviction. - Add max_dedup_keys input param (default 100 000); oldest entries are evicted before each persist so the state file stays bounded. - Use a monotonic cluster_id counter (_cid_box) instead of len(lsh_cache) so IDs never collide after eviction; remove evicted cluster_ids from the MinHashLSH index to prevent stale query results. - Expose lsh_max_dedup_keys / lsh_evicted_keys / lsh_evicted_clusters in stats. - Forward max_dedup_keys through normalize_* and filter_logs nodes. tdp_alert_triage: - Remove web-log detection branch; workflow now accepts HTTP alerts directly. - Add _strip_think() to all LLM nodes to strip blocks. - Update workflow.md to reflect simplified node structure. Test tools (moved from scripts/ → tests/integration/): - test_http_alert_dedup_stream.py: streaming simulation for http_alert_dedup. - test_alert_dedup_triage_stream.py: end-to-end dedup → triage pipeline test. Co-authored-by: Cursor --- .../alert_dedup_triage/workflow.json | 52 ++++ .../workflows/alert_dedup_triage/workflow.md | 89 +++++++ .../workflows/http_alert_dedup/workflow.json | 57 ++-- .../workflows/tdp_alert_triage/workflow.json | 159 ++++++++--- .../workflows/tdp_alert_triage/workflow.md | 218 +++++++-------- .../test_alert_dedup_triage_stream.py | 248 ++++++++++++++++++ .../test_http_alert_dedup_stream.py | 178 +++++++++++++ 7 files changed, 825 insertions(+), 176 deletions(-) create mode 100644 .flocks/plugins/workflows/alert_dedup_triage/workflow.json create mode 100644 .flocks/plugins/workflows/alert_dedup_triage/workflow.md create mode 100644 tests/integration/test_alert_dedup_triage_stream.py create mode 100755 tests/integration/test_http_alert_dedup_stream.py diff --git a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json new file mode 100644 index 000000000..9250f7034 --- /dev/null +++ b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json @@ -0,0 +1,52 @@ +{ + "name": "alert_dedup_triage", + "description": "Chained pipeline: http_alert_dedup → tdp_alert_triage. Deduplicates incoming TDP/HTTP alerts with MinHash LSH, then runs LLM-based triage (survey / CVE / payload analysis in parallel) on each unique alert.", + "description_cn": "去重+研判串联工作流:先调用 http_alert_dedup 做 MinHash LSH 去重,再对每条首次出现的唯一告警调用 tdp_alert_triage 进行 LLM 研判;重复告警会从持久化研判缓存(triage_cache.pkl)中回填历史研判结果(stage=duplicate_with_triage)。研判缓存上限由 max_dedup_keys 控制(默认 10W,FIFO 淘汰)。", + "start": "receive_alerts", + "nodes": [ + { + "id": "receive_alerts", + "type": "python", + "description": "解析输入告警列表(与 http_alert_dedup 接口相同):支持 alerts 列表或 alert_file 文件路径,提取去重 / 研判配置参数及服务 URL", + "code": "\nimport json\nimport os\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n print(f'[receive] loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from file: {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\n# Maximum triage-cache entries kept on disk (FIFO eviction, same default as LSH).\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nprint(f'[receive] raw_alerts={len(alerts_input)}, source_log_type={source_log_type}, max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['threshold'] = threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\noutputs['dedup_service_url'] = inputs.get('dedup_service_url', 'http://127.0.0.1:19000')\noutputs['triage_service_url'] = inputs.get('triage_service_url', 'http://127.0.0.1:19001')\n" + }, + { + "id": "dedup_and_triage", + "type": "python", + "description": "核心循环节点:逐条调用 http_alert_dedup 服务去重,对首次出现的唯一告警调用 tdp_alert_triage 服务研判;命中去重的重复告警会回填历史研判结果(从持久化研判缓存 triage_cache.pkl 读取),研判缓存支持 FIFO LRU 淘汰(max_dedup_keys 可调)", + "code": "\nimport json\nimport os\nimport pickle\nimport sys\nimport time\nimport urllib.request\nimport urllib.error\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nWORKFLOW_NAME = 'alert_dedup_triage'\n\n# ── Triage cache helpers ────────────────────────────────────────\n\ndef _triage_cache_path():\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent # ~/.flocks\n state_dir = flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME\n state_dir.mkdir(parents=True, exist_ok=True)\n return str(state_dir / 'triage_cache.pkl'), str(state_dir / 'triage_cache.lock')\n\ndef _acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L'); fh.flush(); fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef _release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef _load_triage_cache(cache_path):\n # Returns an ordered dict: dedup_key -> triage_info dict.\n if not os.path.exists(cache_path) or os.path.getsize(cache_path) == 0:\n return {}\n try:\n with open(cache_path, 'rb') as f:\n c = pickle.load(f)\n if not isinstance(c, dict):\n return {}\n print(f'[triage_cache] loaded {len(c)} entries from {cache_path}')\n return c\n except Exception as e:\n print(f'[triage_cache] failed to load ({e}), starting fresh')\n return {}\n\ndef _save_triage_cache(cache_path, cache):\n tmp = cache_path + '.tmp'\n try:\n with open(tmp, 'wb') as f:\n pickle.dump(cache, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, cache_path)\n print(f'[triage_cache] saved {len(cache)} entries -> {cache_path}')\n except Exception as e:\n print(f'[triage_cache] failed to save: {e}')\n if os.path.exists(tmp):\n try: os.remove(tmp)\n except Exception: pass\n\ndef _evict_cache(cache, max_keys):\n excess = len(cache) - max_keys\n if excess > 0:\n for k in list(cache.keys())[:excess]:\n del cache[k]\n return excess\n return 0\n\n# ── HTTP helper ─────────────────────────────────────────────────\n\ndef _post(url, payload, timeout):\n req = urllib.request.Request(\n url,\n data=json.dumps(payload).encode(),\n headers={'Content-Type': 'application/json'},\n method='POST',\n )\n t0 = time.time()\n try:\n with urllib.request.urlopen(req, timeout=timeout) as resp:\n return json.loads(resp.read()), round((time.time() - t0) * 1000), None\n except urllib.error.HTTPError as e:\n body = e.read().decode(errors='replace')[:300]\n return None, round((time.time() - t0) * 1000), f'HTTP {e.code}: {body}'\n except Exception as e:\n return None, round((time.time() - t0) * 1000), str(e)\n\n# ── Main ────────────────────────────────────────────────────────\n\nfrom pathlib import Path # noqa: E402 (used inside node code)\n\nraw_alerts = inputs.get('raw_alerts', [])\nsource_log_type = inputs.get('source_log_type', 'tdp')\nfilter_enabled = inputs.get('filter_enabled', True)\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = inputs.get('threshold', 0.7)\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = inputs.get('max_field_len', 500)\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\ndedup_url = inputs.get('dedup_service_url', 'http://127.0.0.1:19000')\ntriage_url = inputs.get('triage_service_url', 'http://127.0.0.1:19001')\ntriage_timeout = int(inputs.get('triage_timeout_s', 300))\ndedup_timeout = int(inputs.get('dedup_timeout_s', 60))\n\ndedup_base_inputs = {\n 'source_log_type': source_log_type,\n 'filter_enabled': filter_enabled,\n 'dedup_enabled': dedup_enabled,\n 'threshold': threshold,\n 'strict_fields': strict_fields,\n 'lsh_fields': lsh_fields,\n 'max_field_len': max_field_len,\n 'max_dedup_keys': max_dedup_keys,\n}\n\n# Load triage cache (locked to avoid concurrent corruption).\ncache_path, lock_path = _triage_cache_path()\nlock_fh = _acquire_lock(lock_path)\ntry:\n triage_cache = _load_triage_cache(cache_path)\nfinally:\n _release_lock(lock_fh)\n\nresults = []\ntriage_results = []\ncache_dirty = False\n\nstats = {\n 'total_input': len(raw_alerts),\n 'dedup_failed': 0,\n 'filtered_out': 0,\n 'duplicate_skipped': 0,\n 'duplicate_with_triage': 0, # duplicate but returned cached result\n 'triage_invoked': 0,\n 'triage_success': 0,\n 'triage_failed': 0,\n 'verdict_counts': {},\n}\n\nfor i, alert in enumerate(raw_alerts):\n entry = {\n 'alert_index': i,\n 'alert_id': alert.get('id') or alert.get('uuid') or alert.get('behave_uuid'),\n 'threat_name': (alert.get('threat') or {}).get('name', ''),\n }\n\n # ── Step 1: dedup ───────────────────────────────────────────\n dr, dms, derr = _post(\n f'{dedup_url}/invoke',\n {'inputs': {**dedup_base_inputs, 'alerts': [alert]}},\n dedup_timeout,\n )\n if derr or not dr or dr.get('status') != 'SUCCEEDED':\n stats['dedup_failed'] += 1\n entry.update({'stage': 'dedup_failed', 'dedup_error': derr or dr, 'dedup_ms': dms})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] ✗ dedup FAILED ({dms}ms) {str(derr or \"\")[:80]}')\n continue\n\n dout = dr.get('outputs', {})\n unique_alerts = dout.get('unique_alerts', [])\n dstats = dout.get('stats', {})\n entry['dedup_ms'] = dms\n entry['filter_removed'] = dstats.get('filter_removed_count', 0)\n entry['lsh_clusters'] = dstats.get('lsh_total_clusters')\n entry['lsh_dedup_keys'] = dstats.get('lsh_total_dedup_keys')\n\n if not unique_alerts:\n stats['filtered_out'] += 1\n entry.update({'stage': 'filtered_out', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - filtered_out ({dms}ms)')\n continue\n\n already = bool(unique_alerts[0].get('dedup_key_already_exists'))\n dedup_key = unique_alerts[0].get('dedup_key', '')\n entry['dedup_key'] = dedup_key\n\n if already:\n # Look up the previous triage result for this dedup_key.\n cached_triage = triage_cache.get(dedup_key)\n if cached_triage:\n stats['duplicate_with_triage'] += 1\n verdict = cached_triage.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n entry.update({'stage': 'duplicate_with_triage', 'triage': cached_triage})\n results.append(entry)\n triage_results.append({**entry, **cached_triage})\n print(f' [{i+1}/{len(raw_alerts)}] ↩ duplicate+cached ({dms}ms) key={dedup_key[:8]} verdict={verdict}')\n else:\n stats['duplicate_skipped'] += 1\n entry.update({'stage': 'duplicate_skipped', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - duplicate (no cache) ({dms}ms) key={dedup_key[:8]}')\n continue\n\n # ── Step 2: triage (unique, first occurrence) ───────────────\n stats['triage_invoked'] += 1\n tr, tms, terr = _post(\n f'{triage_url}/invoke',\n {'inputs': {'alert_data': alert}},\n triage_timeout,\n )\n if terr or not tr or tr.get('status') != 'SUCCEEDED':\n stats['triage_failed'] += 1\n entry.update({'stage': 'triage_failed', 'triage_ms': tms,\n 'triage_error': terr or tr})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] ✗ dedup OK + triage FAILED ({dms}+{tms}ms) {str(terr or \"\")[:80]}')\n continue\n\n stats['triage_success'] += 1\n tout = tr.get('outputs', {})\n verdict = tout.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n\n triage_info = {\n 'attack_verdict': verdict,\n 'risk_level': tout.get('risk_level'),\n 'report_title': tout.get('report_title'),\n 'report_path': tout.get('report_path'),\n 'final_report': tout.get('final_report', ''),\n }\n # Store in cache (refresh insertion order so recently-hit keys survive eviction longer).\n if dedup_key in triage_cache:\n del triage_cache[dedup_key]\n triage_cache[dedup_key] = triage_info\n cache_dirty = True\n\n entry.update({'stage': 'triage_done', 'triage_ms': tms, 'triage': triage_info})\n results.append(entry)\n triage_results.append({**entry, **triage_info})\n print(f' [{i+1}/{len(raw_alerts)}] ✓ dedup+triage OK ({dms}+{tms}ms) verdict={verdict} title={(tout.get(\"report_title\") or \"\")[:40]}')\n\n# Persist updated triage cache (only if changed).\nif cache_dirty:\n lock_fh = _acquire_lock(lock_path)\n try:\n evicted = _evict_cache(triage_cache, max_dedup_keys)\n if evicted:\n print(f'[triage_cache] LRU eviction: dropped {evicted} entries (max={max_dedup_keys})')\n _save_triage_cache(cache_path, triage_cache)\n finally:\n _release_lock(lock_fh)\n\nstats['triage_cache_size'] = len(triage_cache)\nprint(f'[pipeline] stats={json.dumps(stats, ensure_ascii=False)}')\n\noutputs['results'] = results\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\n" + }, + { + "id": "generate_summary", + "type": "python", + "description": "汇总节点:聚合所有研判结果,生成 pipeline_summary.md,并输出最高风险告警的研判标签(与 tdp_alert_triage 输出字段兼容)", + "code": "\nimport json\nimport datetime\n\nresults = inputs.get('results', [])\ntriage_results = inputs.get('triage_results', [])\nstats = inputs.get('stats', {})\n\nVERDICT_ORDER = {'attack_success': 5, 'attack': 4, 'attack_failed': 3, 'unknown': 2, 'benign': 1}\n\n# Pick the highest-risk triage result (includes duplicates with cached results).\ntop = None\nfor r in triage_results:\n if top is None:\n top = r\n elif VERDICT_ORDER.get(r.get('attack_verdict', ''), 0) > VERDICT_ORDER.get(top.get('attack_verdict', ''), 0):\n top = r\n\nfinal_report = top.get('final_report', '') if top else ''\nreport_title = top.get('report_title', '') if top else ''\nreport_path = top.get('report_path', '') if top else ''\nattack_verdict = top.get('attack_verdict', '') if top else ''\nrisk_level = top.get('risk_level', '') if top else ''\n\ntoday = datetime.date.today().isoformat()\nverdict_cn = {\n 'attack_success': '攻击成功', 'attack_failed': '攻击失败',\n 'attack': '攻击', 'unknown': '未知', 'benign': '安全',\n}\nstage_cn = {\n 'triage_done': '研判完成',\n 'duplicate_with_triage': '重复(缓存)',\n 'duplicate_skipped': '重复(跳过)',\n 'filtered_out': '已过滤',\n 'dedup_failed': '去重失败',\n 'triage_failed': '研判失败',\n}\n\nrows = []\nfor r in results:\n stage = r.get('stage', '')\n triage = r.get('triage')\n verdict = triage.get('attack_verdict', '-') if isinstance(triage, dict) else '-'\n title = triage.get('report_title', '-') if isinstance(triage, dict) else '-'\n cache_mark = ' ↩' if stage == 'duplicate_with_triage' else ''\n rows.append(\n f\"| {r.get('alert_index', 0) + 1} \"\n f\"| {(r.get('threat_name') or '')[:30]} \"\n f\"| {stage_cn.get(stage, stage)}{cache_mark} \"\n f\"| {verdict_cn.get(verdict, verdict)} \"\n f\"| {(title or '')[:30]} |\"\n )\n\nsummary_md = (\n f'# 告警去重研判汇总报告\\n\\n'\n f'**处理日期**: {today}\\n\\n'\n f'## 统计\\n'\n f'- 总输入: {stats.get(\"total_input\", 0)}\\n'\n f'- 新增研判: {stats.get(\"triage_success\", 0)}\\n'\n f'- 重复(含缓存研判): {stats.get(\"duplicate_with_triage\", 0)}\\n'\n f'- 重复(无缓存): {stats.get(\"duplicate_skipped\", 0)}\\n'\n f'- 过滤掉: {stats.get(\"filtered_out\", 0)}\\n'\n f'- 研判失败: {stats.get(\"triage_failed\", 0)}\\n'\n f'- 研判缓存条数: {stats.get(\"triage_cache_size\", 0)}\\n'\n f'- 攻击判定分布: {json.dumps(stats.get(\"verdict_counts\", {}), ensure_ascii=False)}\\n\\n'\n f'## 明细\\n\\n'\n f'| # | 告警类型 | 阶段 | 攻击判定 | 研判标题 |\\n'\n f'|---|---------|------|---------|--------|\\n'\n + '\\n'.join(rows) + '\\n\\n'\n + (f'## 最高风险告警研判报告\\n\\n{final_report}\\n' if final_report else '')\n)\n\ntry:\n out_path = get_path('pipeline_summary.md')\n import os\n os.makedirs(os.path.dirname(str(out_path)), exist_ok=True)\n with open(str(out_path), 'w', encoding='utf-8') as _f:\n _f.write(summary_md)\n summary_path = str(out_path)\n print(f'[summary] written to {summary_path}')\nexcept Exception as _e:\n print(f'[summary] WARNING: could not write summary file: {_e}')\n summary_path = ''\n\nprint(f'[summary] triage_success={stats.get(\"triage_success\", 0)}, '\n f'duplicate_with_triage={stats.get(\"duplicate_with_triage\", 0)}, '\n f'top_verdict={attack_verdict}')\n\noutputs['final_reports'] = [r.get('triage', {}).get('final_report', '') for r in triage_results if isinstance(r.get('triage'), dict)]\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\noutputs['summary_report'] = summary_md\noutputs['report_path'] = summary_path\noutputs['final_report'] = final_report\noutputs['report_title'] = report_title\noutputs['attack_verdict'] = attack_verdict\noutputs['risk_level'] = risk_level\n" + } + ], + "edges": [ + { + "from": "receive_alerts", + "to": "dedup_and_triage", + "order": 0 + }, + { + "from": "dedup_and_triage", + "to": "generate_summary", + "order": 0 + } + ], + "metadata": { + "node_timeout_s": 7200, + "sampleInputs": { + "source_log_type": "tdp", + "filter_enabled": true, + "dedup_enabled": true, + "threshold": 0.7, + "dedup_service_url": "http://127.0.0.1:19000", + "triage_service_url": "http://127.0.0.1:19001", + "triage_timeout_s": 300, + "_comment_alerts": "Pass 'alerts' (list) or 'alert_file' (path to JSON file)", + "max_dedup_keys": 100000 + } + } +} \ No newline at end of file diff --git a/.flocks/plugins/workflows/alert_dedup_triage/workflow.md b/.flocks/plugins/workflows/alert_dedup_triage/workflow.md new file mode 100644 index 000000000..3bfe19533 --- /dev/null +++ b/.flocks/plugins/workflows/alert_dedup_triage/workflow.md @@ -0,0 +1,89 @@ +# 告警去重研判串联工作流 + +## 业务场景 + +将 `http_alert_dedup`(MinHash LSH 去重)与 `tdp_alert_triage`(LLM 研判)串联为单一工作流: + +1. 批量接收 TDP/HTTP 原始告警 +2. 逐条调用 http_alert_dedup 服务去重:跨批次已见告警直接跳过,节省研判算力 +3. 对首次出现的唯一告警调用 tdp_alert_triage 服务进行 LLM 研判(测绘/CVE/payload 并行) +4. 聚合所有结果输出汇总报告及最高风险告警的研判详情 + +## 流程结构 + +``` +receive_alerts (解析输入,与 http_alert_dedup 接口相同) + ↓ +dedup_and_triage (逐条去重 → 唯一告警 → 研判) + ↓ +generate_summary (聚合输出,写 pipeline_summary.md) +``` + +### 内部循环逻辑(dedup_and_triage) + +``` +for each raw_alert: + POST /invoke → http_alert_dedup (port 19000) + ├─ filtered_out → 跳过(非 HTTP / 扫描告警) + ├─ duplicate_skipped → 跳过(跨批次已见) + └─ unique → POST /invoke → tdp_alert_triage (port 19001) + ↓ + collect triage result +``` + +## 节点详情 + +### 1. `receive_alerts` +与 `http_alert_dedup` 接口完全一致: +- 支持 `alerts`(list)或 `alert_file`(文件路径) +- 提取去重配置:`source_log_type`、`filter_enabled`、`dedup_enabled`、`threshold` +- 支持通过 `dedup_service_url` / `triage_service_url` 输入字段覆盖服务地址 + +### 2. `dedup_and_triage` +逐条处理每条原始告警: +- 单条 POST 到 `http_alert_dedup` 服务(保持跨批次 LSH 状态持久化) +- 根据返回的 `dedup_key_already_exists` 判断是否为首次出现 +- 仅对首次出现的告警用**原始 raw alert**(而非归一化字段)调用 `tdp_alert_triage` +- 对所有告警记录处理阶段:`filtered_out` / `duplicate_skipped` / `triage_done` / `dedup_failed` / `triage_failed` + +### 3. `generate_summary` +- 聚合所有研判结果,按 `attack_verdict` 风险级别排序 +- 生成 Markdown 汇总表(明细 + 统计)+ 最高风险告警的完整报告 +- 落盘到 `~/.flocks/workspace/outputs//artifacts/pipeline_summary.md` +- 主要输出字段与 `tdp_alert_triage` 兼容(`attack_verdict`、`risk_level`、`report_title`、`final_report`),方便单告警场景直接对接下游 + +## 输入参数 + +| 字段 | 类型 | 默认 | 说明 | +|------|------|------|------| +| `alerts` | list | — | 原始告警列表(与 alert_file 二选一) | +| `alert_file` | string | — | JSON 文件路径(替代 alerts 列表) | +| `source_log_type` | string | `"tdp"` | 日志类型(`tdp` / `skyeye`) | +| `filter_enabled` | bool | `true` | 是否启用告警过滤 | +| `dedup_enabled` | bool | `true` | 是否启用去重(含持久化) | +| `threshold` | float | `0.7` | LSH Jaccard 相似度阈值 | +| `dedup_service_url` | string | `http://127.0.0.1:19000` | http_alert_dedup 服务地址 | +| `triage_service_url` | string | `http://127.0.0.1:19001` | tdp_alert_triage 服务地址 | +| `triage_timeout_s` | int | `300` | 单条研判超时秒数 | +| `dedup_timeout_s` | int | `60` | 单条去重超时秒数 | + +## 输出参数 + +| 字段 | 类型 | 说明 | +|------|------|------| +| `final_report` | string | 最高风险告警的完整 Markdown 报告 | +| `report_title` | string | 最高风险告警的标题 | +| `attack_verdict` | enum | 最高风险告警的判定标签 | +| `risk_level` | enum | 最高风险告警的风险等级 | +| `final_reports` | list | 所有研判成功告警的报告列表 | +| `triage_results` | list | 所有研判成功告警的详情 | +| `summary_report` | string | 汇总 Markdown(统计 + 明细表) | +| `report_path` | string | `pipeline_summary.md` 落盘路径 | +| `stats` | dict | 处理统计(total/filtered/dedup/triage 各计数) | + +## 工程要点 + +- **跨批次去重**:`dedup_and_triage` 每次单条调用 dedup 服务,LSH 状态持久化在 `~/.flocks/workspace/workflows/http_alert_dedup/` 下,跨工作流调用生效 +- **原始告警传给研判**:triage 接收的是原始 raw alert(保留嵌套 `net.http.*` / `threat.*` 字段),而不是归一化后的字段,确保研判提示词能解析完整信息 +- **节点超时**:`node_timeout_s = 7200`,留出足够余量处理大批量告警(每条研判约 50s × N 条) +- **输出兼容性**:`generate_summary` 的主要输出字段与 `tdp_alert_triage` 相同,单告警场景下可无缝替换 diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json index 122f1d776..b1f74bdf2 100644 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.json @@ -1,14 +1,14 @@ { "name": "http_alert_dedup", "description": "Network alert deduplication pipeline: normalize (TDP/Skyeye field mapping) → filter (remove scans / non-HTTP) → dedup (URI normalization + 5-gram Jaccard similarity). Returns a dict with deduped_alerts, unique_alerts and stats.", - "description_cn": "网络告警去重 Pipeline:归一化(TDP/Skyeye 字段映射,含日志类型分支)→ 过滤(剔除扫描/非 HTTP 告警)→ 去重(URI 归一化 + 5-gram Jaccard 相似度聚类,MD5 dedup_key)。输入 dict,输出 dict(deduped_alerts / unique_alerts / stats)。", + "description_cn": "网络告警去重 Pipeline:归一化(TDP/Skyeye 字段映射,含日志类型分支)→ 过滤(剔除扫描/非 HTTP 告警)→ 去重(URI 归一化 + 5-gram MinHash LSH + dedup_key 持久化,FIFO LRU 上限默认 10W、可通过 max_dedup_keys 调整)。输入 dict,输出 dict(deduped_alerts / unique_alerts / stats)。", "start": "receive_alerts", "nodes": [ { "id": "receive_alerts", "type": "python", "description": "接收原始告警列表,解析输入格式,提取 Pipeline 配置,输出 source_log_type 供后续分支节点路由", - "code": "import json\nimport os\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n\n# Support alert_file: load JSON from a local file path when alerts list is not given directly.\n# This lets callers pass a file path (e.g. '/Users/foo/Downloads/tdp_logs.json') instead of\n# inlining potentially thousands of alerts into the workflow inputs.\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n print(f'[receive] loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from file: {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\ndedup_threshold = float(inputs.get('threshold', inputs.get('dedup_threshold', 0.7)))\nstrict_fields = inputs.get('strict_fields', inputs.get('dedup_fields_strict', ['sip', 'dip']))\nlsh_fields = inputs.get('lsh_fields', inputs.get('dedup_fields_lsh', ['req_http_url', 'req_body', 'rsp_body']))\nmax_field_len = int(inputs.get('max_field_len', 500))\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\nprint(f'[receive] source_log_type={source_log_type}, total={len(alerts_input)}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['stats'] = {'raw_count': len(alerts_input)}\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['dedup_threshold'] = dedup_threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len" + "code": "\nimport json\nimport os\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n\n# Support alert_file: load JSON from a local file path when alerts list is not given directly.\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n print(f'[receive] loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from file: {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\ndedup_threshold = float(inputs.get('threshold', inputs.get('dedup_threshold', 0.7)))\nstrict_fields = inputs.get('strict_fields', inputs.get('dedup_fields_strict', ['sip', 'dip']))\nlsh_fields = inputs.get('lsh_fields', inputs.get('dedup_fields_lsh', ['req_http_url', 'req_body', 'rsp_body']))\nmax_field_len = int(inputs.get('max_field_len', 500))\n# Maximum dedup_keys (and LSH clusters) to keep in persisted state.\n# When the cache grows beyond this limit, oldest entries are evicted in FIFO\n# order on the next dedup run. Default 100,000 — tunable per request.\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\nprint(f'[receive] source_log_type={source_log_type}, total={len(alerts_input)}, max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['stats'] = {'raw_count': len(alerts_input)}\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['dedup_threshold'] = dedup_threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\n" }, { "id": "branch_log_type", @@ -20,34 +20,60 @@ "id": "normalize_tdp", "type": "python", "description": "TDP 字段归一化:将 TDP 原始嵌套字段(net_real_src_ip/net_http_url/threat_name 等)映射为标准字段(sip/dip/req_http_url/threat_name 等)", - "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent': 'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in TDP_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none')\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_tdp] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len']:\n outputs[k] = inputs.get(k)" + "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent': 'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in TDP_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none')\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_tdp] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len',\n 'max_dedup_keys']:\n outputs[k] = inputs.get(k)" }, { "id": "normalize_skyeye", "type": "python", "description": "Skyeye 字段归一化:将 Skyeye 原始字段(uri/agent/host/vuln_name/attack_result 等)映射为标准字段", - "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent': 'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in SKYEYE_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_skyeye] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len']:\n outputs[k] = inputs.get(k)" + "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent': 'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in SKYEYE_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_skyeye] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len',\n 'max_dedup_keys']:\n outputs[k] = inputs.get(k)" }, { "id": "filter_logs", "type": "python", "description": "Step 2 — 过滤:9 种 process_type 分类;保留 non-scan + HTTP(任意方向 in/out/lateral)的告警", - "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if source == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert, source_log_type)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert, source_log_type)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len']:\n outputs[k] = inputs.get(k)" + "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if source == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert, source_log_type)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert, source_log_type)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len',\n 'max_dedup_keys']:\n outputs[k] = inputs.get(k)" }, { "id": "dedup_logs", "type": "python", - "description": "Step 3 — Dedup (terminal): URI normalization + MinHash LSH (datasketch, 128 perms, 5-gram shingles). LSH index + dedup_key cache are persisted to ~/.flocks/workspace/workflows/http_alert_dedup/ with atomic write and a cross-platform exclusive file lock (POSIX fcntl / Windows msvcrt). Survives restarts; safe for concurrent runs. When dedup_enabled=False, no disk state is read or written and cross-batch duplicate detection is intentionally disabled.", - "code": "import os\nimport re\nimport sys\nimport pickle\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'http_alert_dedup'\nLSH_CLUSTER_WARN_THRESHOLD = 100000 # Warn when persisted cluster or dedup-key count exceeds this.\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n # Mirrors utils.normalize_uri() in lsh_processor.py.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n # Mirrors LSHProcessor.gen_text_minhash(text, k=5).\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_state_paths(threshold):\n # Persist under ~/.flocks/workspace/workflows// \u2014 the flocks workspace\n # output directory, consistent with where other runtime outputs land.\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent # ~/.flocks\n state_dir = str(flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n base = os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\ndef acquire_lock(lock_path):\n # Cross-platform exclusive file lock that serializes load+modify+dump across\n # concurrent workflow runs racing on the same state file.\n # POSIX: fcntl.flock(LOCK_EX) — blocks until acquired.\n # Windows: msvcrt.locking(LK_LOCK) — locks 1 byte; blocks ~10s per call,\n # so we loop until the lock is granted.\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L')\n fh.flush()\n fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1)\n break\n except OSError:\n # LK_LOCK retried internally and still failed; loop and try again.\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0)\n msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef load_state(state_path, threshold):\n # Returns (lsh_index, lsh_cache, dedup_key_cache).\n # On any error or parameter mismatch, returns (None, None, None) so the caller\n # initializes fresh state.\n if not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch (stored np={state.get(\"num_perm\")}, th={state.get(\"threshold\")}), starting fresh')\n return None, None, None\n cache = state['lsh_cache']\n seen = state.get('dedup_key_cache', set())\n print(f'[dedup] loaded state: {len(cache)} clusters, {len(seen)} dedup_keys from {state_path}')\n return state['lsh_index'], cache, seen\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold):\n # Atomic write: pickle to .tmp, fsync, then os.replace() over the target.\n # Crash mid-write leaves the original file intact instead of a corrupt half-file.\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index,\n 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache,\n 'num_perm': NUM_PERM,\n 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, state_path)\n print(f'[dedup] state saved: {len(lsh_cache)} clusters, {len(dedup_key_cache)} dedup_keys -> {state_path}')\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try:\n os.remove(tmp)\n except Exception:\n pass\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nstats = dict(inputs.get('stats', {}))\n\n# Shared permutation parameters across all MinHash objects.\n# Mirrors LSHProcessor.__init__: self.permutations = MinHash(...).permutations\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\nstate_path, lock_path = get_state_paths(threshold)\nlock_fh = acquire_lock(lock_path) if dedup_enabled else None\n\ntry:\n # lsh_index: MinHashLSH band/row index — O(1) approximate candidate lookup.\n # lsh_cache: int cluster_id -> MinHash, used for exact Jaccard re-ranking.\n # dedup_key_cache: set of MD5 dedup_keys ever seen — survives restarts so that\n # dedup_key_already_exists is correct across batches.\n if dedup_enabled:\n lsh_index, lsh_cache, dedup_key_cache = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n dedup_key_cache = set()\n else:\n lsh_index, lsh_cache, dedup_key_cache = None, {}, set()\n\n def query_most_similar(minhash):\n # Mirrors LSHProcessor.query_most_similar():\n # 1. LSH candidate lookup (fast, approximate).\n # 2. Exact Jaccard re-ranking among top-100 candidates.\n # 3. If no candidate found, insert as a new cluster and return its id.\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = len(lsh_cache)\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\n keyed = []\n for alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n # Dedup disabled: hash raw text, no clustering, no cross-batch tracking.\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n alert['_lsh_cluster_id'] = None\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n # dedup_key = MD5(strict_fields_text + '.' + cluster_id)\n # Mirrors LogDedup._generate_dedup_key_text / hashlib_gen_md5_str.\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n # Cross-batch awareness: dedup_key_cache is loaded from disk at start.\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache.add(dk)\n keyed.append(alert)\n\n if dedup_enabled:\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters '\n f'and {len(dedup_key_cache)} dedup_keys (threshold={LSH_CLUSTER_WARN_THRESHOLD}); '\n f'consider rotating state file at {state_path}')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold)\nfinally:\n if lock_fh is not None:\n release_lock(lock_fh)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\nstats['dedup_state_persisted'] = bool(dedup_enabled)\nif dedup_enabled:\n # Only expose persistence-related fields when state is actually read/written.\n stats['lsh_total_clusters'] = len(lsh_cache)\n stats['lsh_total_dedup_keys'] = len(dedup_key_cache)\n stats['lsh_state_path'] = state_path\n\nif dedup_enabled:\n summary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | persisted_clusters={len(lsh_cache)}, persisted_keys={len(dedup_key_cache)}'\n )\nelse:\n summary = (\n f'http_alert_dedup done (dedup_enabled=False, no state persisted): '\n f'raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique={len(unique_alerts)} (in-batch only)'\n )\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary" + "description": "Step 3 — Dedup (terminal): URI normalization + MinHash LSH (datasketch, 128 perms, 5-gram shingles). LSH index + dedup_key cache are persisted to ~/.flocks/workspace/workflows/http_alert_dedup/ with atomic write and a cross-platform exclusive file lock (POSIX fcntl / Windows msvcrt). FIFO LRU eviction enforces max_dedup_keys (default 100,000, tunable via inputs.max_dedup_keys); cluster_id is monotonically allocated to survive eviction. Survives restarts; safe for concurrent runs. When dedup_enabled=False, no disk state is read or written.", + "code": "\nimport os\nimport re\nimport sys\nimport pickle\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'http_alert_dedup'\nLSH_CLUSTER_WARN_THRESHOLD = 100000 # Warn when persisted cluster or dedup-key count exceeds this.\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_state_paths(threshold):\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent # ~/.flocks\n state_dir = str(flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n base = os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\ndef acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L')\n fh.flush()\n fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1)\n break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0)\n msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef load_state(state_path, threshold):\n # Returns (lsh_index, lsh_cache_dict, dedup_key_cache_dict, next_cluster_id).\n # On any error or parameter mismatch, returns (None, None, None, 0).\n # Backward-compat: legacy state stored dedup_key_cache as a set; we coerce to dict.\n if not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None, 0\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch (stored np={state.get(\"num_perm\")}, th={state.get(\"threshold\")}), starting fresh')\n return None, None, None, 0\n cache = state['lsh_cache']\n # Coerce dedup_key_cache to dict (insertion-ordered) for FIFO eviction.\n seen_raw = state.get('dedup_key_cache', {})\n if isinstance(seen_raw, set):\n # Legacy format: set has no order. Best-effort: treat as one batch.\n seen = {k: None for k in seen_raw}\n elif isinstance(seen_raw, dict):\n seen = seen_raw\n else:\n seen = {}\n # next_cluster_id: prefer stored, otherwise derive from max key (legacy state).\n next_cid = state.get('next_cluster_id')\n if next_cid is None:\n next_cid = (max(cache.keys()) + 1) if cache else 0\n print(f'[dedup] loaded state: {len(cache)} clusters, {len(seen)} dedup_keys, next_cid={next_cid} from {state_path}')\n return state['lsh_index'], cache, seen, next_cid\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None, 0\n\ndef evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_keys):\n # FIFO eviction to enforce max_keys upper bound on both caches.\n # Returns (evicted_keys, evicted_clusters) for logging.\n evicted_keys = 0\n evicted_clusters = 0\n excess_keys = len(dedup_key_cache) - max_keys\n if excess_keys > 0:\n # dict preserves insertion order in Python 3.7+; iterate to get oldest.\n old_keys = list(dedup_key_cache.keys())[:excess_keys]\n for k in old_keys:\n del dedup_key_cache[k]\n evicted_keys = excess_keys\n excess_clusters = len(lsh_cache) - max_keys\n if excess_clusters > 0:\n old_cids = list(lsh_cache.keys())[:excess_clusters]\n for cid in old_cids:\n # Drop from LSH band index *and* the cache dict, otherwise lsh_index.query\n # would return cluster_ids missing from lsh_cache and trigger KeyError.\n try:\n lsh_index.remove(cid)\n except (KeyError, ValueError):\n pass\n del lsh_cache[cid]\n evicted_clusters = excess_clusters\n return evicted_keys, evicted_clusters\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, next_cluster_id):\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index,\n 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache,\n 'next_cluster_id': next_cluster_id,\n 'num_perm': NUM_PERM,\n 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, state_path)\n print(f'[dedup] state saved: {len(lsh_cache)} clusters, {len(dedup_key_cache)} dedup_keys -> {state_path}')\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try:\n os.remove(tmp)\n except Exception:\n pass\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\nstats = dict(inputs.get('stats', {}))\n\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\nstate_path, lock_path = get_state_paths(threshold)\nlock_fh = acquire_lock(lock_path) if dedup_enabled else None\n\nevicted_keys = 0\nevicted_clusters = 0\n\ntry:\n # lsh_index: MinHashLSH band/row index — O(1) approximate candidate lookup.\n # lsh_cache: cluster_id -> MinHash, used for exact Jaccard re-ranking.\n # dedup_key_cache: ordered dict of MD5 dedup_keys ever seen.\n # next_cluster_id: monotonic counter; survives eviction so cluster_ids never collide.\n if dedup_enabled:\n lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n dedup_key_cache = {}\n next_cluster_id = 0\n else:\n lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = None, {}, {}, 0\n\n # next_cluster_id wrapped in a 1-element list so the closure can mutate it.\n # Workflow scripts run at module-top; nonlocal/global doesn't reach this scope.\n _cid_box = [next_cluster_id]\n def query_most_similar(minhash):\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = _cid_box[0]\n _cid_box[0] += 1\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\n keyed = []\n for alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n alert['_lsh_cluster_id'] = None\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache[dk] = None\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n # Cross-batch awareness: dedup_key_cache loaded from disk at start.\n # Re-insert refreshes insertion order so recently-seen keys survive eviction longer.\n already = dk in dedup_key_cache\n if already:\n del dedup_key_cache[dk]\n dedup_key_cache[dk] = None\n alert['dedup_key_already_exists'] = already\n keyed.append(alert)\n\n if dedup_enabled:\n # Enforce max_dedup_keys upper bound before persisting.\n evicted_keys, evicted_clusters = evict_oldest(\n lsh_index, lsh_cache, dedup_key_cache, max_dedup_keys\n )\n if evicted_keys or evicted_clusters:\n print(f'[dedup] LRU eviction (max_dedup_keys={max_dedup_keys}): '\n f'dropped {evicted_keys} keys, {evicted_clusters} clusters')\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters '\n f'and {len(dedup_key_cache)} dedup_keys (warn={LSH_CLUSTER_WARN_THRESHOLD}); '\n f'consider raising max_dedup_keys or rotating state file at {state_path}')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, _cid_box[0])\nfinally:\n if lock_fh is not None:\n release_lock(lock_fh)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\nstats['dedup_state_persisted'] = bool(dedup_enabled)\nif dedup_enabled:\n stats['lsh_total_clusters'] = len(lsh_cache)\n stats['lsh_total_dedup_keys'] = len(dedup_key_cache)\n stats['lsh_state_path'] = state_path\n stats['lsh_max_dedup_keys'] = max_dedup_keys\n stats['lsh_evicted_keys'] = evicted_keys\n stats['lsh_evicted_clusters'] = evicted_clusters\n\nif dedup_enabled:\n summary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | persisted_clusters={len(lsh_cache)}, persisted_keys={len(dedup_key_cache)}, max={max_dedup_keys}'\n )\nelse:\n summary = (\n f'http_alert_dedup done (dedup_enabled=False, no state persisted): '\n f'raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique={len(unique_alerts)} (in-batch only)'\n )\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary\n" } ], "edges": [ - {"from": "receive_alerts", "to": "branch_log_type", "order": 0}, - {"from": "branch_log_type", "to": "normalize_tdp", "label": "tdp", "order": 0}, - {"from": "branch_log_type", "to": "normalize_skyeye", "label": "skyeye", "order": 1}, - {"from": "normalize_tdp", "to": "filter_logs", "order": 0}, - {"from": "normalize_skyeye", "to": "filter_logs", "order": 0}, - {"from": "filter_logs", "to": "dedup_logs", "order": 0} + { + "from": "receive_alerts", + "to": "branch_log_type", + "order": 0 + }, + { + "from": "branch_log_type", + "to": "normalize_tdp", + "label": "tdp", + "order": 0 + }, + { + "from": "branch_log_type", + "to": "normalize_skyeye", + "label": "skyeye", + "order": 1 + }, + { + "from": "normalize_tdp", + "to": "filter_logs", + "order": 0 + }, + { + "from": "normalize_skyeye", + "to": "filter_logs", + "order": 0 + }, + { + "from": "filter_logs", + "to": "dedup_logs", + "order": 0 + } ], "metadata": { "node_timeout_s": 300, @@ -69,7 +95,8 @@ "threat_name": "SQL注入攻击", "threat_type": "web攻击" } - ] + ], + "max_dedup_keys": 100000 } } -} +} \ No newline at end of file diff --git a/.flocks/plugins/workflows/tdp_alert_triage/workflow.json b/.flocks/plugins/workflows/tdp_alert_triage/workflow.json index 9464401be..eba7fb845 100644 --- a/.flocks/plugins/workflows/tdp_alert_triage/workflow.json +++ b/.flocks/plugins/workflows/tdp_alert_triage/workflow.json @@ -1,89 +1,168 @@ { "name": "tdp_alert_triage", - "description": "NDR/TDP alert triage workflow - handles nested TDP alert payloads and automates HTTP-focused security investigation.", - "description_cn": "NDR/TDP 告警调查工作流 - 兼容 TDP 嵌套告警结构并自动化研判 HTTP 类安全事件", + "description": "NDR/TDP HTTP alert triage workflow with parallel survey/cve/payload analysis.", + "description_cn": "TDP/NDR HTTP 日志研判工作流(默认输入为 HTTP 日志,跳过类型判别)— 测绘 / 漏洞分析 / 漏洞详情 / 攻击 payload 四节点并行执行", "start": "receive_alert", "nodes": [ { "id": "receive_alert", "type": "python", - "description": "接收并解析 NDR/TDP 告警数据,提取关键字段、HTTP 请求响应和 IOC", - "code": "import json\nimport re\n\nalert_input = inputs.get('alert_data', inputs.get('alert', {}))\n\nif isinstance(alert_input, str):\n try:\n alert_input = json.loads(alert_input)\n except Exception:\n alert_input = {}\n\nif isinstance(alert_input, list):\n alert_data = alert_input[0] if alert_input else {}\nelif isinstance(alert_input, dict) and isinstance(alert_input.get('data'), list):\n alert_data = alert_input.get('data', [])[0] if alert_input.get('data') else {}\nelse:\n alert_data = alert_input if isinstance(alert_input, dict) else {}\n\nnet = alert_data.get('net', {}) or {}\nhttp = net.get('http', {}) or {}\nthreat = alert_data.get('threat', {}) or {}\nassets = alert_data.get('assets', {}) or {}\n\ndef pick(*values):\n for value in values:\n if value not in (None, '', [], {}):\n return value\n return ''\n\nsrc_ip = pick(\n alert_data.get('attacker'),\n alert_data.get('external_ip'),\n net.get('src_ip'),\n net.get('flow_src_ip'),\n alert_data.get('sip'),\n alert_data.get('src_ip'),\n alert_data.get('src')\n)\ndst_ip = pick(\n alert_data.get('victim'),\n alert_data.get('machine'),\n alert_data.get('server_ip'),\n net.get('dest_ip'),\n net.get('flow_dest_ip'),\n alert_data.get('dip'),\n alert_data.get('dst_ip'),\n alert_data.get('dst')\n)\nsrc_port = pick(\n net.get('src_port'),\n net.get('flow_src_port'),\n alert_data.get('external_port'),\n alert_data.get('sport'),\n alert_data.get('src_port'),\n 0\n)\ndst_port = pick(\n net.get('dest_port'),\n net.get('flow_dest_port'),\n alert_data.get('server_port'),\n alert_data.get('machine_port'),\n alert_data.get('dport'),\n alert_data.get('dst_port'),\n 0\n)\nprotocol = pick(\n net.get('app_proto'),\n net.get('type'),\n net.get('proto'),\n alert_data.get('net_app_proto'),\n alert_data.get('protocol'),\n alert_data.get('event_type'),\n 'TCP'\n)\nalert_type = pick(\n threat.get('name'),\n alert_data.get('threat_name'),\n alert_data.get('alert_type'),\n threat.get('topic'),\n alert_data.get('type'),\n 'unknown'\n)\nseverity = pick(\n threat.get('severity'),\n alert_data.get('threat_severity'),\n alert_data.get('severity'),\n threat.get('level'),\n alert_data.get('level'),\n 'medium'\n)\n\nreq_line = pick(http.get('reqs_line'), alert_data.get('req_line'))\nreq_header = pick(http.get('reqs_header'), alert_data.get('req_header'))\nreq_body = pick(http.get('req_body'), alert_data.get('req_body'))\nresp_line = pick(http.get('resp_line'), alert_data.get('rsp_line'), alert_data.get('resp_line'))\nresp_header = pick(http.get('resp_header'), alert_data.get('rsp_header'), alert_data.get('resp_header'))\nresp_body = pick(http.get('resp_body'), alert_data.get('rsp_body'), alert_data.get('resp_body'))\nstatus = pick(http.get('status'), alert_data.get('http_status'), 0)\n\nhost = pick(http.get('reqs_host'), alert_data.get('url_host'), http.get('domain'), dst_ip)\nraw_url = pick(http.get('raw_url'), http.get('url'), alert_data.get('url_path'))\nurl = ''\nif host and raw_url:\n scheme = 'https' if net.get('is_https') else 'http'\n url = raw_url if str(raw_url).startswith(('http://', 'https://')) else f'{scheme}://{host}{raw_url}'\n\npayload = f'请求行: {req_line}\\n请求头: {req_header}\\n请求体: {req_body}'\nresponse = f'状态行: {resp_line}\\n响应头: {resp_header}\\n响应体: {resp_body}'\n\nvuln_text = '\\n'.join(str(item) for item in [\n threat.get('msg', ''),\n threat.get('topic', ''),\n alert_data.get('data', ''),\n url,\n json.dumps(threat.get('tag', []), ensure_ascii=False)\n] if item)\nvuln_matches = sorted(set(re.findall(r'\\b(?:CVE|CNVD|CNNVD|XVE)-[A-Za-z0-9._-]+\\b', vuln_text, flags=re.I)))\nvuln_id = vuln_matches[0] if vuln_matches else ''\n\niocs = []\nfor candidate in [src_ip, dst_ip]:\n if candidate:\n iocs.append({'type': 'ip', 'value': candidate})\nif url:\n iocs.append({'type': 'url', 'value': url})\nif host and not re.match(r'^\\d{1,3}(?:\\.\\d{1,3}){3}(?::\\d+)?$', str(host)):\n iocs.append({'type': 'domain', 'value': str(host).split(':')[0]})\n\nsearch_text = '\\n'.join([payload, response, str(alert_data.get('data', '')), url])\nurl_pattern = r'https?://[^\\s<>\"]+'\nfor matched_url in re.findall(url_pattern, search_text):\n if matched_url not in [ioc['value'] for ioc in iocs if ioc['type'] == 'url']:\n iocs.append({'type': 'url', 'value': matched_url})\n\ndomain_pattern = r'(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\\.)+(?:com|net|org|io|cn|co|edu|gov|info|biz|xyz|top|cc|tk|ml|ga|cf|gq|pw|ws|site|online|club|shop|live|fun|tech|pro|app|dev|cloud|host|space|vip|tw|hk|jp|kr|ru|uk|de|fr|eu|au|ca)'\nfor domain in re.findall(domain_pattern, search_text):\n if domain not in [ioc['value'] for ioc in iocs if ioc['type'] == 'domain']:\n iocs.append({'type': 'domain', 'value': domain})\n\noutputs['parsed_alert'] = {\n 'src_ip': src_ip,\n 'dst_ip': dst_ip,\n 'src_port': src_port,\n 'dst_port': dst_port,\n 'protocol': protocol,\n 'payload': payload,\n 'response': response,\n 'url': url,\n 'status': status,\n 'alert_type': alert_type,\n 'severity': severity,\n 'vuln_id': vuln_id,\n 'vuln_candidates': vuln_matches,\n 'threat_result': threat.get('result', ''),\n 'threat_confidence': threat.get('confidence', ''),\n 'threat_msg': threat.get('msg', ''),\n 'failed_by': threat.get('failed_by', []),\n 'asset_ip': assets.get('ip', ''),\n 'asset_name': assets.get('name', []),\n 'iocs': iocs,\n 'raw_alert': alert_data\n}\noutputs['has_vuln_id'] = bool(vuln_id)\noutputs['iocs'] = iocs" + "description": "解析 NDR/TDP 告警,提取 HTTP 请求/响应、IOC、威胁字段,并生成统一 log_text 供下游 LLM 使用", + "code": "\nimport json\nimport re\n\nalert_input = inputs.get('alert_data', inputs.get('alert', {}))\n\nif isinstance(alert_input, str):\n try:\n alert_input = json.loads(alert_input)\n except Exception:\n alert_input = {}\n\nif isinstance(alert_input, list):\n alert_data = alert_input[0] if alert_input else {}\nelif isinstance(alert_input, dict) and isinstance(alert_input.get('data'), list):\n alert_data = alert_input.get('data', [])[0] if alert_input.get('data') else {}\nelse:\n alert_data = alert_input if isinstance(alert_input, dict) else {}\n\nnet = alert_data.get('net', {}) or {}\nhttp = net.get('http', {}) or {}\nthreat = alert_data.get('threat', {}) or {}\nassets = alert_data.get('assets', {}) or {}\n\ndef pick(*values):\n for value in values:\n if value not in (None, '', [], {}):\n return value\n return ''\n\nsrc_ip = pick(alert_data.get('attacker'), alert_data.get('external_ip'),\n net.get('src_ip'), net.get('flow_src_ip'),\n alert_data.get('sip'), alert_data.get('src_ip'), alert_data.get('src'))\ndst_ip = pick(alert_data.get('victim'), alert_data.get('machine'),\n alert_data.get('server_ip'), net.get('dest_ip'), net.get('flow_dest_ip'),\n alert_data.get('dip'), alert_data.get('dst_ip'), alert_data.get('dst'))\nsrc_port = pick(net.get('src_port'), net.get('flow_src_port'),\n alert_data.get('external_port'), alert_data.get('sport'),\n alert_data.get('src_port'), 0)\ndst_port = pick(net.get('dest_port'), net.get('flow_dest_port'),\n alert_data.get('server_port'), alert_data.get('machine_port'),\n alert_data.get('dport'), alert_data.get('dst_port'), 0)\nprotocol = pick(net.get('app_proto'), net.get('type'), net.get('proto'),\n alert_data.get('net_app_proto'), alert_data.get('protocol'),\n alert_data.get('event_type'), 'TCP')\nalert_type = pick(threat.get('name'), alert_data.get('threat_name'),\n alert_data.get('alert_type'), threat.get('topic'),\n alert_data.get('type'), 'unknown')\nseverity = pick(threat.get('severity'), alert_data.get('threat_severity'),\n alert_data.get('severity'), threat.get('level'),\n alert_data.get('level'), 'medium')\n\nreq_line = pick(http.get('reqs_line'), alert_data.get('req_line'))\nreq_header = pick(http.get('reqs_header'), alert_data.get('req_header'))\nreq_body = pick(http.get('req_body'), alert_data.get('req_body'))\nresp_line = pick(http.get('resp_line'), alert_data.get('rsp_line'), alert_data.get('resp_line'))\nresp_header = pick(http.get('resp_header'), alert_data.get('rsp_header'), alert_data.get('resp_header'))\nresp_body = pick(http.get('resp_body'), alert_data.get('rsp_body'), alert_data.get('resp_body'))\nstatus = pick(http.get('status'), alert_data.get('http_status'), 0)\n\nhost = pick(http.get('reqs_host'), alert_data.get('url_host'), http.get('domain'), dst_ip)\nraw_url = pick(http.get('raw_url'), http.get('url'), alert_data.get('url_path'))\nurl = ''\nif host and raw_url:\n scheme = 'https' if net.get('is_https') else 'http'\n url = raw_url if str(raw_url).startswith(('http://', 'https://')) else f'{scheme}://{host}{raw_url}'\n\npayload = f'请求行: {req_line}\\n请求头: {req_header}\\n请求体: {req_body}'\nresponse = f'状态行: {resp_line}\\n响应头: {resp_header}\\n响应体: {resp_body}'\n\n# Render the full HTTP transaction once; reused by every prompt downstream.\nlog_text = (\n f'[告警基本信息]\\n'\n f'告警类型: {alert_type}\\n严重级别: {severity}\\n'\n f'源地址: {src_ip}:{src_port}\\n目的地址: {dst_ip}:{dst_port}\\n'\n f'协议: {protocol}\\nURL: {url}\\nHTTP状态码: {status}\\n'\n f'TDP判定: {threat.get(\"result\", \"\")}\\nTDP消息: {threat.get(\"msg\", \"\")}\\n\\n'\n f'[HTTP请求内容]\\n{payload}\\n\\n'\n f'[HTTP响应内容]\\n{response}'\n)\n\n# Pre-extract obvious vuln IDs from raw threat fields; the LLM step will refine this later.\nvuln_text = '\\n'.join(str(item) for item in [\n threat.get('msg', ''), threat.get('topic', ''),\n alert_data.get('data', ''), url,\n json.dumps(threat.get('tag', []), ensure_ascii=False),\n] if item)\nvuln_matches = sorted(set(re.findall(r'\\b(?:CVE|CNVD|CNNVD|XVE)-[A-Za-z0-9._-]+\\b', vuln_text, flags=re.I)))\n\niocs = []\nfor candidate in [src_ip, dst_ip]:\n if candidate:\n iocs.append({'type': 'ip', 'value': candidate})\nif url:\n iocs.append({'type': 'url', 'value': url})\nif host and not re.match(r'^\\d{1,3}(?:\\.\\d{1,3}){3}(?::\\d+)?$', str(host)):\n iocs.append({'type': 'domain', 'value': str(host).split(':')[0]})\n\noutputs['parsed_alert'] = {\n 'src_ip': src_ip, 'dst_ip': dst_ip, 'src_port': src_port, 'dst_port': dst_port,\n 'protocol': protocol, 'payload': payload, 'response': response,\n 'url': url, 'status': status,\n 'alert_type': alert_type, 'severity': severity,\n 'vuln_id': vuln_matches[0] if vuln_matches else '',\n 'vuln_candidates': vuln_matches,\n 'threat_result': threat.get('result', ''),\n 'threat_msg': threat.get('msg', ''),\n 'failed_by': threat.get('failed_by', []),\n 'asset_ip': assets.get('ip', ''), 'asset_name': assets.get('name', []),\n 'iocs': iocs, 'raw_alert': alert_data, 'log_text': log_text,\n}\noutputs['log_text'] = log_text\noutputs['iocs'] = iocs\n" }, { - "id": "query_threat_intel", + "id": "prepare_intel", "type": "python", - "description": "查询外部 IOC 的威胁情报,自动跳过内网/保留地址", - "code": "import ipaddress\n\niocs = inputs.get('iocs', [])\nintel_results = []\nseen = set()\n\ndef is_public_ip(value):\n try:\n ip_obj = ipaddress.ip_address(value)\n except Exception:\n return False\n return not (\n ip_obj.is_private\n or ip_obj.is_loopback\n or ip_obj.is_reserved\n or ip_obj.is_link_local\n or ip_obj.is_multicast\n or ip_obj.is_unspecified\n )\n\nfor ioc in iocs:\n ioc_type = ioc.get('type', '')\n ioc_value = str(ioc.get('value', '')).strip()\n key = (ioc_type, ioc_value)\n\n if not ioc_value or key in seen:\n continue\n seen.add(key)\n\n if ioc_type == 'ip':\n if not is_public_ip(ioc_value):\n continue\n result = tool.run_safe('threatbook_ip_query', ip=ioc_value)\n if result['success']:\n intel_results.append({\n 'source': 'threatbook',\n 'type': 'ip',\n 'value': ioc_value,\n 'result': result['text']\n })\n vt_result = tool.run_safe('virustotal_ip_query', ip=ioc_value)\n if vt_result['success']:\n intel_results.append({\n 'source': 'virustotal',\n 'type': 'ip',\n 'value': ioc_value,\n 'result': vt_result['text']\n })\n\n elif ioc_type == 'domain':\n result = tool.run_safe('threatbook_domain_query', domain=ioc_value)\n if result['success']:\n intel_results.append({\n 'source': 'threatbook',\n 'type': 'domain',\n 'value': ioc_value,\n 'result': result['text']\n })\n vt_result = tool.run_safe('virustotal_domain_query', domain=ioc_value)\n if vt_result['success']:\n intel_results.append({\n 'source': 'virustotal',\n 'type': 'domain',\n 'value': ioc_value,\n 'result': vt_result['text']\n })\n\n elif ioc_type == 'url':\n result = tool.run_safe('threatbook_url_query', url=ioc_value)\n if result['success']:\n intel_results.append({\n 'source': 'threatbook',\n 'type': 'url',\n 'value': ioc_value,\n 'result': result['text']\n })\n vt_result = tool.run_safe('virustotal_url_query', url=ioc_value)\n if vt_result['success']:\n intel_results.append({\n 'source': 'virustotal',\n 'type': 'url',\n 'value': ioc_value,\n 'result': vt_result['text']\n })\n\noutputs['intel_results'] = intel_results\noutputs['has_intel'] = len(intel_results) > 0" + "description": "并行前的预处理:查询 IP 威胁情报 + CVE 漏洞情报,生成 intel_content / vuln_content 供后续 survey/cve_info 使用", + "code": "\nimport ipaddress\nimport json\n\n# Pre-fetch external intel (IP geo/asset + CVE info) so the parallel `survey` and `cve_info`\n# nodes have a concrete `content` block to feed their LLM prompts. Internal/RFC1918 IPs\n# are skipped to avoid wasted upstream calls.\nparsed_alert = inputs.get('parsed_alert', {})\niocs = inputs.get('iocs', parsed_alert.get('iocs', []))\n\ndef is_public_ip(value):\n try:\n ip_obj = ipaddress.ip_address(value)\n except Exception:\n return False\n return not (ip_obj.is_private or ip_obj.is_loopback or ip_obj.is_reserved\n or ip_obj.is_link_local or ip_obj.is_multicast or ip_obj.is_unspecified)\n\nintel_results = []\nseen = set()\nfor ioc in iocs:\n ioc_type = ioc.get('type', '')\n ioc_value = str(ioc.get('value', '')).strip()\n key = (ioc_type, ioc_value)\n if not ioc_value or key in seen:\n continue\n seen.add(key)\n if ioc_type == 'ip':\n if not is_public_ip(ioc_value):\n continue\n r = tool.run_safe('threatbook_ip_query', ip=ioc_value)\n if r['success']:\n intel_results.append({'source': 'threatbook', 'type': 'ip', 'value': ioc_value, 'result': r['text']})\n elif ioc_type == 'domain':\n r = tool.run_safe('threatbook_domain_query', domain=ioc_value)\n if r['success']:\n intel_results.append({'source': 'threatbook', 'type': 'domain', 'value': ioc_value, 'result': r['text']})\n elif ioc_type == 'url':\n r = tool.run_safe('threatbook_url_query', url=ioc_value)\n if r['success']:\n intel_results.append({'source': 'threatbook', 'type': 'url', 'value': ioc_value, 'result': r['text']})\n\n# CVE intel (only if we already extracted a vuln id in receive_alert).\nvuln_info = {}\nvuln_id = parsed_alert.get('vuln_id', '')\nif vuln_id:\n r = tool.run_safe('__mcp_vuln_query', vuln_id=vuln_id)\n if r['success']:\n try:\n obj = r.get('obj')\n if isinstance(obj, str):\n obj = json.loads(obj)\n vuln_info = obj if isinstance(obj, dict) else {'raw_result': r.get('text', '')}\n except Exception:\n vuln_info = {'raw_result': r.get('text', '')}\n\n# Compact text blobs for downstream prompts.\nintel_content = '\\n'.join(\n f\"[{i['source']}/{i['type']}] {i['value']}\\n{i['result']}\" for i in intel_results\n) or '(无可用情报数据)'\n\nvuln_content = (\n json.dumps(vuln_info, ensure_ascii=False, indent=2)\n if vuln_info else '(无可用漏洞情报数据)'\n)\n\nprint(f'[prepare_intel] intel_results={len(intel_results)}, vuln_id={vuln_id or \"none\"}')\n\noutputs['parsed_alert'] = parsed_alert\noutputs['log_text'] = inputs.get('log_text', parsed_alert.get('log_text', ''))\noutputs['intel_results'] = intel_results\noutputs['intel_content'] = intel_content\noutputs['vuln_info'] = vuln_info\noutputs['vuln_content'] = vuln_content\n" }, { - "id": "query_vuln", + "id": "survey", "type": "python", - "description": "仅在识别到标准漏洞编号时查询漏洞信息", - "code": "import json\n\nparsed_alert = inputs.get('parsed_alert', {})\nvuln_id = parsed_alert.get('vuln_id', '')\nvuln_info = {}\n\nif vuln_id:\n result = tool.run_safe('__mcp_vuln_query', vuln_id=vuln_id)\n if result['success']:\n try:\n obj = result.get('obj')\n if isinstance(obj, str):\n obj = json.loads(obj)\n vuln_info = obj if isinstance(obj, dict) else {}\n except Exception:\n vuln_info = {'raw_result': result.get('text', '')}\n else:\n vuln_info = {'error': result.get('error', 'Query failed'), 'vuln_id': vuln_id}\n\noutputs['vuln_info'] = vuln_info\noutputs['has_vuln_info'] = bool(vuln_info) and 'error' not in vuln_info" + "description": "测绘:总结 IP 情报中的空间测绘信息(标签、服务、应用资产)", + "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Summarize spatial-mapping (CMDB-style) info from intel data per IP.\nlog_text = inputs.get('log_text', '')\nintel_content = inputs.get('intel_content', '')\n\nprompt = f'''你是一个专业的Web日志分析专家。请总结以下IP的情报数据中的空间测绘信息。\n1. 如果该IP没有测绘信息,则不列出。\n2. 如果IP有测绘信息,则以简短的语言对该IP的测绘信息进行总结,关键说明ip的标签和测绘信息显示有哪些服务或者应用资产。\n3. 多个IP的测绘信息以无序列表显示,每个ip数据描述占一行数据。\n4. 不需要生成其他额外的补充信息。\n\n## 情报参考信息\n{intel_content}\n\n## 用户的原始输入日志\n{log_text}\n'''\n\nresult = _strip_think(llm.ask(prompt))\nprint(f'[survey] {len(result)} chars')\noutputs['survey_result'] = result\n" }, { - "id": "analyze_payload", + "id": "cve_related", "type": "python", - "description": "使用 LLM 分析 HTTP 请求负载并按规范落盘", - "code": "import os\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nparsed_alert = inputs.get('parsed_alert', {})\npayload = parsed_alert.get('payload', '')\nalert_type = parsed_alert.get('alert_type', 'unknown')\nurl = parsed_alert.get('url', '')\n\nprompt = f\"\"\"你是一位网络安全专家,正在分析 HTTP 请求负载。\n\n告警类型: {alert_type}\nURL: {url}\n请求内容:\n{payload}\n\n请输出不超过 120 字的中文分析,包含:\n1. 该流量更像攻击、扫描、误报还是正常请求\n2. 具体攻击/扫描方式\n3. 攻击意图或合法目的\n\"\"\"\n\nanalysis_result = llm.ask(prompt)\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\ntool.run('write', filePath=os.path.join(artifacts_dir, 'payload_analysis_llm_output.md'), content=analysis_result)\n\noutputs['payload_analysis'] = analysis_result\noutputs['has_payload_analysis'] = bool(analysis_result)" + "description": "关联漏洞:仅从日志文本中提取 CVE/CNVD/CNNVD/XVE 编号", + "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Extract CVE/CNVD/CNNVD/XVE numbers strictly from the log text.\nlog_text = inputs.get('log_text', '')\n\nprompt = f'''请从以下的日志数据中提取漏洞编号。\n要求:\n1. 仅从日志文本中识别漏洞编号,不要做任何推测。\n2. 如果日志中存在漏洞编号,则用简短语言描述,如:\"日志中存在漏洞编号:CVE-****-****\"。\n3. 如果日志中不存在漏洞编号,则输出:\"日志中无关联漏洞情报\"。\n\n日志数据如下:\n{log_text}\n'''\n\nresult = _strip_think(llm.ask(prompt))\nprint(f'[cve_related] {result[:80]}')\noutputs['cve_related_result'] = result\n" }, { - "id": "analyze_response", + "id": "cve_info", "type": "python", - "description": "结合 HTTP 响应和 TDP 判定字段判断攻击是否成功", - "code": "import os\nimport re\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nparsed_alert = inputs.get('parsed_alert', {})\npayload = parsed_alert.get('payload', '')\nresponse = parsed_alert.get('response', '')\nstatus = parsed_alert.get('status', 0)\nfailed_by = parsed_alert.get('failed_by', [])\nthreat_result = parsed_alert.get('threat_result', '')\n\nprompt = f\"\"\"你是一位网络安全专家,正在分析 HTTP 请求和响应来判断攻击是否成功。\n\n请求内容:\n{payload}\n\n响应内容:\n{response}\n\n补充信号:\n- HTTP 状态码: {status}\n- TDP result: {threat_result}\n- failed_by: {failed_by}\n\n请严格输出 JSON,对象中必须包含以下字段:\n{{\n \\\"is_attack\\\": true/false,\n \\\"attack_success\\\": true/false,\n \\\"summary\\\": \\\"不超过120字的中文结论\\\",\n \\\"success_evidence\\\": [\\\"字符串\\\"],\n \\\"reason\\\": \\\"简要原因\\\"\n}}\n\"\"\"\n\nanalysis_raw = llm.ask(prompt)\nanalysis_obj = {}\ntry:\n matched = re.search(r'\\{.*\\}', analysis_raw, re.S)\n if matched:\n analysis_obj = json.loads(matched.group(0))\nexcept Exception:\n analysis_obj = {}\n\nstatus_int = 0\ntry:\n status_int = int(status)\nexcept Exception:\n status_int = 0\n\ndef fallback_success():\n if any('http_status_4' in item or 'http_status_5' in item for item in failed_by):\n return False\n if status_int >= 400:\n return False\n if threat_result in ('success', 'succeeded'):\n return True\n return False\n\nattack_success = bool(analysis_obj.get('attack_success')) if analysis_obj else fallback_success()\nsummary = analysis_obj.get('summary') if analysis_obj else ''\nreason = analysis_obj.get('reason') if analysis_obj else ''\nsuccess_evidence = analysis_obj.get('success_evidence') if analysis_obj else []\nif not isinstance(success_evidence, list):\n success_evidence = [str(success_evidence)] if success_evidence else []\n\nresponse_analysis = summary or analysis_raw\nif reason:\n response_analysis += f'\\n原因: {reason}'\nif success_evidence:\n response_analysis += '\\n成功标志: ' + ';'.join(str(item) for item in success_evidence if item)\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\ntool.run('write', filePath=os.path.join(artifacts_dir, 'response_analysis_llm_output.md'), content=response_analysis)\n\noutputs['response_analysis'] = response_analysis\noutputs['has_response_analysis'] = bool(response_analysis)\noutputs['attack_success'] = attack_success" + "description": "漏洞详情:基于 vuln_content 输出关联漏洞的基本信息(不含修复建议)", + "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Summarize basic CVE info from intel content (no remediation).\nlog_text = inputs.get('log_text', '')\nvuln_content = inputs.get('vuln_content', '')\n\nprompt = f'''你是一个专业的Web日志分析专家。参考情报信息中的漏洞数据,简要说明关联的CVE漏洞信息。\n1. 不要输出任何解释说明,只输出漏洞基本信息。不需要生成漏洞的处置建议或修复措施等。\n\n## 情报参考信息\n{vuln_content}\n\n## 用户的原始输入日志\n{log_text}\n'''\n\nresult = _strip_think(llm.ask(prompt))\nprint(f'[cve_info] {len(result)} chars')\noutputs['cve_info_result'] = result\n" }, { - "id": "join_results", + "id": "payload_analysis", + "type": "python", + "description": "攻击 payload:分析日志中是否包含攻击负载并给出判定依据,不分析意图与影响", + "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Payload-only analysis. Per prompt rules, no intent/impact discussion.\nlog_text = inputs.get('log_text', '')\n\nprompt = f'''你是一个专业的Web日志分析专家。根据用户输入的日志进行攻击负载分析。\n1. 首先分析日志中是否包含攻击负载,并给出判定依据。\n2. 不要进行攻击意图分析、攻击影响分析。\n3. 用简短的语言在一段话中进行描述。\n\n## 用户的原始输入日志:\n{log_text}\n'''\n\nresult = _strip_think(llm.ask(prompt))\nprint(f'[payload_analysis] {len(result)} chars')\noutputs['payload_analysis_result'] = result\n" + }, + { + "id": "attack_analysis_result", "type": "python", "join": true, - "description": "等待并归一化并行节点输出,供最终报告使用", - "code": "outputs['parsed_alert'] = inputs.get('parsed_alert', {})\noutputs['intel_results'] = inputs.get('intel_results', [])\noutputs['vuln_info'] = inputs.get('vuln_info', {})\noutputs['payload_analysis'] = inputs.get('payload_analysis', '')\noutputs['response_analysis'] = inputs.get('response_analysis', '')\noutputs['attack_success'] = inputs.get('attack_success', False)" + "description": "攻击分析结果(4 个并行节点的 join 点):按五分类标准产出长文本判定", + "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Join point for the 4 parallel branches; produces the long-form attack-status reasoning\n# that report/verdict/title nodes consume.\nimport os\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nlog_text = inputs.get('log_text', '')\n\nprompt = f'''你是一名专业且经验丰富的网络安全分析师和Web日志分析专家,你对HTTP协议以及Web攻击有着深入的理解,并且你能够快速识别和应对各种网络威胁。你的任务是对提供的HTTP请求与响应内容进行详细的专业分析,并判断日志请求的攻击状态。\n\n请严格遵循以下指令进行思考和分析:\n1. 攻击状态只能从以下情况中选择一种:[\"攻击成功\", \"攻击失败\", \"攻击\", \"未知\", \"安全\"]。\n2. 从日志中提取出\"HTTP请求内容\"和\"HTTP响应内容\"。请注意,HTTP请求内容和HTTP响应内容是分开的,请不要混淆,有些日志中没有包含HTTP响应内容,请不要将HTTP请求内容和HTTP响应内容混淆。分析后请你记住哪些是HTTP请求内容,哪些是HTTP响应内容。\n3. 请检查HTTP响应状态码,2xx或者3xx状态码都代表本次HTTP请求成功,4xx或者5xx状态码大多数情况下都代表请求失败,只有在请求成功的情况下才能对攻击是否成功进行后续判断。\n\n各攻击状态的定义以及判定标准:\n1. 攻击成功:\n(1) 首先分析日志中是否含有清晰的\"HTTP响应内容\",如果日志中没有\"HTTP响应内容\",则肯定不属于攻击成功。\n(2) 如果日志中未提供\"HTTP响应内容\",即使HTTP请求内容中包含攻击者预期的结果,也不能判定为攻击成功。\n(3) 从日志中提取出\"HTTP请求内容\"和\"HTTP响应内容\"。请深入分析\"HTTP响应内容\",并判定其是否为\"HTTP请求内容\"攻击成功时的预期结果,这是判定攻击成功的强依据。请注意,HTTP响应码200仅表示网络连接成功,不代表攻击攻击成功。\n(4) 分析HTTP请求内容和HTTP响应内容,只有当HTTP响应内容中明确包含攻击载荷在目标机器上成功执行的证据,并且HTTP请求内容中包含攻击载荷的特征,则判定为\"攻击成功\"。\n(5) 请注意:攻击成功的判定必须包含HTTP响应内容。如果不包含HTTP响应内容,则肯定不属于攻击成功。\n(6) 请注意:如果不包含HTTP响应内容,即使HTTP请求内容是攻击,这也不属于攻击成功。\n2. 攻击失败:\n(1) 分析HTTP请求内容和HTTP响应内容,如果HTTP响应内容中明确包含攻击载荷在目标机器上执行失败或者被阻止的证据,并且HTTP请求内容中包含攻击载荷的特征,则判定为\"攻击失败\"。\n(2) 攻击失败的判定必须包含HTTP响应内容。如果不包含HTTP响应内容,则肯定不属于攻击失败。\n3. 攻击:\n(1) 在\"HTTP请求内容\"或\"HTTP响应内容\"中发现任何证明存在攻击意图的证据,即可判定为存在攻击行为。但如果不符合上述的攻击成功或者攻击失败的标准,则\"攻击状态\"为\"攻击\"。\n(2) 请注意:如果日志中只提供了\"HTTP请求内容\",且没有提供\"HTTP响应内容\",且HTTP的请求内容分析中是包含攻击行为的,则\"攻击状态\"为\"攻击\"。\n4. 未知:\n(1) 如果不能100%确定HTTP通信的攻击结果,那么请在\"攻击状态\"处给出\"未知\"。\n(2) 请注意:如果在你给的判定原因中存在\"可能\"等不确定词汇,都代表你不能对你的结论100%确定,那么请在\"攻击状态\"处给出\"未知\"。\n5. 安全:\n(1) 如果\"HTTP请求内容\"和\"HTTP响应内容\"中都没有任何攻击意图的证据,那么请在\"攻击状态\"处给出\"安全\"。\n\n## 日志内容\n{log_text}\n\n## 输出要求\n请按下列结构输出(中文):\n1. 攻击状态: [攻击成功/攻击失败/攻击/未知/安全]\n2. 判定依据: 简要说明请求与响应的关键证据\n3. 详细分析: 不超过200字\n'''\n\nresult = _strip_think(llm.ask(prompt))\nprint(f'[attack_analysis_result] {len(result)} chars')\n\n# Persist intermediate artifact for traceability and downstream re-use.\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\ntool.run('write',\n filePath=os.path.join(artifacts_dir, 'attack_analysis_result.md'),\n content=result)\n\n# Forward all upstream parallel outputs so report/verdict/title nodes receive them.\noutputs['attack_analysis_result'] = result\noutputs['parsed_alert'] = inputs.get('parsed_alert', {})\noutputs['log_text'] = log_text\noutputs['intel_results'] = inputs.get('intel_results', [])\noutputs['intel_content'] = inputs.get('intel_content', '')\noutputs['vuln_info'] = inputs.get('vuln_info', {})\noutputs['vuln_content'] = inputs.get('vuln_content', '')\noutputs['survey_result'] = inputs.get('survey_result', '')\noutputs['cve_related_result'] = inputs.get('cve_related_result', '')\noutputs['cve_info_result'] = inputs.get('cve_info_result', '')\noutputs['payload_analysis_result'] = inputs.get('payload_analysis_result', '')\n" + }, + { + "id": "attack_verdict", + "type": "python", + "description": "攻击判定:将攻击分析结果归一化为 5 个标签之一(attack_success/attack_failed/attack/unknown/benign)", + "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Convert the upstream long-form reasoning into a single verdict label.\nattack_analysis_result = inputs.get('attack_analysis_result', '')\n\nprompt = f'''你是一个专业的Web日志分析专家。请据参考信息,直接输出攻击判定类别:\nattack_success:表示攻击成功。\nattack_failed:表示攻击失败。\nattack:表示是日志内容是攻击。\nunknown:表示未知。\nbenign:是安全。\n不额外输出任何其他信息,包括解释、判定依据等。\n\n## 日志分析结果:\n{attack_analysis_result}\n'''\n\nraw = _strip_think(llm.ask(prompt)).strip().lower()\n# Be defensive: LLM may add quotes / punctuation around the label.\nallowed = ('attack_success', 'attack_failed', 'attack', 'unknown', 'benign')\nverdict = next((v for v in allowed if v in raw), 'unknown')\nprint(f'[attack_verdict] raw={raw!r} -> verdict={verdict}')\n\noutputs['attack_verdict'] = verdict\n# Forward everything for the next nodes.\nfor k in ('parsed_alert', 'log_text', 'intel_results', 'intel_content',\n 'vuln_info', 'vuln_content', 'survey_result', 'cve_related_result',\n 'cve_info_result', 'payload_analysis_result', 'attack_analysis_result'):\n outputs[k] = inputs.get(k)\n" + }, + { + "id": "report_title", + "type": "python", + "description": "报告标题:基于攻击类型 + 判定结果生成不超过 30 字的中文报告标题", + "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Pithy title; encodes attack type + verdict outcome.\nattack_analysis_result = inputs.get('attack_analysis_result', '')\nattack_verdict = inputs.get('attack_verdict', 'unknown')\nparsed_alert = inputs.get('parsed_alert', {})\n\nprompt = f'''你是一个专业的Web日志分析专家。请基于以下分析结果,生成一份不超过 30 字的中文报告标题。\n要求:\n1. 标题必须能体现\"攻击类型\"或\"攻击结果分析的结论\"。\n2. 不要带书名号、引号或其他标点。\n3. 只输出标题本身,不要任何解释或说明。\n\n## 攻击类型\n{parsed_alert.get('alert_type', 'unknown')}\n\n## 攻击判定\n{attack_verdict}\n\n## 攻击分析结果\n{attack_analysis_result}\n'''\n\nraw = _strip_think(llm.ask(prompt)).strip()\ntitle = raw.splitlines()[0].strip(' \"\\'《》[]【】') if raw else f'{parsed_alert.get(\"alert_type\", \"未知告警\")} - {attack_verdict}'\nprint(f'[report_title] {title}')\n\noutputs['report_title'] = title\nfor k in ('parsed_alert', 'log_text', 'intel_results', 'intel_content',\n 'vuln_info', 'vuln_content', 'survey_result', 'cve_related_result',\n 'cve_info_result', 'payload_analysis_result', 'attack_analysis_result',\n 'attack_verdict'):\n outputs[k] = inputs.get(k)\n" }, { "id": "generate_report", "type": "python", - "description": "汇总 IOC、威胁情报、漏洞、请求响应分析并生成最终报告", - "code": "import os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nparsed_alert = inputs.get('parsed_alert', {})\nintel_results = inputs.get('intel_results', [])\nvuln_info = inputs.get('vuln_info', {})\npayload_analysis = inputs.get('payload_analysis', '')\nresponse_analysis = inputs.get('response_analysis', '')\nattack_success = inputs.get('attack_success', False)\n\ndef severity_to_text(value):\n mapping = {0: 'low', 1: 'low', 2: 'medium', 3: 'high', 4: 'critical'}\n if isinstance(value, int):\n return mapping.get(value, str(value))\n if str(value).isdigit():\n return mapping.get(int(value), str(value))\n return str(value) or 'unknown'\n\nseverity_text = severity_to_text(parsed_alert.get('severity', 'unknown'))\nreport_content = f\"\"\"# NDR/TDP 告警分析报告\n\n## 执行摘要\n\n**告警类型**: {parsed_alert.get('alert_type', 'unknown')}\n**严重级别**: {severity_text}\n**攻击是否成功**: {'是' if attack_success else '否'}\n**URL**: {parsed_alert.get('url', 'N/A')}\n\n### 主要发现\n- 源 IP: {parsed_alert.get('src_ip', 'N/A')}\n- 目的 IP: {parsed_alert.get('dst_ip', 'N/A')}\n- 协议: {parsed_alert.get('protocol', 'N/A')}\n- 端口: {parsed_alert.get('src_port', 'N/A')} -> {parsed_alert.get('dst_port', 'N/A')}\n- TDP 判定: {parsed_alert.get('threat_result', 'N/A')}\n- 失败信号: {', '.join(parsed_alert.get('failed_by', [])) or '无'}\n\n---\n\n## 详细分析\n\n### 1. IOC\n```json\n{json.dumps(parsed_alert.get('iocs', []), ensure_ascii=False, indent=2)}\n```\n\n### 2. 威胁情报查询结果\n\"\"\"\n\nif intel_results:\n for intel in intel_results:\n report_content += f\"\\n**{intel['source']} - {intel['type']}**: {intel['value']}\\n```\\n{intel['result']}\\n```\\n\"\nelse:\n report_content += '\\n未查询到可用外部情报,或 IOC 仅包含内网/保留地址。\\n'\n\nreport_content += '\\n### 3. 漏洞信息\\n'\nif vuln_info:\n report_content += f\"\\n```json\\n{json.dumps(vuln_info, ensure_ascii=False, indent=2)}\\n```\\n\"\nelse:\n report_content += '\\n未识别到标准漏洞编号,跳过漏洞查询。\\n'\n\nreport_content += f\"\"\"\n### 4. 攻击负载分析\n{payload_analysis or '无'}\n\n### 5. 响应包分析\n{response_analysis or '无'}\n\n---\n\n## 风险评估\n\n- **风险等级**: {'High' if attack_success else 'Medium'}\n- **攻击成功**: {'是 - 建议紧急响应' if attack_success else '否 - 当前更像扫描或失败尝试'}\n\n---\n\n## 建议与行动项\n\n1. {'立即阻断源 IP 通信并检查目标主机是否已落地 WebShell。' if attack_success else '结合 Web/WAF/主机日志复核该请求,确认是否仅为扫描或误报。'}\n2. 检查目标主机 {parsed_alert.get('dst_ip', 'N/A')} 对应 Web 服务和目录访问日志。\n3. 如 URL 涉及敏感上传/脚本路径,建议核查文件完整性与最近变更。\n4. 将该源地址与其他探测、爆破、上传类告警做关联分析。\n\n---\n\n## 数据来源\n\n- 威胁情报: ThreatBook, VirusTotal\n- 漏洞信息: Vulnerability Database\n- 分析: LLM-based analysis\n\"\"\"\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\nreport_path = os.path.join(artifacts_dir, 'final_report.md')\ntool.run('write', filePath=report_path, content=report_content)\n\noutputs['final_report'] = report_content\noutputs['report_path'] = report_path\noutputs['risk_level'] = 'High' if attack_success else 'Medium'\noutputs['action_required'] = '紧急响应' if attack_success else '持续监控'\noutputs['action_details'] = '建议立即阻断源 IP 通信,隔离受影响系统,并进行深入调查' if attack_success else '建议记录该告警,结合 Web/WAF/主机日志持续监控并复核'" + "description": "输出报告:汇总所有分析结果生成最终 markdown 报告,落盘到 ~/.flocks/workspace/outputs//artifacts/final_report.md", + "code": "\n# Final markdown that aggregates every prior step. Persists to disk.\nimport os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nparsed_alert = inputs.get('parsed_alert', {})\nlog_text = inputs.get('log_text', '')\nintel_results = inputs.get('intel_results', [])\nvuln_info = inputs.get('vuln_info', {})\nsurvey_result = inputs.get('survey_result', '')\ncve_related_result = inputs.get('cve_related_result', '')\ncve_info_result = inputs.get('cve_info_result', '')\npayload_analysis_result = inputs.get('payload_analysis_result', '')\nattack_analysis_result = inputs.get('attack_analysis_result', '')\nattack_verdict = inputs.get('attack_verdict', 'unknown')\nreport_title = inputs.get('report_title', '')\n\nverdict_cn_map = {\n 'attack_success': '攻击成功',\n 'attack_failed': '攻击失败',\n 'attack': '攻击',\n 'unknown': '未知',\n 'benign': '安全',\n}\nverdict_cn = verdict_cn_map.get(attack_verdict, attack_verdict)\n\n# Risk level driven by the verdict (not by individual evidence) for consistency with the label.\nrisk_level = {\n 'attack_success': 'High',\n 'attack_failed': 'Medium',\n 'attack': 'Medium',\n 'unknown': 'Medium',\n 'benign': 'Low',\n}.get(attack_verdict, 'Medium')\n\n# Use the LLM-generated title; fall back to a deterministic synthesis if the LLM returned empty.\nif not report_title:\n report_title = f'{parsed_alert.get(\"alert_type\", \"Web日志告警\")} - {verdict_cn}'\n\nintel_md = ''\nif intel_results:\n for intel in intel_results:\n intel_md += f\"\\n**{intel['source']} / {intel['type']}**: {intel['value']}\\n```\\n{intel['result']}\\n```\\n\"\nelse:\n intel_md = '\\n(未查询到外部威胁情报)\\n'\n\nvuln_md = (\n f\"\\n```json\\n{json.dumps(vuln_info, ensure_ascii=False, indent=2)}\\n```\\n\"\n if vuln_info else '\\n(未查询到漏洞详情)\\n'\n)\n\n# Title encodes verdict; report intentionally omits a wall-clock timestamp.\nreport = (\n f'# {report_title}\\n\\n'\n f'**攻击判定**: {verdict_cn} (`{attack_verdict}`)\\n'\n f'**风险等级**: {risk_level}\\n'\n f'**告警类型**: {parsed_alert.get(\"alert_type\", \"unknown\")}\\n'\n f'**源 / 目的**: {parsed_alert.get(\"src_ip\", \"N/A\")}:{parsed_alert.get(\"src_port\", \"N/A\")} → '\n f'{parsed_alert.get(\"dst_ip\", \"N/A\")}:{parsed_alert.get(\"dst_port\", \"N/A\")}\\n'\n f'**URL**: {parsed_alert.get(\"url\", \"N/A\")}\\n\\n'\n '---\\n\\n'\n '## 1. 日志类型分析\\n'\n 'Web日志(已通过 log_type_analysis 校验)\\n\\n'\n '## 2. 测绘信息\\n'\n f'{survey_result or \"(无)\"}\\n\\n'\n '## 3. 关联漏洞分析\\n'\n f'{cve_related_result or \"(无)\"}\\n\\n'\n '## 4. 漏洞详情\\n'\n f'{cve_info_result or \"(无)\"}\\n\\n'\n '## 5. 攻击 Payload 分析\\n'\n f'{payload_analysis_result or \"(无)\"}\\n\\n'\n '## 6. 攻击分析结果\\n'\n f'{attack_analysis_result or \"(无)\"}\\n\\n'\n '## 7. 威胁情报\\n'\n f'{intel_md}\\n'\n '## 8. 漏洞情报原始数据\\n'\n f'{vuln_md}\\n'\n '## 9. 原始日志\\n'\n f'```\\n{log_text}\\n```\\n'\n)\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\nreport_path = os.path.join(artifacts_dir, 'final_report.md')\ntool.run('write', filePath=report_path, content=report)\n\nprint(f'[generate_report] verdict={attack_verdict}, title={report_title}, path={report_path}')\n\noutputs['final_report'] = report\noutputs['report_path'] = report_path\noutputs['report_title'] = report_title\noutputs['attack_verdict'] = attack_verdict\noutputs['risk_level'] = risk_level\n" } ], "edges": [ { "from": "receive_alert", - "to": "query_threat_intel" + "to": "prepare_intel", + "order": 0 }, { - "from": "receive_alert", - "to": "query_vuln" + "from": "prepare_intel", + "to": "survey", + "order": 0 }, { - "from": "receive_alert", - "to": "analyze_payload" + "from": "prepare_intel", + "to": "cve_related", + "order": 1 }, { - "from": "receive_alert", - "to": "analyze_response" + "from": "prepare_intel", + "to": "cve_info", + "order": 2 + }, + { + "from": "prepare_intel", + "to": "payload_analysis", + "order": 3 + }, + { + "from": "survey", + "to": "attack_analysis_result", + "order": 0 + }, + { + "from": "cve_related", + "to": "attack_analysis_result", + "order": 1 }, { - "from": "query_threat_intel", - "to": "join_results" + "from": "cve_info", + "to": "attack_analysis_result", + "order": 2 }, { - "from": "query_vuln", - "to": "join_results" + "from": "payload_analysis", + "to": "attack_analysis_result", + "order": 3 }, { - "from": "analyze_payload", - "to": "join_results" + "from": "attack_analysis_result", + "to": "attack_verdict", + "order": 0 }, { - "from": "analyze_response", - "to": "join_results" + "from": "attack_verdict", + "to": "report_title", + "order": 0 }, { - "from": "join_results", - "to": "generate_report" + "from": "report_title", + "to": "generate_report", + "order": 0 + } + ], + "metadata": { + "node_timeout_s": 600, + "sampleInputs": { + "alert_data": { + "data": [ + { + "attacker": "1.2.3.4", + "victim": "10.0.0.1", + "external_port": 50000, + "machine_port": 80, + "url_host": "vuln.example.com", + "url_path": "/admin/login.php?id=1 OR 1=1", + "net": { + "src_port": 50000, + "dest_port": 80, + "app_proto": "http", + "http": { + "reqs_line": "GET /admin/login.php?id=1 OR 1=1 HTTP/1.1", + "reqs_header": "Host: vuln.example.com\nUser-Agent: sqlmap/1.6", + "req_body": "", + "resp_line": "HTTP/1.1 200 OK", + "resp_header": "Content-Type: text/html", + "resp_body": "You have an error in your SQL syntax", + "status": 200 + } + }, + "threat": { + "name": "SQL注入攻击", + "severity": "high", + "msg": "CVE-2017-12615 detected", + "result": "success" + } + } + ] + } } - ] -} + } +} \ No newline at end of file diff --git a/.flocks/plugins/workflows/tdp_alert_triage/workflow.md b/.flocks/plugins/workflows/tdp_alert_triage/workflow.md index 21f4368f8..9cd394ce4 100644 --- a/.flocks/plugins/workflows/tdp_alert_triage/workflow.md +++ b/.flocks/plugins/workflows/tdp_alert_triage/workflow.md @@ -1,124 +1,100 @@ -# NDR/TDP 告警调查工作流 +# TDP/NDR Web 日志研判工作流 ## 业务场景 -对 NDR/TDP 告警进行自动化研判分析。当前工作流重点兼容 TDP 检索结果这类嵌套结构输入,例如顶层 `data` 数组、`net.http` 请求响应字段以及 `threat` 判定信息,并生成结构化分析报告。 - -## 流程步骤 - -### 1. 接收告警数据 -- **描述**: 接收并解析 NDR/TDP 告警或网络流量日志,提取关键字段(源IP、目的IP、端口、协议、HTTP 请求/响应、IOC 等) -- **工具/模型**: Tool-driven -- **输入**: `alert_data` - 告警 JSON 数据,支持扁平结构或 TDP 风格的 `{ "data": [ ... ] }` -- **输出**: `parsed_alert` - 解析后的告警数据字典 -- **处理逻辑**: - - 自动展开顶层 `data[0]`,兼容直接传入单条告警 - - 优先从 `threat`、`net.http`、`attacker/victim`、`external_ip/server_ip` 等 TDP 字段提取 src/dst、URL、请求与响应 - - 从 `threat.msg`、`threat.tag` 等字段中识别标准漏洞编号(如 `CVE-*`) - - 提取 IOC(IP、域名、URL),并保留原始告警用于后续分析 - -### 2. 威胁情报查询(并行) -- **描述**: 使用多源威胁情报查询告警中涉及的外部 IP、域名、URL 等指标 -- **工具/模型**: Tool-driven -- **输入**: `parsed_alert` - 解析后的告警数据 -- **输出**: `intel_results` - 威胁情报查询结果汇总 -- **处理逻辑**: - - 遍历告警中的 IOC(IP、域名、URL) - - 自动去重,并跳过内网/保留地址,避免对 `127.0.0.1`、RFC1918 地址做无意义情报查询 - - 使用 `threatbook_ip_query`、`threatbook_domain_query`、`threatbook_url_query` 查询 - - 使用 `virustotal_ip_query`、`virustotal_domain_query`、`virustotal_url_query` 做补充查询 - - 汇总所有情报结果 - -### 3. 漏洞信息查询(并行) -- **描述**: 仅在识别到标准漏洞编号时查询漏洞信息(CVE/CNVD/CNNVD/XVE) -- **工具/模型**: Tool-driven -- **输入**: `parsed_alert` - 可能包含漏洞ID -- **输出**: `vuln_info` - 漏洞详细信息 -- **处理逻辑**: - - 从 `threat.msg`、`threat.tag`、URL 等文本中提取漏洞ID(如 `CVE-2021-xxx`) - - 仅当存在标准漏洞编号时才调用 `__mcp_vuln_query` - - 获取漏洞描述、影响产品、修复方案、POC 等信息 - - 无漏洞ID时返回空结果 - -### 4. 攻击负载分析(并行) -- **描述**: 使用 LLM 分析 HTTP 请求负载,识别攻击/扫描手法与意图 -- **工具/模型**: LLM-driven -- **输入**: `parsed_alert` - 包含 payload -- **输出**: `payload_analysis` - 攻击负载分析结果 -- **处理逻辑**: - - 提取 HTTP 请求行、请求头、请求体 - - 使用 LLM 分析该流量更像攻击、扫描、误报还是正常请求 - - 识别具体攻击/扫描方式和意图 - - **必须落盘**: 将 LLM 分析结果写入 `~/.flocks/workspace/outputs//artifacts/payload_analysis_llm_output.md` - -### 5. 响应包分析与攻击成功判定(并行) -- **描述**: 结合服务器响应包和 TDP 判定字段,判断攻击是否成功 -- **工具/模型**: LLM-driven -- **输入**: `parsed_alert` - 包含请求和响应 -- **输出**: `response_analysis` - 响应分析结果, `attack_success` - 攻击是否成功 -- **处理逻辑**: - - 提取请求包和响应包内容 - - 将 `HTTP status`、`threat.result`、`threat.failed_by` 一并作为判定信号 - - 优先让 LLM 结构化输出成功/失败结论,解析失败时再使用规则兜底 - - **必须落盘**: 将分析结果写入 `~/.flocks/workspace/outputs//artifacts/response_analysis_llm_output.md` - -### 6. 汇聚并行结果 -- **描述**: 使用 `join=true` 等待并行节点全部完成,再把结果归一化后传给报告节点 -- **工具/模型**: Tool-driven -- **输入**: `intel_results`、`vuln_info`、`payload_analysis`、`response_analysis`、`attack_success` -- **输出**: 归一化后的统一上下文 -- **处理逻辑**: - - 等待 4 个并行节点全部完成 - - 透传并规整报告节点所需字段 - - 避免多个并行分支直接汇聚到写文件节点,满足 workflow 引擎约束 - -### 7. 生成分析报告 -- **描述**: 综合以上分析结果,生成结构化分析报告 -- **工具/模型**: LLM-driven -- **输入**: 所有前序步骤的输出(intel_results, vuln_info, payload_analysis, response_analysis, attack_success) -- **输出**: `final_report` - 完整分析报告 -- **处理逻辑**: - - 汇总情报查询结果 - - 汇总漏洞信息 - - 汇总攻击负载分析和响应分析 - - 根据 `attack_success` 和 TDP 失败信号生成风险等级 - - 生成结构化报告,包含:摘要、IOC、情报、漏洞、分析、风险评估、建议 - - **必须落盘**: 将报告写入 `~/.flocks/workspace/outputs//artifacts/final_report.md` - -## 并行执行设计 - -步骤 2、3、4、5 为并行节点,同时执行以提升效率: -- query_threat_intel: 威胁情报查询 -- query_vuln: 漏洞信息查询 -- analyze_payload: 攻击负载分析 -- analyze_response: 响应包分析与攻击成功判定 - -所有并行节点执行完成后,先汇聚到 `join_results`,再进入 `generate_report` 生成最终报告。 - -## 报告结构 - -### 执行摘要 -- 告警概述 -- 主要发现 -- 风险等级 - -### 详细分析 -- 告警详情 -- 威胁情报结果 -- 漏洞信息(如有) -- 攻击负载分析 -- 响应分析 - -### 关键发现 -- IOC 列表 -- 攻击手法描述 -- 是否成功判定 - -### 风险评估 -- 风险等级 -- 影响范围 - -### 建议与行动项 -- 紧急处置建议 -- 长期加固建议 -- 需要关联分析的系统 +对 NDR/TDP 上送的 HTTP 日志告警进行标准化研判(默认输入已为 HTTP 日志,无需类型判别):收集情报 → 并行分析(测绘 / 漏洞 / 漏洞详情 / payload)→ 综合给出攻击判定与最终报告。其中**测绘 / 漏洞分析 / 漏洞详情 / 攻击 payload** 四步并行执行以缩短端到端时延。 + +## 流程结构 + +``` +receive_alert (告警解析) + │ + ▼ +prepare_intel (查询 IP 威胁情报 + CVE 漏洞情报) + │ + ▼ +┌────────────────── 并行 4 节点 ──────────────────┐ +│ survey (测绘) │ +│ cve_related (从日志提取 CVE 编号) │ +│ cve_info (展示 CVE 漏洞信息) │ +│ payload_analysis (攻击 payload 分析) │ +└──────────────────────────────────────────────────┘ + │ + ▼ +attack_analysis_result (攻击分析结果,join 节点) + │ + ▼ +attack_verdict (攻击判定:5 类标签) + │ + ▼ +report_title (报告标题) + │ + ▼ +generate_report (输出最终 Markdown 报告) +``` + +## 节点详情 + +### 1. `receive_alert` +解析 NDR/TDP 告警 JSON,兼容顶层 `data` 数组与扁平结构。提取 `src_ip/dst_ip/sport/dport/protocol`、HTTP 请求/响应、URL、IOC 列表,以及预扫的 CVE/CNVD/CNNVD/XVE 编号。生成统一的 `log_text` 文本块供下游所有 LLM prompt 使用。 + +### 2. `prepare_intel` +并行块的预处理: +- 对外网 IP 调用 `threatbook_ip_query`(自动跳过 RFC1918 / 回环 / 保留地址) +- 对域名/URL 调用 `threatbook_domain_query` / `threatbook_url_query` +- 若 `receive_alert` 提取到 CVE,调用 `__mcp_vuln_query` 获取详情 + +输出 `intel_content` 与 `vuln_content` 文本块,供 `survey` 与 `cve_info` 的 LLM prompt 直接使用。 + +### 3-6. 并行节点 + +| 节点 | 职责 | 关键约束 | +|------|------|---------| +| `survey` | 总结 IP 情报中的空间测绘信息(标签 + 服务 + 应用资产) | 多 IP 以无序列表显示,无测绘信息则不列出 | +| `cve_related` | 仅从日志文本提取漏洞编号 | 不做任何推测,无编号则输出"日志中无关联漏洞情报" | +| `cve_info` | 基于 vuln_content 输出 CVE 基本信息 | 不输出修复建议、不带额外解释说明 | +| `payload_analysis` | 分析日志中是否包含攻击载荷及判定依据 | 不做攻击意图分析、不做攻击影响分析 | + +4 个节点同时执行,各自独立产出结果。 + +### 7. `attack_analysis_result`(join) +`join: true` —— 等待 4 个并行节点全部完成。按"攻击成功 / 攻击失败 / 攻击 / 未知 / 安全"五分类标准进行长文本判定,输出"攻击状态 + 判定依据 + 详细分析"。同时把所有上游结果透传到下游。落盘到 `attack_analysis_result.md`。 + +### 8. `attack_verdict` +将上一节点的长文本归一化为 5 个标签之一:`attack_success` / `attack_failed` / `attack` / `unknown` / `benign`。LLM 输出做 token 容错。 + +### 9. `report_title` +基于攻击类型 + 判定结果生成 ≤30 字中文标题。对返回内容做引号/括号清理。LLM 失败时回退到 ` - ` 模板。 + +### 10. `generate_report` +汇总所有分析结果生成最终 Markdown 报告,9 个章节:执行摘要 → 日志类型 → 测绘 → 关联漏洞 → 漏洞详情 → payload → 攻击分析 → 威胁情报 → 原始日志。落盘到: +``` +~/.flocks/workspace/outputs//artifacts/final_report.md +``` +报告标题包含攻击类型/结果,正文不包含时间戳。 + +## 输入参数 + +```json +{ + "alert_data": "TDP 告警 JSON(list / dict / 嵌套 data 结构均支持)" +} +``` + +## 输出参数 + +| 字段 | 类型 | 说明 | +|------|------|------| +| `final_report` | string | 完整 Markdown 报告 | +| `report_path` | string | 报告文件路径 | +| `report_title` | string | 报告标题 | +| `attack_verdict` | enum | `attack_success` / `attack_failed` / `attack` / `unknown` / `benign` | +| `risk_level` | enum | `High` / `Medium` / `Low` | + +## 工程要点 + +- **并行扇出/扇入**:`prepare_intel` 是唯一 fan-out 起点;`attack_analysis_result` 用 `join: true` 作为唯一 fan-in 汇聚点,符合 flocks workflow 引擎的 lint 要求。 +- **LLM 推理块清洗**:所有 LLM 节点都会用 `_strip_think()` 去除 `...` 推理块,避免模型内部思考过程污染输出。 +- **LLM 容错**:所有调用 `llm.ask` 的节点都对返回结果做了正则提取与回退处理,单一 LLM 输出格式偏差不会让整个工作流失败。 +- **节点超时**:`metadata.node_timeout_s = 600`,留给最慢的 LLM 推理足够时间。 +- **报告落盘**:使用 `WorkspaceManager.get_workspace_dir()` 解析输出根目录,所有产物统一落到 flocks 工作区下的 `outputs//artifacts/`。 diff --git a/tests/integration/test_alert_dedup_triage_stream.py b/tests/integration/test_alert_dedup_triage_stream.py new file mode 100644 index 000000000..b87706f7e --- /dev/null +++ b/tests/integration/test_alert_dedup_triage_stream.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +# NOTE: standalone manual integration test — not a pytest test, run directly with python3. +""" +手动集成测试工具:两阶段流式 pipeline(dedup → triage) + +逐条读取 ~/Downloads/tdp_logs.json,对每条告警: + 1) 调用 http_alert_dedup(POST /workflow-center/http_alert_dedup/invoke) + - 返回 unique_alerts 为空 -> 被过滤掉,跳过 triage + - 返回 unique_alerts[0].dedup_key_already_exists == True -> 跨批次重复,跳过 triage + - 否则 -> 视为"首次出现的可分析告警",转 step 2 + 2) 调用 tdp_alert_triage(POST /workflow-center//invoke) + - 把原始告警作为 alert_data 传入,触发 LLM 研判流水线(测绘/CVE/payload 并行) + +输出 JSONL 到 ~/.flocks/workspace/outputs//,每条记录包含: + {batch, alert_index, dedup: {...}, triage: {verdict, risk, title, report_path} | None, reason} +末尾追加一行 _summary 汇总。 + +用法: + python3 scripts/stream_pipeline_dedup_triage.py [--input FILE] [--limit N] [--delay SEC] + +如果只想跑一小批做端到端验证: + python3 scripts/stream_pipeline_dedup_triage.py --limit 3 --triage-limit 1 +""" + +import argparse +import json +import os +import sys +import time +import urllib.request +import urllib.error +from datetime import datetime +from pathlib import Path + +# ---------- API endpoints ---------- +# dedup: via main server proxy (records UI metrics) +DEDUP_URL = "http://127.0.0.1:8000/api/workflow-center/http_alert_dedup/invoke" +DEDUP_KEY = "Yw5WQxIL2bgDSL1RH0XO4yolu30GYrQ9bsfLHSmWVfk" + +# triage: call the published service directly (avoids 30 s proxy timeout) +TRIAGE_URL = "http://127.0.0.1:19001/invoke" +TRIAGE_KEY = "8e23f1ad036c4f73960925923d04e9a1edf8fcaf3d6b4461b5d2ced7e0956267" + +DEDUP_BASE_INPUTS = { + "source_log_type": "tdp", + "filter_enabled": True, + "dedup_enabled": True, + "threshold": 0.7, +} + + +def _post(url: str, api_key: str, payload: dict, timeout: int) -> tuple[dict, int]: + """POST JSON to a flocks /invoke endpoint; return (response_dict, elapsed_ms).""" + # Services use X-API-Key; main proxy uses Authorization: Bearer. + if "127.0.0.1:8000" in url: + auth_header = {"Authorization": f"Bearer {api_key}"} + else: + auth_header = {"X-API-Key": api_key} + req = urllib.request.Request( + url, + data=json.dumps(payload).encode(), + headers={"Content-Type": "application/json", **auth_header}, + method="POST", + ) + t0 = time.time() + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read()), round((time.time() - t0) * 1000) + except urllib.error.HTTPError as e: + body = e.read().decode(errors="replace")[:500] + return {"status": "FAILED", "error": f"HTTP {e.code}: {body}"}, round((time.time() - t0) * 1000) + except Exception as e: + return {"status": "FAILED", "error": str(e)}, round((time.time() - t0) * 1000) + + +def call_dedup(alert: dict, timeout: int) -> tuple[dict, int]: + payload = {"inputs": {**DEDUP_BASE_INPUTS, "alerts": [alert]}} + return _post(DEDUP_URL, DEDUP_KEY, payload, timeout) + + +def call_triage(alert: dict, timeout: int) -> tuple[dict, int]: + """Call triage service directly on port 19001 — no proxy timeout.""" + payload = {"inputs": {"alert_data": alert}} + return _post(TRIAGE_URL, TRIAGE_KEY, payload, timeout) + + +def default_output_path() -> Path: + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + out_dir = Path.home() / ".flocks" / "workspace" / "outputs" / datetime.now().strftime("%Y-%m-%d") + out_dir.mkdir(parents=True, exist_ok=True) + return out_dir / f"{ts}_pipeline_dedup_triage.jsonl" + + +def main() -> None: + p = argparse.ArgumentParser(description="Streaming pipeline: dedup -> triage") + p.add_argument("--input", default=str(Path.home() / "Downloads" / "tdp_logs.json"), + help="Input JSON file (top-level list)") + p.add_argument("--limit", type=int, default=0, + help="Process only the first N alerts (0 = all)") + p.add_argument("--triage-limit", type=int, default=0, + help="Stop after triggering N successful triage runs (0 = unlimited). " + "Useful to avoid burning LLM credits during smoke tests.") + p.add_argument("--delay", type=float, default=0.0, + help="Delay (seconds) between alerts") + p.add_argument("--dedup-timeout", type=int, default=60) + p.add_argument("--triage-timeout", type=int, default=600) + p.add_argument("--output", default=None, help="Output JSONL path") + args = p.parse_args() + + src = Path(args.input).expanduser() + if not src.exists(): + print(f"[ERROR] input not found: {src}", file=sys.stderr) + sys.exit(1) + + with open(src, "r", encoding="utf-8") as f: + records = json.load(f) + if isinstance(records, dict): + records = records.get("data", records.get("alerts", records.get("logs", []))) + if not isinstance(records, list): + print("[ERROR] expected top-level list", file=sys.stderr) + sys.exit(1) + + if args.limit > 0: + records = records[: args.limit] + + out_path = Path(args.output).expanduser() if args.output else default_output_path() + out_path.parent.mkdir(parents=True, exist_ok=True) + + total = len(records) + print(f"[stream] input: {src} count={total}") + print(f"[stream] output: {out_path}") + print(f"[stream] dedup: {DEDUP_URL}") + print(f"[stream] triage: {TRIAGE_URL}") + print("-" * 80) + + summary = { + "total_input": total, + "dedup_success": 0, + "dedup_failed": 0, + "filtered_out": 0, + "duplicate_skipped": 0, + "triage_invoked": 0, + "triage_success": 0, + "triage_failed": 0, + "verdict_counts": {}, + "started_at": datetime.now().isoformat(), + } + + with open(out_path, "w", encoding="utf-8") as f_out: + for i, alert in enumerate(records): + entry = {"alert_index": i, "alert_id": alert.get("id") or alert.get("uuid"), + "threat_name": (alert.get("threat") or {}).get("name", ""), + "src_ip": alert.get("attacker"), "dst_ip": alert.get("victim")} + + # ---------- step 1: dedup ---------- + dr, dms = call_dedup(alert, args.dedup_timeout) + ds = dr.get("status", "UNKNOWN") + if ds != "SUCCEEDED": + summary["dedup_failed"] += 1 + entry["dedup"] = {"status": ds, "elapsed_ms": dms, "error": dr.get("error", "")[:300]} + entry["reason"] = "dedup_failed" + entry["triage"] = None + f_out.write(json.dumps(entry, ensure_ascii=False) + "\n"); f_out.flush() + print(f" [{i+1:3d}/{total}] ✗ dedup FAILED ({dms}ms) {dr.get('error','')[:80]}") + continue + + summary["dedup_success"] += 1 + outs = dr.get("outputs", {}) + stats = outs.get("stats", {}) + ua = outs.get("unique_alerts", []) + entry["dedup"] = {"status": ds, "elapsed_ms": dms, + "filter_removed": stats.get("filter_removed_count", 0), + "after_filter": stats.get("after_filter_count", 0), + "unique_alerts": len(ua), + "lsh_clusters": stats.get("lsh_total_clusters"), + "lsh_dedup_keys": stats.get("lsh_total_dedup_keys")} + + if not ua: + summary["filtered_out"] += 1 + entry["reason"] = "filtered_out" + entry["triage"] = None + f_out.write(json.dumps(entry, ensure_ascii=False) + "\n"); f_out.flush() + print(f" [{i+1:3d}/{total}] - dedup OK ({dms:>4d}ms) filtered_out (kept=0)") + continue + + already = bool(ua[0].get("dedup_key_already_exists")) + entry["dedup"]["dedup_key"] = ua[0].get("dedup_key") + entry["dedup"]["dedup_key_already_exists"] = already + + if already: + summary["duplicate_skipped"] += 1 + entry["reason"] = "duplicate_skipped" + entry["triage"] = None + f_out.write(json.dumps(entry, ensure_ascii=False) + "\n"); f_out.flush() + print(f" [{i+1:3d}/{total}] - dedup OK ({dms:>4d}ms) duplicate (key={ua[0].get('dedup_key','')[:8]})") + continue + + # ---------- step 2: triage (only first-seen unique alerts) ---------- + if args.triage_limit > 0 and summary["triage_success"] >= args.triage_limit: + entry["reason"] = "triage_limit_reached" + entry["triage"] = None + f_out.write(json.dumps(entry, ensure_ascii=False) + "\n"); f_out.flush() + print(f" [{i+1:3d}/{total}] - dedup OK ({dms:>4d}ms) triage limit reached, skip") + continue + + summary["triage_invoked"] += 1 + tr, tms = call_triage(alert, args.triage_timeout) + ts_ = tr.get("status", "UNKNOWN") + if ts_ != "SUCCEEDED": + summary["triage_failed"] += 1 + entry["reason"] = "triage_failed" + entry["triage"] = {"status": ts_, "elapsed_ms": tms, "error": tr.get("error", "")[:300]} + f_out.write(json.dumps(entry, ensure_ascii=False) + "\n"); f_out.flush() + print(f" [{i+1:3d}/{total}] ✗ dedup OK + triage FAILED ({dms}+{tms}ms) {tr.get('error','')[:80]}") + continue + + summary["triage_success"] += 1 + tout = tr.get("outputs", {}) + verdict = tout.get("attack_verdict", "unknown") + summary["verdict_counts"][verdict] = summary["verdict_counts"].get(verdict, 0) + 1 + entry["reason"] = "triage_done" + entry["triage"] = {"status": ts_, "elapsed_ms": tms, + "attack_verdict": verdict, + "risk_level": tout.get("risk_level"), + "report_title": tout.get("report_title"), + "report_path": tout.get("report_path")} + f_out.write(json.dumps(entry, ensure_ascii=False) + "\n"); f_out.flush() + print(f" [{i+1:3d}/{total}] ✓ dedup OK + triage OK ({dms}+{tms}ms) " + f"verdict={verdict} title={(tout.get('report_title') or '')[:30]}") + + if args.delay > 0: + time.sleep(args.delay) + + summary["finished_at"] = datetime.now().isoformat() + f_out.write(json.dumps({"_summary": summary}, ensure_ascii=False) + "\n") + + print("-" * 80) + print(f"[done] dedup_success / failed : {summary['dedup_success']} / {summary['dedup_failed']}") + print(f"[done] filtered_out : {summary['filtered_out']}") + print(f"[done] duplicate_skipped : {summary['duplicate_skipped']}") + print(f"[done] triage_invoked : {summary['triage_invoked']}") + print(f"[done] triage_success / failed : {summary['triage_success']} / {summary['triage_failed']}") + print(f"[done] verdict_counts : {summary['verdict_counts']}") + print(f"[done] output : {out_path}") + + +if __name__ == "__main__": + main() diff --git a/tests/integration/test_http_alert_dedup_stream.py b/tests/integration/test_http_alert_dedup_stream.py new file mode 100755 index 000000000..2cb239888 --- /dev/null +++ b/tests/integration/test_http_alert_dedup_stream.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +# NOTE: standalone manual integration test — not a pytest test, run directly with python3. +""" +手动集成测试工具:流式模拟脚本,逐条读取 tdp_logs.json,逐条 POST 到 http_alert_dedup 的 +/invoke 接口,汇总去重结果写入 output 文件。需要 flocks 服务运行,http_alert_dedup 工作流已发布。 + +用法: + python3 scripts/stream_tdp_invoke.py [--input FILE] [--batch-size N] [--delay SEC] [--output FILE] + +默认: + --input ~/Downloads/tdp_logs.json + --batch-size 1 每次发送的告警条数(1 = 严格逐条) + --delay 0.0 每批次之间的间隔秒数(模拟流速) + --output ~/.flocks/workspace/outputs/_tdp_invoke.jsonl +""" + +import argparse +import json +import os +import sys +import time +import urllib.request +import urllib.error +from datetime import datetime +from pathlib import Path + +API_URL = "http://127.0.0.1:8000/api/workflow-center/http_alert_dedup/invoke" +API_KEY = "Yw5WQxIL2bgDSL1RH0XO4yolu30GYrQ9bsfLHSmWVfk" +WORKFLOW_INPUTS_BASE = { + "source_log_type": "tdp", + "filter_enabled": True, + "dedup_enabled": True, + "threshold": 0.7, +} + + +def post_invoke(alerts: list) -> dict: + payload = json.dumps({"inputs": {**WORKFLOW_INPUTS_BASE, "alerts": alerts}}).encode() + req = urllib.request.Request( + API_URL, + data=payload, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {API_KEY}", + }, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=60) as resp: + return json.loads(resp.read()) + except urllib.error.HTTPError as e: + body = e.read().decode(errors="replace") + return {"error": f"HTTP {e.code}: {body}", "status": "FAILED"} + except Exception as e: + return {"error": str(e), "status": "FAILED"} + + +def default_output_path() -> Path: + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + out_dir = Path.home() / ".flocks" / "workspace" / "outputs" / datetime.now().strftime("%Y-%m-%d") + out_dir.mkdir(parents=True, exist_ok=True) + return out_dir / f"{ts}_tdp_invoke.jsonl" + + +def main(): + parser = argparse.ArgumentParser(description="流式模拟:逐条将 TDP 告警发送至 /invoke") + parser.add_argument("--input", default=str(Path.home() / "Downloads" / "tdp_logs.json"), + help="输入 JSON 文件路径(list 格式)") + parser.add_argument("--batch-size", type=int, default=1, + help="每次请求发送的告警条数,默认 1(逐条流式)") + parser.add_argument("--delay", type=float, default=0.0, + help="每批之间等待秒数,默认 0(无延迟)") + parser.add_argument("--output", default=None, + help="输出 JSONL 文件路径,默认写入 ~/.flocks/workspace/outputs/") + args = parser.parse_args() + + input_path = Path(args.input).expanduser() + if not input_path.exists(): + print(f"[ERROR] 文件不存在: {input_path}", file=sys.stderr) + sys.exit(1) + + with open(input_path, "r", encoding="utf-8") as f: + records = json.load(f) + + if isinstance(records, dict): + records = records.get("data", records.get("alerts", records.get("logs", []))) + if not isinstance(records, list): + print("[ERROR] 文件格式错误:期望顶层为 JSON 数组", file=sys.stderr) + sys.exit(1) + + total = len(records) + batch_size = max(1, args.batch_size) + output_path = Path(args.output).expanduser() if args.output else default_output_path() + output_path.parent.mkdir(parents=True, exist_ok=True) + + print(f"[stream] 输入: {input_path} 共 {total} 条") + print(f"[stream] batch_size={batch_size} delay={args.delay}s") + print(f"[stream] 输出: {output_path}") + print(f"[stream] API: {API_URL}") + print("-" * 60) + + summary = { + "total_input": total, + "total_batches": 0, + "total_unique": 0, + "total_deduped": 0, + "total_filtered_out": 0, + "failed_batches": 0, + "started_at": datetime.now().isoformat(), + } + + with open(output_path, "w", encoding="utf-8") as out_f: + batch_idx = 0 + for start in range(0, total, batch_size): + batch = records[start: start + batch_size] + batch_idx += 1 + t0 = time.time() + result = post_invoke(batch) + elapsed = round(time.time() - t0, 3) + + status = result.get("status", "UNKNOWN") + outputs = result.get("outputs", {}) + stats = outputs.get("stats", {}) + + log_entry = { + "batch": batch_idx, + "record_start": start, + "record_end": start + len(batch) - 1, + "status": status, + "elapsed_ms": round(elapsed * 1000), + "unique_alerts": len(outputs.get("unique_alerts", [])), + "deduped_alerts": len(outputs.get("deduped_alerts", [])), + "stats": stats, + "error": result.get("error"), + "dedup_summary": outputs.get("dedup_summary", ""), + } + out_f.write(json.dumps(log_entry, ensure_ascii=False) + "\n") + out_f.flush() + + summary["total_batches"] += 1 + if status == "SUCCEEDED": + summary["total_unique"] += log_entry["unique_alerts"] + summary["total_deduped"] += log_entry["deduped_alerts"] + summary["total_filtered_out"] += stats.get("filter_removed_count", 0) + else: + summary["failed_batches"] += 1 + + progress = f"{start + len(batch)}/{total}" + indicator = "✓" if status == "SUCCEEDED" else "✗" + print( + f" {indicator} batch {batch_idx:4d} [{progress:>9s}] " + f"unique={log_entry['unique_alerts']:3d} " + f"deduped={log_entry['deduped_alerts']:3d} " + f"{elapsed*1000:.0f}ms" + + (f" ERR: {result.get('error','')[:60]}" if status != "SUCCEEDED" else "") + ) + + if args.delay > 0 and start + batch_size < total: + time.sleep(args.delay) + + summary["finished_at"] = datetime.now().isoformat() + + print("-" * 60) + print(f"[done] 批次总数: {summary['total_batches']}") + print(f"[done] 失败批次: {summary['failed_batches']}") + print(f"[done] 累计输入: {summary['total_input']}") + print(f"[done] 累计过滤掉: {summary['total_filtered_out']}") + print(f"[done] 累计去重后: {summary['total_unique']}") + print(f"[done] 累计去重前: {summary['total_deduped']}") + print(f"[done] 输出文件: {output_path}") + + # 末尾追加一行汇总 + with open(output_path, "a", encoding="utf-8") as out_f: + out_f.write(json.dumps({"_summary": summary}, ensure_ascii=False) + "\n") + + +if __name__ == "__main__": + main() From 5b86c29f3ef2ba90dca8b6a5d96ee4b48608ed29 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Sat, 9 May 2026 21:29:51 +0800 Subject: [PATCH 20/41] feat(workflow): add syslog ingestion trigger and integration tab - Add flocks/syslog package (constants, parser, listener, manager) to receive UDP/TCP syslog messages and trigger workflow execution via the Channel-GatewayManager pattern - Expose POST/GET /workflow/{id}/syslog-config API endpoints; start/stop all enabled listeners in the FastAPI lifespan - Add SyslogConfig TS interface and saveSyslogConfig/getSyslogConfig API methods to the frontend workflow client - Extract PublishSection, KafkaSection, SyslogSection from RunTab into a new IntegrationTab component; wire it as a fourth "Integration" tab in RightPanel so the Run tab only shows test-run and execution history - Add tabIntegration i18n keys (zh-CN / en-US) and syslogActive badge shown when listener is enabled and collapsed Co-authored-by: Cursor --- flocks/server/app.py | 18 + flocks/server/routes/workflow.py | 73 +++ flocks/syslog/__init__.py | 6 + flocks/syslog/constants.py | 3 + flocks/syslog/listener.py | 118 +++++ flocks/syslog/manager.py | 153 ++++++ flocks/syslog/parser.py | 196 ++++++++ webui/src/api/workflow.ts | 25 + webui/src/locales/en-US/workflow.json | 11 + webui/src/locales/zh-CN/workflow.json | 11 + webui/src/pages/WorkflowDetail/RightPanel.tsx | 15 +- .../WorkflowDetail/tabs/IntegrationTab.tsx | 460 ++++++++++++++++++ .../pages/WorkflowDetail/tabs/RunTab.test.tsx | 12 + .../src/pages/WorkflowDetail/tabs/RunTab.tsx | 253 +--------- 14 files changed, 1099 insertions(+), 255 deletions(-) create mode 100644 flocks/syslog/__init__.py create mode 100644 flocks/syslog/constants.py create mode 100644 flocks/syslog/listener.py create mode 100644 flocks/syslog/manager.py create mode 100644 flocks/syslog/parser.py create mode 100644 webui/src/pages/WorkflowDetail/tabs/IntegrationTab.tsx diff --git a/flocks/server/app.py b/flocks/server/app.py index a3edd3610..e9722106e 100644 --- a/flocks/server/app.py +++ b/flocks/server/app.py @@ -229,6 +229,15 @@ async def lifespan(app: FastAPI): except Exception as e: log.warning("channel.gateway.start_failed", {"error": str(e)}) + # Start syslog listeners for workflows with syslog enabled + try: + from flocks.syslog.manager import default_manager as default_syslog_manager + + await default_syslog_manager.start_all() + log.info("syslog.manager.started") + except Exception as e: + log.warning("syslog.manager.start_failed", {"error": str(e)}) + try: from flocks.updater.updater import recover_upgrade_state @@ -274,6 +283,15 @@ async def lifespan(app: FastAPI): except Exception as e: log.warning("channel.gateway.stop_failed", {"error": str(e)}) + # Stop syslog listeners + try: + from flocks.syslog.manager import default_manager as default_syslog_manager + + await default_syslog_manager.stop_all() + log.info("syslog.manager.stopped") + except Exception as e: + log.warning("syslog.manager.stop_failed", {"error": str(e)}) + # Stop Task Center try: from flocks.task.manager import TaskManager diff --git a/flocks/server/routes/workflow.py b/flocks/server/routes/workflow.py index ed6b8e092..822e106cd 100644 --- a/flocks/server/routes/workflow.py +++ b/flocks/server/routes/workflow.py @@ -44,6 +44,7 @@ read_workflow_from_fs as shared_read_workflow_from_fs, workflow_scan_dirs as _all_scan_dirs, ) +from flocks.syslog.constants import WORKFLOW_SYSLOG_CONFIG_PREFIX from flocks.workflow.execution_store import ( create_execution_record, normalize_execution_status as _normalize_execution_status, @@ -447,6 +448,10 @@ def _workflow_stats_key(workflow_id: str) -> str: return f"workflow/{workflow_id}/stats" +def _syslog_config_key(workflow_id: str) -> str: + return f"{WORKFLOW_SYSLOG_CONFIG_PREFIX}{workflow_id}" + + async def _run_workflow_execution_task( *, workflow_id: str, @@ -811,6 +816,17 @@ async def delete_workflow(workflow_id: str): except Exception: pass + try: + from flocks.syslog.manager import default_manager as _syslog_default_manager + + await _syslog_default_manager.stop_workflow(workflow_id) + except Exception: + pass + try: + await Storage.remove(_syslog_config_key(workflow_id)) + except Storage.NotFoundError: + pass + log.info("workflow.deleted", {"id": workflow_id}) await publish_event("workflow.deleted", {"id": workflow_id}) return None @@ -1385,6 +1401,19 @@ class KafkaConfigRequest(BaseModel): outputTopic: Optional[str] = None +class SyslogConfigRequest(BaseModel): + """Per-workflow syslog listener configuration (experimental).""" + + model_config = ConfigDict(populate_by_name=True) + + enabled: bool = False + protocol: str = "udp" + host: str = "0.0.0.0" + port: int = 5140 + msg_format: str = Field("auto", alias="format") + input_key: str = Field("syslog_message", alias="inputKey") + + @router.post("/workflow/{workflow_id}/publish") async def publish_workflow_as_api(workflow_id: str): """ @@ -1559,6 +1588,50 @@ async def get_kafka_config(workflow_id: str): raise HTTPException(status_code=500, detail=f"Failed to get Kafka config: {str(e)}") +@router.post("/workflow/{workflow_id}/syslog-config") +async def save_syslog_config(workflow_id: str, req: SyslogConfigRequest): + """ + Save syslog listener configuration for a workflow. + When enabled, starts UDP/TCP listener and passes parsed messages to workflow inputs. + """ + try: + if not _read_workflow_from_fs(workflow_id): + raise HTTPException(status_code=404, detail=f"Workflow not found: {workflow_id}") + + config = { + "workflowId": workflow_id, + "enabled": req.enabled, + "protocol": req.protocol, + "host": req.host, + "port": req.port, + "format": req.msg_format, + "inputKey": req.input_key, + "updatedAt": int(time.time() * 1000), + } + await Storage.write(_syslog_config_key(workflow_id), config) + + from flocks.syslog.manager import default_manager as _syslog_default_manager + + await _syslog_default_manager.restart_workflow(workflow_id) + return {"ok": True} + except HTTPException: + raise + except Exception as e: + log.error("workflow.syslog_config.save.error", {"id": workflow_id, "error": str(e)}) + raise HTTPException(status_code=500, detail=f"Failed to save syslog config: {str(e)}") + + +@router.get("/workflow/{workflow_id}/syslog-config") +async def get_syslog_config(workflow_id: str): + """Get saved syslog configuration for a workflow.""" + try: + config = await Storage.read(_syslog_config_key(workflow_id)) + return config + except Exception as e: + log.error("workflow.syslog_config.get.error", {"id": workflow_id, "error": str(e)}) + raise HTTPException(status_code=500, detail=f"Failed to get syslog config: {str(e)}") + + # ============================================================================= # API Endpoints - Run Single Node # ============================================================================= diff --git a/flocks/syslog/__init__.py b/flocks/syslog/__init__.py new file mode 100644 index 000000000..c3b804c64 --- /dev/null +++ b/flocks/syslog/__init__.py @@ -0,0 +1,6 @@ +"""Syslog ingestion for workflow triggers (UDP/TCP listeners).""" + +from flocks.syslog.constants import WORKFLOW_SYSLOG_CONFIG_PREFIX +from flocks.syslog.manager import SyslogManager, default_manager + +__all__ = ["SyslogManager", "default_manager", "WORKFLOW_SYSLOG_CONFIG_PREFIX"] diff --git a/flocks/syslog/constants.py b/flocks/syslog/constants.py new file mode 100644 index 000000000..717b8d693 --- /dev/null +++ b/flocks/syslog/constants.py @@ -0,0 +1,3 @@ +"""Storage key prefix for per-workflow syslog config (must match server routes).""" + +WORKFLOW_SYSLOG_CONFIG_PREFIX = "workflow_syslog_config/" diff --git a/flocks/syslog/listener.py b/flocks/syslog/listener.py new file mode 100644 index 000000000..7e5af3074 --- /dev/null +++ b/flocks/syslog/listener.py @@ -0,0 +1,118 @@ +"""Asyncio UDP/TCP syslog listeners.""" + +from __future__ import annotations + +import asyncio +from typing import Awaitable, Callable, Union + +from flocks.syslog.parser import parse_syslog + +OnSyslogMessage = Callable[[dict], Union[None, Awaitable[None]]] + + +class SyslogUDPProtocol(asyncio.DatagramProtocol): + """Receive syslog datagrams and invoke async callback with parsed dict.""" + + def __init__( + self, + on_message: OnSyslogMessage, + format_hint: str, + ) -> None: + self._on_message = on_message + self._format_hint = format_hint + + def datagram_received(self, data: bytes, _addr) -> None: # noqa: ANN001 + text = data.decode("utf-8", errors="replace") + parsed = parse_syslog(text, self._format_hint) + try: + loop = asyncio.get_running_loop() + except RuntimeError: + return + loop.create_task(self._safe_dispatch(parsed)) + + async def _safe_dispatch(self, parsed: dict) -> None: + try: + res = self._on_message(parsed) + if asyncio.iscoroutine(res): + await res + except Exception: + # Logged by caller / manager + pass + + +async def run_udp_syslog_server( + host: str, + port: int, + format_hint: str, + on_message: OnSyslogMessage, + *, + abort_event: asyncio.Event, +) -> None: + loop = asyncio.get_running_loop() + transport, _protocol = await loop.create_datagram_endpoint( + lambda: SyslogUDPProtocol(on_message, format_hint), + local_addr=(host, port), + ) + try: + await abort_event.wait() + finally: + transport.close() + + +async def _handle_tcp_client( + reader: asyncio.StreamReader, + writer: asyncio.StreamWriter, + format_hint: str, + on_message: OnSyslogMessage, +) -> None: + try: + while True: + line = await reader.readline() + if not line: + break + text = line.decode("utf-8", errors="replace").strip() + if not text: + continue + parsed = parse_syslog(text, format_hint) + try: + res = on_message(parsed) + if asyncio.iscoroutine(res): + await res + except Exception: + pass + finally: + try: + writer.close() + await writer.wait_closed() + except Exception: + pass + + +async def run_tcp_syslog_server( + host: str, + port: int, + format_hint: str, + on_message: OnSyslogMessage, + *, + abort_event: asyncio.Event, +) -> None: + async def handle_client( + reader: asyncio.StreamReader, + writer: asyncio.StreamWriter, + ) -> None: + await _handle_tcp_client(reader, writer, format_hint, on_message) + + server = await asyncio.start_server(handle_client, host, port) + serve_task: asyncio.Task[None] | None = None + try: + serve_task = asyncio.create_task(server.serve_forever()) + await abort_event.wait() + finally: + if serve_task and not serve_task.done(): + serve_task.cancel() + try: + await serve_task + except asyncio.CancelledError: + pass + server.close() + await server.wait_closed() diff --git a/flocks/syslog/manager.py b/flocks/syslog/manager.py new file mode 100644 index 000000000..925d43eb8 --- /dev/null +++ b/flocks/syslog/manager.py @@ -0,0 +1,153 @@ +"""Lifecycle manager for syslog listeners → workflow runs.""" + +from __future__ import annotations + +import asyncio +from typing import Any, Dict, Optional + +from flocks.storage.storage import Storage +from flocks.utils.log import Log +from flocks.workflow.fs_store import read_workflow_from_fs +from flocks.workflow.runner import run_workflow + +from flocks.syslog.constants import WORKFLOW_SYSLOG_CONFIG_PREFIX +from flocks.syslog.listener import run_tcp_syslog_server, run_udp_syslog_server + +log = Log.create(service="syslog.manager") + + +class SyslogManager: + """One async listener task per workflow id (when enabled).""" + + def __init__(self) -> None: + self._tasks: dict[str, asyncio.Task] = {} + self._abort_events: dict[str, asyncio.Event] = {} + + @staticmethod + def _config_key(workflow_id: str) -> str: + return f"{WORKFLOW_SYSLOG_CONFIG_PREFIX}{workflow_id}" + + async def start_all(self) -> None: + try: + keys = await Storage.list_keys(WORKFLOW_SYSLOG_CONFIG_PREFIX) + except Exception as exc: + log.warning("syslog.list_keys_failed", {"error": str(exc)}) + return + + for key in keys: + if not key.startswith(WORKFLOW_SYSLOG_CONFIG_PREFIX): + continue + workflow_id = key[len(WORKFLOW_SYSLOG_CONFIG_PREFIX) :] + if not workflow_id: + continue + try: + data = await Storage.read(key) + except Exception as exc: + log.warning("syslog.config_read_failed", {"key": key, "error": str(exc)}) + continue + if isinstance(data, dict) and data.get("enabled"): + await self.restart_workflow(workflow_id) + + async def stop_all(self) -> None: + for workflow_id in list(self._tasks.keys()): + await self.stop_workflow(workflow_id) + + async def stop_workflow(self, workflow_id: str) -> None: + ev = self._abort_events.pop(workflow_id, None) + if ev is not None: + ev.set() + task = self._tasks.pop(workflow_id, None) + if task is not None and not task.done(): + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + async def restart_workflow(self, workflow_id: str) -> None: + await self.stop_workflow(workflow_id) + key = self._config_key(workflow_id) + try: + data = await Storage.read(key) + except Exception as exc: + log.warning("syslog.restart_read_failed", {"workflow_id": workflow_id, "error": str(exc)}) + return + if not isinstance(data, dict) or not data.get("enabled"): + return + + abort = asyncio.Event() + self._abort_events[workflow_id] = abort + task = asyncio.create_task( + self._listener_loop(workflow_id, data, abort), + name=f"syslog-{workflow_id}", + ) + self._tasks[workflow_id] = task + log.info("syslog.listener_scheduled", {"workflow_id": workflow_id}) + + async def _listener_loop( + self, + workflow_id: str, + config: Dict[str, Any], + abort: asyncio.Event, + ) -> None: + host = str(config.get("host") or "0.0.0.0") + port = int(config.get("port") or 5140) + protocol = str(config.get("protocol") or "udp").lower() + format_hint = str(config.get("format") or "auto") + input_key = str(config.get("inputKey") or "syslog_message") + + async def on_msg(parsed: dict) -> None: + await self._trigger_workflow(workflow_id, parsed, input_key) + + try: + if protocol == "tcp": + await run_tcp_syslog_server( + host, + port, + format_hint, + on_msg, + abort_event=abort, + ) + else: + await run_udp_syslog_server( + host, + port, + format_hint, + on_msg, + abort_event=abort, + ) + except asyncio.CancelledError: + raise + except OSError as exc: + log.error( + "syslog.bind_failed", + {"workflow_id": workflow_id, "error": str(exc), "host": host, "port": port, "protocol": protocol}, + ) + except Exception as exc: + log.error("syslog.listener_error", {"workflow_id": workflow_id, "error": str(exc)}) + + async def _trigger_workflow(self, workflow_id: str, syslog_msg: dict, input_key: str) -> None: + data = read_workflow_from_fs(workflow_id) + if not data: + log.warning("syslog.workflow_not_found", {"workflow_id": workflow_id}) + return + workflow_json = data.get("workflowJson") + if not workflow_json: + log.warning("syslog.workflow_json_missing", {"workflow_id": workflow_id}) + return + inputs = {input_key: syslog_msg} + try: + await asyncio.to_thread( + run_workflow, + workflow=workflow_json, + inputs=inputs, + trace=False, + ) + except Exception as exc: + log.error( + "syslog.workflow_run_failed", + {"workflow_id": workflow_id, "error": str(exc)}, + ) + + +default_manager = SyslogManager() diff --git a/flocks/syslog/parser.py b/flocks/syslog/parser.py new file mode 100644 index 000000000..7a8377251 --- /dev/null +++ b/flocks/syslog/parser.py @@ -0,0 +1,196 @@ +"""Parse syslog lines (RFC 5424 and BSD / RFC 3164 style) without external deps.""" + +from __future__ import annotations + +import re +from datetime import datetime +from typing import Any, Dict, Optional + +_PRI_RE = re.compile(r"^<(\d{1,3})>") +# After stripping PRI: MMM DD hh:mm:ss hostname tag: msg +_RFC3164_REST_RE = re.compile( + r"^([A-Za-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})\s+(\S+)\s*(.*)$", + re.DOTALL, +) + + +def _pri_parts(pri: int) -> tuple[int, int]: + facility = pri >> 3 + severity = pri & 7 + return facility, severity + + +def _normalize_ts(ts: Optional[str]) -> str: + if not ts: + return "" + ts = ts.strip() + # RFC5424 full-date + if "T" in ts: + try: + # Zulu + if ts.endswith("Z"): + return datetime.fromisoformat(ts.replace("Z", "+00:00")).isoformat() + return datetime.fromisoformat(ts).isoformat() + except ValueError: + return ts + # RFC3164: Oct 11 22:14:15 (no year — use current year best-effort) + try: + now = datetime.now() + dt = datetime.strptime(f"{now.year} {ts}", "%Y %b %d %H:%M:%S") + return dt.isoformat() + except ValueError: + return ts + + +def parse_syslog(raw: str, format_hint: str = "auto") -> Dict[str, Any]: + """ + Parse one syslog payload into a dict suitable for workflow inputs. + + format_hint: "auto" | "rfc3164" | "rfc5424" + """ + text = raw.decode("utf-8", errors="replace") if isinstance(raw, (bytes, bytearray)) else raw + text = text.strip() + if not text: + return { + "raw": text, + "facility": 0, + "severity": 0, + "timestamp": "", + "hostname": "", + "app_name": "", + "message": "", + "format": "empty", + } + + m_pri = _PRI_RE.match(text) + if not m_pri: + return { + "raw": text, + "facility": 0, + "severity": 0, + "timestamp": "", + "hostname": "", + "app_name": "", + "message": text, + "format": "unparsed", + } + + pri = int(m_pri.group(1)) + facility, severity = _pri_parts(pri) + rest = text[m_pri.end() :] + + if format_hint == "rfc3164": + return _parse_rfc3164(rest, raw=text, facility=facility, severity=severity) + if format_hint == "rfc5424": + return _parse_rfc5424(rest, raw=text, facility=facility, severity=severity) + + # auto: RFC5424 if second token is a digit version + if rest and rest[0].isdigit(): + first_space = rest.find(" ") + if first_space > 0 and rest[:first_space].isdigit(): + return _parse_rfc5424(rest, raw=text, facility=facility, severity=severity) + + return _parse_rfc3164(rest, raw=text, facility=facility, severity=severity) + + +def _next_rfc5424_token(s: str) -> tuple[str, str]: + """Pop one syslog field from *s*; structured data may start with '['.""" + s = s.lstrip() + if not s: + return "", "" + if s[0] == "[": + depth = 0 + for j, c in enumerate(s): + if c == "[": + depth += 1 + elif c == "]": + depth -= 1 + if depth == 0: + return s[: j + 1], s[j + 1 :].lstrip() + return s, "" + sp = s.find(" ") + if sp == -1: + return s, "" + return s[:sp], s[sp + 1 :].lstrip() + + +def _parse_rfc5424( + rest: str, + *, + raw: str, + facility: int, + severity: int, +) -> Dict[str, Any]: + s = rest.lstrip() + if not s: + return _parse_rfc3164(rest, raw=raw, facility=facility, severity=severity) + + i = 0 + while i < len(s) and s[i].isdigit(): + i += 1 + version = s[:i].strip() + s = s[i:].lstrip() + if not version.isdigit(): + return _parse_rfc3164(rest, raw=raw, facility=facility, severity=severity) + + ts, s = _next_rfc5424_token(s) + hostname, s = _next_rfc5424_token(s) + app_name, s = _next_rfc5424_token(s) + _procid, s = _next_rfc5424_token(s) + _msgid, s = _next_rfc5424_token(s) + _sdata, s = _next_rfc5424_token(s) + msg = s.strip() + + return { + "raw": raw, + "facility": facility, + "severity": severity, + "timestamp": _normalize_ts(ts), + "hostname": hostname if hostname != "-" else "", + "app_name": app_name if app_name != "-" else "", + "message": msg, + "format": "rfc5424", + } + + +def _parse_rfc3164( + rest: str, + *, + raw: str, + facility: int, + severity: int, +) -> Dict[str, Any]: + m = _RFC3164_REST_RE.match(rest.strip()) + if m: + ts = m.group(1) + hostname = m.group(2) + remainder = (m.group(3) or "").strip() + app_name = "" + message = remainder + # TAG: message (tag is alphanumeric, often "sshd" or "su") + if remainder and ":" in remainder: + tag, _, body = remainder.partition(":") + if tag and " " not in tag and tag.isprintable(): + app_name = tag.strip() + message = body.strip() + return { + "raw": raw, + "facility": facility, + "severity": severity, + "timestamp": _normalize_ts(ts), + "hostname": hostname, + "app_name": app_name, + "message": message, + "format": "rfc3164", + } + + return { + "raw": raw, + "facility": facility, + "severity": severity, + "timestamp": "", + "hostname": "", + "app_name": "", + "message": rest.strip(), + "format": "rfc3164", + } diff --git a/webui/src/api/workflow.ts b/webui/src/api/workflow.ts index 8899c2aa0..5a770a704 100644 --- a/webui/src/api/workflow.ts +++ b/webui/src/api/workflow.ts @@ -147,6 +147,18 @@ export interface WorkflowService { containerName?: string; } +/** Saved syslog listener config (per workflow). */ +export interface SyslogConfig { + workflowId?: string; + enabled?: boolean; + protocol?: string; + host?: string; + port?: number; + format?: string; + inputKey?: string; + updatedAt?: number; +} + export const workflowAPI = { list: (params?: { category?: string; status?: string; excludeId?: string }) => client.get('/api/workflow', { params }), @@ -239,6 +251,19 @@ export const workflowAPI = { outputTopic?: string; } | null>(`/api/workflow/${id}/kafka-config`), + saveSyslogConfig: (id: string, config: { + enabled?: boolean; + protocol?: string; + host?: string; + port?: number; + format?: string; + inputKey?: string; + }) => + client.post<{ ok: boolean }>(`/api/workflow/${id}/syslog-config`, config), + + getSyslogConfig: (id: string) => + client.get(`/api/workflow/${id}/syslog-config`), + runNode: (id: string, data: { nodeId: string; inputs?: Record }) => client.post(`/api/workflow/${id}/run-node`, { node_id: data.nodeId, inputs: data.inputs ?? {} }), diff --git a/webui/src/locales/en-US/workflow.json b/webui/src/locales/en-US/workflow.json index 6bffeb0c8..303565d56 100644 --- a/webui/src/locales/en-US/workflow.json +++ b/webui/src/locales/en-US/workflow.json @@ -58,6 +58,7 @@ "tabOverview": "Overview", "tabChat": "AI Edit", "tabRun": "Run", + "tabIntegration": "Integration", "renderError": "Component render error", "deleteWorkflow": "Delete Workflow", "deleteConfirmTitle": "Delete Workflow", @@ -204,6 +205,16 @@ "savedConfig": "Saved", "saveConfig": "Save Configuration", "kafkaHint": "Kafka integration is under development, configuration will take effect in a future version", + "syslogSection": "Syslog input", + "syslogExperimental": "(Experimental)", + "syslogEnabled": "Enable listener", + "syslogProtocol": "Protocol", + "syslogHost": "Bind address", + "syslogPort": "Port", + "syslogFormat": "Parse format", + "syslogInputKey": "Inputs key", + "syslogHint": "When enabled, Flocks listens for syslog on the given address/port and passes parsed payloads to workflow inputs (default key: syslog_message).", + "syslogActive": "Listening", "historySection": "Execution History", "noHistory": "No execution records", "noOutput": "No output data", diff --git a/webui/src/locales/zh-CN/workflow.json b/webui/src/locales/zh-CN/workflow.json index 74e852323..2524d9925 100644 --- a/webui/src/locales/zh-CN/workflow.json +++ b/webui/src/locales/zh-CN/workflow.json @@ -58,6 +58,7 @@ "tabOverview": "概览", "tabChat": "AI 编辑", "tabRun": "运行", + "tabIntegration": "集成", "renderError": "组件渲染出错", "deleteWorkflow": "删除工作流", "deleteConfirmTitle": "删除工作流", @@ -204,6 +205,16 @@ "savedConfig": "已保存", "saveConfig": "保存配置", "kafkaHint": "Kafka 集成功能开发中,配置将在后续版本生效", + "syslogSection": "Syslog 输入", + "syslogExperimental": "(实验性)", + "syslogEnabled": "启用监听", + "syslogProtocol": "协议", + "syslogHost": "监听地址", + "syslogPort": "端口", + "syslogFormat": "解析格式", + "syslogInputKey": "Inputs 键名", + "syslogHint": "开启后 Flocks 在指定地址/端口接收 syslog,解析结果写入工作流 inputs(默认键名 syslog_message)。", + "syslogActive": "监听中", "historySection": "执行历史", "noHistory": "暂无执行记录", "noOutput": "无输出数据", diff --git a/webui/src/pages/WorkflowDetail/RightPanel.tsx b/webui/src/pages/WorkflowDetail/RightPanel.tsx index 8c0db61ab..dd6b371e9 100644 --- a/webui/src/pages/WorkflowDetail/RightPanel.tsx +++ b/webui/src/pages/WorkflowDetail/RightPanel.tsx @@ -6,6 +6,7 @@ import { useConfirm } from '@/components/common/ConfirmDialog'; import OverviewTab from './tabs/OverviewTab'; import ChatTab from './tabs/ChatTab'; import RunTab from './tabs/RunTab'; +import IntegrationTab from './tabs/IntegrationTab'; // ───────────────────────────────────────────── // Error boundary helpers @@ -59,7 +60,7 @@ class TabErrorBoundary extends Component< // RightPanel // ───────────────────────────────────────────── -type TabId = 'chat' | 'overview' | 'run'; +type TabId = 'chat' | 'overview' | 'run' | 'integration'; interface RightPanelProps { workflow: Workflow; @@ -107,9 +108,10 @@ export default function RightPanel({ }; const TABS: { id: TabId; label: string }[] = [ - { id: 'overview', label: t('detail.rightPanel.tabOverview') }, - { id: 'chat', label: t('detail.rightPanel.tabChat') }, - { id: 'run', label: t('detail.rightPanel.tabRun') }, + { id: 'overview', label: t('detail.rightPanel.tabOverview') }, + { id: 'chat', label: t('detail.rightPanel.tabChat') }, + { id: 'run', label: t('detail.rightPanel.tabRun') }, + { id: 'integration', label: t('detail.rightPanel.tabIntegration') }, ]; return ( @@ -158,6 +160,11 @@ export default function RightPanel({ /> )} + {activeTab === 'integration' && ( + + + + )} {/* 底部删除按钮 */} diff --git a/webui/src/pages/WorkflowDetail/tabs/IntegrationTab.tsx b/webui/src/pages/WorkflowDetail/tabs/IntegrationTab.tsx new file mode 100644 index 000000000..7290db1c0 --- /dev/null +++ b/webui/src/pages/WorkflowDetail/tabs/IntegrationTab.tsx @@ -0,0 +1,460 @@ +import { useState, useEffect, useCallback } from 'react'; +import { + Loader2, Globe, StopCircle, Check, ChevronDown, ChevronRight, + AlertCircle, Wifi, Server, +} from 'lucide-react'; +import { useTranslation } from 'react-i18next'; +import { + workflowAPI, + Workflow, + WorkflowService, +} from '@/api/workflow'; +import CopyButton from '@/components/common/CopyButton'; +import WorkflowStatusBadge from '@/components/common/WorkflowStatusBadge'; +import { extractErrorMessage } from '@/utils/error'; + +export interface IntegrationTabProps { + workflow: Workflow; +} + +// ───────────────────────────────────────────── +// 共享 SectionHeader +// ───────────────────────────────────────────── +function SectionHeader({ + title, + expanded, + onToggle, + badge, +}: { + title: string; + expanded: boolean; + onToggle: () => void; + badge?: React.ReactNode; +}) { + return ( + + ); +} + +// ───────────────────────────────────────────── +// 发布为 API +// ───────────────────────────────────────────── +function PublishSection({ workflowId }: { workflowId: string }) { + const { t } = useTranslation('workflow'); + const [expanded, setExpanded] = useState(true); + const [service, setService] = useState(null); + const [loadingService, setLoadingService] = useState(true); + const [publishing, setPublishing] = useState(false); + const [stopping, setStopping] = useState(false); + const [error, setError] = useState(''); + const [apiKeyVisible, setApiKeyVisible] = useState(false); + + const fetchService = useCallback(async () => { + try { + const res = await workflowAPI.getService(workflowId); + setService(res.data); + } catch { + setService(null); + } finally { + setLoadingService(false); + } + }, [workflowId]); + + useEffect(() => { + fetchService(); + }, [fetchService]); + + const handlePublish = async () => { + setError(''); + setPublishing(true); + try { + const res = await workflowAPI.publish(workflowId); + setService(res.data); + } catch (err: unknown) { + setError(extractErrorMessage(err, t('detail.run.publishFailed'))); + } finally { + setPublishing(false); + } + }; + + const handleUnpublish = async () => { + setError(''); + setStopping(true); + try { + await workflowAPI.unpublish(workflowId); + await fetchService(); + } catch (err: unknown) { + setError(extractErrorMessage(err, t('detail.run.stopFailed'))); + } finally { + setStopping(false); + } + }; + + const maskedKey = (key?: string) => { + if (!key) return '***'; + return apiKeyVisible ? key : `${key.slice(0, 4)}${'*'.repeat(Math.max(0, key.length - 8))}${key.slice(-4)}`; + }; + + const badge = service && ; + + return ( +
+ setExpanded(v => !v)} + badge={badge} + /> + {expanded && ( +
+ {loadingService ? ( +
+ +
+ ) : service && service.status !== 'stopped' ? ( +
+
+ +
+ {service.invokeUrl ?? ''} + +
+
+
+ +
+ + {maskedKey(service.apiKey)} + + + +
+
+
+ +
+
{`curl -X POST ${service.invokeUrl ?? ''} \\
+  -H "Content-Type: application/json" \\
+  -H "X-API-Key: ${service.apiKey ?? ''}" \\
+  -d '{"inputs": {}}'`}
+
+ +
+
+
+ +
+ ) : ( +
+

+ {t('detail.run.publishDesc')} +

+ + {publishing && ( +

{t('detail.run.dockerStarting')}

+ )} +
+ )} + {error && ( +
+ + {error} +
+ )} +
+ )} +
+ ); +} + +// ───────────────────────────────────────────── +// Kafka 配置 +// ───────────────────────────────────────────── +function KafkaSection({ workflowId }: { workflowId: string }) { + const { t } = useTranslation('workflow'); + const [expanded, setExpanded] = useState(false); + const [saving, setSaving] = useState(false); + const [saved, setSaved] = useState(false); + const [inputBroker, setInputBroker] = useState(''); + const [inputTopic, setInputTopic] = useState(''); + const [inputGroupId, setInputGroupId] = useState(''); + const [outputBroker, setOutputBroker] = useState(''); + const [outputTopic, setOutputTopic] = useState(''); + + useEffect(() => { + workflowAPI.getKafkaConfig(workflowId).then(res => { + if (res.data) { + setInputBroker(res.data.inputBroker || ''); + setInputTopic(res.data.inputTopic || ''); + setInputGroupId(res.data.inputGroupId || ''); + setOutputBroker(res.data.outputBroker || ''); + setOutputTopic(res.data.outputTopic || ''); + } + }).catch(() => {}); + }, [workflowId]); + + const handleSave = async () => { + setSaving(true); + setSaved(false); + try { + await workflowAPI.saveKafkaConfig(workflowId, { + inputBroker, inputTopic, inputGroupId, outputBroker, outputTopic, + }); + setSaved(true); + setTimeout(() => setSaved(false), 2000); + } catch { + // ignore - stub endpoint may return 501 + } finally { + setSaving(false); + } + }; + + const inputField = (label: string, value: string, onChange: (v: string) => void, placeholder: string) => ( +
+ + onChange(e.target.value)} + placeholder={placeholder} + className="w-full text-xs border border-gray-200 rounded-lg px-3 py-1.5 focus:outline-none focus:ring-1 focus:ring-red-500" + /> +
+ ); + + return ( +
+ setExpanded(v => !v)} + badge={{t('detail.run.kafkaExperimental')}} + /> + {expanded && ( +
+
+

+ {t('detail.run.inputConfig')} +

+ {inputField('Broker', inputBroker, setInputBroker, 'localhost:9092')} + {inputField('Topic', inputTopic, setInputTopic, 'workflow-input')} + {inputField('Consumer Group', inputGroupId, setInputGroupId, 'flocks-consumer')} +
+
+

+ {t('detail.run.outputConfig')} +

+ {inputField('Broker', outputBroker, setOutputBroker, 'localhost:9092')} + {inputField('Topic', outputTopic, setOutputTopic, 'workflow-output')} +
+ +

{t('detail.run.kafkaHint')}

+
+ )} +
+ ); +} + +// ───────────────────────────────────────────── +// Syslog 配置 +// ───────────────────────────────────────────── +function SyslogSection({ workflowId }: { workflowId: string }) { + const { t } = useTranslation('workflow'); + const [expanded, setExpanded] = useState(false); + const [saving, setSaving] = useState(false); + const [saved, setSaved] = useState(false); + const [enabled, setEnabled] = useState(false); + const [protocol, setProtocol] = useState('udp'); + const [host, setHost] = useState('0.0.0.0'); + const [port, setPort] = useState('5140'); + const [format, setFormat] = useState('auto'); + const [inputKey, setInputKey] = useState('syslog_message'); + + // 摘要行:已启用时在折叠标题旁显示 + const summaryBadge = enabled && !expanded ? ( + + {protocol.toUpperCase()} {host}:{port} · {t('detail.run.syslogActive')} + + ) : ( + {t('detail.run.syslogExperimental')} + ); + + useEffect(() => { + workflowAPI.getSyslogConfig(workflowId).then(res => { + if (res.data) { + setEnabled(!!res.data.enabled); + setProtocol(res.data.protocol || 'udp'); + setHost(res.data.host || '0.0.0.0'); + setPort(String(res.data.port ?? 5140)); + setFormat(res.data.format || 'auto'); + setInputKey(res.data.inputKey || 'syslog_message'); + } + }).catch(() => {}); + }, [workflowId]); + + const handleSave = async () => { + setSaving(true); + setSaved(false); + try { + await workflowAPI.saveSyslogConfig(workflowId, { + enabled, + protocol, + host, + port: Number.parseInt(port, 10) || 5140, + format, + inputKey, + }); + setSaved(true); + setTimeout(() => setSaved(false), 2000); + } catch { + // ignore + } finally { + setSaving(false); + } + }; + + const inputField = (label: string, value: string, onChange: (v: string) => void, placeholder: string) => ( +
+ + onChange(e.target.value)} + placeholder={placeholder} + className="w-full text-xs border border-gray-200 rounded-lg px-3 py-1.5 focus:outline-none focus:ring-1 focus:ring-red-500" + /> +
+ ); + + return ( +
+ setExpanded(v => !v)} + badge={summaryBadge} + /> + {expanded && ( +
+
+ setEnabled(e.target.checked)} + className="rounded border-gray-300 text-red-600 focus:ring-red-500" + /> + +
+
+

+ {t('detail.run.inputConfig')} +

+
+ + +
+ {inputField(t('detail.run.syslogHost'), host, setHost, '0.0.0.0')} + {inputField(t('detail.run.syslogPort'), port, setPort, '5140')} +
+ + +
+ {inputField(t('detail.run.syslogInputKey'), inputKey, setInputKey, 'syslog_message')} +
+ +

{t('detail.run.syslogHint')}

+
+ )} +
+ ); +} + +// ───────────────────────────────────────────── +// 主组件 +// ───────────────────────────────────────────── +export default function IntegrationTab({ workflow }: IntegrationTabProps) { + return ( +
+ + + +
+ ); +} diff --git a/webui/src/pages/WorkflowDetail/tabs/RunTab.test.tsx b/webui/src/pages/WorkflowDetail/tabs/RunTab.test.tsx index 82b47e28b..8709af1e7 100644 --- a/webui/src/pages/WorkflowDetail/tabs/RunTab.test.tsx +++ b/webui/src/pages/WorkflowDetail/tabs/RunTab.test.tsx @@ -17,6 +17,8 @@ const { workflowAPI } = vi.hoisted(() => ({ unpublish: vi.fn(), getKafkaConfig: vi.fn(), saveKafkaConfig: vi.fn(), + getSyslogConfig: vi.fn(), + saveSyslogConfig: vi.fn(), getHistory: vi.fn(), }, })); @@ -74,6 +76,15 @@ vi.mock('react-i18next', () => ({ 'detail.run.savedConfig': '已保存', 'detail.run.saveConfig': '保存配置', 'detail.run.kafkaHint': 'hint', + 'detail.run.syslogSection': 'Syslog', + 'detail.run.syslogExperimental': '实验性', + 'detail.run.syslogEnabled': '启用', + 'detail.run.syslogProtocol': '协议', + 'detail.run.syslogHost': '地址', + 'detail.run.syslogPort': '端口', + 'detail.run.syslogFormat': '格式', + 'detail.run.syslogInputKey': '键名', + 'detail.run.syslogHint': 'syslog hint', 'detail.run.historySection': '执行历史', 'detail.run.noHistory': '暂无执行记录', 'detail.run.noOutput': '无输出数据', @@ -130,6 +141,7 @@ describe('RunTab', () => { workflowAPI.saveSampleInputs.mockResolvedValue({ data: { ok: true } }); workflowAPI.getService.mockResolvedValue({ data: null }); workflowAPI.getKafkaConfig.mockResolvedValue({ data: null }); + workflowAPI.getSyslogConfig.mockResolvedValue({ data: null }); workflowAPI.getHistory.mockResolvedValue({ data: [] }); workflowAPI.run.mockResolvedValue({ data: { diff --git a/webui/src/pages/WorkflowDetail/tabs/RunTab.tsx b/webui/src/pages/WorkflowDetail/tabs/RunTab.tsx index 0852c0ff0..b1f35cbb3 100644 --- a/webui/src/pages/WorkflowDetail/tabs/RunTab.tsx +++ b/webui/src/pages/WorkflowDetail/tabs/RunTab.tsx @@ -1,14 +1,13 @@ import { useState, useEffect, useCallback, useRef } from 'react'; import { - Loader2, ChevronDown, ChevronRight, Globe, StopCircle, - Check, Clock, CheckCircle, XCircle, AlertCircle, Wifi, FlaskConical, + Loader2, ChevronDown, ChevronRight, StopCircle, + Clock, CheckCircle, XCircle, AlertCircle, FlaskConical, } from 'lucide-react'; import { useTranslation } from 'react-i18next'; import { workflowAPI, Workflow, WorkflowExecution, - WorkflowService, WorkflowJSON, } from '@/api/workflow'; import CopyButton from '@/components/common/CopyButton'; @@ -484,252 +483,6 @@ function TestSection({ ); } -// ───────────────────────────────────────────── -// 区块2:发布为 API -// ───────────────────────────────────────────── -function PublishSection({ workflowId }: { workflowId: string }) { - const { t } = useTranslation('workflow'); - const [expanded, setExpanded] = useState(true); - const [service, setService] = useState(null); - const [loadingService, setLoadingService] = useState(true); - const [publishing, setPublishing] = useState(false); - const [stopping, setStopping] = useState(false); - const [error, setError] = useState(''); - const [apiKeyVisible, setApiKeyVisible] = useState(false); - - const fetchService = useCallback(async () => { - try { - const res = await workflowAPI.getService(workflowId); - setService(res.data); - } catch { - setService(null); - } finally { - setLoadingService(false); - } - }, [workflowId]); - - useEffect(() => { - fetchService(); - }, [fetchService]); - - const handlePublish = async () => { - setError(''); - setPublishing(true); - try { - const res = await workflowAPI.publish(workflowId); - setService(res.data); - } catch (err: unknown) { - setError(extractErrorMessage(err, t('detail.run.publishFailed'))); - } finally { - setPublishing(false); - } - }; - - const handleUnpublish = async () => { - setError(''); - setStopping(true); - try { - await workflowAPI.unpublish(workflowId); - await fetchService(); - } catch (err: unknown) { - setError(extractErrorMessage(err, t('detail.run.stopFailed'))); - } finally { - setStopping(false); - } - }; - - const maskedKey = (key?: string) => { - if (!key) return '***'; - return apiKeyVisible ? key : `${key.slice(0, 4)}${'*'.repeat(Math.max(0, key.length - 8))}${key.slice(-4)}`; - }; - - const badge = service && ( - - ); - - return ( -
- setExpanded(v => !v)} badge={badge} /> - {expanded && ( -
- {loadingService ? ( -
- -
- ) : service && service.status !== 'stopped' ? ( -
-
- -
- {service.invokeUrl ?? ''} - -
-
-
- -
- - {maskedKey(service.apiKey)} - - - -
-
-
- -
-
{`curl -X POST ${service.invokeUrl ?? ''} \\
-  -H "Content-Type: application/json" \\
-  -H "X-API-Key: ${service.apiKey ?? ''}" \\
-  -d '{"inputs": {}}'`}
-
- -
-
-
- -
- ) : ( -
-

- {t('detail.run.publishDesc')} -

- - {publishing && ( -

{t('detail.run.dockerStarting')}

- )} -
- )} - {error && ( -
- - {error} -
- )} -
- )} -
- ); -} - -// ───────────────────────────────────────────── -// 区块3:Kafka 配置 -// ───────────────────────────────────────────── -function KafkaSection({ workflowId }: { workflowId: string }) { - const { t } = useTranslation('workflow'); - const [expanded, setExpanded] = useState(false); - const [saving, setSaving] = useState(false); - const [saved, setSaved] = useState(false); - const [inputBroker, setInputBroker] = useState(''); - const [inputTopic, setInputTopic] = useState(''); - const [inputGroupId, setInputGroupId] = useState(''); - const [outputBroker, setOutputBroker] = useState(''); - const [outputTopic, setOutputTopic] = useState(''); - - useEffect(() => { - workflowAPI.getKafkaConfig(workflowId).then(res => { - if (res.data) { - setInputBroker(res.data.inputBroker || ''); - setInputTopic(res.data.inputTopic || ''); - setInputGroupId(res.data.inputGroupId || ''); - setOutputBroker(res.data.outputBroker || ''); - setOutputTopic(res.data.outputTopic || ''); - } - }).catch(() => {}); - }, [workflowId]); - - const handleSave = async () => { - setSaving(true); - setSaved(false); - try { - await workflowAPI.saveKafkaConfig(workflowId, { - inputBroker, inputTopic, inputGroupId, outputBroker, outputTopic, - }); - setSaved(true); - setTimeout(() => setSaved(false), 2000); - } catch { - // ignore - stub endpoint may return 501 - } finally { - setSaving(false); - } - }; - - const inputField = (label: string, value: string, onChange: (v: string) => void, placeholder: string) => ( -
- - onChange(e.target.value)} - placeholder={placeholder} - className="w-full text-xs border border-gray-200 rounded-lg px-3 py-1.5 focus:outline-none focus:ring-1 focus:ring-red-500" - /> -
- ); - - return ( -
- setExpanded(v => !v)} - badge={{t('detail.run.kafkaExperimental')}} - /> - {expanded && ( -
-
-

- {t('detail.run.inputConfig')} -

- {inputField('Broker', inputBroker, setInputBroker, 'localhost:9092')} - {inputField('Topic', inputTopic, setInputTopic, 'workflow-input')} - {inputField('Consumer Group', inputGroupId, setInputGroupId, 'flocks-consumer')} -
-
-

- {t('detail.run.outputConfig')} -

- {inputField('Broker', outputBroker, setOutputBroker, 'localhost:9092')} - {inputField('Topic', outputTopic, setOutputTopic, 'workflow-output')} -
- -

{t('detail.run.kafkaHint')}

-
- )} -
- ); -} - // ───────────────────────────────────────────── // 单步详情组件 // ───────────────────────────────────────────── @@ -1036,8 +789,6 @@ export default function RunTab({ onExecutionChange={onLatestExecutionChange} onExecutionSettled={onExecutionSettled} /> - - Date: Sat, 9 May 2026 21:31:17 +0800 Subject: [PATCH 21/41] =?UTF-8?q?refactor(ingest):=20rename=20flocks/syslo?= =?UTF-8?q?g=20=E2=86=92=20flocks/ingest/syslog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Group data-ingestion triggers under a shared flocks/ingest/ namespace so that future connectors (kafka, webhook, …) live alongside syslog as sibling sub-packages rather than top-level packages. - Move flocks/syslog/* → flocks/ingest/syslog/* - Add flocks/ingest/__init__.py (empty package marker) - Update all internal imports (flocks.syslog → flocks.ingest.syslog) in listener.py, manager.py, __init__.py, server/app.py and server/routes/workflow.py Co-authored-by: Cursor --- flocks/ingest/__init__.py | 0 flocks/{ => ingest}/syslog/__init__.py | 4 ++-- flocks/{ => ingest}/syslog/constants.py | 0 flocks/{ => ingest}/syslog/listener.py | 2 +- flocks/{ => ingest}/syslog/manager.py | 4 ++-- flocks/{ => ingest}/syslog/parser.py | 0 flocks/server/app.py | 4 ++-- flocks/server/routes/workflow.py | 6 +++--- 8 files changed, 10 insertions(+), 10 deletions(-) create mode 100644 flocks/ingest/__init__.py rename flocks/{ => ingest}/syslog/__init__.py (50%) rename flocks/{ => ingest}/syslog/constants.py (100%) rename flocks/{ => ingest}/syslog/listener.py (98%) rename flocks/{ => ingest}/syslog/manager.py (97%) rename flocks/{ => ingest}/syslog/parser.py (100%) diff --git a/flocks/ingest/__init__.py b/flocks/ingest/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/flocks/syslog/__init__.py b/flocks/ingest/syslog/__init__.py similarity index 50% rename from flocks/syslog/__init__.py rename to flocks/ingest/syslog/__init__.py index c3b804c64..5e83880ce 100644 --- a/flocks/syslog/__init__.py +++ b/flocks/ingest/syslog/__init__.py @@ -1,6 +1,6 @@ """Syslog ingestion for workflow triggers (UDP/TCP listeners).""" -from flocks.syslog.constants import WORKFLOW_SYSLOG_CONFIG_PREFIX -from flocks.syslog.manager import SyslogManager, default_manager +from flocks.ingest.syslog.constants import WORKFLOW_SYSLOG_CONFIG_PREFIX +from flocks.ingest.syslog.manager import SyslogManager, default_manager __all__ = ["SyslogManager", "default_manager", "WORKFLOW_SYSLOG_CONFIG_PREFIX"] diff --git a/flocks/syslog/constants.py b/flocks/ingest/syslog/constants.py similarity index 100% rename from flocks/syslog/constants.py rename to flocks/ingest/syslog/constants.py diff --git a/flocks/syslog/listener.py b/flocks/ingest/syslog/listener.py similarity index 98% rename from flocks/syslog/listener.py rename to flocks/ingest/syslog/listener.py index 7e5af3074..decce3cda 100644 --- a/flocks/syslog/listener.py +++ b/flocks/ingest/syslog/listener.py @@ -5,7 +5,7 @@ import asyncio from typing import Awaitable, Callable, Union -from flocks.syslog.parser import parse_syslog +from flocks.ingest.syslog.parser import parse_syslog OnSyslogMessage = Callable[[dict], Union[None, Awaitable[None]]] diff --git a/flocks/syslog/manager.py b/flocks/ingest/syslog/manager.py similarity index 97% rename from flocks/syslog/manager.py rename to flocks/ingest/syslog/manager.py index 925d43eb8..c00ea2c99 100644 --- a/flocks/syslog/manager.py +++ b/flocks/ingest/syslog/manager.py @@ -10,8 +10,8 @@ from flocks.workflow.fs_store import read_workflow_from_fs from flocks.workflow.runner import run_workflow -from flocks.syslog.constants import WORKFLOW_SYSLOG_CONFIG_PREFIX -from flocks.syslog.listener import run_tcp_syslog_server, run_udp_syslog_server +from flocks.ingest.syslog.constants import WORKFLOW_SYSLOG_CONFIG_PREFIX +from flocks.ingest.syslog.listener import run_tcp_syslog_server, run_udp_syslog_server log = Log.create(service="syslog.manager") diff --git a/flocks/syslog/parser.py b/flocks/ingest/syslog/parser.py similarity index 100% rename from flocks/syslog/parser.py rename to flocks/ingest/syslog/parser.py diff --git a/flocks/server/app.py b/flocks/server/app.py index e9722106e..183ea8d60 100644 --- a/flocks/server/app.py +++ b/flocks/server/app.py @@ -231,7 +231,7 @@ async def lifespan(app: FastAPI): # Start syslog listeners for workflows with syslog enabled try: - from flocks.syslog.manager import default_manager as default_syslog_manager + from flocks.ingest.syslog.manager import default_manager as default_syslog_manager await default_syslog_manager.start_all() log.info("syslog.manager.started") @@ -285,7 +285,7 @@ async def lifespan(app: FastAPI): # Stop syslog listeners try: - from flocks.syslog.manager import default_manager as default_syslog_manager + from flocks.ingest.syslog.manager import default_manager as default_syslog_manager await default_syslog_manager.stop_all() log.info("syslog.manager.stopped") diff --git a/flocks/server/routes/workflow.py b/flocks/server/routes/workflow.py index 822e106cd..a60c81ccc 100644 --- a/flocks/server/routes/workflow.py +++ b/flocks/server/routes/workflow.py @@ -44,7 +44,7 @@ read_workflow_from_fs as shared_read_workflow_from_fs, workflow_scan_dirs as _all_scan_dirs, ) -from flocks.syslog.constants import WORKFLOW_SYSLOG_CONFIG_PREFIX +from flocks.ingest.syslog.constants import WORKFLOW_SYSLOG_CONFIG_PREFIX from flocks.workflow.execution_store import ( create_execution_record, normalize_execution_status as _normalize_execution_status, @@ -817,7 +817,7 @@ async def delete_workflow(workflow_id: str): pass try: - from flocks.syslog.manager import default_manager as _syslog_default_manager + from flocks.ingest.syslog.manager import default_manager as _syslog_default_manager await _syslog_default_manager.stop_workflow(workflow_id) except Exception: @@ -1610,7 +1610,7 @@ async def save_syslog_config(workflow_id: str, req: SyslogConfigRequest): } await Storage.write(_syslog_config_key(workflow_id), config) - from flocks.syslog.manager import default_manager as _syslog_default_manager + from flocks.ingest.syslog.manager import default_manager as _syslog_default_manager await _syslog_default_manager.restart_workflow(workflow_id) return {"ok": True} From 18e6521648349db3b8a059047015337dea1ca438 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Sun, 10 May 2026 16:31:15 +0800 Subject: [PATCH 22/41] feat(alert_dedup_triage): support syslog real-time input mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit receive_alerts node now handles three input sources in priority order: 1. syslog_message (injected by flocks syslog listener, RFC3164/5424) - TDP alert JSON parsed from syslog_message.message - Syslog metadata (hostname, severity, timestamp…) attached to alert under _syslog_meta for traceability; does not affect dedup/triage logic 2. alerts (batch list, existing) 3. alert_file (local JSON path, existing) Also adds syslog-config API documentation and example to workflow.md and syslog_message sample to metadata.sampleInputs. Co-authored-by: Cursor --- .../alert_dedup_triage/workflow.json | 21 +++-- .../workflows/alert_dedup_triage/workflow.md | 83 ++++++++++++++++--- 2 files changed, 86 insertions(+), 18 deletions(-) diff --git a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json index 9250f7034..e356ef05d 100644 --- a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json +++ b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json @@ -1,14 +1,14 @@ { "name": "alert_dedup_triage", - "description": "Chained pipeline: http_alert_dedup → tdp_alert_triage. Deduplicates incoming TDP/HTTP alerts with MinHash LSH, then runs LLM-based triage (survey / CVE / payload analysis in parallel) on each unique alert.", - "description_cn": "去重+研判串联工作流:先调用 http_alert_dedup 做 MinHash LSH 去重,再对每条首次出现的唯一告警调用 tdp_alert_triage 进行 LLM 研判;重复告警会从持久化研判缓存(triage_cache.pkl)中回填历史研判结果(stage=duplicate_with_triage)。研判缓存上限由 max_dedup_keys 控制(默认 10W,FIFO 淘汰)。", + "description": "Chained pipeline: http_alert_dedup → tdp_alert_triage. Accepts alerts via syslog (real-time, single message), alerts list (batch), or alert_file path. Deduplicates with MinHash LSH; runs LLM triage on first-seen unique alerts; returns cached triage results for duplicates.", + "description_cn": "去重+研判串联工作流。支持三种输入模式:syslog 实时单条(配置 syslog-config 后自动触发)、alerts 批次列表、alert_file 文件路径。先调用 http_alert_dedup 做 MinHash LSH 去重,再对首次出现的唯一告警调用 tdp_alert_triage 进行 LLM 研判;重复告警从持久化研判缓存(triage_cache.pkl)中回填历史结果。研判缓存上限由 max_dedup_keys 控制(默认 10W,FIFO 淘汰)。", "start": "receive_alerts", "nodes": [ { "id": "receive_alerts", "type": "python", - "description": "解析输入告警列表(与 http_alert_dedup 接口相同):支持 alerts 列表或 alert_file 文件路径,提取去重 / 研判配置参数及服务 URL", - "code": "\nimport json\nimport os\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n print(f'[receive] loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from file: {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\n# Maximum triage-cache entries kept on disk (FIFO eviction, same default as LSH).\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nprint(f'[receive] raw_alerts={len(alerts_input)}, source_log_type={source_log_type}, max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['threshold'] = threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\noutputs['dedup_service_url'] = inputs.get('dedup_service_url', 'http://127.0.0.1:19000')\noutputs['triage_service_url'] = inputs.get('triage_service_url', 'http://127.0.0.1:19001')\n" + "description": "解析输入告警。支持三种输入模式(优先级由高到低):① syslog_message(flocks syslog 监听注入,message 字段为 TDP JSON);② alerts(告警列表,批次调用);③ alert_file(本地 JSON 文件路径)。syslog 元数据(hostname/severity 等)附加到告警的 _syslog_meta 字段供溯源。", + "code": "\nimport json\nimport os\n\n# ── 输入来源优先级:syslog_message > alerts > alert_file ────────\n# syslog 模式:flocks ingest/syslog/manager.py 调用本工作流时,\n# 会把 parse_syslog() 的返回值以 inputKey(默认 syslog_message)注入。\n# 解析结构:{raw, facility, severity, timestamp, hostname, app_name, message, format}\n# TDP 告警 JSON 存放在 message 字段。\n\nalerts_input = []\ninput_mode = 'unknown'\n\nsyslog_msg = inputs.get('syslog_message') or inputs.get('syslog')\nif syslog_msg and isinstance(syslog_msg, dict):\n raw_text = syslog_msg.get('message', '')\n if raw_text:\n try:\n alert = json.loads(raw_text)\n # 将 syslog 元数据附加到告警,方便后续溯源(不影响去重/研判逻辑)\n alert['_syslog_meta'] = {\n 'hostname': syslog_msg.get('hostname', ''),\n 'app_name': syslog_msg.get('app_name', ''),\n 'timestamp': syslog_msg.get('timestamp', ''),\n 'severity': syslog_msg.get('severity'),\n 'facility': syslog_msg.get('facility'),\n 'format': syslog_msg.get('format', ''),\n }\n alerts_input = [alert]\n input_mode = 'syslog'\n print(f'[receive] syslog mode: host={syslog_msg.get(\"hostname\")!r} '\n f'app={syslog_msg.get(\"app_name\")!r} '\n f'severity={syslog_msg.get(\"severity\")} '\n f'format={syslog_msg.get(\"format\")!r}')\n except (json.JSONDecodeError, TypeError) as _e:\n print(f'[receive] WARNING: syslog.message is not valid JSON ({_e}), '\n f'raw={raw_text[:120]!r}')\n else:\n print('[receive] WARNING: syslog_message.message is empty, skipping')\n\nif not alerts_input:\n alerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n if alerts_input:\n input_mode = 'alerts'\n\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n input_mode = 'alert_file'\n print(f'[receive] file mode: loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nif not alerts_input:\n print('[receive] WARNING: no alerts to process (syslog_message, alerts, alert_file all empty)')\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nprint(f'[receive] input_mode={input_mode} raw_alerts={len(alerts_input)} '\n f'source_log_type={source_log_type} max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['input_mode'] = input_mode\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['threshold'] = threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\noutputs['dedup_service_url'] = inputs.get('dedup_service_url', 'http://127.0.0.1:19000')\noutputs['triage_service_url'] = inputs.get('triage_service_url', 'http://127.0.0.1:19001')\n" }, { "id": "dedup_and_triage", @@ -46,7 +46,18 @@ "triage_service_url": "http://127.0.0.1:19001", "triage_timeout_s": 300, "_comment_alerts": "Pass 'alerts' (list) or 'alert_file' (path to JSON file)", - "max_dedup_keys": 100000 + "max_dedup_keys": 100000, + "_comment_syslog": "Syslog mode: configure via POST /api/workflow/{id}/syslog-config {enabled:true, protocol:'udp', port:5140, inputKey:'syslog_message'}. The syslog listener parses RFC3164/5424 and injects the result as 'syslog_message'; TDP alert JSON must be in the syslog message body.", + "syslog_message": { + "raw": "<134>May 10 16:00:00 tdp-sensor tdp: {\"id\":\"AZtRkZkzj\",\"net\":{...}}", + "facility": 16, + "severity": 6, + "timestamp": "2026-05-10T16:00:00", + "hostname": "tdp-sensor", + "app_name": "tdp", + "message": "{\"id\":\"AZtRkZkzj\",\"net\":{\"http\":{\"url\":\"/admin\"}},\"threat\":{\"name\":\"SQL注入\"}}", + "format": "rfc3164" + } } } } \ No newline at end of file diff --git a/.flocks/plugins/workflows/alert_dedup_triage/workflow.md b/.flocks/plugins/workflows/alert_dedup_triage/workflow.md index 3bfe19533..06cec5866 100644 --- a/.flocks/plugins/workflows/alert_dedup_triage/workflow.md +++ b/.flocks/plugins/workflows/alert_dedup_triage/workflow.md @@ -4,7 +4,7 @@ 将 `http_alert_dedup`(MinHash LSH 去重)与 `tdp_alert_triage`(LLM 研判)串联为单一工作流: -1. 批量接收 TDP/HTTP 原始告警 +1. 接收 TDP/HTTP 原始告警(支持 syslog 实时单条 / 批量列表 / 文件三种模式) 2. 逐条调用 http_alert_dedup 服务去重:跨批次已见告警直接跳过,节省研判算力 3. 对首次出现的唯一告警调用 tdp_alert_triage 服务进行 LLM 研判(测绘/CVE/payload 并行) 4. 聚合所有结果输出汇总报告及最高风险告警的研判详情 @@ -12,9 +12,9 @@ ## 流程结构 ``` -receive_alerts (解析输入,与 http_alert_dedup 接口相同) +receive_alerts (解析输入:syslog_message / alerts 列表 / alert_file 文件) ↓ -dedup_and_triage (逐条去重 → 唯一告警 → 研判) +dedup_and_triage (逐条去重 → 唯一告警 → 研判 → 缓存回填) ↓ generate_summary (聚合输出,写 pipeline_summary.md) ``` @@ -24,18 +24,26 @@ generate_summary (聚合输出,写 pipeline_summary.md) ``` for each raw_alert: POST /invoke → http_alert_dedup (port 19000) - ├─ filtered_out → 跳过(非 HTTP / 扫描告警) - ├─ duplicate_skipped → 跳过(跨批次已见) - └─ unique → POST /invoke → tdp_alert_triage (port 19001) - ↓ - collect triage result + ├─ filtered_out → 跳过(非 HTTP / 扫描告警) + ├─ duplicate_with_triage → 回填历史研判缓存(triage_cache.pkl) + ├─ duplicate_skipped → 跳过(跨批次已见且无缓存) + └─ unique → POST /invoke → tdp_alert_triage (port 19001) + ↓ + collect & persist triage result ``` ## 节点详情 ### 1. `receive_alerts` -与 `http_alert_dedup` 接口完全一致: -- 支持 `alerts`(list)或 `alert_file`(文件路径) + +支持三种输入模式(优先级由高到低): + +| 模式 | 触发条件 | 说明 | +|------|---------|------| +| **syslog** | `syslog_message` 字段存在 | flocks syslog 监听器注入,TDP 告警 JSON 在 `.message` 字段;syslog 元数据附加到告警的 `_syslog_meta` 字段 | +| **alerts** | `alerts` 或 `alert_list` 字段为非空列表 | 批量调用,直接传入告警列表 | +| **alert_file** | `alert_file` 为本地 JSON 文件路径 | 离线测试 / 批处理场景 | + - 提取去重配置:`source_log_type`、`filter_enabled`、`dedup_enabled`、`threshold` - 支持通过 `dedup_service_url` / `triage_service_url` 输入字段覆盖服务地址 @@ -56,12 +64,14 @@ for each raw_alert: | 字段 | 类型 | 默认 | 说明 | |------|------|------|------| +| `syslog_message` | dict | — | syslog 解析结果(由 flocks 监听器注入),TDP JSON 在 `.message` 字段 | | `alerts` | list | — | 原始告警列表(与 alert_file 二选一) | | `alert_file` | string | — | JSON 文件路径(替代 alerts 列表) | | `source_log_type` | string | `"tdp"` | 日志类型(`tdp` / `skyeye`) | | `filter_enabled` | bool | `true` | 是否启用告警过滤 | | `dedup_enabled` | bool | `true` | 是否启用去重(含持久化) | | `threshold` | float | `0.7` | LSH Jaccard 相似度阈值 | +| `max_dedup_keys` | int | `100000` | LSH hash + 研判缓存最大条数,超出后 FIFO 淘汰 | | `dedup_service_url` | string | `http://127.0.0.1:19000` | http_alert_dedup 服务地址 | | `triage_service_url` | string | `http://127.0.0.1:19001` | tdp_alert_triage 服务地址 | | `triage_timeout_s` | int | `300` | 单条研判超时秒数 | @@ -81,9 +91,56 @@ for each raw_alert: | `report_path` | string | `pipeline_summary.md` 落盘路径 | | `stats` | dict | 处理统计(total/filtered/dedup/triage 各计数) | +## Syslog 接入配置 + +flocks 内置了 RFC 3164 / RFC 5424 syslog 监听器(UDP + TCP,默认端口 5140)。只需通过 API 为本工作流开启监听,即可实现 TDP 实时告警接入。 + +### 启用 syslog 监听 + +```bash +curl -X POST http://127.0.0.1:8000/api/workflow/alert_dedup_triage/syslog-config \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{ + "enabled": true, + "protocol": "udp", + "host": "0.0.0.0", + "port": 5140, + "inputKey": "syslog_message" + }' +``` + +| 参数 | 说明 | +|------|------| +| `protocol` | `udp`(推荐)或 `tcp` | +| `port` | syslog 监听端口(默认 5140) | +| `inputKey` | 注入到工作流 inputs 的键名,本工作流固定读取 `syslog_message` | + +### TDP 设备 syslog 转发格式 + +TDP 传感器/探针将告警以 syslog 方式推送时,**消息体(MSG 字段)必须是合法 JSON 格式的 TDP 告警对象**,例如: + +``` +<134>May 10 16:00:00 tdp-sensor tdp: {"id":"AZtRk...","net":{"http":{"url":"/admin"}},"threat":{"name":"SQL注入"}} +``` + +- `receive_alerts` 节点会从 `syslog_message.message` 字段提取并解析该 JSON +- syslog 元数据(`hostname`、`severity`、`timestamp` 等)附加到告警的 `_syslog_meta` 字段,可供后续节点溯源但不参与去重计算 + +### 查询当前配置 + +```bash +curl http://127.0.0.1:8000/api/workflow/alert_dedup_triage/syslog-config \ + -H "Authorization: Bearer " +``` + +--- + ## 工程要点 -- **跨批次去重**:`dedup_and_triage` 每次单条调用 dedup 服务,LSH 状态持久化在 `~/.flocks/workspace/workflows/http_alert_dedup/` 下,跨工作流调用生效 -- **原始告警传给研判**:triage 接收的是原始 raw alert(保留嵌套 `net.http.*` / `threat.*` 字段),而不是归一化后的字段,确保研判提示词能解析完整信息 +- **三种输入模式**:syslog 实时单条(最高优先级)→ alerts 批次列表 → alert_file 文件,`receive_alerts` 自动检测并切换,`input_mode` 字段记录实际生效模式 +- **跨批次去重**:`dedup_and_triage` 每次单条调用 dedup 服务,LSH 状态持久化在 `~/.flocks/workspace/workflows/http_alert_dedup/` 下,syslog 实时模式与批次模式共享同一 LSH 状态 +- **研判缓存回填**:重复告警从 `~/.flocks/workspace/workflows/alert_dedup_triage/triage_cache.pkl` 读取历史研判结果(`stage=duplicate_with_triage`),实时 syslog 模式下可做到秒级响应 +- **原始告警传给研判**:triage 接收的是原始 raw alert(保留嵌套 `net.http.*` / `threat.*` 字段),syslog 模式下包含 `_syslog_meta` 附加字段 - **节点超时**:`node_timeout_s = 7200`,留出足够余量处理大批量告警(每条研判约 50s × N 条) -- **输出兼容性**:`generate_summary` 的主要输出字段与 `tdp_alert_triage` 相同,单告警场景下可无缝替换 +- **输出兼容性**:`generate_summary` 的主要输出字段与 `tdp_alert_triage` 相同,单告警(syslog)场景下可无缝替换 From a2ea64a48a64ff7d7654bacb832feb1ccbb03623 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Sun, 10 May 2026 17:12:48 +0800 Subject: [PATCH 23/41] refactor(alert_dedup_triage): embed sub-workflow calls via engine, remove HTTP service dependency Replace direct HTTP POST calls to published service ports (19000/19001) with in-process embedded invocations using flocks.workflow.runner.run_workflow and flocks.workflow.fs_store.workflow_scan_dirs. - _invoke_workflow(): locates sub-workflow.json via workflow_scan_dirs(), then calls run_workflow() directly in the same process - Removes urllib/HTTP helpers, dedup_service_url, triage_service_url inputs - Adds dedup_workflow_id / triage_workflow_id inputs (default: http_alert_dedup, tdp_alert_triage) so callers can override the sub-workflow IDs if needed - http_alert_dedup and tdp_alert_triage no longer need to be published as running services for alert_dedup_triage to function Co-authored-by: Cursor --- .../workflows/alert_dedup_triage/workflow.json | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json index e356ef05d..9c809f225 100644 --- a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json +++ b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json @@ -7,14 +7,14 @@ { "id": "receive_alerts", "type": "python", - "description": "解析输入告警。支持三种输入模式(优先级由高到低):① syslog_message(flocks syslog 监听注入,message 字段为 TDP JSON);② alerts(告警列表,批次调用);③ alert_file(本地 JSON 文件路径)。syslog 元数据(hostname/severity 等)附加到告警的 _syslog_meta 字段供溯源。", - "code": "\nimport json\nimport os\n\n# ── 输入来源优先级:syslog_message > alerts > alert_file ────────\n# syslog 模式:flocks ingest/syslog/manager.py 调用本工作流时,\n# 会把 parse_syslog() 的返回值以 inputKey(默认 syslog_message)注入。\n# 解析结构:{raw, facility, severity, timestamp, hostname, app_name, message, format}\n# TDP 告警 JSON 存放在 message 字段。\n\nalerts_input = []\ninput_mode = 'unknown'\n\nsyslog_msg = inputs.get('syslog_message') or inputs.get('syslog')\nif syslog_msg and isinstance(syslog_msg, dict):\n raw_text = syslog_msg.get('message', '')\n if raw_text:\n try:\n alert = json.loads(raw_text)\n # 将 syslog 元数据附加到告警,方便后续溯源(不影响去重/研判逻辑)\n alert['_syslog_meta'] = {\n 'hostname': syslog_msg.get('hostname', ''),\n 'app_name': syslog_msg.get('app_name', ''),\n 'timestamp': syslog_msg.get('timestamp', ''),\n 'severity': syslog_msg.get('severity'),\n 'facility': syslog_msg.get('facility'),\n 'format': syslog_msg.get('format', ''),\n }\n alerts_input = [alert]\n input_mode = 'syslog'\n print(f'[receive] syslog mode: host={syslog_msg.get(\"hostname\")!r} '\n f'app={syslog_msg.get(\"app_name\")!r} '\n f'severity={syslog_msg.get(\"severity\")} '\n f'format={syslog_msg.get(\"format\")!r}')\n except (json.JSONDecodeError, TypeError) as _e:\n print(f'[receive] WARNING: syslog.message is not valid JSON ({_e}), '\n f'raw={raw_text[:120]!r}')\n else:\n print('[receive] WARNING: syslog_message.message is empty, skipping')\n\nif not alerts_input:\n alerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n if alerts_input:\n input_mode = 'alerts'\n\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n input_mode = 'alert_file'\n print(f'[receive] file mode: loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nif not alerts_input:\n print('[receive] WARNING: no alerts to process (syslog_message, alerts, alert_file all empty)')\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nprint(f'[receive] input_mode={input_mode} raw_alerts={len(alerts_input)} '\n f'source_log_type={source_log_type} max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['input_mode'] = input_mode\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['threshold'] = threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\noutputs['dedup_service_url'] = inputs.get('dedup_service_url', 'http://127.0.0.1:19000')\noutputs['triage_service_url'] = inputs.get('triage_service_url', 'http://127.0.0.1:19001')\n" + "description": "解析输入告警,支持三种模式(优先级:syslog_message > alerts > alert_file)。输出子工作流 ID(dedup_workflow_id / triage_workflow_id),由 dedup_and_triage 节点内嵌调用,无需发布服务。", + "code": "\nimport json\nimport os\n\n# Input priority: syslog_message > alerts > alert_file\n# syslog mode: flocks ingest/syslog/manager.py injects parsed syslog dict as inputKey (default: syslog_message)\n# Structure: {raw, facility, severity, timestamp, hostname, app_name, message, format}\n# TDP alert JSON is in the message field.\n\nalerts_input = []\ninput_mode = 'unknown'\n\nsyslog_msg = inputs.get('syslog_message') or inputs.get('syslog')\nif syslog_msg and isinstance(syslog_msg, dict):\n raw_text = syslog_msg.get('message', '')\n if raw_text:\n try:\n alert = json.loads(raw_text)\n alert['_syslog_meta'] = {\n 'hostname': syslog_msg.get('hostname', ''),\n 'app_name': syslog_msg.get('app_name', ''),\n 'timestamp': syslog_msg.get('timestamp', ''),\n 'severity': syslog_msg.get('severity'),\n 'facility': syslog_msg.get('facility'),\n 'format': syslog_msg.get('format', ''),\n }\n alerts_input = [alert]\n input_mode = 'syslog'\n print(f'[receive] syslog mode: host={syslog_msg.get(\"hostname\")!r} '\n f'app={syslog_msg.get(\"app_name\")!r} '\n f'severity={syslog_msg.get(\"severity\")} '\n f'format={syslog_msg.get(\"format\")!r}')\n except (json.JSONDecodeError, TypeError) as _e:\n print(f'[receive] WARNING: syslog.message is not valid JSON ({_e}), '\n f'raw={raw_text[:120]!r}')\n else:\n print('[receive] WARNING: syslog_message.message is empty, skipping')\n\nif not alerts_input:\n alerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n if alerts_input:\n input_mode = 'alerts'\n\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n input_mode = 'alert_file'\n print(f'[receive] file mode: loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nif not alerts_input:\n print('[receive] WARNING: no alerts to process (syslog_message, alerts, alert_file all empty)')\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nprint(f'[receive] input_mode={input_mode} raw_alerts={len(alerts_input)} '\n f'source_log_type={source_log_type} max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['input_mode'] = input_mode\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['threshold'] = threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\n# Sub-workflow IDs: engine embeds them directly; no service URL needed\noutputs['dedup_workflow_id'] = inputs.get('dedup_workflow_id', 'http_alert_dedup')\noutputs['triage_workflow_id'] = inputs.get('triage_workflow_id', 'tdp_alert_triage')\n" }, { "id": "dedup_and_triage", "type": "python", - "description": "核心循环节点:逐条调用 http_alert_dedup 服务去重,对首次出现的唯一告警调用 tdp_alert_triage 服务研判;命中去重的重复告警会回填历史研判结果(从持久化研判缓存 triage_cache.pkl 读取),研判缓存支持 FIFO LRU 淘汰(max_dedup_keys 可调)", - "code": "\nimport json\nimport os\nimport pickle\nimport sys\nimport time\nimport urllib.request\nimport urllib.error\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nWORKFLOW_NAME = 'alert_dedup_triage'\n\n# ── Triage cache helpers ────────────────────────────────────────\n\ndef _triage_cache_path():\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent # ~/.flocks\n state_dir = flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME\n state_dir.mkdir(parents=True, exist_ok=True)\n return str(state_dir / 'triage_cache.pkl'), str(state_dir / 'triage_cache.lock')\n\ndef _acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L'); fh.flush(); fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef _release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef _load_triage_cache(cache_path):\n # Returns an ordered dict: dedup_key -> triage_info dict.\n if not os.path.exists(cache_path) or os.path.getsize(cache_path) == 0:\n return {}\n try:\n with open(cache_path, 'rb') as f:\n c = pickle.load(f)\n if not isinstance(c, dict):\n return {}\n print(f'[triage_cache] loaded {len(c)} entries from {cache_path}')\n return c\n except Exception as e:\n print(f'[triage_cache] failed to load ({e}), starting fresh')\n return {}\n\ndef _save_triage_cache(cache_path, cache):\n tmp = cache_path + '.tmp'\n try:\n with open(tmp, 'wb') as f:\n pickle.dump(cache, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, cache_path)\n print(f'[triage_cache] saved {len(cache)} entries -> {cache_path}')\n except Exception as e:\n print(f'[triage_cache] failed to save: {e}')\n if os.path.exists(tmp):\n try: os.remove(tmp)\n except Exception: pass\n\ndef _evict_cache(cache, max_keys):\n excess = len(cache) - max_keys\n if excess > 0:\n for k in list(cache.keys())[:excess]:\n del cache[k]\n return excess\n return 0\n\n# ── HTTP helper ─────────────────────────────────────────────────\n\ndef _post(url, payload, timeout):\n req = urllib.request.Request(\n url,\n data=json.dumps(payload).encode(),\n headers={'Content-Type': 'application/json'},\n method='POST',\n )\n t0 = time.time()\n try:\n with urllib.request.urlopen(req, timeout=timeout) as resp:\n return json.loads(resp.read()), round((time.time() - t0) * 1000), None\n except urllib.error.HTTPError as e:\n body = e.read().decode(errors='replace')[:300]\n return None, round((time.time() - t0) * 1000), f'HTTP {e.code}: {body}'\n except Exception as e:\n return None, round((time.time() - t0) * 1000), str(e)\n\n# ── Main ────────────────────────────────────────────────────────\n\nfrom pathlib import Path # noqa: E402 (used inside node code)\n\nraw_alerts = inputs.get('raw_alerts', [])\nsource_log_type = inputs.get('source_log_type', 'tdp')\nfilter_enabled = inputs.get('filter_enabled', True)\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = inputs.get('threshold', 0.7)\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = inputs.get('max_field_len', 500)\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\ndedup_url = inputs.get('dedup_service_url', 'http://127.0.0.1:19000')\ntriage_url = inputs.get('triage_service_url', 'http://127.0.0.1:19001')\ntriage_timeout = int(inputs.get('triage_timeout_s', 300))\ndedup_timeout = int(inputs.get('dedup_timeout_s', 60))\n\ndedup_base_inputs = {\n 'source_log_type': source_log_type,\n 'filter_enabled': filter_enabled,\n 'dedup_enabled': dedup_enabled,\n 'threshold': threshold,\n 'strict_fields': strict_fields,\n 'lsh_fields': lsh_fields,\n 'max_field_len': max_field_len,\n 'max_dedup_keys': max_dedup_keys,\n}\n\n# Load triage cache (locked to avoid concurrent corruption).\ncache_path, lock_path = _triage_cache_path()\nlock_fh = _acquire_lock(lock_path)\ntry:\n triage_cache = _load_triage_cache(cache_path)\nfinally:\n _release_lock(lock_fh)\n\nresults = []\ntriage_results = []\ncache_dirty = False\n\nstats = {\n 'total_input': len(raw_alerts),\n 'dedup_failed': 0,\n 'filtered_out': 0,\n 'duplicate_skipped': 0,\n 'duplicate_with_triage': 0, # duplicate but returned cached result\n 'triage_invoked': 0,\n 'triage_success': 0,\n 'triage_failed': 0,\n 'verdict_counts': {},\n}\n\nfor i, alert in enumerate(raw_alerts):\n entry = {\n 'alert_index': i,\n 'alert_id': alert.get('id') or alert.get('uuid') or alert.get('behave_uuid'),\n 'threat_name': (alert.get('threat') or {}).get('name', ''),\n }\n\n # ── Step 1: dedup ───────────────────────────────────────────\n dr, dms, derr = _post(\n f'{dedup_url}/invoke',\n {'inputs': {**dedup_base_inputs, 'alerts': [alert]}},\n dedup_timeout,\n )\n if derr or not dr or dr.get('status') != 'SUCCEEDED':\n stats['dedup_failed'] += 1\n entry.update({'stage': 'dedup_failed', 'dedup_error': derr or dr, 'dedup_ms': dms})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] ✗ dedup FAILED ({dms}ms) {str(derr or \"\")[:80]}')\n continue\n\n dout = dr.get('outputs', {})\n unique_alerts = dout.get('unique_alerts', [])\n dstats = dout.get('stats', {})\n entry['dedup_ms'] = dms\n entry['filter_removed'] = dstats.get('filter_removed_count', 0)\n entry['lsh_clusters'] = dstats.get('lsh_total_clusters')\n entry['lsh_dedup_keys'] = dstats.get('lsh_total_dedup_keys')\n\n if not unique_alerts:\n stats['filtered_out'] += 1\n entry.update({'stage': 'filtered_out', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - filtered_out ({dms}ms)')\n continue\n\n already = bool(unique_alerts[0].get('dedup_key_already_exists'))\n dedup_key = unique_alerts[0].get('dedup_key', '')\n entry['dedup_key'] = dedup_key\n\n if already:\n # Look up the previous triage result for this dedup_key.\n cached_triage = triage_cache.get(dedup_key)\n if cached_triage:\n stats['duplicate_with_triage'] += 1\n verdict = cached_triage.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n entry.update({'stage': 'duplicate_with_triage', 'triage': cached_triage})\n results.append(entry)\n triage_results.append({**entry, **cached_triage})\n print(f' [{i+1}/{len(raw_alerts)}] ↩ duplicate+cached ({dms}ms) key={dedup_key[:8]} verdict={verdict}')\n else:\n stats['duplicate_skipped'] += 1\n entry.update({'stage': 'duplicate_skipped', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - duplicate (no cache) ({dms}ms) key={dedup_key[:8]}')\n continue\n\n # ── Step 2: triage (unique, first occurrence) ───────────────\n stats['triage_invoked'] += 1\n tr, tms, terr = _post(\n f'{triage_url}/invoke',\n {'inputs': {'alert_data': alert}},\n triage_timeout,\n )\n if terr or not tr or tr.get('status') != 'SUCCEEDED':\n stats['triage_failed'] += 1\n entry.update({'stage': 'triage_failed', 'triage_ms': tms,\n 'triage_error': terr or tr})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] ✗ dedup OK + triage FAILED ({dms}+{tms}ms) {str(terr or \"\")[:80]}')\n continue\n\n stats['triage_success'] += 1\n tout = tr.get('outputs', {})\n verdict = tout.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n\n triage_info = {\n 'attack_verdict': verdict,\n 'risk_level': tout.get('risk_level'),\n 'report_title': tout.get('report_title'),\n 'report_path': tout.get('report_path'),\n 'final_report': tout.get('final_report', ''),\n }\n # Store in cache (refresh insertion order so recently-hit keys survive eviction longer).\n if dedup_key in triage_cache:\n del triage_cache[dedup_key]\n triage_cache[dedup_key] = triage_info\n cache_dirty = True\n\n entry.update({'stage': 'triage_done', 'triage_ms': tms, 'triage': triage_info})\n results.append(entry)\n triage_results.append({**entry, **triage_info})\n print(f' [{i+1}/{len(raw_alerts)}] ✓ dedup+triage OK ({dms}+{tms}ms) verdict={verdict} title={(tout.get(\"report_title\") or \"\")[:40]}')\n\n# Persist updated triage cache (only if changed).\nif cache_dirty:\n lock_fh = _acquire_lock(lock_path)\n try:\n evicted = _evict_cache(triage_cache, max_dedup_keys)\n if evicted:\n print(f'[triage_cache] LRU eviction: dropped {evicted} entries (max={max_dedup_keys})')\n _save_triage_cache(cache_path, triage_cache)\n finally:\n _release_lock(lock_fh)\n\nstats['triage_cache_size'] = len(triage_cache)\nprint(f'[pipeline] stats={json.dumps(stats, ensure_ascii=False)}')\n\noutputs['results'] = results\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\n" + "description": "核心循环节点:逐条内嵌调用 http_alert_dedup 工作流去重(不依赖发布服务端口),对首次出现的唯一告警内嵌调用 tdp_alert_triage 工作流研判;重复告警从 triage_cache.pkl 回填历史研判结果(FIFO LRU,max_dedup_keys 可调)。", + "code": "\nimport json\nimport os\nimport pickle\nimport sys\nimport time\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nWORKFLOW_NAME = 'alert_dedup_triage'\n\n# ── Triage cache helpers ──────────────────────────────────────────────────────\n\ndef _triage_cache_path():\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent\n state_dir = flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME\n state_dir.mkdir(parents=True, exist_ok=True)\n return str(state_dir / 'triage_cache.pkl'), str(state_dir / 'triage_cache.lock')\n\ndef _acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L'); fh.flush(); fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef _release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef _load_triage_cache(cache_path):\n if not os.path.exists(cache_path) or os.path.getsize(cache_path) == 0:\n return {}\n try:\n with open(cache_path, 'rb') as f:\n c = pickle.load(f)\n if not isinstance(c, dict):\n return {}\n print(f'[triage_cache] loaded {len(c)} entries from {cache_path}')\n return c\n except Exception as e:\n print(f'[triage_cache] failed to load ({e}), starting fresh')\n return {}\n\ndef _save_triage_cache(cache_path, cache):\n tmp = cache_path + '.tmp'\n try:\n with open(tmp, 'wb') as f:\n pickle.dump(cache, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, cache_path)\n print(f'[triage_cache] saved {len(cache)} entries -> {cache_path}')\n except Exception as e:\n print(f'[triage_cache] failed to save: {e}')\n if os.path.exists(tmp):\n try: os.remove(tmp)\n except Exception: pass\n\ndef _evict_cache(cache, max_keys):\n excess = len(cache) - max_keys\n if excess > 0:\n for k in list(cache.keys())[:excess]:\n del cache[k]\n return excess\n return 0\n\n# ── Embedded sub-workflow invocation (no HTTP, no published service needed) ──\n\ndef _find_workflow_path(workflow_id):\n from flocks.workflow.fs_store import workflow_scan_dirs\n for root, _ in workflow_scan_dirs():\n p = root / workflow_id / 'workflow.json'\n if p.exists():\n return p\n return None\n\ndef _invoke_workflow(workflow_id, wf_inputs, timeout_s):\n from flocks.workflow.runner import run_workflow as _run_wf\n t0 = time.time()\n try:\n wf_path = _find_workflow_path(workflow_id)\n if not wf_path:\n return None, round((time.time() - t0) * 1000), f'Workflow not found: {workflow_id!r}'\n result = _run_wf(\n workflow=wf_path,\n inputs=wf_inputs,\n timeout_s=float(timeout_s),\n ensure_requirements=False,\n )\n ms = round((time.time() - t0) * 1000)\n if result.status == 'SUCCEEDED':\n return {'status': 'SUCCEEDED', 'outputs': result.outputs}, ms, None\n else:\n return None, ms, result.error or f'status={result.status}'\n except Exception as e:\n return None, round((time.time() - t0) * 1000), str(e)\n\n# ── Main ─────────────────────────────────────────────────────────────────────\n\nraw_alerts = inputs.get('raw_alerts', [])\nsource_log_type = inputs.get('source_log_type', 'tdp')\nfilter_enabled = inputs.get('filter_enabled', True)\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = inputs.get('threshold', 0.7)\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = inputs.get('max_field_len', 500)\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\ndedup_wf_id = inputs.get('dedup_workflow_id', 'http_alert_dedup')\ntriage_wf_id = inputs.get('triage_workflow_id', 'tdp_alert_triage')\ntriage_timeout_s = int(inputs.get('triage_timeout_s', 300))\ndedup_timeout_s = int(inputs.get('dedup_timeout_s', 60))\n\ndedup_base_inputs = {\n 'source_log_type': source_log_type,\n 'filter_enabled': filter_enabled,\n 'dedup_enabled': dedup_enabled,\n 'threshold': threshold,\n 'strict_fields': strict_fields,\n 'lsh_fields': lsh_fields,\n 'max_field_len': max_field_len,\n 'max_dedup_keys': max_dedup_keys,\n}\n\ncache_path, lock_path = _triage_cache_path()\nlock_fh = _acquire_lock(lock_path)\ntry:\n triage_cache = _load_triage_cache(cache_path)\nfinally:\n _release_lock(lock_fh)\n\nresults = []\ntriage_results = []\ncache_dirty = False\n\nstats = {\n 'total_input': len(raw_alerts),\n 'dedup_failed': 0,\n 'filtered_out': 0,\n 'duplicate_skipped': 0,\n 'duplicate_with_triage': 0,\n 'triage_invoked': 0,\n 'triage_success': 0,\n 'triage_failed': 0,\n 'verdict_counts': {},\n}\n\nfor i, alert in enumerate(raw_alerts):\n entry = {\n 'alert_index': i,\n 'alert_id': alert.get('id') or alert.get('uuid') or alert.get('behave_uuid'),\n 'threat_name': (alert.get('threat') or {}).get('name', ''),\n }\n\n # Step 1: dedup via embedded http_alert_dedup workflow\n dr, dms, derr = _invoke_workflow(\n dedup_wf_id,\n {**dedup_base_inputs, 'alerts': [alert]},\n dedup_timeout_s,\n )\n if derr or not dr or dr.get('status') != 'SUCCEEDED':\n stats['dedup_failed'] += 1\n entry.update({'stage': 'dedup_failed', 'dedup_error': derr or dr, 'dedup_ms': dms})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] dedup FAILED ({dms}ms) {str(derr or \"\")[:80]}')\n continue\n\n dout = dr.get('outputs', {})\n unique_alerts = dout.get('unique_alerts', [])\n dstats = dout.get('stats', {})\n entry['dedup_ms'] = dms\n entry['filter_removed'] = dstats.get('filter_removed_count', 0)\n entry['lsh_clusters'] = dstats.get('lsh_total_clusters')\n entry['lsh_dedup_keys'] = dstats.get('lsh_total_dedup_keys')\n\n if not unique_alerts:\n stats['filtered_out'] += 1\n entry.update({'stage': 'filtered_out', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - filtered_out ({dms}ms)')\n continue\n\n already = bool(unique_alerts[0].get('dedup_key_already_exists'))\n dedup_key = unique_alerts[0].get('dedup_key', '')\n entry['dedup_key'] = dedup_key\n\n if already:\n cached_triage = triage_cache.get(dedup_key)\n if cached_triage:\n stats['duplicate_with_triage'] += 1\n verdict = cached_triage.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n entry.update({'stage': 'duplicate_with_triage', 'triage': cached_triage})\n results.append(entry)\n triage_results.append({**entry, **cached_triage})\n print(f' [{i+1}/{len(raw_alerts)}] duplicate+cached ({dms}ms) key={dedup_key[:8]} verdict={verdict}')\n else:\n stats['duplicate_skipped'] += 1\n entry.update({'stage': 'duplicate_skipped', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - duplicate (no cache) ({dms}ms) key={dedup_key[:8]}')\n continue\n\n # Step 2: triage via embedded tdp_alert_triage workflow (unique alerts only)\n stats['triage_invoked'] += 1\n tr, tms, terr = _invoke_workflow(\n triage_wf_id,\n {'alert_data': alert},\n triage_timeout_s,\n )\n if terr or not tr or tr.get('status') != 'SUCCEEDED':\n stats['triage_failed'] += 1\n entry.update({'stage': 'triage_failed', 'triage_ms': tms,\n 'triage_error': terr or tr})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] dedup OK + triage FAILED ({dms}+{tms}ms) {str(terr or \"\")[:80]}')\n continue\n\n stats['triage_success'] += 1\n tout = tr.get('outputs', {})\n verdict = tout.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n\n triage_info = {\n 'attack_verdict': verdict,\n 'risk_level': tout.get('risk_level'),\n 'report_title': tout.get('report_title'),\n 'report_path': tout.get('report_path'),\n 'final_report': tout.get('final_report', ''),\n }\n if dedup_key in triage_cache:\n del triage_cache[dedup_key]\n triage_cache[dedup_key] = triage_info\n cache_dirty = True\n\n entry.update({'stage': 'triage_done', 'triage_ms': tms, 'triage': triage_info})\n results.append(entry)\n triage_results.append({**entry, **triage_info})\n print(f' [{i+1}/{len(raw_alerts)}] dedup+triage OK ({dms}+{tms}ms) verdict={verdict} title={(tout.get(\"report_title\") or \"\")[:40]}')\n\nif cache_dirty:\n lock_fh = _acquire_lock(lock_path)\n try:\n evicted = _evict_cache(triage_cache, max_dedup_keys)\n if evicted:\n print(f'[triage_cache] LRU eviction: dropped {evicted} entries (max={max_dedup_keys})')\n _save_triage_cache(cache_path, triage_cache)\n finally:\n _release_lock(lock_fh)\n\nstats['triage_cache_size'] = len(triage_cache)\nprint(f'[pipeline] stats={json.dumps(stats, ensure_ascii=False)}')\n\noutputs['results'] = results\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\n" }, { "id": "generate_summary", @@ -42,8 +42,6 @@ "filter_enabled": true, "dedup_enabled": true, "threshold": 0.7, - "dedup_service_url": "http://127.0.0.1:19000", - "triage_service_url": "http://127.0.0.1:19001", "triage_timeout_s": 300, "_comment_alerts": "Pass 'alerts' (list) or 'alert_file' (path to JSON file)", "max_dedup_keys": 100000, @@ -57,7 +55,10 @@ "app_name": "tdp", "message": "{\"id\":\"AZtRkZkzj\",\"net\":{\"http\":{\"url\":\"/admin\"}},\"threat\":{\"name\":\"SQL注入\"}}", "format": "rfc3164" - } + }, + "dedup_workflow_id": "http_alert_dedup", + "triage_workflow_id": "tdp_alert_triage", + "_comment_ids": "Sub-workflow IDs used for embedded invocation. No published service required - the engine locates and runs these workflows directly." } } } \ No newline at end of file From 3f49f2b74f4856b07a1bcdc3c6e33124f7c11004 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Sun, 10 May 2026 17:14:50 +0800 Subject: [PATCH 24/41] style(alert_dedup_triage): translate all comments and UI strings to English - Node description fields: translated to English - generate_summary: verdict_label / stage_label dicts and summary_md template converted from Chinese to English - Code comments in receive_alerts and dedup_and_triage were already English - description_cn field and LLM prompt content in tdp_alert_triage left as-is (intentionally Chinese for LLM interaction) Co-authored-by: Cursor --- .../plugins/workflows/alert_dedup_triage/workflow.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json index 9c809f225..d2e6b521e 100644 --- a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json +++ b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json @@ -7,20 +7,20 @@ { "id": "receive_alerts", "type": "python", - "description": "解析输入告警,支持三种模式(优先级:syslog_message > alerts > alert_file)。输出子工作流 ID(dedup_workflow_id / triage_workflow_id),由 dedup_and_triage 节点内嵌调用,无需发布服务。", + "description": "Parse incoming alerts. Supports three input modes in priority order: syslog_message (injected by flocks syslog listener; TDP alert JSON in .message field), alerts (list, batch), alert_file (local JSON path). Outputs sub-workflow IDs for embedded invocation by dedup_and_triage.", "code": "\nimport json\nimport os\n\n# Input priority: syslog_message > alerts > alert_file\n# syslog mode: flocks ingest/syslog/manager.py injects parsed syslog dict as inputKey (default: syslog_message)\n# Structure: {raw, facility, severity, timestamp, hostname, app_name, message, format}\n# TDP alert JSON is in the message field.\n\nalerts_input = []\ninput_mode = 'unknown'\n\nsyslog_msg = inputs.get('syslog_message') or inputs.get('syslog')\nif syslog_msg and isinstance(syslog_msg, dict):\n raw_text = syslog_msg.get('message', '')\n if raw_text:\n try:\n alert = json.loads(raw_text)\n alert['_syslog_meta'] = {\n 'hostname': syslog_msg.get('hostname', ''),\n 'app_name': syslog_msg.get('app_name', ''),\n 'timestamp': syslog_msg.get('timestamp', ''),\n 'severity': syslog_msg.get('severity'),\n 'facility': syslog_msg.get('facility'),\n 'format': syslog_msg.get('format', ''),\n }\n alerts_input = [alert]\n input_mode = 'syslog'\n print(f'[receive] syslog mode: host={syslog_msg.get(\"hostname\")!r} '\n f'app={syslog_msg.get(\"app_name\")!r} '\n f'severity={syslog_msg.get(\"severity\")} '\n f'format={syslog_msg.get(\"format\")!r}')\n except (json.JSONDecodeError, TypeError) as _e:\n print(f'[receive] WARNING: syslog.message is not valid JSON ({_e}), '\n f'raw={raw_text[:120]!r}')\n else:\n print('[receive] WARNING: syslog_message.message is empty, skipping')\n\nif not alerts_input:\n alerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n if alerts_input:\n input_mode = 'alerts'\n\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n input_mode = 'alert_file'\n print(f'[receive] file mode: loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nif not alerts_input:\n print('[receive] WARNING: no alerts to process (syslog_message, alerts, alert_file all empty)')\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nprint(f'[receive] input_mode={input_mode} raw_alerts={len(alerts_input)} '\n f'source_log_type={source_log_type} max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['input_mode'] = input_mode\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['threshold'] = threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\n# Sub-workflow IDs: engine embeds them directly; no service URL needed\noutputs['dedup_workflow_id'] = inputs.get('dedup_workflow_id', 'http_alert_dedup')\noutputs['triage_workflow_id'] = inputs.get('triage_workflow_id', 'tdp_alert_triage')\n" }, { "id": "dedup_and_triage", "type": "python", - "description": "核心循环节点:逐条内嵌调用 http_alert_dedup 工作流去重(不依赖发布服务端口),对首次出现的唯一告警内嵌调用 tdp_alert_triage 工作流研判;重复告警从 triage_cache.pkl 回填历史研判结果(FIFO LRU,max_dedup_keys 可调)。", + "description": "Core loop: for each alert, invoke http_alert_dedup in-process (no published service required) to deduplicate; for first-seen unique alerts invoke tdp_alert_triage in-process for LLM triage. Duplicate alerts are served from triage_cache.pkl (FIFO LRU, max_dedup_keys configurable).", "code": "\nimport json\nimport os\nimport pickle\nimport sys\nimport time\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nWORKFLOW_NAME = 'alert_dedup_triage'\n\n# ── Triage cache helpers ──────────────────────────────────────────────────────\n\ndef _triage_cache_path():\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent\n state_dir = flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME\n state_dir.mkdir(parents=True, exist_ok=True)\n return str(state_dir / 'triage_cache.pkl'), str(state_dir / 'triage_cache.lock')\n\ndef _acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L'); fh.flush(); fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef _release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef _load_triage_cache(cache_path):\n if not os.path.exists(cache_path) or os.path.getsize(cache_path) == 0:\n return {}\n try:\n with open(cache_path, 'rb') as f:\n c = pickle.load(f)\n if not isinstance(c, dict):\n return {}\n print(f'[triage_cache] loaded {len(c)} entries from {cache_path}')\n return c\n except Exception as e:\n print(f'[triage_cache] failed to load ({e}), starting fresh')\n return {}\n\ndef _save_triage_cache(cache_path, cache):\n tmp = cache_path + '.tmp'\n try:\n with open(tmp, 'wb') as f:\n pickle.dump(cache, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, cache_path)\n print(f'[triage_cache] saved {len(cache)} entries -> {cache_path}')\n except Exception as e:\n print(f'[triage_cache] failed to save: {e}')\n if os.path.exists(tmp):\n try: os.remove(tmp)\n except Exception: pass\n\ndef _evict_cache(cache, max_keys):\n excess = len(cache) - max_keys\n if excess > 0:\n for k in list(cache.keys())[:excess]:\n del cache[k]\n return excess\n return 0\n\n# ── Embedded sub-workflow invocation (no HTTP, no published service needed) ──\n\ndef _find_workflow_path(workflow_id):\n from flocks.workflow.fs_store import workflow_scan_dirs\n for root, _ in workflow_scan_dirs():\n p = root / workflow_id / 'workflow.json'\n if p.exists():\n return p\n return None\n\ndef _invoke_workflow(workflow_id, wf_inputs, timeout_s):\n from flocks.workflow.runner import run_workflow as _run_wf\n t0 = time.time()\n try:\n wf_path = _find_workflow_path(workflow_id)\n if not wf_path:\n return None, round((time.time() - t0) * 1000), f'Workflow not found: {workflow_id!r}'\n result = _run_wf(\n workflow=wf_path,\n inputs=wf_inputs,\n timeout_s=float(timeout_s),\n ensure_requirements=False,\n )\n ms = round((time.time() - t0) * 1000)\n if result.status == 'SUCCEEDED':\n return {'status': 'SUCCEEDED', 'outputs': result.outputs}, ms, None\n else:\n return None, ms, result.error or f'status={result.status}'\n except Exception as e:\n return None, round((time.time() - t0) * 1000), str(e)\n\n# ── Main ─────────────────────────────────────────────────────────────────────\n\nraw_alerts = inputs.get('raw_alerts', [])\nsource_log_type = inputs.get('source_log_type', 'tdp')\nfilter_enabled = inputs.get('filter_enabled', True)\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = inputs.get('threshold', 0.7)\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = inputs.get('max_field_len', 500)\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\ndedup_wf_id = inputs.get('dedup_workflow_id', 'http_alert_dedup')\ntriage_wf_id = inputs.get('triage_workflow_id', 'tdp_alert_triage')\ntriage_timeout_s = int(inputs.get('triage_timeout_s', 300))\ndedup_timeout_s = int(inputs.get('dedup_timeout_s', 60))\n\ndedup_base_inputs = {\n 'source_log_type': source_log_type,\n 'filter_enabled': filter_enabled,\n 'dedup_enabled': dedup_enabled,\n 'threshold': threshold,\n 'strict_fields': strict_fields,\n 'lsh_fields': lsh_fields,\n 'max_field_len': max_field_len,\n 'max_dedup_keys': max_dedup_keys,\n}\n\ncache_path, lock_path = _triage_cache_path()\nlock_fh = _acquire_lock(lock_path)\ntry:\n triage_cache = _load_triage_cache(cache_path)\nfinally:\n _release_lock(lock_fh)\n\nresults = []\ntriage_results = []\ncache_dirty = False\n\nstats = {\n 'total_input': len(raw_alerts),\n 'dedup_failed': 0,\n 'filtered_out': 0,\n 'duplicate_skipped': 0,\n 'duplicate_with_triage': 0,\n 'triage_invoked': 0,\n 'triage_success': 0,\n 'triage_failed': 0,\n 'verdict_counts': {},\n}\n\nfor i, alert in enumerate(raw_alerts):\n entry = {\n 'alert_index': i,\n 'alert_id': alert.get('id') or alert.get('uuid') or alert.get('behave_uuid'),\n 'threat_name': (alert.get('threat') or {}).get('name', ''),\n }\n\n # Step 1: dedup via embedded http_alert_dedup workflow\n dr, dms, derr = _invoke_workflow(\n dedup_wf_id,\n {**dedup_base_inputs, 'alerts': [alert]},\n dedup_timeout_s,\n )\n if derr or not dr or dr.get('status') != 'SUCCEEDED':\n stats['dedup_failed'] += 1\n entry.update({'stage': 'dedup_failed', 'dedup_error': derr or dr, 'dedup_ms': dms})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] dedup FAILED ({dms}ms) {str(derr or \"\")[:80]}')\n continue\n\n dout = dr.get('outputs', {})\n unique_alerts = dout.get('unique_alerts', [])\n dstats = dout.get('stats', {})\n entry['dedup_ms'] = dms\n entry['filter_removed'] = dstats.get('filter_removed_count', 0)\n entry['lsh_clusters'] = dstats.get('lsh_total_clusters')\n entry['lsh_dedup_keys'] = dstats.get('lsh_total_dedup_keys')\n\n if not unique_alerts:\n stats['filtered_out'] += 1\n entry.update({'stage': 'filtered_out', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - filtered_out ({dms}ms)')\n continue\n\n already = bool(unique_alerts[0].get('dedup_key_already_exists'))\n dedup_key = unique_alerts[0].get('dedup_key', '')\n entry['dedup_key'] = dedup_key\n\n if already:\n cached_triage = triage_cache.get(dedup_key)\n if cached_triage:\n stats['duplicate_with_triage'] += 1\n verdict = cached_triage.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n entry.update({'stage': 'duplicate_with_triage', 'triage': cached_triage})\n results.append(entry)\n triage_results.append({**entry, **cached_triage})\n print(f' [{i+1}/{len(raw_alerts)}] duplicate+cached ({dms}ms) key={dedup_key[:8]} verdict={verdict}')\n else:\n stats['duplicate_skipped'] += 1\n entry.update({'stage': 'duplicate_skipped', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - duplicate (no cache) ({dms}ms) key={dedup_key[:8]}')\n continue\n\n # Step 2: triage via embedded tdp_alert_triage workflow (unique alerts only)\n stats['triage_invoked'] += 1\n tr, tms, terr = _invoke_workflow(\n triage_wf_id,\n {'alert_data': alert},\n triage_timeout_s,\n )\n if terr or not tr or tr.get('status') != 'SUCCEEDED':\n stats['triage_failed'] += 1\n entry.update({'stage': 'triage_failed', 'triage_ms': tms,\n 'triage_error': terr or tr})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] dedup OK + triage FAILED ({dms}+{tms}ms) {str(terr or \"\")[:80]}')\n continue\n\n stats['triage_success'] += 1\n tout = tr.get('outputs', {})\n verdict = tout.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n\n triage_info = {\n 'attack_verdict': verdict,\n 'risk_level': tout.get('risk_level'),\n 'report_title': tout.get('report_title'),\n 'report_path': tout.get('report_path'),\n 'final_report': tout.get('final_report', ''),\n }\n if dedup_key in triage_cache:\n del triage_cache[dedup_key]\n triage_cache[dedup_key] = triage_info\n cache_dirty = True\n\n entry.update({'stage': 'triage_done', 'triage_ms': tms, 'triage': triage_info})\n results.append(entry)\n triage_results.append({**entry, **triage_info})\n print(f' [{i+1}/{len(raw_alerts)}] dedup+triage OK ({dms}+{tms}ms) verdict={verdict} title={(tout.get(\"report_title\") or \"\")[:40]}')\n\nif cache_dirty:\n lock_fh = _acquire_lock(lock_path)\n try:\n evicted = _evict_cache(triage_cache, max_dedup_keys)\n if evicted:\n print(f'[triage_cache] LRU eviction: dropped {evicted} entries (max={max_dedup_keys})')\n _save_triage_cache(cache_path, triage_cache)\n finally:\n _release_lock(lock_fh)\n\nstats['triage_cache_size'] = len(triage_cache)\nprint(f'[pipeline] stats={json.dumps(stats, ensure_ascii=False)}')\n\noutputs['results'] = results\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\n" }, { "id": "generate_summary", "type": "python", - "description": "汇总节点:聚合所有研判结果,生成 pipeline_summary.md,并输出最高风险告警的研判标签(与 tdp_alert_triage 输出字段兼容)", - "code": "\nimport json\nimport datetime\n\nresults = inputs.get('results', [])\ntriage_results = inputs.get('triage_results', [])\nstats = inputs.get('stats', {})\n\nVERDICT_ORDER = {'attack_success': 5, 'attack': 4, 'attack_failed': 3, 'unknown': 2, 'benign': 1}\n\n# Pick the highest-risk triage result (includes duplicates with cached results).\ntop = None\nfor r in triage_results:\n if top is None:\n top = r\n elif VERDICT_ORDER.get(r.get('attack_verdict', ''), 0) > VERDICT_ORDER.get(top.get('attack_verdict', ''), 0):\n top = r\n\nfinal_report = top.get('final_report', '') if top else ''\nreport_title = top.get('report_title', '') if top else ''\nreport_path = top.get('report_path', '') if top else ''\nattack_verdict = top.get('attack_verdict', '') if top else ''\nrisk_level = top.get('risk_level', '') if top else ''\n\ntoday = datetime.date.today().isoformat()\nverdict_cn = {\n 'attack_success': '攻击成功', 'attack_failed': '攻击失败',\n 'attack': '攻击', 'unknown': '未知', 'benign': '安全',\n}\nstage_cn = {\n 'triage_done': '研判完成',\n 'duplicate_with_triage': '重复(缓存)',\n 'duplicate_skipped': '重复(跳过)',\n 'filtered_out': '已过滤',\n 'dedup_failed': '去重失败',\n 'triage_failed': '研判失败',\n}\n\nrows = []\nfor r in results:\n stage = r.get('stage', '')\n triage = r.get('triage')\n verdict = triage.get('attack_verdict', '-') if isinstance(triage, dict) else '-'\n title = triage.get('report_title', '-') if isinstance(triage, dict) else '-'\n cache_mark = ' ↩' if stage == 'duplicate_with_triage' else ''\n rows.append(\n f\"| {r.get('alert_index', 0) + 1} \"\n f\"| {(r.get('threat_name') or '')[:30]} \"\n f\"| {stage_cn.get(stage, stage)}{cache_mark} \"\n f\"| {verdict_cn.get(verdict, verdict)} \"\n f\"| {(title or '')[:30]} |\"\n )\n\nsummary_md = (\n f'# 告警去重研判汇总报告\\n\\n'\n f'**处理日期**: {today}\\n\\n'\n f'## 统计\\n'\n f'- 总输入: {stats.get(\"total_input\", 0)}\\n'\n f'- 新增研判: {stats.get(\"triage_success\", 0)}\\n'\n f'- 重复(含缓存研判): {stats.get(\"duplicate_with_triage\", 0)}\\n'\n f'- 重复(无缓存): {stats.get(\"duplicate_skipped\", 0)}\\n'\n f'- 过滤掉: {stats.get(\"filtered_out\", 0)}\\n'\n f'- 研判失败: {stats.get(\"triage_failed\", 0)}\\n'\n f'- 研判缓存条数: {stats.get(\"triage_cache_size\", 0)}\\n'\n f'- 攻击判定分布: {json.dumps(stats.get(\"verdict_counts\", {}), ensure_ascii=False)}\\n\\n'\n f'## 明细\\n\\n'\n f'| # | 告警类型 | 阶段 | 攻击判定 | 研判标题 |\\n'\n f'|---|---------|------|---------|--------|\\n'\n + '\\n'.join(rows) + '\\n\\n'\n + (f'## 最高风险告警研判报告\\n\\n{final_report}\\n' if final_report else '')\n)\n\ntry:\n out_path = get_path('pipeline_summary.md')\n import os\n os.makedirs(os.path.dirname(str(out_path)), exist_ok=True)\n with open(str(out_path), 'w', encoding='utf-8') as _f:\n _f.write(summary_md)\n summary_path = str(out_path)\n print(f'[summary] written to {summary_path}')\nexcept Exception as _e:\n print(f'[summary] WARNING: could not write summary file: {_e}')\n summary_path = ''\n\nprint(f'[summary] triage_success={stats.get(\"triage_success\", 0)}, '\n f'duplicate_with_triage={stats.get(\"duplicate_with_triage\", 0)}, '\n f'top_verdict={attack_verdict}')\n\noutputs['final_reports'] = [r.get('triage', {}).get('final_report', '') for r in triage_results if isinstance(r.get('triage'), dict)]\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\noutputs['summary_report'] = summary_md\noutputs['report_path'] = summary_path\noutputs['final_report'] = final_report\noutputs['report_title'] = report_title\noutputs['attack_verdict'] = attack_verdict\noutputs['risk_level'] = risk_level\n" + "description": "Aggregate all triage results, write pipeline_summary.md, and expose the highest-risk alert's triage labels (attack_verdict, risk_level, report_title, final_report) for downstream compatibility with tdp_alert_triage outputs.", + "code": "\nimport json\nimport datetime\n\nresults = inputs.get('results', [])\ntriage_results = inputs.get('triage_results', [])\nstats = inputs.get('stats', {})\n\nVERDICT_ORDER = {'attack_success': 5, 'attack': 4, 'attack_failed': 3, 'unknown': 2, 'benign': 1}\n\n# Pick the highest-risk triage result (includes duplicates with cached results).\ntop = None\nfor r in triage_results:\n if top is None:\n top = r\n elif VERDICT_ORDER.get(r.get('attack_verdict', ''), 0) > VERDICT_ORDER.get(top.get('attack_verdict', ''), 0):\n top = r\n\nfinal_report = top.get('final_report', '') if top else ''\nreport_title = top.get('report_title', '') if top else ''\nreport_path = top.get('report_path', '') if top else ''\nattack_verdict = top.get('attack_verdict', '') if top else ''\nrisk_level = top.get('risk_level', '') if top else ''\n\ntoday = datetime.date.today().isoformat()\nverdict_label = {\n 'attack_success': 'Attack Success',\n 'attack_failed': 'Attack Failed',\n 'attack': 'Attack',\n 'unknown': 'Unknown',\n 'benign': 'Benign',\n}\nstage_label = {\n 'triage_done': 'Triaged',\n 'duplicate_with_triage': 'Duplicate (cached)',\n 'duplicate_skipped': 'Duplicate (skipped)',\n 'filtered_out': 'Filtered',\n 'dedup_failed': 'Dedup Failed',\n 'triage_failed': 'Triage Failed',\n}\n\nrows = []\nfor r in results:\n stage = r.get('stage', '')\n triage = r.get('triage')\n verdict = triage.get('attack_verdict', '-') if isinstance(triage, dict) else '-'\n title = triage.get('report_title', '-') if isinstance(triage, dict) else '-'\n cache_mark = ' (cached)' if stage == 'duplicate_with_triage' else ''\n rows.append(\n f\"| {r.get('alert_index', 0) + 1} \"\n f\"| {(r.get('threat_name') or '')[:30]} \"\n f\"| {stage_label.get(stage, stage)}{cache_mark} \"\n f\"| {verdict_label.get(verdict, verdict)} \"\n f\"| {(title or '')[:30]} |\"\n )\n\nsummary_md = (\n f'# Alert Dedup & Triage Summary\\n\\n'\n f'**Date**: {today}\\n\\n'\n f'## Statistics\\n'\n f'- Total input: {stats.get(\"total_input\", 0)}\\n'\n f'- New triages: {stats.get(\"triage_success\", 0)}\\n'\n f'- Duplicates (cached triage): {stats.get(\"duplicate_with_triage\", 0)}\\n'\n f'- Duplicates (no cache): {stats.get(\"duplicate_skipped\", 0)}\\n'\n f'- Filtered out: {stats.get(\"filtered_out\", 0)}\\n'\n f'- Triage failed: {stats.get(\"triage_failed\", 0)}\\n'\n f'- Triage cache size: {stats.get(\"triage_cache_size\", 0)}\\n'\n f'- Verdict distribution: {json.dumps(stats.get(\"verdict_counts\", {}), ensure_ascii=False)}\\n\\n'\n f'## Details\\n\\n'\n f'| # | Threat | Stage | Verdict | Report Title |\\n'\n f'|---|--------|-------|---------|--------------|\\n'\n + '\\n'.join(rows) + '\\n\\n'\n + (f'## Top-Risk Alert Report\\n\\n{final_report}\\n' if final_report else '')\n)\n\ntry:\n out_path = get_path('pipeline_summary.md')\n import os\n os.makedirs(os.path.dirname(str(out_path)), exist_ok=True)\n with open(str(out_path), 'w', encoding='utf-8') as _f:\n _f.write(summary_md)\n summary_path = str(out_path)\n print(f'[summary] written to {summary_path}')\nexcept Exception as _e:\n print(f'[summary] WARNING: could not write summary file: {_e}')\n summary_path = ''\n\nprint(f'[summary] triage_success={stats.get(\"triage_success\", 0)}, '\n f'duplicate_with_triage={stats.get(\"duplicate_with_triage\", 0)}, '\n f'top_verdict={attack_verdict}')\n\noutputs['final_reports'] = [r.get('triage', {}).get('final_report', '') for r in triage_results if isinstance(r.get('triage'), dict)]\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\noutputs['summary_report'] = summary_md\noutputs['report_path'] = summary_path\noutputs['final_report'] = final_report\noutputs['report_title'] = report_title\noutputs['attack_verdict'] = attack_verdict\noutputs['risk_level'] = risk_level\n" } ], "edges": [ From ce8ef7737c04e54a885473dcad6df79b394120b8 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Sun, 10 May 2026 17:24:49 +0800 Subject: [PATCH 25/41] fix(generate_summary): remove duplicate '(cached)' label in summary table Co-authored-by: Cursor --- .flocks/plugins/workflows/alert_dedup_triage/workflow.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json index d2e6b521e..5ed3c5d3c 100644 --- a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json +++ b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json @@ -20,7 +20,7 @@ "id": "generate_summary", "type": "python", "description": "Aggregate all triage results, write pipeline_summary.md, and expose the highest-risk alert's triage labels (attack_verdict, risk_level, report_title, final_report) for downstream compatibility with tdp_alert_triage outputs.", - "code": "\nimport json\nimport datetime\n\nresults = inputs.get('results', [])\ntriage_results = inputs.get('triage_results', [])\nstats = inputs.get('stats', {})\n\nVERDICT_ORDER = {'attack_success': 5, 'attack': 4, 'attack_failed': 3, 'unknown': 2, 'benign': 1}\n\n# Pick the highest-risk triage result (includes duplicates with cached results).\ntop = None\nfor r in triage_results:\n if top is None:\n top = r\n elif VERDICT_ORDER.get(r.get('attack_verdict', ''), 0) > VERDICT_ORDER.get(top.get('attack_verdict', ''), 0):\n top = r\n\nfinal_report = top.get('final_report', '') if top else ''\nreport_title = top.get('report_title', '') if top else ''\nreport_path = top.get('report_path', '') if top else ''\nattack_verdict = top.get('attack_verdict', '') if top else ''\nrisk_level = top.get('risk_level', '') if top else ''\n\ntoday = datetime.date.today().isoformat()\nverdict_label = {\n 'attack_success': 'Attack Success',\n 'attack_failed': 'Attack Failed',\n 'attack': 'Attack',\n 'unknown': 'Unknown',\n 'benign': 'Benign',\n}\nstage_label = {\n 'triage_done': 'Triaged',\n 'duplicate_with_triage': 'Duplicate (cached)',\n 'duplicate_skipped': 'Duplicate (skipped)',\n 'filtered_out': 'Filtered',\n 'dedup_failed': 'Dedup Failed',\n 'triage_failed': 'Triage Failed',\n}\n\nrows = []\nfor r in results:\n stage = r.get('stage', '')\n triage = r.get('triage')\n verdict = triage.get('attack_verdict', '-') if isinstance(triage, dict) else '-'\n title = triage.get('report_title', '-') if isinstance(triage, dict) else '-'\n cache_mark = ' (cached)' if stage == 'duplicate_with_triage' else ''\n rows.append(\n f\"| {r.get('alert_index', 0) + 1} \"\n f\"| {(r.get('threat_name') or '')[:30]} \"\n f\"| {stage_label.get(stage, stage)}{cache_mark} \"\n f\"| {verdict_label.get(verdict, verdict)} \"\n f\"| {(title or '')[:30]} |\"\n )\n\nsummary_md = (\n f'# Alert Dedup & Triage Summary\\n\\n'\n f'**Date**: {today}\\n\\n'\n f'## Statistics\\n'\n f'- Total input: {stats.get(\"total_input\", 0)}\\n'\n f'- New triages: {stats.get(\"triage_success\", 0)}\\n'\n f'- Duplicates (cached triage): {stats.get(\"duplicate_with_triage\", 0)}\\n'\n f'- Duplicates (no cache): {stats.get(\"duplicate_skipped\", 0)}\\n'\n f'- Filtered out: {stats.get(\"filtered_out\", 0)}\\n'\n f'- Triage failed: {stats.get(\"triage_failed\", 0)}\\n'\n f'- Triage cache size: {stats.get(\"triage_cache_size\", 0)}\\n'\n f'- Verdict distribution: {json.dumps(stats.get(\"verdict_counts\", {}), ensure_ascii=False)}\\n\\n'\n f'## Details\\n\\n'\n f'| # | Threat | Stage | Verdict | Report Title |\\n'\n f'|---|--------|-------|---------|--------------|\\n'\n + '\\n'.join(rows) + '\\n\\n'\n + (f'## Top-Risk Alert Report\\n\\n{final_report}\\n' if final_report else '')\n)\n\ntry:\n out_path = get_path('pipeline_summary.md')\n import os\n os.makedirs(os.path.dirname(str(out_path)), exist_ok=True)\n with open(str(out_path), 'w', encoding='utf-8') as _f:\n _f.write(summary_md)\n summary_path = str(out_path)\n print(f'[summary] written to {summary_path}')\nexcept Exception as _e:\n print(f'[summary] WARNING: could not write summary file: {_e}')\n summary_path = ''\n\nprint(f'[summary] triage_success={stats.get(\"triage_success\", 0)}, '\n f'duplicate_with_triage={stats.get(\"duplicate_with_triage\", 0)}, '\n f'top_verdict={attack_verdict}')\n\noutputs['final_reports'] = [r.get('triage', {}).get('final_report', '') for r in triage_results if isinstance(r.get('triage'), dict)]\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\noutputs['summary_report'] = summary_md\noutputs['report_path'] = summary_path\noutputs['final_report'] = final_report\noutputs['report_title'] = report_title\noutputs['attack_verdict'] = attack_verdict\noutputs['risk_level'] = risk_level\n" + "code": "\nimport json\nimport datetime\n\nresults = inputs.get('results', [])\ntriage_results = inputs.get('triage_results', [])\nstats = inputs.get('stats', {})\n\nVERDICT_ORDER = {'attack_success': 5, 'attack': 4, 'attack_failed': 3, 'unknown': 2, 'benign': 1}\n\n# Pick the highest-risk triage result (includes duplicates with cached results).\ntop = None\nfor r in triage_results:\n if top is None:\n top = r\n elif VERDICT_ORDER.get(r.get('attack_verdict', ''), 0) > VERDICT_ORDER.get(top.get('attack_verdict', ''), 0):\n top = r\n\nfinal_report = top.get('final_report', '') if top else ''\nreport_title = top.get('report_title', '') if top else ''\nreport_path = top.get('report_path', '') if top else ''\nattack_verdict = top.get('attack_verdict', '') if top else ''\nrisk_level = top.get('risk_level', '') if top else ''\n\ntoday = datetime.date.today().isoformat()\nverdict_label = {\n 'attack_success': 'Attack Success',\n 'attack_failed': 'Attack Failed',\n 'attack': 'Attack',\n 'unknown': 'Unknown',\n 'benign': 'Benign',\n}\nstage_label = {\n 'triage_done': 'Triaged',\n 'duplicate_with_triage': 'Duplicate (cached)',\n 'duplicate_skipped': 'Duplicate (skipped)',\n 'filtered_out': 'Filtered',\n 'dedup_failed': 'Dedup Failed',\n 'triage_failed': 'Triage Failed',\n}\n\nrows = []\nfor r in results:\n stage = r.get('stage', '')\n triage = r.get('triage')\n verdict = triage.get('attack_verdict', '-') if isinstance(triage, dict) else '-'\n title = triage.get('report_title', '-') if isinstance(triage, dict) else '-'\n cache_mark = '' # stage_label already includes cache indicator\n rows.append(\n f\"| {r.get('alert_index', 0) + 1} \"\n f\"| {(r.get('threat_name') or '')[:30]} \"\n f\"| {stage_label.get(stage, stage)}{cache_mark} \"\n f\"| {verdict_label.get(verdict, verdict)} \"\n f\"| {(title or '')[:30]} |\"\n )\n\nsummary_md = (\n f'# Alert Dedup & Triage Summary\\n\\n'\n f'**Date**: {today}\\n\\n'\n f'## Statistics\\n'\n f'- Total input: {stats.get(\"total_input\", 0)}\\n'\n f'- New triages: {stats.get(\"triage_success\", 0)}\\n'\n f'- Duplicates (cached triage): {stats.get(\"duplicate_with_triage\", 0)}\\n'\n f'- Duplicates (no cache): {stats.get(\"duplicate_skipped\", 0)}\\n'\n f'- Filtered out: {stats.get(\"filtered_out\", 0)}\\n'\n f'- Triage failed: {stats.get(\"triage_failed\", 0)}\\n'\n f'- Triage cache size: {stats.get(\"triage_cache_size\", 0)}\\n'\n f'- Verdict distribution: {json.dumps(stats.get(\"verdict_counts\", {}), ensure_ascii=False)}\\n\\n'\n f'## Details\\n\\n'\n f'| # | Threat | Stage | Verdict | Report Title |\\n'\n f'|---|--------|-------|---------|--------------|\\n'\n + '\\n'.join(rows) + '\\n\\n'\n + (f'## Top-Risk Alert Report\\n\\n{final_report}\\n' if final_report else '')\n)\n\ntry:\n out_path = get_path('pipeline_summary.md')\n import os\n os.makedirs(os.path.dirname(str(out_path)), exist_ok=True)\n with open(str(out_path), 'w', encoding='utf-8') as _f:\n _f.write(summary_md)\n summary_path = str(out_path)\n print(f'[summary] written to {summary_path}')\nexcept Exception as _e:\n print(f'[summary] WARNING: could not write summary file: {_e}')\n summary_path = ''\n\nprint(f'[summary] triage_success={stats.get(\"triage_success\", 0)}, '\n f'duplicate_with_triage={stats.get(\"duplicate_with_triage\", 0)}, '\n f'top_verdict={attack_verdict}')\n\noutputs['final_reports'] = [r.get('triage', {}).get('final_report', '') for r in triage_results if isinstance(r.get('triage'), dict)]\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\noutputs['summary_report'] = summary_md\noutputs['report_path'] = summary_path\noutputs['final_report'] = final_report\noutputs['report_title'] = report_title\noutputs['attack_verdict'] = attack_verdict\noutputs['risk_level'] = risk_level\n" } ], "edges": [ From af6d1426f905afe4e1a49ddbb463b3351fa74929 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Sun, 10 May 2026 18:55:38 +0800 Subject: [PATCH 26/41] feat(alert_dedup_triage): skip summary in syslog mode, generate full report in batch mode Add branch_output_mode node after dedup_and_triage: - input_mode == 'syslog' -> direct_output: returns single-alert triage result directly (one-liner summary_report, no table, no file write) - input_mode == 'alerts' | 'alert_file' -> generate_summary: full statistics table + top-risk report written to pipeline_summary.md Both paths emit identical output field names for downstream compatibility. Co-authored-by: Cursor --- .../alert_dedup_triage/workflow.json | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json index 5ed3c5d3c..37701ac17 100644 --- a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json +++ b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json @@ -1,7 +1,7 @@ { "name": "alert_dedup_triage", - "description": "Chained pipeline: http_alert_dedup → tdp_alert_triage. Accepts alerts via syslog (real-time, single message), alerts list (batch), or alert_file path. Deduplicates with MinHash LSH; runs LLM triage on first-seen unique alerts; returns cached triage results for duplicates.", - "description_cn": "去重+研判串联工作流。支持三种输入模式:syslog 实时单条(配置 syslog-config 后自动触发)、alerts 批次列表、alert_file 文件路径。先调用 http_alert_dedup 做 MinHash LSH 去重,再对首次出现的唯一告警调用 tdp_alert_triage 进行 LLM 研判;重复告警从持久化研判缓存(triage_cache.pkl)中回填历史结果。研判缓存上限由 max_dedup_keys 控制(默认 10W,FIFO 淘汰)。", + "description": "Chained pipeline: http_alert_dedup -> tdp_alert_triage. Supports syslog (real-time single alert), alerts list (batch), or alert_file path. Deduplicates with MinHash LSH; runs LLM triage on first-seen unique alerts; returns cached triage for duplicates. Syslog mode returns triage result directly; batch mode generates a full summary report.", + "description_cn": "去重+研判串联工作流。支持三种输入模式:syslog 实时单条、alerts 批次列表、alert_file 文件路径。syslog 模式下跳过汇总节点,直接返回单条研判结果;批次模式生成完整汇总报告。重复告警从 triage_cache.pkl 中回填历史研判结果(FIFO LRU,max_dedup_keys 可调)。", "start": "receive_alerts", "nodes": [ { @@ -16,6 +16,18 @@ "description": "Core loop: for each alert, invoke http_alert_dedup in-process (no published service required) to deduplicate; for first-seen unique alerts invoke tdp_alert_triage in-process for LLM triage. Duplicate alerts are served from triage_cache.pkl (FIFO LRU, max_dedup_keys configurable).", "code": "\nimport json\nimport os\nimport pickle\nimport sys\nimport time\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nWORKFLOW_NAME = 'alert_dedup_triage'\n\n# ── Triage cache helpers ──────────────────────────────────────────────────────\n\ndef _triage_cache_path():\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent\n state_dir = flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME\n state_dir.mkdir(parents=True, exist_ok=True)\n return str(state_dir / 'triage_cache.pkl'), str(state_dir / 'triage_cache.lock')\n\ndef _acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L'); fh.flush(); fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef _release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef _load_triage_cache(cache_path):\n if not os.path.exists(cache_path) or os.path.getsize(cache_path) == 0:\n return {}\n try:\n with open(cache_path, 'rb') as f:\n c = pickle.load(f)\n if not isinstance(c, dict):\n return {}\n print(f'[triage_cache] loaded {len(c)} entries from {cache_path}')\n return c\n except Exception as e:\n print(f'[triage_cache] failed to load ({e}), starting fresh')\n return {}\n\ndef _save_triage_cache(cache_path, cache):\n tmp = cache_path + '.tmp'\n try:\n with open(tmp, 'wb') as f:\n pickle.dump(cache, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, cache_path)\n print(f'[triage_cache] saved {len(cache)} entries -> {cache_path}')\n except Exception as e:\n print(f'[triage_cache] failed to save: {e}')\n if os.path.exists(tmp):\n try: os.remove(tmp)\n except Exception: pass\n\ndef _evict_cache(cache, max_keys):\n excess = len(cache) - max_keys\n if excess > 0:\n for k in list(cache.keys())[:excess]:\n del cache[k]\n return excess\n return 0\n\n# ── Embedded sub-workflow invocation (no HTTP, no published service needed) ──\n\ndef _find_workflow_path(workflow_id):\n from flocks.workflow.fs_store import workflow_scan_dirs\n for root, _ in workflow_scan_dirs():\n p = root / workflow_id / 'workflow.json'\n if p.exists():\n return p\n return None\n\ndef _invoke_workflow(workflow_id, wf_inputs, timeout_s):\n from flocks.workflow.runner import run_workflow as _run_wf\n t0 = time.time()\n try:\n wf_path = _find_workflow_path(workflow_id)\n if not wf_path:\n return None, round((time.time() - t0) * 1000), f'Workflow not found: {workflow_id!r}'\n result = _run_wf(\n workflow=wf_path,\n inputs=wf_inputs,\n timeout_s=float(timeout_s),\n ensure_requirements=False,\n )\n ms = round((time.time() - t0) * 1000)\n if result.status == 'SUCCEEDED':\n return {'status': 'SUCCEEDED', 'outputs': result.outputs}, ms, None\n else:\n return None, ms, result.error or f'status={result.status}'\n except Exception as e:\n return None, round((time.time() - t0) * 1000), str(e)\n\n# ── Main ─────────────────────────────────────────────────────────────────────\n\nraw_alerts = inputs.get('raw_alerts', [])\nsource_log_type = inputs.get('source_log_type', 'tdp')\nfilter_enabled = inputs.get('filter_enabled', True)\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = inputs.get('threshold', 0.7)\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = inputs.get('max_field_len', 500)\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\ndedup_wf_id = inputs.get('dedup_workflow_id', 'http_alert_dedup')\ntriage_wf_id = inputs.get('triage_workflow_id', 'tdp_alert_triage')\ntriage_timeout_s = int(inputs.get('triage_timeout_s', 300))\ndedup_timeout_s = int(inputs.get('dedup_timeout_s', 60))\n\ndedup_base_inputs = {\n 'source_log_type': source_log_type,\n 'filter_enabled': filter_enabled,\n 'dedup_enabled': dedup_enabled,\n 'threshold': threshold,\n 'strict_fields': strict_fields,\n 'lsh_fields': lsh_fields,\n 'max_field_len': max_field_len,\n 'max_dedup_keys': max_dedup_keys,\n}\n\ncache_path, lock_path = _triage_cache_path()\nlock_fh = _acquire_lock(lock_path)\ntry:\n triage_cache = _load_triage_cache(cache_path)\nfinally:\n _release_lock(lock_fh)\n\nresults = []\ntriage_results = []\ncache_dirty = False\n\nstats = {\n 'total_input': len(raw_alerts),\n 'dedup_failed': 0,\n 'filtered_out': 0,\n 'duplicate_skipped': 0,\n 'duplicate_with_triage': 0,\n 'triage_invoked': 0,\n 'triage_success': 0,\n 'triage_failed': 0,\n 'verdict_counts': {},\n}\n\nfor i, alert in enumerate(raw_alerts):\n entry = {\n 'alert_index': i,\n 'alert_id': alert.get('id') or alert.get('uuid') or alert.get('behave_uuid'),\n 'threat_name': (alert.get('threat') or {}).get('name', ''),\n }\n\n # Step 1: dedup via embedded http_alert_dedup workflow\n dr, dms, derr = _invoke_workflow(\n dedup_wf_id,\n {**dedup_base_inputs, 'alerts': [alert]},\n dedup_timeout_s,\n )\n if derr or not dr or dr.get('status') != 'SUCCEEDED':\n stats['dedup_failed'] += 1\n entry.update({'stage': 'dedup_failed', 'dedup_error': derr or dr, 'dedup_ms': dms})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] dedup FAILED ({dms}ms) {str(derr or \"\")[:80]}')\n continue\n\n dout = dr.get('outputs', {})\n unique_alerts = dout.get('unique_alerts', [])\n dstats = dout.get('stats', {})\n entry['dedup_ms'] = dms\n entry['filter_removed'] = dstats.get('filter_removed_count', 0)\n entry['lsh_clusters'] = dstats.get('lsh_total_clusters')\n entry['lsh_dedup_keys'] = dstats.get('lsh_total_dedup_keys')\n\n if not unique_alerts:\n stats['filtered_out'] += 1\n entry.update({'stage': 'filtered_out', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - filtered_out ({dms}ms)')\n continue\n\n already = bool(unique_alerts[0].get('dedup_key_already_exists'))\n dedup_key = unique_alerts[0].get('dedup_key', '')\n entry['dedup_key'] = dedup_key\n\n if already:\n cached_triage = triage_cache.get(dedup_key)\n if cached_triage:\n stats['duplicate_with_triage'] += 1\n verdict = cached_triage.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n entry.update({'stage': 'duplicate_with_triage', 'triage': cached_triage})\n results.append(entry)\n triage_results.append({**entry, **cached_triage})\n print(f' [{i+1}/{len(raw_alerts)}] duplicate+cached ({dms}ms) key={dedup_key[:8]} verdict={verdict}')\n else:\n stats['duplicate_skipped'] += 1\n entry.update({'stage': 'duplicate_skipped', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - duplicate (no cache) ({dms}ms) key={dedup_key[:8]}')\n continue\n\n # Step 2: triage via embedded tdp_alert_triage workflow (unique alerts only)\n stats['triage_invoked'] += 1\n tr, tms, terr = _invoke_workflow(\n triage_wf_id,\n {'alert_data': alert},\n triage_timeout_s,\n )\n if terr or not tr or tr.get('status') != 'SUCCEEDED':\n stats['triage_failed'] += 1\n entry.update({'stage': 'triage_failed', 'triage_ms': tms,\n 'triage_error': terr or tr})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] dedup OK + triage FAILED ({dms}+{tms}ms) {str(terr or \"\")[:80]}')\n continue\n\n stats['triage_success'] += 1\n tout = tr.get('outputs', {})\n verdict = tout.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n\n triage_info = {\n 'attack_verdict': verdict,\n 'risk_level': tout.get('risk_level'),\n 'report_title': tout.get('report_title'),\n 'report_path': tout.get('report_path'),\n 'final_report': tout.get('final_report', ''),\n }\n if dedup_key in triage_cache:\n del triage_cache[dedup_key]\n triage_cache[dedup_key] = triage_info\n cache_dirty = True\n\n entry.update({'stage': 'triage_done', 'triage_ms': tms, 'triage': triage_info})\n results.append(entry)\n triage_results.append({**entry, **triage_info})\n print(f' [{i+1}/{len(raw_alerts)}] dedup+triage OK ({dms}+{tms}ms) verdict={verdict} title={(tout.get(\"report_title\") or \"\")[:40]}')\n\nif cache_dirty:\n lock_fh = _acquire_lock(lock_path)\n try:\n evicted = _evict_cache(triage_cache, max_dedup_keys)\n if evicted:\n print(f'[triage_cache] LRU eviction: dropped {evicted} entries (max={max_dedup_keys})')\n _save_triage_cache(cache_path, triage_cache)\n finally:\n _release_lock(lock_fh)\n\nstats['triage_cache_size'] = len(triage_cache)\nprint(f'[pipeline] stats={json.dumps(stats, ensure_ascii=False)}')\n\noutputs['results'] = results\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\n" }, + { + "id": "branch_output_mode", + "type": "branch", + "select_key": "input_mode", + "description": "Route output based on input_mode: syslog (single alert) -> direct_output; batch (alerts list or file) -> generate_summary." + }, + { + "id": "direct_output", + "type": "python", + "description": "Syslog / single-alert output: extract triage result directly without generating a full summary table. Outputs are field-compatible with generate_summary.", + "code": "\nimport json\nimport datetime\n\nresults = inputs.get('results', [])\ntriage_results = inputs.get('triage_results', [])\nstats = inputs.get('stats', {})\n\n# In syslog mode there is exactly one alert; pull its triage info directly.\nentry = results[0] if results else {}\ntriage = entry.get('triage') or {}\n\nattack_verdict = triage.get('attack_verdict', '')\nrisk_level = triage.get('risk_level', '')\nreport_title = triage.get('report_title', '')\nfinal_report = triage.get('final_report', '')\nreport_path = triage.get('report_path', '')\nstage = entry.get('stage', '')\n\n# Minimal one-liner summary (no full table needed for a single alert).\nverdict_label = {\n 'attack_success': 'Attack Success',\n 'attack_failed': 'Attack Failed',\n 'attack': 'Attack',\n 'unknown': 'Unknown',\n 'benign': 'Benign',\n}\nsummary_report = (\n f'# {report_title or \"Alert Triage Result\"}\\n\\n'\n f'**Date**: {datetime.date.today().isoformat()} '\n f'**Stage**: {stage} '\n f'**Verdict**: {verdict_label.get(attack_verdict, attack_verdict or \"-\")} '\n f'**Risk**: {risk_level or \"-\"}\\n'\n)\n\nprint(f'[direct_output] stage={stage} verdict={attack_verdict} title={report_title[:40]!r}')\n\noutputs['final_reports'] = [final_report] if final_report else []\noutputs['triage_results'] = triage_results\noutputs['results'] = results\noutputs['stats'] = stats\noutputs['summary_report'] = summary_report\noutputs['report_path'] = report_path\noutputs['final_report'] = final_report\noutputs['report_title'] = report_title\noutputs['attack_verdict'] = attack_verdict\noutputs['risk_level'] = risk_level\n" + }, { "id": "generate_summary", "type": "python", @@ -31,8 +43,19 @@ }, { "from": "dedup_and_triage", - "to": "generate_summary", + "to": "branch_output_mode", "order": 0 + }, + { + "from": "branch_output_mode", + "to": "direct_output", + "order": 0, + "label": "syslog" + }, + { + "from": "branch_output_mode", + "to": "generate_summary", + "order": 1 } ], "metadata": { From 1f94e80eab8d67f0a7973136dffe6bdc07f9d831 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Sun, 10 May 2026 19:09:59 +0800 Subject: [PATCH 27/41] fix(ingest/syslog): persist execution records and stats for syslog-triggered runs Syslog-triggered workflow runs were invisible in the WebUI history panel and not counted in the workflow stats card because `_trigger_workflow` called the runner directly, bypassing both the execution record write and the call counter update. Run records were never written, and `callCount` only ever reflected HTTP-triggered runs. This change makes the syslog path go through the same persistence helpers as the HTTP API path so all trigger sources are uniformly visible in the UI: - syslog/manager: wrap each run with create_execution_record / record_execution_result; write the same fields the WebUI consumes (`outputResults`, `errorMessage`, `duration`, `executionLog`, `currentNodeId`, `currentPhase`, `currentStepIndex`); tag input params with `_trigger=syslog` for source identification. - workflow/execution_store: move the workflow stats counter update into `record_execution_result` so every persisted result automatically increments callCount/successCount/errorCount/totalRuntime/avgRuntime, regardless of trigger source. - server/routes/workflow: remove the now-redundant `_update_workflow_stats` calls from the HTTP run/invoke paths and drop the orphaned helper to avoid double counting. Co-authored-by: Cursor --- flocks/ingest/syslog/manager.py | 46 +++++++++++++++++++++++++-- flocks/server/routes/workflow.py | 23 -------------- flocks/workflow/execution_store.py | 51 +++++++++++++++++++++++++++++- 3 files changed, 93 insertions(+), 27 deletions(-) diff --git a/flocks/ingest/syslog/manager.py b/flocks/ingest/syslog/manager.py index c00ea2c99..6984d322b 100644 --- a/flocks/ingest/syslog/manager.py +++ b/flocks/ingest/syslog/manager.py @@ -3,10 +3,16 @@ from __future__ import annotations import asyncio -from typing import Any, Dict, Optional +import time +from typing import Any, Dict from flocks.storage.storage import Storage from flocks.utils.log import Log +from flocks.workflow.execution_store import ( + create_execution_record, + record_execution_result, + resolve_execution_outcome, +) from flocks.workflow.fs_store import read_workflow_from_fs from flocks.workflow.runner import run_workflow @@ -136,18 +142,52 @@ async def _trigger_workflow(self, workflow_id: str, syslog_msg: dict, input_key: log.warning("syslog.workflow_json_missing", {"workflow_id": workflow_id}) return inputs = {input_key: syslog_msg} + + exec_data = await create_execution_record( + workflow_id, + input_params={"_trigger": "syslog", **inputs}, + ) + exec_id = exec_data["id"] + start_time = time.time() + try: - await asyncio.to_thread( + result = await asyncio.to_thread( run_workflow, workflow=workflow_json, inputs=inputs, trace=False, ) + status, error_msg = resolve_execution_outcome(result) + duration = time.time() - start_time + exec_data.update({ + "status": status, + "outputResults": result.outputs if isinstance(result.outputs, dict) else {}, + "finishedAt": int(time.time() * 1000), + "duration": duration, + "errorMessage": error_msg, + "executionLog": list(result.history or []), + "currentNodeId": result.last_node_id, + "currentPhase": status, + "currentStepIndex": result.steps, + }) except Exception as exc: + duration = time.time() - start_time log.error( "syslog.workflow_run_failed", - {"workflow_id": workflow_id, "error": str(exc)}, + {"workflow_id": workflow_id, "exec_id": exec_id, "error": str(exc)}, ) + exec_data.update({ + "status": "error", + "errorMessage": str(exc), + "finishedAt": int(time.time() * 1000), + "duration": duration, + "currentPhase": "error", + }) + finally: + try: + await record_execution_result(workflow_id, exec_id, exec_data) + except Exception as exc: + log.warning("syslog.exec_record_failed", {"exec_id": exec_id, "error": str(exc)}) default_manager = SyslogManager() diff --git a/flocks/server/routes/workflow.py b/flocks/server/routes/workflow.py index a60c81ccc..ad6cae1b2 100644 --- a/flocks/server/routes/workflow.py +++ b/flocks/server/routes/workflow.py @@ -527,11 +527,6 @@ def _on_step_complete(step_result) -> None: "currentStepIndex": result.steps, }) - if status_value == "success": - await _update_workflow_stats(workflow_id, True, duration) - elif status_value in {"error", "timeout"}: - await _update_workflow_stats(workflow_id, False, duration) - await _record_execution_result(workflow_id, exec_id, current_data) log.info("workflow.executed", { "id": workflow_id, @@ -550,8 +545,6 @@ def _on_step_complete(step_result) -> None: "executionLog": list(step_history), "currentPhase": "cancelled" if cancel_event.is_set() else "error", }) - if current_data["status"] == "error": - await _update_workflow_stats(workflow_id, False, duration) await _record_execution_result(workflow_id, exec_id, current_data) log.error("workflow.execute.error", { "id": workflow_id, @@ -592,18 +585,6 @@ async def _get_workflow_stats(workflow_id: str) -> Dict[str, Any]: return dict(_DEFAULT_STATS) -async def _update_workflow_stats(workflow_id: str, success: bool, duration: float) -> None: - """Update workflow statistics""" - stats = await _get_workflow_stats(workflow_id) - stats["callCount"] += 1 - if success: - stats["successCount"] += 1 - else: - stats["errorCount"] += 1 - stats["totalRuntime"] += duration - await Storage.write(_workflow_stats_key(workflow_id), stats) - - # ============================================================================= # API Endpoints - Workflow CRUD # ============================================================================= @@ -1065,7 +1046,6 @@ async def workflow_center_invoke(workflow_id: str, req: WorkflowCenterInvokeRequ raw_status = result.get("status", "SUCCEEDED") if isinstance(result, dict) else "SUCCEEDED" status_value = _normalize_execution_status(raw_status) success = status_value == "success" - await _update_workflow_stats(workflow_id, success, duration) exec_data.update({ "outputResults": result.get("outputs", {}) if isinstance(result, dict) else {}, "status": status_value, @@ -1077,21 +1057,18 @@ async def workflow_center_invoke(workflow_id: str, req: WorkflowCenterInvokeRequ return result except (WorkflowNotFoundError, WorkflowNotPublishedError) as e: duration = time.time() - started - await _update_workflow_stats(workflow_id, False, duration) exec_data.update({"status": "error", "finishedAt": int(time.time() * 1000), "duration": duration, "errorMessage": str(e)}) await _record_execution_result(workflow_id, exec_id, exec_data) raise HTTPException(status_code=404, detail=str(e)) except WorkflowCenterError as e: duration = time.time() - started - await _update_workflow_stats(workflow_id, False, duration) exec_data.update({"status": "error", "finishedAt": int(time.time() * 1000), "duration": duration, "errorMessage": str(e)}) await _record_execution_result(workflow_id, exec_id, exec_data) raise HTTPException(status_code=400, detail=str(e)) except Exception as e: duration = time.time() - started - await _update_workflow_stats(workflow_id, False, duration) exec_data.update({"status": "error", "finishedAt": int(time.time() * 1000), "duration": duration, "errorMessage": str(e)}) await _record_execution_result(workflow_id, exec_id, exec_data) diff --git a/flocks/workflow/execution_store.py b/flocks/workflow/execution_store.py index 04ffdc610..9d300205a 100644 --- a/flocks/workflow/execution_store.py +++ b/flocks/workflow/execution_store.py @@ -11,6 +11,43 @@ from flocks.workflow.runner import RunWorkflowResult +def _workflow_stats_key(workflow_id: str) -> str: + return f"workflow/{workflow_id}/stats" + + +_DEFAULT_STATS: Dict[str, Any] = { + "callCount": 0, + "successCount": 0, + "errorCount": 0, + "totalRuntime": 0.0, + "avgRuntime": 0.0, + "thumbsUp": 0, + "thumbsDown": 0, +} + + +async def _update_workflow_stats(workflow_id: str, success: bool, duration: float) -> None: + """Increment workflow call/success/error counters and update avgRuntime.""" + try: + key = _workflow_stats_key(workflow_id) + try: + stats: Dict[str, Any] = await Storage.read(key) or dict(_DEFAULT_STATS) + except Exception: + stats = dict(_DEFAULT_STATS) + stats["callCount"] = stats.get("callCount", 0) + 1 + if success: + stats["successCount"] = stats.get("successCount", 0) + 1 + else: + stats["errorCount"] = stats.get("errorCount", 0) + 1 + total = stats.get("totalRuntime", 0.0) + duration + stats["totalRuntime"] = total + call_count = stats["callCount"] + stats["avgRuntime"] = (total / call_count) if call_count > 0 else 0.0 + await Storage.write(key, stats) + except Exception: + pass + + def workflow_execution_key(exec_id: str) -> str: """Return the storage key for one workflow execution.""" return f"workflow_execution/{exec_id}" @@ -98,8 +135,20 @@ async def record_execution_result( exec_id: str, exec_data: Dict[str, Any], ) -> None: - """Persist the final execution record and audit trail.""" + """Persist the final execution record, audit trail, and workflow stats.""" await Storage.write(workflow_execution_key(exec_id), exec_data) + + # Update call/success/error counters so all trigger paths (HTTP, syslog, etc.) + # are reflected in the UI stats panel. + status = exec_data.get("status", "error") + success = status == "success" + duration = exec_data.get("duration") + if not isinstance(duration, (int, float)): + started_at = exec_data.get("startedAt", 0) + finished_at = exec_data.get("finishedAt", int(time.time() * 1000)) + duration = max(0.0, (finished_at - started_at) / 1000.0) + await _update_workflow_stats(workflow_id, success, float(duration)) + try: await Recorder.record_workflow_execution( exec_id=exec_id, From db1d260dfbfaf6c2c76cca74ccb6729151823e2c Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Sun, 10 May 2026 22:14:10 +0800 Subject: [PATCH 28/41] feat(workflows): support mixed TDP/Skyeye batches and flat-format TDP triage http_alert_dedup: - Replace branch_log_type + normalize_tdp + normalize_skyeye nodes with a single unified normalize node; each alert is individually classified via field signatures (nested net dict / behave_uuid for TDP; uri / vuln_name / attack_result for Skyeye) before field mapping, enabling mixed-type batches in a single invocation. - Update filter_logs to read per-alert _source_type set by normalize instead of the batch-level source_log_type, ensuring correct threat/process type classification in mixed batches. - source_log_type is retained as an optional batch-level fallback hint when per-alert detection is inconclusive. alert_dedup_triage: - Implement 4-priority source_log_type resolution in receive_alerts: (1) explicit input param, (2) syslog app_name/hostname hint, (3) JSON field auto-detection on first alert, (4) default 'tdp'. - Emit source_log_type_reason for traceability. tdp_alert_triage: - Extend receive_alert pick() lookups to cover flat TDP field names (net_real_src_ip, net_dest_ip, net_http_url, net_http_reqs_body, net_http_resp_body, net_http_status, etc.) alongside nested TDP and normalized schema, fixing empty-payload LLM analysis for pre-flattened TDP alerts arriving via alert_dedup_triage. Co-authored-by: Cursor --- .../alert_dedup_triage/workflow.json | 4 +- .../workflows/http_alert_dedup/workflow.json | 52 +++++-------------- .../workflows/tdp_alert_triage/workflow.json | 4 +- 3 files changed, 16 insertions(+), 44 deletions(-) diff --git a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json index 37701ac17..d252426e6 100644 --- a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json +++ b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json @@ -7,8 +7,8 @@ { "id": "receive_alerts", "type": "python", - "description": "Parse incoming alerts. Supports three input modes in priority order: syslog_message (injected by flocks syslog listener; TDP alert JSON in .message field), alerts (list, batch), alert_file (local JSON path). Outputs sub-workflow IDs for embedded invocation by dedup_and_triage.", - "code": "\nimport json\nimport os\n\n# Input priority: syslog_message > alerts > alert_file\n# syslog mode: flocks ingest/syslog/manager.py injects parsed syslog dict as inputKey (default: syslog_message)\n# Structure: {raw, facility, severity, timestamp, hostname, app_name, message, format}\n# TDP alert JSON is in the message field.\n\nalerts_input = []\ninput_mode = 'unknown'\n\nsyslog_msg = inputs.get('syslog_message') or inputs.get('syslog')\nif syslog_msg and isinstance(syslog_msg, dict):\n raw_text = syslog_msg.get('message', '')\n if raw_text:\n try:\n alert = json.loads(raw_text)\n alert['_syslog_meta'] = {\n 'hostname': syslog_msg.get('hostname', ''),\n 'app_name': syslog_msg.get('app_name', ''),\n 'timestamp': syslog_msg.get('timestamp', ''),\n 'severity': syslog_msg.get('severity'),\n 'facility': syslog_msg.get('facility'),\n 'format': syslog_msg.get('format', ''),\n }\n alerts_input = [alert]\n input_mode = 'syslog'\n print(f'[receive] syslog mode: host={syslog_msg.get(\"hostname\")!r} '\n f'app={syslog_msg.get(\"app_name\")!r} '\n f'severity={syslog_msg.get(\"severity\")} '\n f'format={syslog_msg.get(\"format\")!r}')\n except (json.JSONDecodeError, TypeError) as _e:\n print(f'[receive] WARNING: syslog.message is not valid JSON ({_e}), '\n f'raw={raw_text[:120]!r}')\n else:\n print('[receive] WARNING: syslog_message.message is empty, skipping')\n\nif not alerts_input:\n alerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n if alerts_input:\n input_mode = 'alerts'\n\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n input_mode = 'alert_file'\n print(f'[receive] file mode: loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nif not alerts_input:\n print('[receive] WARNING: no alerts to process (syslog_message, alerts, alert_file all empty)')\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nprint(f'[receive] input_mode={input_mode} raw_alerts={len(alerts_input)} '\n f'source_log_type={source_log_type} max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['input_mode'] = input_mode\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['threshold'] = threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\n# Sub-workflow IDs: engine embeds them directly; no service URL needed\noutputs['dedup_workflow_id'] = inputs.get('dedup_workflow_id', 'http_alert_dedup')\noutputs['triage_workflow_id'] = inputs.get('triage_workflow_id', 'tdp_alert_triage')\n" + "description": "Parse incoming alerts (syslog_message / alerts list / alert_file). Resolves source_log_type in priority order: (1) explicit input param, (2) syslog app_name/hostname hint (contains 'tdp' or 'skyeye'), (3) JSON field auto-detection (TDP: nested net dict/behave_uuid; Skyeye: uri/vuln_name/attack_result), (4) default 'tdp'. Logs detection reason for traceability.", + "code": "\nimport json\nimport os\n\n# Input priority: syslog_message > alerts > alert_file\nalerts_input = []\ninput_mode = 'unknown'\n_syslog_msg = None\n\nsyslog_msg = inputs.get('syslog_message') or inputs.get('syslog')\nif syslog_msg and isinstance(syslog_msg, dict):\n raw_text = syslog_msg.get('message', '')\n if raw_text:\n try:\n alert = json.loads(raw_text)\n alert['_syslog_meta'] = {\n 'hostname': syslog_msg.get('hostname', ''),\n 'app_name': syslog_msg.get('app_name', ''),\n 'timestamp': syslog_msg.get('timestamp', ''),\n 'severity': syslog_msg.get('severity'),\n 'facility': syslog_msg.get('facility'),\n 'format': syslog_msg.get('format', ''),\n }\n alerts_input = [alert]\n input_mode = 'syslog'\n _syslog_msg = syslog_msg\n print(f'[receive] syslog mode: host={syslog_msg.get(\"hostname\")!r} '\n f'app={syslog_msg.get(\"app_name\")!r} '\n f'severity={syslog_msg.get(\"severity\")} '\n f'format={syslog_msg.get(\"format\")!r}')\n except (json.JSONDecodeError, TypeError) as _e:\n print(f'[receive] WARNING: syslog.message not valid JSON ({_e}), '\n f'raw={raw_text[:120]!r}')\n else:\n print('[receive] WARNING: syslog_message.message is empty, skipping')\n\nif not alerts_input:\n alerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n if alerts_input:\n input_mode = 'alerts'\n\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n input_mode = 'alert_file'\n print(f'[receive] file mode: loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nif not alerts_input:\n print('[receive] WARNING: no alerts (syslog_message, alerts, alert_file all empty)')\n\n# ── Source log type resolution ────────────────────────────────────────────────\n# Priority:\n# 1. Explicit input param source_log_type (always respected)\n# 2. Syslog metadata: app_name / hostname contains 'skyeye' or 'tdp'\n# 3. JSON field signature on first alert:\n# TDP -> nested 'net' dict, 'behave_uuid', 'flow_id'\n# Skyeye -> 'uri', 'vuln_name', 'attack_result', 'attack_flag'\n# 4. Default 'tdp'\n\ndef _detect_from_syslog_meta(sm):\n for field in ('app_name', 'hostname'):\n val = str(sm.get(field, '') or '').lower()\n if 'skyeye' in val:\n return 'skyeye', f'syslog.{field}={sm.get(field)!r}'\n if 'tdp' in val:\n return 'tdp', f'syslog.{field}={sm.get(field)!r}'\n return None, None\n\ndef _detect_from_alert_json(alert):\n if not isinstance(alert, dict):\n return None, None\n if isinstance(alert.get('net'), dict):\n return 'tdp', 'alert has nested net dict (TDP)'\n if any(k in alert for k in ('behave_uuid', 'flow_id')):\n return 'tdp', 'alert has behave_uuid/flow_id (TDP)'\n if any(k in alert for k in ('uri', 'vuln_name', 'attack_result', 'attack_flag')):\n return 'skyeye', 'alert has uri/vuln_name/attack_result (Skyeye)'\n return None, None\n\nexplicit_type = str(inputs.get('source_log_type', '') or '').lower()\nif explicit_type in ('tdp', 'skyeye'):\n source_log_type = explicit_type\n source_log_type_reason = 'explicit input parameter'\nelif input_mode == 'syslog' and _syslog_msg:\n source_log_type, reason = _detect_from_syslog_meta(_syslog_msg)\n if source_log_type:\n source_log_type_reason = f'syslog metadata: {reason}'\n else:\n first_alert = alerts_input[0] if alerts_input else {}\n source_log_type, reason = _detect_from_alert_json(first_alert)\n if source_log_type:\n source_log_type_reason = f'JSON field detection: {reason}'\n else:\n source_log_type = 'tdp'\n source_log_type_reason = 'fallback default (no hint in syslog meta or alert JSON)'\nelse:\n source_log_type = 'tdp'\n source_log_type_reason = 'default'\n\nprint(f'[receive] source_log_type={source_log_type!r} reason={source_log_type_reason!r}')\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nprint(f'[receive] input_mode={input_mode} raw_alerts={len(alerts_input)} max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['input_mode'] = input_mode\noutputs['source_log_type'] = source_log_type\noutputs['source_log_type_reason'] = source_log_type_reason\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['threshold'] = threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\noutputs['dedup_workflow_id'] = inputs.get('dedup_workflow_id', 'http_alert_dedup')\noutputs['triage_workflow_id'] = inputs.get('triage_workflow_id', 'tdp_alert_triage')\n" }, { "id": "dedup_and_triage", diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json index b1f74bdf2..df630592d 100644 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.json +++ b/.flocks/plugins/workflows/http_alert_dedup/workflow.json @@ -1,38 +1,26 @@ { "name": "http_alert_dedup", - "description": "Network alert deduplication pipeline: normalize (TDP/Skyeye field mapping) → filter (remove scans / non-HTTP) → dedup (URI normalization + 5-gram Jaccard similarity). Returns a dict with deduped_alerts, unique_alerts and stats.", - "description_cn": "网络告警去重 Pipeline:归一化(TDP/Skyeye 字段映射,含日志类型分支)→ 过滤(剔除扫描/非 HTTP 告警)→ 去重(URI 归一化 + 5-gram MinHash LSH + dedup_key 持久化,FIFO LRU 上限默认 10W、可通过 max_dedup_keys 调整)。输入 dict,输出 dict(deduped_alerts / unique_alerts / stats)。", + "description": "Network alert deduplication pipeline: normalize (per-alert TDP/Skyeye auto-detection + field mapping, mixed batches supported) -> filter (remove scans / non-HTTP) -> dedup (URI normalization + 5-gram Jaccard MinHash LSH). Returns deduped_alerts, unique_alerts and stats.", + "description_cn": "网络告警去重 Pipeline:归一化(按每条告警自动识别 TDP/Skyeye 并字段映射,支持单批次混合)→ 过滤(剔除扫描/非 HTTP 告警)→ 去重(URI 归一化 + 5-gram MinHash LSH + dedup_key 持久化,FIFO LRU 上限默认 10W、可通过 max_dedup_keys 调整)。输入 dict,输出 dict(deduped_alerts / unique_alerts / stats)。", "start": "receive_alerts", "nodes": [ { "id": "receive_alerts", "type": "python", - "description": "接收原始告警列表,解析输入格式,提取 Pipeline 配置,输出 source_log_type 供后续分支节点路由", + "description": "Parse input (alerts list / alert_file path), extract pipeline configuration, and forward source_log_type as a batch-level hint used by normalize when per-alert field detection is inconclusive.", "code": "\nimport json\nimport os\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n\n# Support alert_file: load JSON from a local file path when alerts list is not given directly.\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n print(f'[receive] loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from file: {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\ndedup_threshold = float(inputs.get('threshold', inputs.get('dedup_threshold', 0.7)))\nstrict_fields = inputs.get('strict_fields', inputs.get('dedup_fields_strict', ['sip', 'dip']))\nlsh_fields = inputs.get('lsh_fields', inputs.get('dedup_fields_lsh', ['req_http_url', 'req_body', 'rsp_body']))\nmax_field_len = int(inputs.get('max_field_len', 500))\n# Maximum dedup_keys (and LSH clusters) to keep in persisted state.\n# When the cache grows beyond this limit, oldest entries are evicted in FIFO\n# order on the next dedup run. Default 100,000 — tunable per request.\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\nprint(f'[receive] source_log_type={source_log_type}, total={len(alerts_input)}, max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['stats'] = {'raw_count': len(alerts_input)}\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['dedup_threshold'] = dedup_threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\n" }, { - "id": "branch_log_type", - "type": "branch", - "select_key": "source_log_type", - "description": "按 source_log_type 路由:'skyeye' → normalize_skyeye;'tdp'(默认)→ normalize_tdp" - }, - { - "id": "normalize_tdp", + "id": "normalize", "type": "python", - "description": "TDP 字段归一化:将 TDP 原始嵌套字段(net_real_src_ip/net_http_url/threat_name 等)映射为标准字段(sip/dip/req_http_url/threat_name 等)", - "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent': 'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in TDP_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none')\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_tdp] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len',\n 'max_dedup_keys']:\n outputs[k] = inputs.get(k)" - }, - { - "id": "normalize_skyeye", - "type": "python", - "description": "Skyeye 字段归一化:将 Skyeye 原始字段(uri/agent/host/vuln_name/attack_result 等)映射为标准字段", - "code": "import uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent': 'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef normalize_single(alert):\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in SKYEYE_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nnormalized = [normalize_single(a) for a in raw_alerts]\nstats['normalized_count'] = len(normalized)\nprint(f'[normalize_skyeye] {len(raw_alerts)} -> {len(normalized)}')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len',\n 'max_dedup_keys']:\n outputs[k] = inputs.get(k)" + "description": "Normalize TDP and Skyeye alerts into a unified schema — supports mixed batches. Each alert is individually classified via field signatures: TDP (nested net dict / behave_uuid / net_real_src_ip) or Skyeye (uri / vuln_name / attack_result / attack_flag). Falls back to the batch-level source_log_type hint (or tdp default) when no signature is found.", + "code": "\nimport uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent':'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent':'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef detect_alert_type(alert, batch_hint):\n # Per-alert source type detection.\n # Priority: JSON field signatures > batch_hint fallback.\n #\n # TDP signatures (raw or pre-flattened):\n # - nested 'net' dict (e.g. net.http.url)\n # - 'behave_uuid' or 'flow_id' key\n # - pre-flattened TDP keys: 'net_real_src_ip', 'net_http_url', 'threat_suuid'\n #\n # Skyeye signatures:\n # - 'uri', 'vuln_name', 'attack_result', 'attack_flag'\n if isinstance(alert.get('net'), dict):\n return 'tdp'\n if any(k in alert for k in ('behave_uuid', 'flow_id')):\n return 'tdp'\n if any(k in alert for k in ('net_real_src_ip', 'net_http_url', 'threat_suuid')):\n return 'tdp'\n if any(k in alert for k in ('uri', 'vuln_name', 'attack_result', 'attack_flag')):\n return 'skyeye'\n # No clear signature: use batch-level hint (or default 'tdp')\n return batch_hint\n\ndef normalize_single(alert, source_type):\n flat = flatten_dict(alert)\n field_map = TDP_FIELD_MAP if source_type == 'tdp' else SKYEYE_FIELD_MAP\n norm = {}\n for std_key, raw_key in field_map.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n norm['_source_type'] = source_type # carry detection result for downstream use\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\n# batch_hint: explicit caller param or 'tdp'. Used only when per-alert\n# field detection is inconclusive (e.g. already-normalised data).\nbatch_hint = str(inputs.get('source_log_type', 'tdp') or 'tdp').lower()\nif batch_hint not in ('tdp', 'skyeye'):\n batch_hint = 'tdp'\n\ntype_counts = {'tdp': 0, 'skyeye': 0}\nnormalized = []\nfor alert in raw_alerts:\n src_type = detect_alert_type(alert, batch_hint)\n type_counts[src_type] = type_counts.get(src_type, 0) + 1\n normalized.append(normalize_single(alert, src_type))\n\nstats['normalized_count'] = len(normalized)\nstats['normalize_type_counts'] = type_counts\nprint(f'[normalize] {len(raw_alerts)} alerts -> {len(normalized)} normalized '\n f'(tdp={type_counts.get(\"tdp\",0)}, skyeye={type_counts.get(\"skyeye\",0)}, '\n f'batch_hint={batch_hint!r})')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len',\n 'max_dedup_keys']:\n outputs[k] = inputs.get(k)\n" }, { "id": "filter_logs", "type": "python", - "description": "Step 2 — 过滤:9 种 process_type 分类;保留 non-scan + HTTP(任意方向 in/out/lateral)的告警", - "code": "normalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert, source):\n if source == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert, source):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if source == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert, source_log_type)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert, source_log_type)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert, source_log_type)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len',\n 'max_dedup_keys']:\n outputs[k] = inputs.get(k)" + "description": "Filter alerts using per-alert _source_type (mixed batches supported). Classifies into 9 process_types and keeps non-scan + HTTP alerts (direction in/out/lateral). For Skyeye, direction is normalised to inbound.", + "code": "\nnormalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nbatch_hint = str(inputs.get('source_log_type', 'tdp') or 'tdp').lower()\nif batch_hint not in ('tdp', 'skyeye'):\n batch_hint = 'tdp'\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert):\n # Per-alert source detection: use the _source_type tag added by normalize\n # node (written even when fallback was used). Defaults to batch_hint when\n # the field is missing for any reason (e.g. legacy upstream).\n src = alert.get('_source_type') or batch_hint\n if src == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert):\n src = alert.get('_source_type') or batch_hint\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if src == 'skyeye':\n # Skyeye direction is not reliable; assume inbound for routing.\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'dedup_threshold', 'strict_fields', 'lsh_fields',\n 'max_field_len', 'max_dedup_keys']:\n outputs[k] = inputs.get(k)\n" }, { "id": "dedup_logs", @@ -44,28 +32,11 @@ "edges": [ { "from": "receive_alerts", - "to": "branch_log_type", - "order": 0 - }, - { - "from": "branch_log_type", - "to": "normalize_tdp", - "label": "tdp", - "order": 0 - }, - { - "from": "branch_log_type", - "to": "normalize_skyeye", - "label": "skyeye", - "order": 1 - }, - { - "from": "normalize_tdp", - "to": "filter_logs", + "to": "normalize", "order": 0 }, { - "from": "normalize_skyeye", + "from": "normalize", "to": "filter_logs", "order": 0 }, @@ -96,7 +67,8 @@ "threat_type": "web攻击" } ], - "max_dedup_keys": 100000 + "max_dedup_keys": 100000, + "_comment_source_log_type": "Optional. Explicit type hint used only when per-alert field detection is inconclusive. Values: 'tdp' (default) or 'skyeye'. Mixed batches are handled automatically." } } } \ No newline at end of file diff --git a/.flocks/plugins/workflows/tdp_alert_triage/workflow.json b/.flocks/plugins/workflows/tdp_alert_triage/workflow.json index eba7fb845..3e21eeaf0 100644 --- a/.flocks/plugins/workflows/tdp_alert_triage/workflow.json +++ b/.flocks/plugins/workflows/tdp_alert_triage/workflow.json @@ -7,8 +7,8 @@ { "id": "receive_alert", "type": "python", - "description": "解析 NDR/TDP 告警,提取 HTTP 请求/响应、IOC、威胁字段,并生成统一 log_text 供下游 LLM 使用", - "code": "\nimport json\nimport re\n\nalert_input = inputs.get('alert_data', inputs.get('alert', {}))\n\nif isinstance(alert_input, str):\n try:\n alert_input = json.loads(alert_input)\n except Exception:\n alert_input = {}\n\nif isinstance(alert_input, list):\n alert_data = alert_input[0] if alert_input else {}\nelif isinstance(alert_input, dict) and isinstance(alert_input.get('data'), list):\n alert_data = alert_input.get('data', [])[0] if alert_input.get('data') else {}\nelse:\n alert_data = alert_input if isinstance(alert_input, dict) else {}\n\nnet = alert_data.get('net', {}) or {}\nhttp = net.get('http', {}) or {}\nthreat = alert_data.get('threat', {}) or {}\nassets = alert_data.get('assets', {}) or {}\n\ndef pick(*values):\n for value in values:\n if value not in (None, '', [], {}):\n return value\n return ''\n\nsrc_ip = pick(alert_data.get('attacker'), alert_data.get('external_ip'),\n net.get('src_ip'), net.get('flow_src_ip'),\n alert_data.get('sip'), alert_data.get('src_ip'), alert_data.get('src'))\ndst_ip = pick(alert_data.get('victim'), alert_data.get('machine'),\n alert_data.get('server_ip'), net.get('dest_ip'), net.get('flow_dest_ip'),\n alert_data.get('dip'), alert_data.get('dst_ip'), alert_data.get('dst'))\nsrc_port = pick(net.get('src_port'), net.get('flow_src_port'),\n alert_data.get('external_port'), alert_data.get('sport'),\n alert_data.get('src_port'), 0)\ndst_port = pick(net.get('dest_port'), net.get('flow_dest_port'),\n alert_data.get('server_port'), alert_data.get('machine_port'),\n alert_data.get('dport'), alert_data.get('dst_port'), 0)\nprotocol = pick(net.get('app_proto'), net.get('type'), net.get('proto'),\n alert_data.get('net_app_proto'), alert_data.get('protocol'),\n alert_data.get('event_type'), 'TCP')\nalert_type = pick(threat.get('name'), alert_data.get('threat_name'),\n alert_data.get('alert_type'), threat.get('topic'),\n alert_data.get('type'), 'unknown')\nseverity = pick(threat.get('severity'), alert_data.get('threat_severity'),\n alert_data.get('severity'), threat.get('level'),\n alert_data.get('level'), 'medium')\n\nreq_line = pick(http.get('reqs_line'), alert_data.get('req_line'))\nreq_header = pick(http.get('reqs_header'), alert_data.get('req_header'))\nreq_body = pick(http.get('req_body'), alert_data.get('req_body'))\nresp_line = pick(http.get('resp_line'), alert_data.get('rsp_line'), alert_data.get('resp_line'))\nresp_header = pick(http.get('resp_header'), alert_data.get('rsp_header'), alert_data.get('resp_header'))\nresp_body = pick(http.get('resp_body'), alert_data.get('rsp_body'), alert_data.get('resp_body'))\nstatus = pick(http.get('status'), alert_data.get('http_status'), 0)\n\nhost = pick(http.get('reqs_host'), alert_data.get('url_host'), http.get('domain'), dst_ip)\nraw_url = pick(http.get('raw_url'), http.get('url'), alert_data.get('url_path'))\nurl = ''\nif host and raw_url:\n scheme = 'https' if net.get('is_https') else 'http'\n url = raw_url if str(raw_url).startswith(('http://', 'https://')) else f'{scheme}://{host}{raw_url}'\n\npayload = f'请求行: {req_line}\\n请求头: {req_header}\\n请求体: {req_body}'\nresponse = f'状态行: {resp_line}\\n响应头: {resp_header}\\n响应体: {resp_body}'\n\n# Render the full HTTP transaction once; reused by every prompt downstream.\nlog_text = (\n f'[告警基本信息]\\n'\n f'告警类型: {alert_type}\\n严重级别: {severity}\\n'\n f'源地址: {src_ip}:{src_port}\\n目的地址: {dst_ip}:{dst_port}\\n'\n f'协议: {protocol}\\nURL: {url}\\nHTTP状态码: {status}\\n'\n f'TDP判定: {threat.get(\"result\", \"\")}\\nTDP消息: {threat.get(\"msg\", \"\")}\\n\\n'\n f'[HTTP请求内容]\\n{payload}\\n\\n'\n f'[HTTP响应内容]\\n{response}'\n)\n\n# Pre-extract obvious vuln IDs from raw threat fields; the LLM step will refine this later.\nvuln_text = '\\n'.join(str(item) for item in [\n threat.get('msg', ''), threat.get('topic', ''),\n alert_data.get('data', ''), url,\n json.dumps(threat.get('tag', []), ensure_ascii=False),\n] if item)\nvuln_matches = sorted(set(re.findall(r'\\b(?:CVE|CNVD|CNNVD|XVE)-[A-Za-z0-9._-]+\\b', vuln_text, flags=re.I)))\n\niocs = []\nfor candidate in [src_ip, dst_ip]:\n if candidate:\n iocs.append({'type': 'ip', 'value': candidate})\nif url:\n iocs.append({'type': 'url', 'value': url})\nif host and not re.match(r'^\\d{1,3}(?:\\.\\d{1,3}){3}(?::\\d+)?$', str(host)):\n iocs.append({'type': 'domain', 'value': str(host).split(':')[0]})\n\noutputs['parsed_alert'] = {\n 'src_ip': src_ip, 'dst_ip': dst_ip, 'src_port': src_port, 'dst_port': dst_port,\n 'protocol': protocol, 'payload': payload, 'response': response,\n 'url': url, 'status': status,\n 'alert_type': alert_type, 'severity': severity,\n 'vuln_id': vuln_matches[0] if vuln_matches else '',\n 'vuln_candidates': vuln_matches,\n 'threat_result': threat.get('result', ''),\n 'threat_msg': threat.get('msg', ''),\n 'failed_by': threat.get('failed_by', []),\n 'asset_ip': assets.get('ip', ''), 'asset_name': assets.get('name', []),\n 'iocs': iocs, 'raw_alert': alert_data, 'log_text': log_text,\n}\noutputs['log_text'] = log_text\noutputs['iocs'] = iocs\n" + "description": "解析告警,支持三种原始格式:嵌套 TDP(net.http.url)、扁平 TDP(net_real_src_ip/net_http_url/net_http_reqs_body 等)、归一化 schema(sip/dip/req_http_url/req_body 等)。提取 HTTP 请求/响应、IOC、威胁字段,生成统一 log_text 供下游 LLM 使用。", + "code": "\nimport json\nimport re\n\nalert_input = inputs.get('alert_data', inputs.get('alert', {}))\n\nif isinstance(alert_input, str):\n try:\n alert_input = json.loads(alert_input)\n except Exception:\n alert_input = {}\n\nif isinstance(alert_input, list):\n alert_data = alert_input[0] if alert_input else {}\nelif isinstance(alert_input, dict) and isinstance(alert_input.get('data'), list):\n alert_data = alert_input.get('data', [])[0] if alert_input.get('data') else {}\nelse:\n alert_data = alert_input if isinstance(alert_input, dict) else {}\n\nnet = alert_data.get('net', {}) or {}\nhttp = net.get('http', {}) or {}\nthreat = alert_data.get('threat', {}) or {}\nassets = alert_data.get('assets', {}) or {}\n\ndef pick(*values):\n for value in values:\n if value not in (None, '', [], {}):\n return value\n return ''\n\n# Support three raw formats:\n# 1. Nested TDP: net.src_ip / net.http.reqs_line etc.\n# 2. Flat TDP: net_real_src_ip / net_http_url / net_http_reqs_body etc.\n# 3. Normalized: sip / dip / req_http_url / req_body / rsp_body etc.\nsrc_ip = pick(\n alert_data.get('attacker'), alert_data.get('external_ip'),\n net.get('src_ip'), net.get('flow_src_ip'),\n alert_data.get('net_real_src_ip'),\n alert_data.get('sip'), alert_data.get('src_ip'), alert_data.get('src'),\n)\ndst_ip = pick(\n alert_data.get('victim'), alert_data.get('machine'),\n alert_data.get('server_ip'), net.get('dest_ip'), net.get('flow_dest_ip'),\n alert_data.get('net_dest_ip'),\n alert_data.get('dip'), alert_data.get('dst_ip'), alert_data.get('dst'),\n)\nsrc_port = pick(\n net.get('src_port'), net.get('flow_src_port'),\n alert_data.get('external_port'), alert_data.get('net_src_port'),\n alert_data.get('sport'), alert_data.get('src_port'), 0,\n)\ndst_port = pick(\n net.get('dest_port'), net.get('flow_dest_port'),\n alert_data.get('server_port'), alert_data.get('machine_port'),\n alert_data.get('net_dest_port'),\n alert_data.get('dport'), alert_data.get('dst_port'), 0,\n)\nprotocol = pick(\n net.get('app_proto'), net.get('type'), net.get('proto'),\n alert_data.get('net_app_proto'), alert_data.get('protocol'),\n alert_data.get('event_type'), 'TCP',\n)\nalert_type = pick(\n threat.get('name'), alert_data.get('threat_name'),\n alert_data.get('vuln_name'),\n alert_data.get('alert_type'), threat.get('topic'),\n alert_data.get('type'), 'unknown',\n)\nseverity = pick(\n threat.get('severity'), alert_data.get('threat_severity'),\n alert_data.get('severity'), threat.get('level'),\n alert_data.get('level'), 'medium',\n)\n\nreq_line = pick(\n http.get('reqs_line'),\n alert_data.get('req_line'), alert_data.get('net_http_reqs_line'),\n)\nreq_header = pick(\n http.get('reqs_header'),\n alert_data.get('req_header'), alert_data.get('net_http_reqs_header'),\n)\nreq_body = pick(\n http.get('req_body'),\n alert_data.get('req_body'), alert_data.get('net_http_reqs_body'),\n)\nresp_line = pick(\n http.get('resp_line'),\n alert_data.get('rsp_line'), alert_data.get('resp_line'),\n alert_data.get('net_http_resp_line'),\n)\nresp_header = pick(\n http.get('resp_header'),\n alert_data.get('rsp_header'), alert_data.get('resp_header'),\n alert_data.get('net_http_resp_header'),\n)\nresp_body = pick(\n http.get('resp_body'),\n alert_data.get('rsp_body'), alert_data.get('resp_body'),\n alert_data.get('net_http_resp_body'),\n)\nstatus = pick(\n http.get('status'),\n alert_data.get('http_status'), alert_data.get('net_http_status'),\n alert_data.get('rsp_status_code'), 0,\n)\n\nhost = pick(\n http.get('reqs_host'), alert_data.get('url_host'), http.get('domain'),\n alert_data.get('req_host'), alert_data.get('net_http_reqs_host'),\n dst_ip,\n)\nraw_url = pick(\n http.get('raw_url'), http.get('url'),\n alert_data.get('url_path'),\n alert_data.get('net_http_url'), alert_data.get('req_http_url'),\n alert_data.get('uri'),\n)\nurl = ''\nif host and raw_url:\n scheme = 'https' if net.get('is_https') else 'http'\n url = raw_url if str(raw_url).startswith(('http://', 'https://')) else f'{scheme}://{host}{raw_url}'\nelif raw_url and str(raw_url).startswith(('http://', 'https://')):\n url = raw_url\nelif raw_url:\n # Relative path only — keep as-is for log readability\n url = raw_url\n\npayload = f'请求行: {req_line}\\n请求头: {req_header}\\n请求体: {req_body}'\nresponse = f'状态行: {resp_line}\\n响应头: {resp_header}\\n响应体: {resp_body}'\n\n# threat_result / threat_msg: also check flat TDP field names\nthreat_result = pick(threat.get('result'), alert_data.get('threat_result'))\nthreat_msg = pick(threat.get('msg'), alert_data.get('threat_msg'))\n\nlog_text = (\n f'[告警基本信息]\\n'\n f'告警类型: {alert_type}\\n严重级别: {severity}\\n'\n f'源地址: {src_ip}:{src_port}\\n目的地址: {dst_ip}:{dst_port}\\n'\n f'协议: {protocol}\\nURL: {url}\\nHTTP状态码: {status}\\n'\n f'TDP判定: {threat_result}\\nTDP消息: {threat_msg}\\n\\n'\n f'[HTTP请求内容]\\n{payload}\\n\\n'\n f'[HTTP响应内容]\\n{response}'\n)\n\nvuln_text = '\\n'.join(str(item) for item in [\n threat_msg, threat.get('topic', ''),\n alert_data.get('data', ''), url,\n json.dumps(threat.get('tag', []), ensure_ascii=False),\n] if item)\nvuln_matches = sorted(set(re.findall(r'\\b(?:CVE|CNVD|CNNVD|XVE)-[A-Za-z0-9._-]+\\b', vuln_text, flags=re.I)))\n\niocs = []\nfor candidate in [src_ip, dst_ip]:\n if candidate:\n iocs.append({'type': 'ip', 'value': candidate})\nif url:\n iocs.append({'type': 'url', 'value': url})\nif host and not re.match(r'^\\d{1,3}(?:\\.\\d{1,3}){3}(?::\\d+)?$', str(host)):\n iocs.append({'type': 'domain', 'value': str(host).split(':')[0]})\n\noutputs['parsed_alert'] = {\n 'src_ip': src_ip, 'dst_ip': dst_ip, 'src_port': src_port, 'dst_port': dst_port,\n 'protocol': protocol, 'payload': payload, 'response': response,\n 'url': url, 'status': status,\n 'alert_type': alert_type, 'severity': severity,\n 'vuln_id': vuln_matches[0] if vuln_matches else '',\n 'vuln_candidates': vuln_matches,\n 'threat_result': threat_result,\n 'threat_msg': threat_msg,\n 'failed_by': threat.get('failed_by', []),\n 'asset_ip': assets.get('ip', ''), 'asset_name': assets.get('name', []),\n 'iocs': iocs, 'raw_alert': alert_data, 'log_text': log_text,\n}\noutputs['log_text'] = log_text\noutputs['iocs'] = iocs\n" }, { "id": "prepare_intel", From 1b5e36a21ada23f23538ba4f4c8422f976cb7ccf Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Tue, 12 May 2026 13:09:45 +0800 Subject: [PATCH 29/41] feat(ingest/workflow): add stream_alert_dedup workflow and iso3164 syslog support - Add stream_alert_dedup workflow: streaming single-record deduplication via syslog UDP input (RFC3164/5424/iso3164), with MinHash LSH (128 perms, 5-gram shingles, Jaccard threshold 0.7). Output is original alert JSON enriched with dedup_key and is_duplicate fields, written to date-partitioned JSONL files (~/.flocks/workspace/workflows/stream_alert_dedup/YYYY-MM-DD/) with a max of 10,000 records per file and a timestamped file header. - Fix syslog parser to handle non-standard iso3164 format used by TDP devices: ISO_TS HOSTNAME APP[PID]: msg (ISO 8601 timestamp, no RFC5424 version number). Adds _ISO3164_REST_RE regex, _parse_iso3164() handler, and auto- detection in parse_syslog(). Result carries format="iso3164" for traceability. Existing rfc3164 and rfc5424 parsing is unchanged. Co-authored-by: Cursor --- .../stream_alert_dedup/workflow.json | 84 +++++++++++ .../workflows/stream_alert_dedup/workflow.md | 138 ++++++++++++++++++ flocks/ingest/syslog/parser.py | 35 ++++- 3 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 .flocks/plugins/workflows/stream_alert_dedup/workflow.json create mode 100644 .flocks/plugins/workflows/stream_alert_dedup/workflow.md diff --git a/.flocks/plugins/workflows/stream_alert_dedup/workflow.json b/.flocks/plugins/workflows/stream_alert_dedup/workflow.json new file mode 100644 index 000000000..6fec44154 --- /dev/null +++ b/.flocks/plugins/workflows/stream_alert_dedup/workflow.json @@ -0,0 +1,84 @@ +{ + "name": "stream_alert_dedup", + "description": "Streaming-friendly HTTP alert deduplication pipeline: supports syslog real-time single alerts, alerts list, or alert_file path. Normalizes (TDP/Skyeye auto-detection, mixed batches) -> filters (remove scans / non-HTTP) -> deduplicates (URI normalization + 5-gram Jaccard MinHash LSH). Each output alert carries the full normalized fields plus dedup annotation: dedup_key (MD5), is_duplicate (cross-batch), _lsh_cluster_id, _source_type, _process_type. Results appended to JSONL files: ~/.flocks/workspace/workflows/stream_alert_dedup//dedup_result_NNN.jsonl (max 10,000 records per file; each file starts with a timestamp header line).", + "description_cn": "流式 HTTP 告警去重 Pipeline。支持三种输入:syslog 实时单条、alerts 批次列表、alert_file 文件路径。处理流程:归一化 → 过滤 → 去重(URI 归一化 + 5-gram MinHash LSH,跨批次持久化,FIFO LRU)。输出告警保留全部归一化字段,追加去重字段:dedup_key、is_duplicate、_lsh_cluster_id 等。结果追加写入 ~/.flocks/workspace/workflows/stream_alert_dedup//dedup_result_NNN.jsonl,每文件最多 10,000 条(不含首行 header),超出时自动新建序号文件,每个文件首行为含时间戳的 header JSON 行。", + "start": "receive_alert", + "nodes": [ + { + "id": "receive_alert", + "type": "python", + "description": "Parse incoming alert(s): syslog_message (single alert, RFC3164/5424) > alerts list > alert_file. Auto-detects source_log_type (TDP/Skyeye) from syslog app_name/hostname, then JSON field signatures, then defaults to 'tdp'.", + "code": "\nimport json\nimport os\n\n# Input priority: syslog_message > alerts > alert_file\nalerts_input = []\ninput_mode = 'unknown'\n_syslog_msg = None\n\nsyslog_msg = inputs.get('syslog_message') or inputs.get('syslog')\nif syslog_msg and isinstance(syslog_msg, dict):\n raw_text = syslog_msg.get('message', '')\n if raw_text:\n try:\n alert = json.loads(raw_text)\n alert['_syslog_meta'] = {\n 'hostname': syslog_msg.get('hostname', ''),\n 'app_name': syslog_msg.get('app_name', ''),\n 'timestamp': syslog_msg.get('timestamp', ''),\n 'severity': syslog_msg.get('severity'),\n 'facility': syslog_msg.get('facility'),\n 'format': syslog_msg.get('format', ''),\n }\n alerts_input = [alert]\n input_mode = 'syslog'\n _syslog_msg = syslog_msg\n print(f'[receive] syslog mode: host={syslog_msg.get(\"hostname\")!r} '\n f'app={syslog_msg.get(\"app_name\")!r} '\n f'severity={syslog_msg.get(\"severity\")} '\n f'format={syslog_msg.get(\"format\")!r}')\n except (json.JSONDecodeError, TypeError) as _e:\n print(f'[receive] WARNING: syslog.message not valid JSON ({_e}), '\n f'raw={raw_text[:120]!r}')\n else:\n print('[receive] WARNING: syslog_message.message is empty, skipping')\n\nif not alerts_input:\n alerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n if alerts_input:\n input_mode = 'alerts'\n\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n input_mode = 'alert_file'\n print(f'[receive] file mode: loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nif not alerts_input:\n print('[receive] WARNING: no alerts (syslog_message, alerts, alert_file all empty)')\n\n# ── Source log type resolution ────────────────────────────────────────────────\ndef _detect_from_syslog_meta(sm):\n for field in ('app_name', 'hostname'):\n val = str(sm.get(field, '') or '').lower()\n if 'skyeye' in val:\n return 'skyeye', f'syslog.{field}={sm.get(field)!r}'\n if 'tdp' in val:\n return 'tdp', f'syslog.{field}={sm.get(field)!r}'\n return None, None\n\ndef _detect_from_alert_json(alert):\n if not isinstance(alert, dict):\n return None, None\n if isinstance(alert.get('net'), dict):\n return 'tdp', 'alert has nested net dict (TDP)'\n if any(k in alert for k in ('behave_uuid', 'flow_id')):\n return 'tdp', 'alert has behave_uuid/flow_id (TDP)'\n if any(k in alert for k in ('net_real_src_ip', 'net_http_url', 'threat_suuid')):\n return 'tdp', 'alert has pre-flattened TDP fields'\n if any(k in alert for k in ('uri', 'vuln_name', 'attack_result', 'attack_flag')):\n return 'skyeye', 'alert has uri/vuln_name/attack_result (Skyeye)'\n return None, None\n\nexplicit_type = str(inputs.get('source_log_type', '') or '').lower()\nif explicit_type in ('tdp', 'skyeye'):\n source_log_type = explicit_type\n source_log_type_reason = 'explicit input parameter'\nelif input_mode == 'syslog' and _syslog_msg:\n source_log_type, reason = _detect_from_syslog_meta(_syslog_msg)\n if source_log_type:\n source_log_type_reason = f'syslog metadata: {reason}'\n else:\n first_alert = alerts_input[0] if alerts_input else {}\n source_log_type, reason = _detect_from_alert_json(first_alert)\n if source_log_type:\n source_log_type_reason = f'JSON field detection: {reason}'\n else:\n source_log_type = 'tdp'\n source_log_type_reason = 'fallback default'\nelse:\n first_alert = alerts_input[0] if alerts_input else {}\n source_log_type, reason = _detect_from_alert_json(first_alert)\n if source_log_type:\n source_log_type_reason = f'JSON field detection: {reason}'\n else:\n source_log_type = 'tdp'\n source_log_type_reason = 'default'\n\nprint(f'[receive] source_log_type={source_log_type!r} reason={source_log_type_reason!r}')\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nprint(f'[receive] input_mode={input_mode} raw_alerts={len(alerts_input)} '\n f'filter_enabled={filter_enabled} dedup_enabled={dedup_enabled} '\n f'max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['input_mode'] = input_mode\noutputs['source_log_type'] = source_log_type\noutputs['source_log_type_reason'] = source_log_type_reason\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['dedup_threshold'] = threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\noutputs['stats'] = {'raw_count': len(alerts_input)}\n" + }, + { + "id": "normalize", + "type": "python", + "description": "Normalize TDP and Skyeye alerts into a unified schema. Per-alert type detection via field signatures; falls back to batch_hint. Carries _syslog_meta and _source_type to downstream nodes.", + "code": "\nimport uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent':'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent':'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef detect_alert_type(alert, batch_hint):\n if isinstance(alert.get('net'), dict):\n return 'tdp'\n if any(k in alert for k in ('behave_uuid', 'flow_id')):\n return 'tdp'\n if any(k in alert for k in ('net_real_src_ip', 'net_http_url', 'threat_suuid')):\n return 'tdp'\n if any(k in alert for k in ('uri', 'vuln_name', 'attack_result', 'attack_flag')):\n return 'skyeye'\n return batch_hint\n\ndef normalize_single(alert, source_type):\n flat = flatten_dict(alert)\n field_map = TDP_FIELD_MAP if source_type == 'tdp' else SKYEYE_FIELD_MAP\n norm = {}\n for std_key, raw_key in field_map.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n norm['_source_type'] = source_type\n # Carry syslog metadata if present\n if '_syslog_meta' in alert:\n norm['_syslog_meta'] = alert['_syslog_meta']\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nbatch_hint = str(inputs.get('source_log_type', 'tdp') or 'tdp').lower()\nif batch_hint not in ('tdp', 'skyeye'):\n batch_hint = 'tdp'\n\ntype_counts = {'tdp': 0, 'skyeye': 0}\nnormalized = []\nfor alert in raw_alerts:\n src_type = detect_alert_type(alert, batch_hint)\n type_counts[src_type] = type_counts.get(src_type, 0) + 1\n normalized.append(normalize_single(alert, src_type))\n\nstats['normalized_count'] = len(normalized)\nstats['normalize_type_counts'] = type_counts\nprint(f'[normalize] {len(raw_alerts)} alerts -> {len(normalized)} normalized '\n f'(tdp={type_counts.get(\"tdp\",0)}, skyeye={type_counts.get(\"skyeye\",0)}, '\n f'batch_hint={batch_hint!r})')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['input_mode', 'source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len', 'max_dedup_keys']:\n outputs[k] = inputs.get(k)\n" + }, + { + "id": "filter_logs", + "type": "python", + "description": "Filter: classify into 9 process_types, keep non-scan HTTP alerts (direction in/out/lateral). Adds _process_type and _threat_type fields. When filter_enabled=False, all alerts pass through.", + "code": "\nnormalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nbatch_hint = str(inputs.get('source_log_type', 'tdp') or 'tdp').lower()\nif batch_hint not in ('tdp', 'skyeye'):\n batch_hint = 'tdp'\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert):\n src = alert.get('_source_type') or batch_hint\n if src == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert):\n src = alert.get('_source_type') or batch_hint\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if src == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['input_mode', 'dedup_enabled', 'dedup_threshold', 'strict_fields',\n 'lsh_fields', 'max_field_len', 'max_dedup_keys']:\n outputs[k] = inputs.get(k)\n" + }, + { + "id": "dedup_and_write", + "type": "python", + "description": "Dedup (terminal): URI normalization + MinHash LSH (128 perms, 5-gram). LSH state persisted to ~/.flocks/workspace/workflows/stream_alert_dedup/ (atomic write, file lock, FIFO LRU eviction). Each output alert = normalized fields + dedup_key + is_duplicate + _lsh_cluster_id. Appends enriched alerts to JSONL files under ~/.flocks/workspace/workflows/stream_alert_dedup//dedup_result_NNN.jsonl. Each new file begins with a header line {_type:file_header, created_at, ...}; max 10,000 alert records per file, auto-increments sequence number on rollover.", + "code": "\nimport os\nimport re\nimport sys\nimport json\nimport pickle\nimport hashlib\nimport datetime\nfrom datasketch import MinHash, MinHashLSH\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'stream_alert_dedup'\nLSH_CLUSTER_WARN_THRESHOLD = 100000\n\ndef normalize_uri(uri):\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_state_paths(threshold):\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent\n state_dir = str(flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n base = os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\ndef get_output_dir():\n from flocks.config import Config\n from pathlib import Path\n flocks_root = Config().get_global().data_dir.parent\n date_str = datetime.datetime.now().strftime('%Y-%m-%d')\n out_dir = flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME / date_str\n out_dir.mkdir(parents=True, exist_ok=True)\n return str(out_dir)\n\ndef acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L'); fh.flush(); fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef load_state(state_path, threshold):\n if not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None, 0\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch, starting fresh')\n return None, None, None, 0\n cache = state['lsh_cache']\n seen_raw = state.get('dedup_key_cache', {})\n seen = {k: None for k in seen_raw} if isinstance(seen_raw, set) else (seen_raw if isinstance(seen_raw, dict) else {})\n next_cid = state.get('next_cluster_id') or ((max(cache.keys()) + 1) if cache else 0)\n print(f'[dedup] loaded state: {len(cache)} clusters, {len(seen)} dedup_keys, next_cid={next_cid}')\n return state['lsh_index'], cache, seen, next_cid\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None, 0\n\ndef evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_keys):\n evicted_keys = evicted_clusters = 0\n excess = len(dedup_key_cache) - max_keys\n if excess > 0:\n for k in list(dedup_key_cache.keys())[:excess]:\n del dedup_key_cache[k]\n evicted_keys = excess\n excess = len(lsh_cache) - max_keys\n if excess > 0:\n for cid in list(lsh_cache.keys())[:excess]:\n try: lsh_index.remove(cid)\n except (KeyError, ValueError): pass\n del lsh_cache[cid]\n evicted_clusters = excess\n return evicted_keys, evicted_clusters\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, next_cluster_id):\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index, 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache, 'next_cluster_id': next_cluster_id,\n 'num_perm': NUM_PERM, 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f); f.flush(); os.fsync(f.fileno())\n os.replace(tmp, state_path)\n print(f'[dedup] state saved: {len(lsh_cache)} clusters, {len(dedup_key_cache)} dedup_keys')\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try: os.remove(tmp)\n except Exception: pass\n\n# ── Main ──────────────────────────────────────────────────────────────────────\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ninput_mode = inputs.get('input_mode', 'unknown')\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\nstats = dict(inputs.get('stats', {}))\n\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\nstate_path, lock_path = get_state_paths(threshold)\nlock_fh = acquire_lock(lock_path) if dedup_enabled else None\n\nevicted_keys = evicted_clusters = 0\n\ntry:\n if dedup_enabled:\n lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n dedup_key_cache = {}\n next_cluster_id = 0\n else:\n lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = None, {}, {}, 0\n\n _cid_box = [next_cluster_id]\n def query_most_similar(minhash):\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = _cid_box[0]\n _cid_box[0] += 1\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\n enriched = []\n for alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n alert['_lsh_cluster_id'] = None\n alert['dedup_key'] = dk\n alert['is_duplicate'] = dk in dedup_key_cache\n dedup_key_cache[dk] = None\n enriched.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n already = dk in dedup_key_cache\n if already:\n del dedup_key_cache[dk]\n dedup_key_cache[dk] = None\n alert['is_duplicate'] = already\n enriched.append(alert)\n\n if dedup_enabled:\n evicted_keys, evicted_clusters = evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_dedup_keys)\n if evicted_keys or evicted_clusters:\n print(f'[dedup] LRU eviction: dropped {evicted_keys} keys, {evicted_clusters} clusters')\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters '\n f'and {len(dedup_key_cache)} dedup_keys (warn={LSH_CLUSTER_WARN_THRESHOLD})')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, _cid_box[0])\nfinally:\n if lock_fh is not None:\n release_lock(lock_fh)\n\n# Unique alerts: one representative per dedup_key (first seen)\nseen_keys = {}\nunique_alerts = []\nfor a in enriched:\n k = a['dedup_key']\n if k not in seen_keys:\n seen_keys[k] = a\n unique_alerts.append(a)\n\ndup_count = len(enriched) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, enriched={len(enriched)}, unique={len(unique_alerts)}, duplicates={dup_count}')\n\nstats['after_dedup_count'] = len(enriched)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(enriched), 4) if enriched else 0.0\nstats['dedup_state_persisted'] = bool(dedup_enabled)\nif dedup_enabled:\n stats['lsh_total_clusters'] = len(lsh_cache)\n stats['lsh_total_dedup_keys'] = len(dedup_key_cache)\n stats['lsh_max_dedup_keys'] = max_dedup_keys\n stats['lsh_evicted_keys'] = evicted_keys\n stats['lsh_evicted_clusters'] = evicted_clusters\n\nif dedup_enabled:\n summary = (\n f'stream_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> enriched={len(enriched)}, unique={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | clusters={len(lsh_cache)}, keys={len(dedup_key_cache)}, max={max_dedup_keys}'\n )\nelse:\n summary = (\n f'stream_alert_dedup done (dedup_enabled=False): '\n f'raw={stats.get(\"raw_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> enriched={len(enriched)}'\n )\nprint(f'[dedup] {summary}')\n\n# ── Write enriched alerts to JSONL files (max 10,000 records per file) ────────\n# File naming: dedup_result_001.jsonl, 002.jsonl, ...\n# Each file starts with a header line: {\"_type\":\"file_header\",\"created_at\":...}\n# Subsequent lines: one enriched alert per line (no header line counted).\n# When the current file reaches MAX_RECORDS_PER_FILE, a new numbered file is created.\n\nMAX_RECORDS_PER_FILE = 10000\n_JSONL_PREFIX = 'dedup_result'\n\ndef _count_alert_lines(file_path):\n count = 0\n try:\n with open(file_path, 'r', encoding='utf-8') as _f:\n for _line in _f:\n _s = _line.strip()\n if _s and '\"_type\"' not in _s:\n count += 1\n except Exception:\n pass\n return count\n\ndef _find_active_file(out_dir):\n import glob\n pattern = os.path.join(out_dir, _JSONL_PREFIX + '_*.jsonl')\n existing = sorted(glob.glob(pattern))\n if not existing:\n return None, 0, 0\n latest = existing[-1]\n basename = os.path.basename(latest)\n try:\n seq = int(basename.replace(_JSONL_PREFIX + '_', '').replace('.jsonl', ''))\n except ValueError:\n seq = len(existing)\n count = _count_alert_lines(latest)\n return latest, count, seq\n\ndef _write_jsonl(out_dir, alerts, now):\n written = []\n active_path, active_count, seq = _find_active_file(out_dir)\n remaining = list(alerts)\n\n while remaining:\n available = MAX_RECORDS_PER_FILE - active_count\n if available <= 0 or active_path is None:\n seq += 1\n active_path = os.path.join(out_dir, f'{_JSONL_PREFIX}_{seq:03d}.jsonl')\n active_count = 0\n available = MAX_RECORDS_PER_FILE\n header = {\n '_type': 'file_header',\n 'created_at': now.isoformat(),\n 'date': now.strftime('%Y-%m-%d'),\n 'workflow': WORKFLOW_NAME,\n 'seq': seq,\n }\n with open(active_path, 'w', encoding='utf-8') as _hf:\n _hf.write(json.dumps(header, ensure_ascii=False) + '\\n')\n\n batch = remaining[:available]\n remaining = remaining[available:]\n\n with open(active_path, 'a', encoding='utf-8') as _af:\n for _alert in batch:\n _af.write(json.dumps(_alert, ensure_ascii=False) + '\\n')\n\n active_count += len(batch)\n if active_path not in written:\n written.append(active_path)\n\n if remaining:\n active_path = None\n active_count = 0\n\n return written\n\n_now = datetime.datetime.now()\ntry:\n _out_dir = get_output_dir()\n _written_paths = _write_jsonl(_out_dir, enriched, _now)\n _out_path = _written_paths[-1] if _written_paths else ''\n print(f'[dedup] wrote {len(enriched)} records -> {_written_paths}')\n stats['output_path'] = _out_path\n stats['output_paths'] = _written_paths\n outputs['output_path'] = _out_path\n outputs['output_paths'] = _written_paths\nexcept Exception as _we:\n import traceback\n print(f'[dedup] WARNING: failed to write JSONL: {_we}\\n{traceback.format_exc()}')\n _out_path = ''\n outputs['output_path'] = ''\n outputs['output_paths'] = []\n\n# ── Outputs ───────────────────────────────────────────────────────────────────\noutputs['enriched_alerts'] = enriched\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary\noutputs['input_mode'] = input_mode\n\n# Convenience fields for single-alert / syslog callers\nif enriched:\n outputs['dedup_key'] = enriched[0].get('dedup_key', '')\n outputs['is_duplicate'] = enriched[0].get('is_duplicate', False)\nelse:\n outputs['dedup_key'] = ''\n outputs['is_duplicate'] = False\n" + } + ], + "edges": [ + { + "from": "receive_alert", + "to": "normalize", + "order": 0 + }, + { + "from": "normalize", + "to": "filter_logs", + "order": 0 + }, + { + "from": "filter_logs", + "to": "dedup_and_write", + "order": 0 + } + ], + "metadata": { + "node_timeout_s": 300, + "sampleInputs": { + "source_log_type": "tdp", + "filter_enabled": true, + "dedup_enabled": true, + "threshold": 0.7, + "max_dedup_keys": 100000, + "_comment_syslog": "Syslog mode: POST /api/workflow/{id}/syslog-config {enabled:true, protocol:'udp', port:5140, inputKey:'syslog_message'}. The TDP/Skyeye alert JSON must be in syslog message body.", + "syslog_message": { + "raw": "<134>May 12 10:00:00 tdp-sensor tdp: {\"id\":\"AZtRkZkzj\",\"net\":{}}", + "facility": 16, + "severity": 6, + "timestamp": "2026-05-12T10:00:00", + "hostname": "tdp-sensor", + "app_name": "tdp", + "message": "{\"id\":\"AZtRkZkzj\",\"net\":{\"http\":{\"url\":\"/admin\"}},\"threat\":{\"name\":\"SQL注入\"}}", + "format": "rfc3164" + }, + "_comment_batch": "Or pass 'alerts' (list) or 'alert_file' (path to JSON file)", + "alerts": [ + { + "net_real_src_ip": "1.2.3.4", + "net_dest_ip": "10.0.0.1", + "direction": "in", + "net_type": "http", + "net_http_url": "/admin/login.php?id=1 OR 1=1", + "net_http_reqs_body": "username=admin&password=123456", + "net_http_resp_body": "root@localhost", + "threat_name": "SQL注入攻击", + "threat_type": "web攻击" + } + ] + } + } +} \ No newline at end of file diff --git a/.flocks/plugins/workflows/stream_alert_dedup/workflow.md b/.flocks/plugins/workflows/stream_alert_dedup/workflow.md new file mode 100644 index 000000000..e5acdee43 --- /dev/null +++ b/.flocks/plugins/workflows/stream_alert_dedup/workflow.md @@ -0,0 +1,138 @@ +# stream_alert_dedup + +流式 HTTP 告警去重 Pipeline,三阶段处理:**归一化 → 过滤 → 去重**。 + +与 `http_alert_dedup` 的核心区别: +1. **流式单条输入**:支持 syslog 实时单条(`syslog_message`),也兼容批次列表与文件 +2. **输出为原始数据增强**:每条输出告警 = 归一化字段 + 去重字段(`dedup_key`、`is_duplicate`、`_lsh_cluster_id` 等) +3. **结果落盘**:每次执行自动将结果写入 `~/.flocks/workspace/workflows/stream_alert_dedup//` + +## 工作流图 + +``` +receive_alert + │ + normalize + │ + filter_logs + │ +dedup_and_write ◀── 终点,输出增强告警 + 写日期目录 JSON +``` + +## 输入参数 + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `syslog_message` | `dict` | — | Syslog 消息体(优先级最高,单条流式) | +| `alerts` | `list[dict]` | — | 原始告警列表(批次模式) | +| `alert_file` | `str` | — | JSON 文件路径(文件模式) | +| `source_log_type` | `str` | 自动识别 | 来源类型 `"tdp"` 或 `"skyeye"`,不填则自动检测 | +| `filter_enabled` | `bool` | `true` | 是否启用过滤阶段 | +| `dedup_enabled` | `bool` | `true` | 是否启用跨批次去重(false 时仅批内去重) | +| `threshold` | `float` | `0.7` | Jaccard 相似度阈值(0–1) | +| `strict_fields` | `list[str]` | `["sip","dip"]` | 严格匹配字段 | +| `lsh_fields` | `list[str]` | `["req_http_url","req_body","rsp_body"]` | 模糊匹配字段(URI 归一化 + MinHash) | +| `max_field_len` | `int` | `500` | 单字段截断长度 | +| `max_dedup_keys` | `int` | `100000` | FIFO LRU 上限(持久化 dedup_key 最大数量) | + +### syslog_message 格式 + +Flocks syslog 监听器解析 RFC3164 / RFC5424 后注入的结构体,TDP/Skyeye 原始 JSON 须在 `message` 字段内: + +```json +{ + "hostname": "tdp-sensor", + "app_name": "tdp", + "timestamp": "2026-05-12T10:00:00", + "severity": 6, + "facility": 16, + "format": "rfc3164", + "message": "{\"id\":\"AZtRkZkzj\",\"net\":{\"http\":{\"url\":\"/admin\"}},\"threat\":{\"name\":\"SQL注入\"}}" +} +``` + +> 开启 syslog 接收:`POST /api/workflow/{id}/syslog-config {"enabled":true,"protocol":"udp","port":5140,"inputKey":"syslog_message"}` + +## 输出参数 + +| 字段 | 类型 | 说明 | +|------|------|------| +| `enriched_alerts` | `list[dict]` | 过滤后全量告警,每条含完整归一化字段 + 去重字段 | +| `unique_alerts` | `list[dict]` | 每个 dedup_key 的代表性告警(首次出现) | +| `dedup_key` | `str` | 第一条告警的 dedup_key(syslog 单条场景直接使用) | +| `is_duplicate` | `bool` | 第一条告警是否为跨批次重复(syslog 单条场景直接使用) | +| `output_path` | `str` | 当次写入的最后一个 JSONL 文件路径 | +| `output_paths` | `list[str]` | 本次写入涉及的所有文件路径(批量超阈值时跨多个文件) | +| `stats` | `dict` | 各阶段统计(见下表) | +| `dedup_summary` | `str` | 一行文字摘要 | +| `input_mode` | `str` | 输入模式:`syslog` / `alerts` / `alert_file` | + +### 每条 enriched_alert 的增强字段 + +| 字段 | 说明 | +|------|------| +| `dedup_key` | MD5 去重键(`strict_fields + cluster_id` 的哈希) | +| `is_duplicate` | 是否已在历史批次中出现过(跨批次持久化感知) | +| `_lsh_cluster_id` | MinHash LSH 簇 ID | +| `_source_type` | 识别出的来源类型(`tdp` / `skyeye`) | +| `_process_type` | 过滤分类(如 `alert_not_scan_http_direction_in`) | +| `_threat_type` | 威胁类型字符串 | +| `_syslog_meta` | syslog 元数据(仅 syslog 模式下存在) | + +### stats 字段 + +| 字段 | 说明 | +|------|------| +| `raw_count` | 原始输入告警数 | +| `normalized_count` | 归一化后告警数 | +| `after_filter_count` | 过滤后保留数 | +| `filter_removed_count` | 过滤剔除数 | +| `after_dedup_count` | 去重处理总数(= after_filter_count) | +| `unique_key_count` | 唯一 dedup_key 数 | +| `dedup_removed_count` | 批内重复数 | +| `dedup_ratio` | 批内压缩率 | +| `output_path` | 结果文件路径 | + +## 结果文件格式 + +写入路径:`~/.flocks/workspace/workflows/stream_alert_dedup//dedup_result_NNN.jsonl` + +- **JSONL 格式**:每行一个 JSON 对象 +- **首行**:`file_header`(含时间戳,不计入告警条数) +- **后续行**:每行一条 enriched_alert +- **分卷规则**:每文件最多 **10,000 条**告警(不含 header 行),超出时自动创建 `dedup_result_002.jsonl`、`003.jsonl`… + +```jsonl +{"_type": "file_header", "created_at": "2026-05-12T10:00:00.123456", "date": "2026-05-12", "workflow": "stream_alert_dedup", "seq": 1} +{"sip": "1.2.3.4", "dip": "10.0.0.1", "req_http_url": "/admin/login.php?id=1 OR 1=1", "threat_name": "SQL注入攻击", "_source_type": "tdp", "_process_type": "alert_not_scan_http_direction_in", "dedup_key": "a3f9...", "is_duplicate": false, "_lsh_cluster_id": 42} +{"sip": "5.6.7.8", "dip": "10.0.0.2", ...} +``` + +`output_path` 输出字段为当次写入的**最后一个**文件路径;`output_paths` 为本次写入涉及的所有文件路径列表(批量超过分卷阈值时可能跨多个文件)。 + +## 节点说明 + +### receive_alert +解析三种输入格式(syslog > alerts > alert_file)。从以下来源按优先级解析 `source_log_type`: +1. 显式 `source_log_type` 参数 +2. Syslog `app_name` / `hostname` 中含 `tdp` 或 `skyeye` +3. 告警 JSON 字段签名(TDP: 嵌套 net 字典 / behave_uuid;Skyeye: uri / vuln_name) +4. 默认 `tdp` + +### normalize +字段映射统一为标准 schema(`sip`/`dip`/`req_http_url`/`req_body`/`rsp_body`/`threat_name` 等),自动检测每条告警类型,支持混合批次。保留 `_syslog_meta`。 + +### filter_logs +基于 `process_type` 9 分类过滤,保留非扫描 HTTP 告警(`in`/`out`/`lateral` 方向)。`filter_enabled=False` 时全量透传。 + +### dedup_and_write(终点) + +**去重算法**(与 http_alert_dedup 相同): +1. `strict_fields` 拼接作为精确前缀 +2. `lsh_fields` URI 归一化后做 **5-gram shingling** +3. MinHash LSH(128 permutations)近似 Jaccard 相似度聚类,阈值 ≥ `threshold` +4. `dedup_key = MD5(strict_prefix + cluster_id)`;`is_duplicate=True` 表示历史已见 + +**持久化**:LSH 状态存于 `~/.flocks/workspace/workflows/stream_alert_dedup/lsh_state_np128_th*.pkl`,原子写 + 文件锁,FIFO LRU 上限 `max_dedup_keys`,可跨批次/跨进程复用。 + +> **注意**:`stream_alert_dedup` 维护独立的 LSH 状态,与 `http_alert_dedup` 不共享去重历史。如需共享历史,可修改 `WORKFLOW_NAME = 'http_alert_dedup'`(同时共享 dedup_key 空间)。 diff --git a/flocks/ingest/syslog/parser.py b/flocks/ingest/syslog/parser.py index 7a8377251..270891350 100644 --- a/flocks/ingest/syslog/parser.py +++ b/flocks/ingest/syslog/parser.py @@ -12,6 +12,14 @@ r"^([A-Za-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})\s+(\S+)\s*(.*)$", re.DOTALL, ) +# Non-standard but common: ISO_TS HOSTNAME APP[PID]: msg (no RFC5424 version) +_ISO3164_REST_RE = re.compile( + r"^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:[+-]\d{2}:\d{2}|Z)?)" # ISO timestamp + r"\s+(\S+)" # hostname + r"\s+(\S+?)\s*:\s*" # app_name/tag: + r"([\s\S]*)$", # message + re.DOTALL, +) def _pri_parts(pri: int) -> tuple[int, int]: @@ -84,11 +92,16 @@ def parse_syslog(raw: str, format_hint: str = "auto") -> Dict[str, Any]: if format_hint == "rfc5424": return _parse_rfc5424(rest, raw=text, facility=facility, severity=severity) - # auto: RFC5424 if second token is a digit version + # auto: RFC5424 if second token is a single digit version number if rest and rest[0].isdigit(): first_space = rest.find(" ") if first_space > 0 and rest[:first_space].isdigit(): return _parse_rfc5424(rest, raw=text, facility=facility, severity=severity) + # Non-standard: ISO_TS HOSTNAME APP[PID]: msg (no version number) + if first_space > 0 and "T" in rest[:first_space]: + m_iso = _ISO3164_REST_RE.match(rest) + if m_iso: + return _parse_iso3164(m_iso, raw=text, facility=facility, severity=severity) return _parse_rfc3164(rest, raw=text, facility=facility, severity=severity) @@ -153,6 +166,26 @@ def _parse_rfc5424( } +def _parse_iso3164( + m: "re.Match[str]", + *, + raw: str, + facility: int, + severity: int, +) -> Dict[str, Any]: + """Handle non-standard ISO_TS HOSTNAME APP[PID]: msg (no RFC5424 version).""" + return { + "raw": raw, + "facility": facility, + "severity": severity, + "timestamp": _normalize_ts(m.group(1)), + "hostname": m.group(2), + "app_name": m.group(3), + "message": m.group(4).strip(), + "format": "iso3164", + } + + def _parse_rfc3164( rest: str, *, From 97c72fc2843b3c27a55c3592d28015b9b4c063d7 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Tue, 12 May 2026 16:03:34 +0800 Subject: [PATCH 30/41] feat(workflows): add tdp_alert_pull_dedup workflow for TDP API polling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a long-running workflow that actively polls TDP attack alerts via the tdp_log_search tool (TDP v3.3.10) instead of relying on syslog ingestion, reusing the stream_alert_dedup pipeline (normalize → filter → MinHash LSH dedup) at each iteration. Key behavior: - Single python node runs `while True` with configurable pull_interval_s (default 60s), max_iterations and max_runtime_s for stop control; node_timeout_s is raised to 30 days so the node can run indefinitely. - Time cursor at ~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json is persisted atomically each round; restarts resume from next_from with no gaps and no overlap. TDP failures do not advance the cursor so the same window is retried on the next round. - Enriched alerts are appended to JSONL files under ~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl, with a file_header line and a 10,000-records-per-file rollover. - LSH state is persisted independently from stream_alert_dedup at ~/.flocks/workflows/tdp_alert_pull_dedup/lsh_state_np128_th*.pkl with atomic write + file lock and FIFO LRU eviction (max_dedup_keys). - Robust TDP response unwrapping accepts list / {"log":[...]} / {"list":[...]} / {"data":[...]} / nested {data: {list: [...]}} shapes. Layout follows existing workflows (workflow.json + workflow.md); the companion _node_pull_dedup_loop.py source file plus _build_workflow.py script keep the embedded node code readable and regenerable. Both underscore-prefixed files are ignored by the workflow scanner which only picks up workflow.json. Verified with Workflow.from_dict + workflow_lint (0 issues) and compile_workflow_file inside the flocks venv. Co-authored-by: Cursor --- .../tdp_alert_pull_dedup/_build_workflow.py | 93 +++ .../_node_pull_dedup_loop.py | 670 ++++++++++++++++++ .../tdp_alert_pull_dedup/workflow.json | 48 ++ .../tdp_alert_pull_dedup/workflow.md | 219 ++++++ 4 files changed, 1030 insertions(+) create mode 100644 .flocks/plugins/workflows/tdp_alert_pull_dedup/_build_workflow.py create mode 100644 .flocks/plugins/workflows/tdp_alert_pull_dedup/_node_pull_dedup_loop.py create mode 100644 .flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.json create mode 100644 .flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.md diff --git a/.flocks/plugins/workflows/tdp_alert_pull_dedup/_build_workflow.py b/.flocks/plugins/workflows/tdp_alert_pull_dedup/_build_workflow.py new file mode 100644 index 000000000..5f58c3b33 --- /dev/null +++ b/.flocks/plugins/workflows/tdp_alert_pull_dedup/_build_workflow.py @@ -0,0 +1,93 @@ +"""Build workflow.json for tdp_alert_pull_dedup. + +Run: python _build_workflow.py + +Reads the pull_dedup_loop node code from _node_pull_dedup_loop.py and +serializes a fully-valid workflow.json next to it. +""" + +from __future__ import annotations + +import json +import os + +HERE = os.path.dirname(os.path.abspath(__file__)) + + +def read_code(name: str) -> str: + with open(os.path.join(HERE, name), "r", encoding="utf-8") as f: + return f.read() + + +workflow = { + "name": "tdp_alert_pull_dedup", + "description": ( + "Long-running TDP alert puller + deduper. Each iteration calls the " + "tdp_log_search tool to fetch attack-level HTTP alerts in a moving time " + "window, normalizes them, filters non-HTTP/scan noise, deduplicates via " + "URI-normalized 5-gram MinHash LSH (persistent across iterations / runs), " + "and appends enriched alerts to JSONL files under " + "~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl. " + "A persistent time cursor at ~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json " + "guarantees no gaps and no overlap across restarts." + ), + "description_cn": ( + "长时间运行的 TDP 告警拉取 + 去重 Pipeline。单个 python 节点内 while 循环:每轮调用 " + "tdp_log_search 拉取一个时间窗口内的攻击级 HTTP 告警 → 归一化 → 过滤(去扫描/非HTTP)→ " + "URI 归一化 + 5-gram MinHash LSH 去重(持久化 LSH 状态,跨轮次/跨进程共享)→ 追加写入 " + "~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl,每文件 10,000 条上限。" + "时间游标 ~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json 持久化,重启可无重叠续拉。" + "通过 pull_interval_s / max_iterations / max_runtime_s 控制循环节奏与停止条件。" + ), + "start": "pull_dedup_loop", + "nodes": [ + { + "id": "pull_dedup_loop", + "type": "python", + "description": ( + "持续拉取 TDP 告警的主循环节点。内部 while 循环执行:调用 tdp_log_search → 归一化 → " + "过滤 → LSH 去重 → 写盘。time_from 来自持久化游标(首次回退 initial_lookback_s 秒)," + "time_to=当前时间,保证窗口连续无重叠。" + "停止条件:max_iterations / max_runtime_s 任一达到即返回;外部取消(如 SIGINT 或节点超时)也会优雅退出。" + ), + "code": read_code("_node_pull_dedup_loop.py"), + } + ], + "edges": [], + "metadata": { + "node_timeout_s": 2592000, + "sampleInputs": { + "pull_interval_s": 60, + "initial_lookback_s": 300, + "max_iterations": 0, + "max_runtime_s": 0, + "batch_size": 1000, + "net_data_types": ["attack"], + "sql": "threat.level = 'attack'", + "assets_group": [], + "filter_enabled": True, + "dedup_enabled": True, + "threshold": 0.7, + "strict_fields": ["sip", "dip"], + "lsh_fields": ["req_http_url", "req_body", "rsp_body"], + "max_field_len": 500, + "max_dedup_keys": 100000, + "reset_cursor": False, + "log_progress_every": 1, + "_comment_runtime": ( + "node_timeout_s 默认 30 天(2,592,000s),适合长时间持续运行;" + "若想短跑测试,把 max_iterations 调小或设 max_runtime_s 即可。" + ), + "_comment_path": ( + "输出落盘根目录:~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl;" + "时间游标:~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json;" + "LSH 持久化:~/.flocks/workflows/tdp_alert_pull_dedup/lsh_state_np128_th{int(threshold*100)}.pkl" + ), + }, + }, +} + +with open(os.path.join(HERE, "workflow.json"), "w", encoding="utf-8") as f: + json.dump(workflow, f, ensure_ascii=False, indent=2) + +print(f"wrote {os.path.join(HERE, 'workflow.json')}") diff --git a/.flocks/plugins/workflows/tdp_alert_pull_dedup/_node_pull_dedup_loop.py b/.flocks/plugins/workflows/tdp_alert_pull_dedup/_node_pull_dedup_loop.py new file mode 100644 index 000000000..01e8da7e5 --- /dev/null +++ b/.flocks/plugins/workflows/tdp_alert_pull_dedup/_node_pull_dedup_loop.py @@ -0,0 +1,670 @@ + +import os +import re +import sys +import json +import time +import pickle +import hashlib +import datetime +import traceback +from pathlib import Path + +from datasketch import MinHash, MinHashLSH + +IS_WINDOWS = sys.platform == 'win32' +if IS_WINDOWS: + import msvcrt # noqa: F401 +else: + import fcntl # noqa: F401 + +WORKFLOW_NAME = 'tdp_alert_pull_dedup' +MINHASH_SEED = 2024 +NUM_PERM = 128 +LSH_CLUSTER_WARN_THRESHOLD = 100000 +MAX_RECORDS_PER_FILE = 10000 +_JSONL_PREFIX = 'alerts' + +HTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE'] + +TDP_FIELD_MAP = { + 'customer_uuid': 'customer_uuid', + 'device_id': 'device_id', + 'id': 'id', + 'time': 'time', + 'direction': 'direction', + 'sip': 'net_real_src_ip', + 'dip': 'net_dest_ip', + 'sport': 'net_src_port', + 'dport': 'net_dest_port', + 'net_type': 'net_type', + 'net_app_proto': 'net_app_proto', + 'req_http_url': 'net_http_url', + 'req_user_agent':'net_http_reqs_user_agent', + 'req_host': 'net_http_reqs_host', + 'req_line': 'net_http_reqs_line', + 'req_header': 'net_http_reqs_header', + 'req_body': 'net_http_reqs_body', + 'req_cookie': 'net_http_reqs_cookie', + 'req_body_len': 'net_http_reqs_content_length', + 'rsp_status_code': 'net_http_status', + 'rsp_line': 'net_http_resp_line', + 'rsp_header': 'net_http_resp_header', + 'rsp_body': 'net_http_resp_body', + 'rsp_body_len': 'net_http_resp_content_length', + 'net_bytes_toclient': 'net_bytes_toclient', + 'net_bytes_toserver': 'net_bytes_toserver', + 'threat_rule_id': 'threat_suuid', + 'threat_name': 'threat_name', + 'threat_msg': 'threat_msg', + 'threat_ioc': 'threat_ioc', + 'threat_level': 'threat_level', + 'threat_severity': 'threat_severity', + 'threat_phase': 'threat_phase', + 'threat_type': 'threat_type', + 'threat_result': 'threat_result', + 'threat_confidence': 'threat_confidence', + 'connection_established': 'established', + 'asset_group_name': 'dest_assets_group_name', + 'asset_name': 'dest_assets_latestName', +} + +NEED_ANALYSIS = { + 'alert_not_scan_http_direction_in', + 'alert_not_scan_http_direction_out', + 'alert_not_scan_http_direction_lateral', +} + +# ── Paths ───────────────────────────────────────────────────────────────────── + +def get_workflow_root(): + from flocks.config import Config + flocks_root = Config().get_global().data_dir.parent # ~/.flocks + root = Path(flocks_root) / 'workflows' / WORKFLOW_NAME + root.mkdir(parents=True, exist_ok=True) + return root + + +def get_state_paths(threshold): + base = str(get_workflow_root() / f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}') + return base + '.pkl', base + '.lock' + + +def get_cursor_path(): + return str(get_workflow_root() / 'cursor.json') + + +def get_output_dir(now): + date_str = now.strftime('%Y-%m-%d') + out_dir = get_workflow_root() / date_str + out_dir.mkdir(parents=True, exist_ok=True) + return str(out_dir) + + +# ── File locking ────────────────────────────────────────────────────────────── + +def acquire_lock(lock_path): + fh = open(lock_path, 'w+') + if IS_WINDOWS: + fh.write('L'); fh.flush(); fh.seek(0) + while True: + try: + msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break + except OSError: + continue + else: + fcntl.flock(fh.fileno(), fcntl.LOCK_EX) + return fh + + +def release_lock(fh): + try: + if IS_WINDOWS: + try: + fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1) + except OSError: + pass + else: + fcntl.flock(fh.fileno(), fcntl.LOCK_UN) + finally: + fh.close() + + +# ── LSH state ───────────────────────────────────────────────────────────────── + +def load_state(state_path, threshold): + if not state_path or not os.path.exists(state_path) or os.path.getsize(state_path) == 0: + return None, None, None, 0 + try: + with open(state_path, 'rb') as f: + state = pickle.load(f) + if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold: + print(f'[dedup] state params mismatch, starting fresh') + return None, None, None, 0 + cache = state['lsh_cache'] + seen_raw = state.get('dedup_key_cache', {}) + seen = {k: None for k in seen_raw} if isinstance(seen_raw, set) else (seen_raw if isinstance(seen_raw, dict) else {}) + next_cid = state.get('next_cluster_id') or ((max(cache.keys()) + 1) if cache else 0) + return state['lsh_index'], cache, seen, next_cid + except Exception as e: + print(f'[dedup] failed to load state ({e}), starting fresh') + return None, None, None, 0 + + +def evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_keys): + evicted_keys = evicted_clusters = 0 + excess = len(dedup_key_cache) - max_keys + if excess > 0: + for k in list(dedup_key_cache.keys())[:excess]: + del dedup_key_cache[k] + evicted_keys = excess + excess = len(lsh_cache) - max_keys + if excess > 0: + for cid in list(lsh_cache.keys())[:excess]: + try: lsh_index.remove(cid) + except (KeyError, ValueError): pass + del lsh_cache[cid] + evicted_clusters = excess + return evicted_keys, evicted_clusters + + +def dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, next_cluster_id): + tmp = state_path + '.tmp' + try: + state = { + 'lsh_index': lsh_index, 'lsh_cache': lsh_cache, + 'dedup_key_cache': dedup_key_cache, 'next_cluster_id': next_cluster_id, + 'num_perm': NUM_PERM, 'threshold': threshold, + } + with open(tmp, 'wb') as f: + pickle.dump(state, f); f.flush(); os.fsync(f.fileno()) + os.replace(tmp, state_path) + except Exception as e: + print(f'[dedup] failed to save state: {e}') + if os.path.exists(tmp): + try: os.remove(tmp) + except Exception: pass + + +# ── Cursor ──────────────────────────────────────────────────────────────────── + +def load_cursor(): + p = get_cursor_path() + if not os.path.exists(p): + return None + try: + with open(p, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception: + return None + + +def save_cursor(cursor): + p = get_cursor_path() + tmp = p + '.tmp' + try: + with open(tmp, 'w', encoding='utf-8') as f: + json.dump(cursor, f, ensure_ascii=False) + f.flush(); os.fsync(f.fileno()) + os.replace(tmp, p) + except Exception as e: + print(f'[cursor] save failed: {e}') + + +# ── Normalize ───────────────────────────────────────────────────────────────── + +def flatten_dict(d, prefix=''): + res = {} + for k, v in d.items(): + if isinstance(v, dict): + res.update(flatten_dict(v, f'{prefix}{k}_')) + else: + res[f'{prefix}{k}'] = v + return res + + +def normalize_single(alert): + import uuid as _uuid + if not isinstance(alert, dict): + return None + flat = flatten_dict(alert) + norm = {} + for std_key, raw_key in TDP_FIELD_MAP.items(): + norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none' + if norm.get('id') in ('none', None, ''): + norm['id'] = str(_uuid.uuid3(_uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values()))) + if norm.get('net_type') in ('none', None, ''): + method = flat.get('method', 'none') + norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other') + norm['_source_type'] = 'tdp' + return norm + + +# ── Filter ──────────────────────────────────────────────────────────────────── + +def is_scan_alert(threat_name): + tnl = str(threat_name or '').lower() + return ('扫描' in tnl) and ('webshell' not in tnl) + + +def is_http(alert): + for field in ('application_layer_protocol', 'net_type', 'net_app_proto'): + val = str(alert.get(field, '') or '').lower() + if val and val != 'none' and 'http' in val: + return True + return False + + +def get_process_type(alert): + threat_name = alert.get('threat_name', '') + direction = str(alert.get('direction', '') or '').lower() + scan = is_scan_alert(threat_name) + http = is_http(alert) + if scan: + return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in' + if http: + return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in' + return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process' + + +# ── Dedup helpers ───────────────────────────────────────────────────────────── + +def normalize_uri(uri): + uri = str(uri or '') + uri = re.sub(r'\d{4}-\d{2}-\d{2}', 'DATETIME', uri) + uri = re.sub(r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE) + uri = re.sub(r'(\.\./)+', 'TRAVERSAL', uri) + uri = re.sub(r'\bNULL\b', 'NULL_REPLACED', uri) + uri = re.sub(r'chr\$\d+\$\|\|chr\$\d+\$', 'CHR_SEQUENCE', uri) + uri = re.sub(r'\b\d+={1,2}\d+\b', 'NUMBER_COMPARISON', uri) + uri = re.sub(r'\b[a-fA-F0-9]{32}\b', 'HEXADECIMAL CHARACTERS', uri) + return uri + + +def gen_minhash(text, permutations): + shingles = [text[i:i+5] for i in range(len(text) - 4)] + m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations) + for s in shingles: + m.update(s.encode('utf-8')) + return m + + +# ── TDP response unwrapping ─────────────────────────────────────────────────── + +def extract_alerts_from_response(resp): + """tdp_log_search returns the inner 'data' field from TDP API (already unwrapped). + + Tolerate several shapes: + - list → used as-is + - dict with 'log' / 'logs' / 'list' / 'data' / 'records' / 'items' key + (possibly nested one level) + """ + if resp is None: + return [] + if isinstance(resp, list): + return list(resp) + if isinstance(resp, dict): + for key in ('log', 'logs', 'list', 'data', 'records', 'items', 'hits'): + v = resp.get(key) + if isinstance(v, list): + return list(v) + if isinstance(v, dict): + for sub in ('list', 'data', 'records', 'items', 'hits'): + sv = v.get(sub) + if isinstance(sv, list): + return list(sv) + return [] + + +# ── JSONL writer ────────────────────────────────────────────────────────────── + +def _count_alert_lines(file_path): + count = 0 + try: + with open(file_path, 'r', encoding='utf-8') as _f: + for _line in _f: + _s = _line.strip() + if _s and '"_type"' not in _s: + count += 1 + except Exception: + pass + return count + + +def _find_active_file(out_dir): + import glob + pattern = os.path.join(out_dir, _JSONL_PREFIX + '_*.jsonl') + existing = sorted(glob.glob(pattern)) + if not existing: + return None, 0, 0 + latest = existing[-1] + basename = os.path.basename(latest) + try: + seq = int(basename.replace(_JSONL_PREFIX + '_', '').replace('.jsonl', '')) + except ValueError: + seq = len(existing) + count = _count_alert_lines(latest) + return latest, count, seq + + +def _write_jsonl(out_dir, alerts, now): + written = [] + active_path, active_count, seq = _find_active_file(out_dir) + remaining = list(alerts) + while remaining: + available = MAX_RECORDS_PER_FILE - active_count + if available <= 0 or active_path is None: + seq += 1 + active_path = os.path.join(out_dir, f'{_JSONL_PREFIX}_{seq:03d}.jsonl') + active_count = 0 + available = MAX_RECORDS_PER_FILE + header = { + '_type': 'file_header', + 'created_at': now.isoformat(), + 'date': now.strftime('%Y-%m-%d'), + 'workflow': WORKFLOW_NAME, + 'seq': seq, + } + with open(active_path, 'w', encoding='utf-8') as _hf: + _hf.write(json.dumps(header, ensure_ascii=False) + '\n') + batch = remaining[:available] + remaining = remaining[available:] + with open(active_path, 'a', encoding='utf-8') as _af: + for _alert in batch: + _af.write(json.dumps(_alert, ensure_ascii=False, default=str) + '\n') + active_count += len(batch) + if active_path not in written: + written.append(active_path) + if remaining: + active_path = None + active_count = 0 + return written + + +# ── Inputs ──────────────────────────────────────────────────────────────────── + +pull_interval_s = float(inputs.get('pull_interval_s', 60)) +initial_lookback_s = int(inputs.get('initial_lookback_s', 300)) +max_iterations = int(inputs.get('max_iterations', 0)) # 0 = infinite +max_runtime_s = float(inputs.get('max_runtime_s', 0)) # 0 = no time limit +batch_size = int(inputs.get('batch_size', 1000)) +net_data_types = inputs.get('net_data_types', ['attack']) +if isinstance(net_data_types, str): + net_data_types = [s.strip() for s in net_data_types.split(',') if s.strip()] +sql_filter = str(inputs.get('sql', "threat.level = 'attack'") or "threat.level = 'attack'") +assets_group = inputs.get('assets_group') or [] +filter_enabled = bool(inputs.get('filter_enabled', True)) +dedup_enabled = bool(inputs.get('dedup_enabled', True)) +threshold = float(inputs.get('threshold', 0.7)) +strict_fields = inputs.get('strict_fields', ['sip', 'dip']) +lsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body']) +max_field_len = int(inputs.get('max_field_len', 500)) +max_dedup_keys = int(inputs.get('max_dedup_keys', 100000)) +reset_cursor = bool(inputs.get('reset_cursor', False)) +log_progress_every = max(1, int(inputs.get('log_progress_every', 1))) + +if max_dedup_keys < 1: + max_dedup_keys = 100000 +if pull_interval_s < 0.1: + pull_interval_s = 0.1 +if batch_size < 1: + batch_size = 1 +if batch_size > 10000: + batch_size = 10000 + +print(f'[init] workflow={WORKFLOW_NAME}') +print(f'[init] pull_interval_s={pull_interval_s}, initial_lookback_s={initial_lookback_s}, ' + f'batch_size={batch_size}, max_iterations={max_iterations}, max_runtime_s={max_runtime_s}') +print(f'[init] sql={sql_filter!r}, net_data_types={net_data_types}, assets_group={list(assets_group) if assets_group else []}') +print(f'[init] filter_enabled={filter_enabled}, dedup_enabled={dedup_enabled}, ' + f'threshold={threshold}, max_dedup_keys={max_dedup_keys}') +print(f'[init] output_root={get_workflow_root()}') + +# ── Cursor init ─────────────────────────────────────────────────────────────── + +now_ts = int(time.time()) +if reset_cursor: + cur = None + print('[cursor] reset_cursor=True, starting from initial_lookback_s') +else: + cur = load_cursor() + +if cur and isinstance(cur.get('next_from'), int): + last_to = int(cur['next_from']) + print(f'[cursor] resumed: next_from={last_to} ({datetime.datetime.fromtimestamp(last_to)})') +else: + last_to = now_ts - initial_lookback_s + print(f'[cursor] fresh start: next_from={last_to} ({datetime.datetime.fromtimestamp(last_to)})') + +# ── MinHash permutations (init once) ────────────────────────────────────────── + +_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations +state_path, lock_path = (get_state_paths(threshold) if dedup_enabled else (None, None)) + +# ── Aggregate stats ─────────────────────────────────────────────────────────── + +stats_all = { + 'iterations': 0, + 'pulls_succeeded': 0, + 'pulls_failed': 0, + 'raw_total': 0, + 'normalized_total': 0, + 'filtered_total': 0, + 'enriched_total': 0, + 'unique_total': 0, + 'duplicates_total': 0, + 'written_files': [], + 'last_window_from': last_to, + 'last_window_to': None, + 'last_error': None, +} + +start_t = time.time() +iter_cnt = 0 +stop_reason = 'completed' + +# ── Main loop ───────────────────────────────────────────────────────────────── + +try: + while True: + iter_cnt += 1 + stats_all['iterations'] = iter_cnt + + if max_iterations and iter_cnt > max_iterations: + stop_reason = f'reached max_iterations={max_iterations}' + print(f'[loop] {stop_reason}') + break + if max_runtime_s and (time.time() - start_t) > max_runtime_s: + stop_reason = f'reached max_runtime_s={max_runtime_s}' + print(f'[loop] {stop_reason}') + break + + time_to_ts = int(time.time()) + time_from = last_to + if time_to_ts <= time_from: + # window not advanced yet (e.g., very short pull_interval); sleep and retry + time.sleep(pull_interval_s) + continue + + stats_all['last_window_from'] = time_from + stats_all['last_window_to'] = time_to_ts + + # ── Pull from TDP ───────────────────────────────────────────────────── + tdp_kwargs = { + 'action': 'search', + 'time_from': time_from, + 'time_to': time_to_ts, + 'net_data_type': list(net_data_types), + 'sql': sql_filter, + 'size': batch_size, + } + if assets_group: + tdp_kwargs['assets_group'] = list(assets_group) + + try: + resp = tool.run('tdp_log_search', **tdp_kwargs) + stats_all['pulls_succeeded'] += 1 + except Exception as _e: + stats_all['pulls_failed'] += 1 + stats_all['last_error'] = f'tdp_log_search failed: {_e}' + print(f'[pull] iter={iter_cnt}: tdp_log_search failed: {_e}') + # Do NOT advance the cursor on failure: we'll retry the same window next round. + time.sleep(pull_interval_s) + continue + + raw_alerts = extract_alerts_from_response(resp) + if iter_cnt % log_progress_every == 0: + print(f'[pull] iter={iter_cnt}: window=[{time_from},{time_to_ts}] ' + f'({datetime.datetime.fromtimestamp(time_from)} → ' + f'{datetime.datetime.fromtimestamp(time_to_ts)}), raw={len(raw_alerts)}') + stats_all['raw_total'] += len(raw_alerts) + + # ── Normalize ───────────────────────────────────────────────────────── + normalized = [] + for a in raw_alerts: + n = normalize_single(a) + if n is not None: + normalized.append(n) + stats_all['normalized_total'] += len(normalized) + + # ── Filter ──────────────────────────────────────────────────────────── + if filter_enabled: + filtered = [] + for a in normalized: + a = dict(a) + ptype = get_process_type(a) + a['_process_type'] = ptype + a['_threat_type'] = str(a.get('threat_name', 'general') or 'general') + if ptype in NEED_ANALYSIS: + filtered.append(a) + else: + filtered = [ + {**a, + '_process_type': 'filter_disabled', + '_threat_type': str(a.get('threat_name', 'general') or 'general')} + for a in normalized + ] + stats_all['filtered_total'] += len(filtered) + + # ── Dedup ───────────────────────────────────────────────────────────── + enriched = [] + if dedup_enabled and filtered: + lock_fh = acquire_lock(lock_path) + try: + lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = load_state(state_path, threshold) + if lsh_index is None: + lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM) + lsh_cache, dedup_key_cache, next_cluster_id = {}, {}, 0 + cid_box = [next_cluster_id] + for a in filtered: + a = dict(a) + text_strict = '. '.join(str(a.get(f, ''))[:max_field_len] for f in strict_fields) + text_lsh = normalize_uri('. '.join(str(a.get(f, ''))[:max_field_len] for f in lsh_fields)) + mh = gen_minhash(text_lsh.lower(), _permutations) + sim_keys = lsh_index.query(mh) + if sim_keys: + cands = sim_keys[:100] + sims = [mh.jaccard(lsh_cache[k]) for k in cands] + cluster_id = cands[sims.index(max(sims))] + else: + cluster_id = cid_box[0] + cid_box[0] += 1 + lsh_index.insert(cluster_id, mh) + lsh_cache[cluster_id] = mh + a['_lsh_cluster_id'] = cluster_id + dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest() + a['dedup_key'] = dk + already = dk in dedup_key_cache + if already: + del dedup_key_cache[dk] + dedup_key_cache[dk] = None + a['is_duplicate'] = already + enriched.append(a) + evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_dedup_keys) + if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD: + print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters and ' + f'{len(dedup_key_cache)} dedup_keys (warn={LSH_CLUSTER_WARN_THRESHOLD})') + dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, cid_box[0]) + finally: + release_lock(lock_fh) + else: + for a in filtered: + a = dict(a) + text_strict = '. '.join(str(a.get(f, ''))[:max_field_len] for f in strict_fields) + text_lsh = '. '.join(str(a.get(f, ''))[:max_field_len] for f in lsh_fields) + dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest() + a['_lsh_cluster_id'] = None + a['dedup_key'] = dk + a['is_duplicate'] = False + enriched.append(a) + + # Unique within this batch (first-seen by dedup_key) + seen_keys = set() + unique_count = 0 + for a in enriched: + k = a.get('dedup_key') + if k not in seen_keys: + seen_keys.add(k) + unique_count += 1 + dup_count = len(enriched) - unique_count + stats_all['enriched_total'] += len(enriched) + stats_all['unique_total'] += unique_count + stats_all['duplicates_total'] += dup_count + + if enriched and iter_cnt % log_progress_every == 0: + print(f'[dedup] iter={iter_cnt}: enriched={len(enriched)}, ' + f'unique={unique_count}, duplicates={dup_count}') + + # ── Write to disk ───────────────────────────────────────────────────── + if enriched: + try: + _now = datetime.datetime.now() + out_dir = get_output_dir(_now) + written_paths = _write_jsonl(out_dir, enriched, _now) + for p in written_paths: + if p not in stats_all['written_files']: + stats_all['written_files'].append(p) + if iter_cnt % log_progress_every == 0: + print(f'[write] iter={iter_cnt}: {len(enriched)} → {written_paths[-1] if written_paths else ""}') + except Exception as _we: + stats_all['last_error'] = f'write failed: {_we}' + print(f'[write] iter={iter_cnt}: failed: {_we}\n{traceback.format_exc()}') + + # ── Advance cursor ──────────────────────────────────────────────────── + last_to = time_to_ts + save_cursor({ + 'next_from': last_to, + 'updated_at': datetime.datetime.now().isoformat(), + 'iter': iter_cnt, + 'workflow': WORKFLOW_NAME, + }) + + time.sleep(pull_interval_s) + +except KeyboardInterrupt: + stop_reason = 'KeyboardInterrupt' + print('[loop] interrupted by user') +except Exception as _loop_err: + stop_reason = f'unhandled error: {_loop_err}' + stats_all['last_error'] = f'unhandled: {_loop_err}' + print(f'[loop] unhandled error: {_loop_err}\n{traceback.format_exc()}') + +# ── Outputs ─────────────────────────────────────────────────────────────────── + +summary = ( + f'{WORKFLOW_NAME} done: iters={stats_all["iterations"]}, ' + f'pulls(ok={stats_all["pulls_succeeded"]}, fail={stats_all["pulls_failed"]}), ' + f'raw={stats_all["raw_total"]}, normalized={stats_all["normalized_total"]}, ' + f'filtered={stats_all["filtered_total"]}, enriched={stats_all["enriched_total"]}, ' + f'unique={stats_all["unique_total"]} (compression ' + f'{(stats_all["duplicates_total"] / stats_all["enriched_total"]) if stats_all["enriched_total"] else 0:.1%}), ' + f'files_written={len(stats_all["written_files"])}, stop={stop_reason}' +) +print(f'[done] {summary}') + +outputs['stats'] = stats_all +outputs['summary'] = summary +outputs['stop_reason'] = stop_reason +outputs['final_cursor'] = last_to +outputs['output_paths'] = list(stats_all['written_files']) +outputs['output_path'] = stats_all['written_files'][-1] if stats_all['written_files'] else '' diff --git a/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.json b/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.json new file mode 100644 index 000000000..5b2ba7374 --- /dev/null +++ b/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.json @@ -0,0 +1,48 @@ +{ + "name": "tdp_alert_pull_dedup", + "description": "Long-running TDP alert puller + deduper. Each iteration calls the tdp_log_search tool to fetch attack-level HTTP alerts in a moving time window, normalizes them, filters non-HTTP/scan noise, deduplicates via URI-normalized 5-gram MinHash LSH (persistent across iterations / runs), and appends enriched alerts to JSONL files under ~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl. A persistent time cursor at ~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json guarantees no gaps and no overlap across restarts.", + "description_cn": "长时间运行的 TDP 告警拉取 + 去重 Pipeline。单个 python 节点内 while 循环:每轮调用 tdp_log_search 拉取一个时间窗口内的攻击级 HTTP 告警 → 归一化 → 过滤(去扫描/非HTTP)→ URI 归一化 + 5-gram MinHash LSH 去重(持久化 LSH 状态,跨轮次/跨进程共享)→ 追加写入 ~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl,每文件 10,000 条上限。时间游标 ~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json 持久化,重启可无重叠续拉。通过 pull_interval_s / max_iterations / max_runtime_s 控制循环节奏与停止条件。", + "start": "pull_dedup_loop", + "nodes": [ + { + "id": "pull_dedup_loop", + "type": "python", + "description": "持续拉取 TDP 告警的主循环节点。内部 while 循环执行:调用 tdp_log_search → 归一化 → 过滤 → LSH 去重 → 写盘。time_from 来自持久化游标(首次回退 initial_lookback_s 秒),time_to=当前时间,保证窗口连续无重叠。停止条件:max_iterations / max_runtime_s 任一达到即返回;外部取消(如 SIGINT 或节点超时)也会优雅退出。", + "code": "\nimport os\nimport re\nimport sys\nimport json\nimport time\nimport pickle\nimport hashlib\nimport datetime\nimport traceback\nfrom pathlib import Path\n\nfrom datasketch import MinHash, MinHashLSH\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nWORKFLOW_NAME = 'tdp_alert_pull_dedup'\nMINHASH_SEED = 2024\nNUM_PERM = 128\nLSH_CLUSTER_WARN_THRESHOLD = 100000\nMAX_RECORDS_PER_FILE = 10000\n_JSONL_PREFIX = 'alerts'\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent':'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\n# ── Paths ─────────────────────────────────────────────────────────────────────\n\ndef get_workflow_root():\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent # ~/.flocks\n root = Path(flocks_root) / 'workflows' / WORKFLOW_NAME\n root.mkdir(parents=True, exist_ok=True)\n return root\n\n\ndef get_state_paths(threshold):\n base = str(get_workflow_root() / f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\n\ndef get_cursor_path():\n return str(get_workflow_root() / 'cursor.json')\n\n\ndef get_output_dir(now):\n date_str = now.strftime('%Y-%m-%d')\n out_dir = get_workflow_root() / date_str\n out_dir.mkdir(parents=True, exist_ok=True)\n return str(out_dir)\n\n\n# ── File locking ──────────────────────────────────────────────────────────────\n\ndef acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L'); fh.flush(); fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\n\ndef release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\n\n# ── LSH state ─────────────────────────────────────────────────────────────────\n\ndef load_state(state_path, threshold):\n if not state_path or not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None, 0\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch, starting fresh')\n return None, None, None, 0\n cache = state['lsh_cache']\n seen_raw = state.get('dedup_key_cache', {})\n seen = {k: None for k in seen_raw} if isinstance(seen_raw, set) else (seen_raw if isinstance(seen_raw, dict) else {})\n next_cid = state.get('next_cluster_id') or ((max(cache.keys()) + 1) if cache else 0)\n return state['lsh_index'], cache, seen, next_cid\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None, 0\n\n\ndef evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_keys):\n evicted_keys = evicted_clusters = 0\n excess = len(dedup_key_cache) - max_keys\n if excess > 0:\n for k in list(dedup_key_cache.keys())[:excess]:\n del dedup_key_cache[k]\n evicted_keys = excess\n excess = len(lsh_cache) - max_keys\n if excess > 0:\n for cid in list(lsh_cache.keys())[:excess]:\n try: lsh_index.remove(cid)\n except (KeyError, ValueError): pass\n del lsh_cache[cid]\n evicted_clusters = excess\n return evicted_keys, evicted_clusters\n\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, next_cluster_id):\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index, 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache, 'next_cluster_id': next_cluster_id,\n 'num_perm': NUM_PERM, 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f); f.flush(); os.fsync(f.fileno())\n os.replace(tmp, state_path)\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try: os.remove(tmp)\n except Exception: pass\n\n\n# ── Cursor ────────────────────────────────────────────────────────────────────\n\ndef load_cursor():\n p = get_cursor_path()\n if not os.path.exists(p):\n return None\n try:\n with open(p, 'r', encoding='utf-8') as f:\n return json.load(f)\n except Exception:\n return None\n\n\ndef save_cursor(cursor):\n p = get_cursor_path()\n tmp = p + '.tmp'\n try:\n with open(tmp, 'w', encoding='utf-8') as f:\n json.dump(cursor, f, ensure_ascii=False)\n f.flush(); os.fsync(f.fileno())\n os.replace(tmp, p)\n except Exception as e:\n print(f'[cursor] save failed: {e}')\n\n\n# ── Normalize ─────────────────────────────────────────────────────────────────\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\n\ndef normalize_single(alert):\n import uuid as _uuid\n if not isinstance(alert, dict):\n return None\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in TDP_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = str(_uuid.uuid3(_uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n norm['_source_type'] = 'tdp'\n return norm\n\n\n# ── Filter ────────────────────────────────────────────────────────────────────\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\n\ndef get_process_type(alert):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\n\n# ── Dedup helpers ─────────────────────────────────────────────────────────────\n\ndef normalize_uri(uri):\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\n\ndef gen_minhash(text, permutations):\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\n\n# ── TDP response unwrapping ───────────────────────────────────────────────────\n\ndef extract_alerts_from_response(resp):\n \"\"\"tdp_log_search returns the inner 'data' field from TDP API (already unwrapped).\n\n Tolerate several shapes:\n - list → used as-is\n - dict with 'log' / 'logs' / 'list' / 'data' / 'records' / 'items' key\n (possibly nested one level)\n \"\"\"\n if resp is None:\n return []\n if isinstance(resp, list):\n return list(resp)\n if isinstance(resp, dict):\n for key in ('log', 'logs', 'list', 'data', 'records', 'items', 'hits'):\n v = resp.get(key)\n if isinstance(v, list):\n return list(v)\n if isinstance(v, dict):\n for sub in ('list', 'data', 'records', 'items', 'hits'):\n sv = v.get(sub)\n if isinstance(sv, list):\n return list(sv)\n return []\n\n\n# ── JSONL writer ──────────────────────────────────────────────────────────────\n\ndef _count_alert_lines(file_path):\n count = 0\n try:\n with open(file_path, 'r', encoding='utf-8') as _f:\n for _line in _f:\n _s = _line.strip()\n if _s and '\"_type\"' not in _s:\n count += 1\n except Exception:\n pass\n return count\n\n\ndef _find_active_file(out_dir):\n import glob\n pattern = os.path.join(out_dir, _JSONL_PREFIX + '_*.jsonl')\n existing = sorted(glob.glob(pattern))\n if not existing:\n return None, 0, 0\n latest = existing[-1]\n basename = os.path.basename(latest)\n try:\n seq = int(basename.replace(_JSONL_PREFIX + '_', '').replace('.jsonl', ''))\n except ValueError:\n seq = len(existing)\n count = _count_alert_lines(latest)\n return latest, count, seq\n\n\ndef _write_jsonl(out_dir, alerts, now):\n written = []\n active_path, active_count, seq = _find_active_file(out_dir)\n remaining = list(alerts)\n while remaining:\n available = MAX_RECORDS_PER_FILE - active_count\n if available <= 0 or active_path is None:\n seq += 1\n active_path = os.path.join(out_dir, f'{_JSONL_PREFIX}_{seq:03d}.jsonl')\n active_count = 0\n available = MAX_RECORDS_PER_FILE\n header = {\n '_type': 'file_header',\n 'created_at': now.isoformat(),\n 'date': now.strftime('%Y-%m-%d'),\n 'workflow': WORKFLOW_NAME,\n 'seq': seq,\n }\n with open(active_path, 'w', encoding='utf-8') as _hf:\n _hf.write(json.dumps(header, ensure_ascii=False) + '\\n')\n batch = remaining[:available]\n remaining = remaining[available:]\n with open(active_path, 'a', encoding='utf-8') as _af:\n for _alert in batch:\n _af.write(json.dumps(_alert, ensure_ascii=False, default=str) + '\\n')\n active_count += len(batch)\n if active_path not in written:\n written.append(active_path)\n if remaining:\n active_path = None\n active_count = 0\n return written\n\n\n# ── Inputs ────────────────────────────────────────────────────────────────────\n\npull_interval_s = float(inputs.get('pull_interval_s', 60))\ninitial_lookback_s = int(inputs.get('initial_lookback_s', 300))\nmax_iterations = int(inputs.get('max_iterations', 0)) # 0 = infinite\nmax_runtime_s = float(inputs.get('max_runtime_s', 0)) # 0 = no time limit\nbatch_size = int(inputs.get('batch_size', 1000))\nnet_data_types = inputs.get('net_data_types', ['attack'])\nif isinstance(net_data_types, str):\n net_data_types = [s.strip() for s in net_data_types.split(',') if s.strip()]\nsql_filter = str(inputs.get('sql', \"threat.level = 'attack'\") or \"threat.level = 'attack'\")\nassets_group = inputs.get('assets_group') or []\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nreset_cursor = bool(inputs.get('reset_cursor', False))\nlog_progress_every = max(1, int(inputs.get('log_progress_every', 1)))\n\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\nif pull_interval_s < 0.1:\n pull_interval_s = 0.1\nif batch_size < 1:\n batch_size = 1\nif batch_size > 10000:\n batch_size = 10000\n\nprint(f'[init] workflow={WORKFLOW_NAME}')\nprint(f'[init] pull_interval_s={pull_interval_s}, initial_lookback_s={initial_lookback_s}, '\n f'batch_size={batch_size}, max_iterations={max_iterations}, max_runtime_s={max_runtime_s}')\nprint(f'[init] sql={sql_filter!r}, net_data_types={net_data_types}, assets_group={list(assets_group) if assets_group else []}')\nprint(f'[init] filter_enabled={filter_enabled}, dedup_enabled={dedup_enabled}, '\n f'threshold={threshold}, max_dedup_keys={max_dedup_keys}')\nprint(f'[init] output_root={get_workflow_root()}')\n\n# ── Cursor init ───────────────────────────────────────────────────────────────\n\nnow_ts = int(time.time())\nif reset_cursor:\n cur = None\n print('[cursor] reset_cursor=True, starting from initial_lookback_s')\nelse:\n cur = load_cursor()\n\nif cur and isinstance(cur.get('next_from'), int):\n last_to = int(cur['next_from'])\n print(f'[cursor] resumed: next_from={last_to} ({datetime.datetime.fromtimestamp(last_to)})')\nelse:\n last_to = now_ts - initial_lookback_s\n print(f'[cursor] fresh start: next_from={last_to} ({datetime.datetime.fromtimestamp(last_to)})')\n\n# ── MinHash permutations (init once) ──────────────────────────────────────────\n\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\nstate_path, lock_path = (get_state_paths(threshold) if dedup_enabled else (None, None))\n\n# ── Aggregate stats ───────────────────────────────────────────────────────────\n\nstats_all = {\n 'iterations': 0,\n 'pulls_succeeded': 0,\n 'pulls_failed': 0,\n 'raw_total': 0,\n 'normalized_total': 0,\n 'filtered_total': 0,\n 'enriched_total': 0,\n 'unique_total': 0,\n 'duplicates_total': 0,\n 'written_files': [],\n 'last_window_from': last_to,\n 'last_window_to': None,\n 'last_error': None,\n}\n\nstart_t = time.time()\niter_cnt = 0\nstop_reason = 'completed'\n\n# ── Main loop ─────────────────────────────────────────────────────────────────\n\ntry:\n while True:\n iter_cnt += 1\n stats_all['iterations'] = iter_cnt\n\n if max_iterations and iter_cnt > max_iterations:\n stop_reason = f'reached max_iterations={max_iterations}'\n print(f'[loop] {stop_reason}')\n break\n if max_runtime_s and (time.time() - start_t) > max_runtime_s:\n stop_reason = f'reached max_runtime_s={max_runtime_s}'\n print(f'[loop] {stop_reason}')\n break\n\n time_to_ts = int(time.time())\n time_from = last_to\n if time_to_ts <= time_from:\n # window not advanced yet (e.g., very short pull_interval); sleep and retry\n time.sleep(pull_interval_s)\n continue\n\n stats_all['last_window_from'] = time_from\n stats_all['last_window_to'] = time_to_ts\n\n # ── Pull from TDP ─────────────────────────────────────────────────────\n tdp_kwargs = {\n 'action': 'search',\n 'time_from': time_from,\n 'time_to': time_to_ts,\n 'net_data_type': list(net_data_types),\n 'sql': sql_filter,\n 'size': batch_size,\n }\n if assets_group:\n tdp_kwargs['assets_group'] = list(assets_group)\n\n try:\n resp = tool.run('tdp_log_search', **tdp_kwargs)\n stats_all['pulls_succeeded'] += 1\n except Exception as _e:\n stats_all['pulls_failed'] += 1\n stats_all['last_error'] = f'tdp_log_search failed: {_e}'\n print(f'[pull] iter={iter_cnt}: tdp_log_search failed: {_e}')\n # Do NOT advance the cursor on failure: we'll retry the same window next round.\n time.sleep(pull_interval_s)\n continue\n\n raw_alerts = extract_alerts_from_response(resp)\n if iter_cnt % log_progress_every == 0:\n print(f'[pull] iter={iter_cnt}: window=[{time_from},{time_to_ts}] '\n f'({datetime.datetime.fromtimestamp(time_from)} → '\n f'{datetime.datetime.fromtimestamp(time_to_ts)}), raw={len(raw_alerts)}')\n stats_all['raw_total'] += len(raw_alerts)\n\n # ── Normalize ─────────────────────────────────────────────────────────\n normalized = []\n for a in raw_alerts:\n n = normalize_single(a)\n if n is not None:\n normalized.append(n)\n stats_all['normalized_total'] += len(normalized)\n\n # ── Filter ────────────────────────────────────────────────────────────\n if filter_enabled:\n filtered = []\n for a in normalized:\n a = dict(a)\n ptype = get_process_type(a)\n a['_process_type'] = ptype\n a['_threat_type'] = str(a.get('threat_name', 'general') or 'general')\n if ptype in NEED_ANALYSIS:\n filtered.append(a)\n else:\n filtered = [\n {**a,\n '_process_type': 'filter_disabled',\n '_threat_type': str(a.get('threat_name', 'general') or 'general')}\n for a in normalized\n ]\n stats_all['filtered_total'] += len(filtered)\n\n # ── Dedup ─────────────────────────────────────────────────────────────\n enriched = []\n if dedup_enabled and filtered:\n lock_fh = acquire_lock(lock_path)\n try:\n lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache, dedup_key_cache, next_cluster_id = {}, {}, 0\n cid_box = [next_cluster_id]\n for a in filtered:\n a = dict(a)\n text_strict = '. '.join(str(a.get(f, ''))[:max_field_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(a.get(f, ''))[:max_field_len] for f in lsh_fields))\n mh = gen_minhash(text_lsh.lower(), _permutations)\n sim_keys = lsh_index.query(mh)\n if sim_keys:\n cands = sim_keys[:100]\n sims = [mh.jaccard(lsh_cache[k]) for k in cands]\n cluster_id = cands[sims.index(max(sims))]\n else:\n cluster_id = cid_box[0]\n cid_box[0] += 1\n lsh_index.insert(cluster_id, mh)\n lsh_cache[cluster_id] = mh\n a['_lsh_cluster_id'] = cluster_id\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n a['dedup_key'] = dk\n already = dk in dedup_key_cache\n if already:\n del dedup_key_cache[dk]\n dedup_key_cache[dk] = None\n a['is_duplicate'] = already\n enriched.append(a)\n evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_dedup_keys)\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters and '\n f'{len(dedup_key_cache)} dedup_keys (warn={LSH_CLUSTER_WARN_THRESHOLD})')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, cid_box[0])\n finally:\n release_lock(lock_fh)\n else:\n for a in filtered:\n a = dict(a)\n text_strict = '. '.join(str(a.get(f, ''))[:max_field_len] for f in strict_fields)\n text_lsh = '. '.join(str(a.get(f, ''))[:max_field_len] for f in lsh_fields)\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n a['_lsh_cluster_id'] = None\n a['dedup_key'] = dk\n a['is_duplicate'] = False\n enriched.append(a)\n\n # Unique within this batch (first-seen by dedup_key)\n seen_keys = set()\n unique_count = 0\n for a in enriched:\n k = a.get('dedup_key')\n if k not in seen_keys:\n seen_keys.add(k)\n unique_count += 1\n dup_count = len(enriched) - unique_count\n stats_all['enriched_total'] += len(enriched)\n stats_all['unique_total'] += unique_count\n stats_all['duplicates_total'] += dup_count\n\n if enriched and iter_cnt % log_progress_every == 0:\n print(f'[dedup] iter={iter_cnt}: enriched={len(enriched)}, '\n f'unique={unique_count}, duplicates={dup_count}')\n\n # ── Write to disk ─────────────────────────────────────────────────────\n if enriched:\n try:\n _now = datetime.datetime.now()\n out_dir = get_output_dir(_now)\n written_paths = _write_jsonl(out_dir, enriched, _now)\n for p in written_paths:\n if p not in stats_all['written_files']:\n stats_all['written_files'].append(p)\n if iter_cnt % log_progress_every == 0:\n print(f'[write] iter={iter_cnt}: {len(enriched)} → {written_paths[-1] if written_paths else \"\"}')\n except Exception as _we:\n stats_all['last_error'] = f'write failed: {_we}'\n print(f'[write] iter={iter_cnt}: failed: {_we}\\n{traceback.format_exc()}')\n\n # ── Advance cursor ────────────────────────────────────────────────────\n last_to = time_to_ts\n save_cursor({\n 'next_from': last_to,\n 'updated_at': datetime.datetime.now().isoformat(),\n 'iter': iter_cnt,\n 'workflow': WORKFLOW_NAME,\n })\n\n time.sleep(pull_interval_s)\n\nexcept KeyboardInterrupt:\n stop_reason = 'KeyboardInterrupt'\n print('[loop] interrupted by user')\nexcept Exception as _loop_err:\n stop_reason = f'unhandled error: {_loop_err}'\n stats_all['last_error'] = f'unhandled: {_loop_err}'\n print(f'[loop] unhandled error: {_loop_err}\\n{traceback.format_exc()}')\n\n# ── Outputs ───────────────────────────────────────────────────────────────────\n\nsummary = (\n f'{WORKFLOW_NAME} done: iters={stats_all[\"iterations\"]}, '\n f'pulls(ok={stats_all[\"pulls_succeeded\"]}, fail={stats_all[\"pulls_failed\"]}), '\n f'raw={stats_all[\"raw_total\"]}, normalized={stats_all[\"normalized_total\"]}, '\n f'filtered={stats_all[\"filtered_total\"]}, enriched={stats_all[\"enriched_total\"]}, '\n f'unique={stats_all[\"unique_total\"]} (compression '\n f'{(stats_all[\"duplicates_total\"] / stats_all[\"enriched_total\"]) if stats_all[\"enriched_total\"] else 0:.1%}), '\n f'files_written={len(stats_all[\"written_files\"])}, stop={stop_reason}'\n)\nprint(f'[done] {summary}')\n\noutputs['stats'] = stats_all\noutputs['summary'] = summary\noutputs['stop_reason'] = stop_reason\noutputs['final_cursor'] = last_to\noutputs['output_paths'] = list(stats_all['written_files'])\noutputs['output_path'] = stats_all['written_files'][-1] if stats_all['written_files'] else ''\n" + } + ], + "edges": [], + "metadata": { + "node_timeout_s": 2592000, + "sampleInputs": { + "pull_interval_s": 60, + "initial_lookback_s": 300, + "max_iterations": 0, + "max_runtime_s": 0, + "batch_size": 1000, + "net_data_types": [ + "attack" + ], + "sql": "threat.level = 'attack'", + "assets_group": [], + "filter_enabled": true, + "dedup_enabled": true, + "threshold": 0.7, + "strict_fields": [ + "sip", + "dip" + ], + "lsh_fields": [ + "req_http_url", + "req_body", + "rsp_body" + ], + "max_field_len": 500, + "max_dedup_keys": 100000, + "reset_cursor": false, + "log_progress_every": 1, + "_comment_runtime": "node_timeout_s 默认 30 天(2,592,000s),适合长时间持续运行;若想短跑测试,把 max_iterations 调小或设 max_runtime_s 即可。", + "_comment_path": "输出落盘根目录:~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl;时间游标:~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json;LSH 持久化:~/.flocks/workflows/tdp_alert_pull_dedup/lsh_state_np128_th{int(threshold*100)}.pkl" + } + } +} \ No newline at end of file diff --git a/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.md b/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.md new file mode 100644 index 000000000..23ad2cafa --- /dev/null +++ b/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.md @@ -0,0 +1,219 @@ +# tdp_alert_pull_dedup + +**TDP 告警持续拉取 + 去重 Pipeline** + +参考 `stream_alert_dedup` 的处理流水线,把"数据源"从 syslog 监听换成主动调用 TDP v3.3.10 的 `tdp_log_search` 工具拉取。 + +工作流启动后,单个节点内部以 `while` 循环持续运行:每轮从 TDP 拉取一个时间窗口的告警 → 归一化 → 过滤(去扫描/非HTTP)→ MinHash LSH 去重 → 追加写入按日期切分的 JSONL 文件,直到达到 `max_iterations` / `max_runtime_s` 任一停止条件,或被外部取消。 + +## 工作流图 + +``` +pull_dedup_loop (循环节点;启动后持续运行) + │ + ├─ tool.run('tdp_log_search', ...) ── TDP 告警拉取 + ├─ normalize ── 字段映射到统一 schema + ├─ filter_logs ── 9 分类,保留非扫描 HTTP 告警 + ├─ dedup ── URI 归一化 + 5-gram MinHash LSH + └─ append JSONL ── 按日期 + 序号分卷写盘 + ▲ + │ + advance time cursor (持久化) +``` + +## 输入参数 + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `pull_interval_s` | `float` | `60` | 每两次拉取之间的休眠秒数(不含 TDP 响应时间) | +| `initial_lookback_s` | `int` | `300` | 首次启动(无持久化游标时)从 `now - initial_lookback_s` 开始 | +| `max_iterations` | `int` | `0` | 最大循环次数,`0` 表示无限循环直到外部取消 | +| `max_runtime_s` | `float` | `0` | 最长运行时长,`0` 表示无限制 | +| `batch_size` | `int` | `1000` | 单次 TDP 拉取的最大告警数(映射到 `size`,上限 10000) | +| `net_data_types` | `list[str]` | `["attack"]` | 传给 `tdp_log_search` 的 `net_data_type`,可选 `attack` / `risk` / `action` | +| `sql` | `str` | `"threat.level = 'attack'"` | TDP 过滤表达式(**不是完整 SQL**),用于过滤拉取范围 | +| `assets_group` | `list[int]` | `[]` | 业务组 ID 列表,可选 | +| `filter_enabled` | `bool` | `true` | 是否启用 9 分类过滤(去扫描 / 仅留 HTTP) | +| `dedup_enabled` | `bool` | `true` | 是否启用 LSH 去重(关闭后仅记录原始 dedup_key、不跨批次感知) | +| `threshold` | `float` | `0.7` | Jaccard 相似度阈值 | +| `strict_fields` | `list[str]` | `["sip","dip"]` | 精确匹配字段(拼接进 dedup_key) | +| `lsh_fields` | `list[str]` | `["req_http_url","req_body","rsp_body"]` | 模糊匹配字段(URI 归一化后做 MinHash) | +| `max_field_len` | `int` | `500` | 单字段截断长度 | +| `max_dedup_keys` | `int` | `100000` | LSH 状态 FIFO LRU 上限 | +| `reset_cursor` | `bool` | `false` | `true` 时忽略已有游标,重新从 `now - initial_lookback_s` 开始 | +| `log_progress_every` | `int` | `1` | 每隔 N 轮打印一次进度日志(避免日志过于频繁) | + +> ⚠️ 启动时**不要**传 `time_from` / `time_to`,工作流会自己用游标推进。要从指定时间开始重拉,把 `reset_cursor` 设为 `true` 并调整 `initial_lookback_s`。 + +## 输出参数 + +工作流执行结束(达到停止条件或被取消)后写入: + +| 字段 | 类型 | 说明 | +|------|------|------| +| `summary` | `str` | 一行摘要(iters / pulls / raw / unique / files / stop_reason) | +| `stop_reason` | `str` | 退出原因:`completed` / `reached max_iterations=N` / `reached max_runtime_s=X` / `KeyboardInterrupt` / `unhandled error: ...` | +| `final_cursor` | `int` | 最后一次成功推进到的时间戳(下次启动从此处继续) | +| `output_paths` | `list[str]` | 本次运行写入的所有 JSONL 文件路径 | +| `output_path` | `str` | 最后写入的 JSONL 文件路径(便于单值消费) | +| `stats` | `dict` | 完整统计(见下表) | + +### stats 字段 + +| 字段 | 说明 | +|------|------| +| `iterations` | 实际执行的循环轮次数 | +| `pulls_succeeded` / `pulls_failed` | TDP API 调用成功 / 失败次数 | +| `raw_total` | 从 TDP 拉到的原始告警数总和 | +| `normalized_total` | 归一化后告警数总和 | +| `filtered_total` | 过滤后保留的告警数总和(filter_enabled=true 时) | +| `enriched_total` | 经过去重处理的告警总数(含重复) | +| `unique_total` | 唯一 dedup_key 数总和 | +| `duplicates_total` | 被识别为重复的告警数 | +| `written_files` | 本次运行追加写入的所有文件路径列表 | +| `last_window_from` / `last_window_to` | 最近一次拉取的时间窗口 | +| `last_error` | 最近一次错误描述(无错误时为 `null`) | + +## 文件落盘 + +### 告警结果(每轮追加) + +``` +~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl +``` + +- **JSONL 格式**:每行一个 JSON 对象。 +- **首行**:`{"_type":"file_header", "created_at":..., "date":..., "workflow":"tdp_alert_pull_dedup", "seq":N}`(不计入告警条数)。 +- **后续行**:每行一条 enriched_alert(归一化字段 + 去重字段)。 +- **分卷规则**:每文件最多 **10,000 条**告警,超出时自动新建 `alerts_002.jsonl`、`003.jsonl`… +- **跨天滚动**:每轮检测当前日期,自动写入新的 `/` 目录。 + +### 时间游标(断点续传) + +``` +~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json +``` + +```json +{ + "next_from": 1715501234, + "updated_at": "2026-05-12T15:43:54.123456", + "iter": 42, + "workflow": "tdp_alert_pull_dedup" +} +``` + +- 每轮成功完成后原子写入。 +- 重启工作流时自动加载,继续从 `next_from` 推进,**无重叠也无空洞**。 +- TDP 调用失败时不推进游标,下一轮会重试同一个时间窗口。 + +### LSH 去重状态 + +``` +~/.flocks/workflows/tdp_alert_pull_dedup/lsh_state_np128_th{int(threshold*100)}.pkl +~/.flocks/workflows/tdp_alert_pull_dedup/lsh_state_np128_th{int(threshold*100)}.lock +``` + +- 原子写入 + 文件锁(跨进程安全)。 +- FIFO LRU 淘汰:达到 `max_dedup_keys` 阈值后逐出最早条目。 +- 不同 `threshold` 互相独立(避免不同阈值之间状态混淆)。 + +### 每条 enriched_alert 的增强字段 + +| 字段 | 说明 | +|------|------| +| `dedup_key` | MD5 去重键(`strict_fields + cluster_id` 的哈希) | +| `is_duplicate` | 是否已在历史轮次中出现过(跨轮持久化感知) | +| `_lsh_cluster_id` | MinHash LSH 簇 ID(`dedup_enabled=false` 时为 `null`) | +| `_source_type` | 固定为 `tdp`(数据源) | +| `_process_type` | 过滤分类(如 `alert_not_scan_http_direction_in`) | +| `_threat_type` | 威胁类型字符串(同 `threat_name`) | + +## 节点说明 + +### `pull_dedup_loop`(唯一节点,长时间运行) + +`type: python`,`metadata.node_timeout_s = 2,592,000`(30 天)。 + +主循环步骤(每轮): +1. **时间窗口计算**:`time_from = 上次 time_to`(首次为 `now - initial_lookback_s`),`time_to = 当前时间戳`。 +2. **TDP 拉取**:`tool.run('tdp_log_search', action='search', time_from, time_to, net_data_type, sql, size)`,失败时 `pulls_failed++` 且**不**推进游标,下轮重试同一窗口。 +3. **响应解包**:自动识别 `list` / `{"log":[...]}` / `{"list":[...]}` / `{"data":[...]}` 等常见 TDP 返回结构。 +4. **归一化**:仅 TDP,复用 `stream_alert_dedup` 的 `TDP_FIELD_MAP`(嵌套字段也支持)。 +5. **过滤**:9 分类,保留 `alert_not_scan_http_direction_{in|out|lateral}`。 +6. **去重**:URI 归一化 + 5-gram MinHash LSH,跨轮 / 跨进程持久化。 +7. **写盘**:JSONL 追加,达到 10,000 条自动滚卷。 +8. **推进游标**:成功完成后 atomic 写入 `cursor.json`。 +9. **休眠**:`pull_interval_s` 秒。 + +退出条件: +- `iter > max_iterations`(且 `max_iterations > 0`) +- `elapsed > max_runtime_s`(且 `max_runtime_s > 0`) +- `KeyboardInterrupt` / 节点取消 +- 不可恢复异常(已被 catch,会写入 `stats.last_error`) + +## 与 stream_alert_dedup 的差异 + +| 维度 | stream_alert_dedup | tdp_alert_pull_dedup | +|------|--------------------|----------------------| +| 数据来源 | syslog 监听 / `alerts` / `alert_file` 三选一 | 主动调用 `tdp_log_search` 工具 | +| 触发方式 | 外部事件驱动(每收到一条触发一次工作流) | 工作流自身长时间运行(while 循环) | +| 多源支持 | TDP + Skyeye 自动识别 | 仅 TDP(数据来源固定) | +| 时间游标 | 无(事件驱动无需游标) | 持久化游标,断点续传 | +| 落盘路径 | `~/.flocks/workspace/workflows/stream_alert_dedup//dedup_result_NNN.jsonl` | `~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl` | +| LSH 状态 | `~/.flocks/workspace/workflows/stream_alert_dedup/lsh_state_*.pkl` | `~/.flocks/workflows/tdp_alert_pull_dedup/lsh_state_*.pkl` | + +> 两个工作流维护**独立**的 LSH 状态与去重历史,不会互相干扰。 + +## 运行方式 + +### 1. 通过 webui 启动 + +打开 webui → Workflows → `tdp_alert_pull_dedup` → 点击运行;可在右侧 RunTab 调整默认 `sampleInputs`。 + +### 2. 通过 API 启动 + +```bash +curl -s -X POST http://localhost:8000/api/workflow/tdp_alert_pull_dedup/run \ + -H 'Content-Type: application/json' \ + -d '{ + "inputs": { + "pull_interval_s": 60, + "initial_lookback_s": 600, + "max_iterations": 0, + "batch_size": 500, + "net_data_types": ["attack"], + "sql": "threat.level = '\''attack'\''", + "filter_enabled": true, + "dedup_enabled": true, + "threshold": 0.7 + } + }' +``` + +### 3. 短跑测试 + +```json +{ + "max_iterations": 5, + "pull_interval_s": 5, + "initial_lookback_s": 86400, + "reset_cursor": true +} +``` + +跑 5 轮、每轮拉取过去 24 小时的告警、忽略已有游标,便于快速验证 pipeline。 + +## 前置条件 + +1. **TDP 凭据已配置**:`tdp_api_key` / `tdp_secret` / `tdp_host` 已通过 secrets 或 `api_services.tdp_api.base_url` 配置。可用 `python -c "from flocks.tool import ToolRegistry; ToolRegistry.init(); print(ToolRegistry.get('tdp_log_search'))"` 验证工具已注册。 +2. **`datasketch` 依赖**:和 `stream_alert_dedup` 共享,已在 flocks 项目依赖中。 +3. **写盘权限**:用户对 `~/.flocks/workflows/` 目录有读写权限。 + +## 工程要点 + +- **节点超时**:`node_timeout_s = 2,592,000` (30 天)。如需更长运行时间可调高 metadata,或拆成多次有限轮次执行(搭配 cron scheduler)。 +- **TDP 调用失败时的语义**:不推进游标,下次重试**同一时间窗口**,避免丢数据。但若 TDP 长时间不可用,建议外部监控 `stats.pulls_failed`。 +- **time_from = 上次 time_to**:闭区间还是开区间取决于 TDP 服务端实现。如观察到边界重复,可在 dedup 阶段被 LSH 自动去掉;若不开启 dedup,建议手动 `+1` 偏移。 +- **路径根目录**:通过 `Config().get_global().data_dir.parent` 解析 `~/.flocks`,避免硬编码用户目录。 +- **不依赖 syslog/Kafka**:与 `stream_alert_dedup` 解耦;如需同时跑两套去重,记得它们**不共享** LSH 历史。 From 4a73d7b2774e0e74cbac36c4656eac63b9208a20 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Tue, 12 May 2026 10:14:21 +0800 Subject: [PATCH 31/41] fix(windows-installer): require elevation for installer shortcuts Route installer-created desktop and Start menu launchers through an elevated PowerShell helper so Windows prompts for UAC before starting Flocks. This keeps the shared flocks.cmd wrapper as the real entrypoint while preventing updater failures caused by insufficient permissions. Co-authored-by: Cursor --- packaging/windows/flocks-setup.iss | 11 +++--- packaging/windows/start-flocks-elevated.ps1 | 16 +++++++++ .../test_browser_runtime_configuration.py | 36 ++++++++++++------- 3 files changed, 46 insertions(+), 17 deletions(-) create mode 100644 packaging/windows/start-flocks-elevated.ps1 diff --git a/packaging/windows/flocks-setup.iss b/packaging/windows/flocks-setup.iss index bccc59779..aa1643c68 100644 --- a/packaging/windows/flocks-setup.iss +++ b/packaging/windows/flocks-setup.iss @@ -49,13 +49,14 @@ Root: HKCU; Subkey: "Environment"; ValueType: string; ValueName: "FLOCKS_INSTALL Root: HKCU; Subkey: "Environment"; ValueType: string; ValueName: "FLOCKS_REPO_ROOT"; ValueData: "{app}\flocks"; Flags: uninsdeletevalue Root: HKCU; Subkey: "Environment"; ValueType: string; ValueName: "FLOCKS_NODE_HOME"; ValueData: "{app}\tools\node"; Flags: uninsdeletevalue -; Shortcuts intentionally target the same wrapper path that `scripts\install.ps1` -; writes, so the Start menu / desktop icon and `flocks start` typed in a new -; terminal are strictly equivalent across all install flows. +; Installer-created launch shortcuts intentionally go through a tiny elevation +; helper first, then invoke the same `%USERPROFILE%\.local\bin\flocks.cmd` +; wrapper that `scripts\install.ps1` writes. This keeps the app entrypoint +; consistent while letting Windows prompt for UAC on shortcut launches. [Icons] -Name: "{autoprograms}\{#MyAppName}\Start Flocks"; Filename: "{%USERPROFILE}\.local\bin\flocks.cmd"; Parameters: "start"; WorkingDir: "{%USERPROFILE}" +Name: "{autoprograms}\{#MyAppName}\Start Flocks"; Filename: "powershell.exe"; Parameters: "-NoProfile -ExecutionPolicy Bypass -WindowStyle Hidden -File ""{app}\flocks\packaging\windows\start-flocks-elevated.ps1"""; WorkingDir: "{%USERPROFILE}" Name: "{autoprograms}\{#MyAppName}\Flocks repository"; Filename: "{app}\flocks"; WorkingDir: "{app}\flocks" -Name: "{userdesktop}\{#MyAppName}"; Filename: "{%USERPROFILE}\.local\bin\flocks.cmd"; Parameters: "start"; WorkingDir: "{%USERPROFILE}"; Tasks: desktopicon +Name: "{userdesktop}\{#MyAppName}"; Filename: "powershell.exe"; Parameters: "-NoProfile -ExecutionPolicy Bypass -WindowStyle Hidden -File ""{app}\flocks\packaging\windows\start-flocks-elevated.ps1"""; WorkingDir: "{%USERPROFILE}"; Tasks: desktopicon [Run] Filename: "powershell.exe"; Parameters: "-NoProfile -ExecutionPolicy Bypass -File ""{app}\flocks\packaging\windows\bootstrap-windows.ps1"" -InstallRoot ""{app}"""; StatusMsg: "Setting up Python and JavaScript dependencies..."; Flags: runascurrentuser waituntilterminated diff --git a/packaging/windows/start-flocks-elevated.ps1 b/packaging/windows/start-flocks-elevated.ps1 new file mode 100644 index 000000000..2a7d5c145 --- /dev/null +++ b/packaging/windows/start-flocks-elevated.ps1 @@ -0,0 +1,16 @@ +[CmdletBinding()] +param() + +$wrapperPath = Join-Path $HOME ".local\bin\flocks.cmd" +if (-not (Test-Path -LiteralPath $wrapperPath)) { + throw "Flocks launcher not found: $wrapperPath" +} + +$cmdPath = $env:ComSpec +if ([string]::IsNullOrWhiteSpace($cmdPath)) { + $cmdPath = "cmd.exe" +} + +# Route installer-created shortcuts through UAC, but keep the real app entrypoint +# on the shared flocks.cmd wrapper so shortcut launches match terminal launches. +Start-Process -FilePath $cmdPath -ArgumentList @("/c", "`"$wrapperPath`" start") -WorkingDirectory $HOME -WindowStyle Hidden -Verb RunAs diff --git a/tests/scripts/test_browser_runtime_configuration.py b/tests/scripts/test_browser_runtime_configuration.py index 24fe47560..9b275fdeb 100644 --- a/tests/scripts/test_browser_runtime_configuration.py +++ b/tests/scripts/test_browser_runtime_configuration.py @@ -87,11 +87,9 @@ def test_inno_setup_points_to_packaging_bootstrap() -> None: assert "scripts\\bootstrap-windows.ps1" not in iss -def test_inno_shortcuts_point_to_user_local_bin_wrapper() -> None: - """Start-menu and desktop shortcuts must match the CLI wrapper location that - `scripts/install.ps1` writes, so `flocks start` triggered from the shortcut - and from a freshly opened terminal are strictly equivalent across all - install flows (source, one-liner, bundled installer).""" +def test_inno_shortcuts_point_to_elevated_launcher() -> None: + """Installer-created launch shortcuts should route through the elevated + launcher so clicking them triggers UAC before running the shared wrapper.""" iss = (PACKAGING_WINDOWS_DIR / "flocks-setup.iss").read_text(encoding="utf-8") icons_section_idx = iss.find("[Icons]") @@ -99,7 +97,8 @@ def test_inno_shortcuts_point_to_user_local_bin_wrapper() -> None: assert icons_section_idx != -1 and run_section_idx != -1 icons_block = iss[icons_section_idx:run_section_idx] - expected_target = "{%USERPROFILE}\\.local\\bin\\flocks.cmd" + expected_target = 'Filename: "powershell.exe"' + expected_script = 'start-flocks-elevated.ps1' start_menu_lines = [ line for line in icons_block.splitlines() @@ -108,14 +107,27 @@ def test_inno_shortcuts_point_to_user_local_bin_wrapper() -> None: assert start_menu_lines, "expected Start Flocks + desktop shortcut entries" for line in start_menu_lines: assert expected_target in line, ( - f"shortcut must target the shared wrapper path; got: {line}" + f"shortcut must target PowerShell launcher; got: {line}" ) - assert 'Parameters: "start"' in line + assert expected_script in line + assert "-WindowStyle Hidden" in line - # Guard against accidentally re-introducing a shortcut to {app}\bin, which - # would point to a non-existent file because install.ps1 writes the wrapper - # under %USERPROFILE%\.local\bin. - assert "{app}\\bin\\flocks.cmd" not in icons_block + # Guard against accidentally re-introducing direct shortcut launches that + # bypass the UAC prompt. + assert "{%USERPROFILE}\\.local\\bin\\flocks.cmd" not in icons_block + + +def test_windows_elevated_launcher_runs_shared_wrapper_as_admin() -> None: + """The elevation helper should re-use the shared CLI wrapper and request + Administrator rights via Start-Process.""" + script = (PACKAGING_WINDOWS_DIR / "start-flocks-elevated.ps1").read_text( + encoding="utf-8-sig" + ) + + assert 'Join-Path $HOME ".local\\bin\\flocks.cmd"' in script + assert "Start-Process" in script + assert "-Verb RunAs" in script + assert "`\"$wrapperPath`\" start" in script def test_inno_finish_page_reminds_user_to_reopen_terminal() -> None: From a8b4156b763f9d6d9df785c015d11576f43c9899 Mon Sep 17 00:00:00 2001 From: xiami Date: Tue, 12 May 2026 12:54:57 +0800 Subject: [PATCH 32/41] feat(windows): bundle python-build-standalone runtime in installer staging (#256) - Pin CPython 3.12.12 in versions.manifest.json with standalone archive metadata - Download and extract install-only tarball in build-staging.ps1 (mirror env support) - Configure PATH and UV_* env vars in bootstrap-windows.ps1 for bundled Python - Assert bundled Python in Windows packaging CI workflows - Extend manifest and bootstrap script contract tests --- .../workflows/windows-packaging-publish.yml | 10 ++++ .github/workflows/windows-packaging.yml | 10 ++++ packaging/windows/bootstrap-windows.ps1 | 31 ++++++++++ packaging/windows/build-staging.ps1 | 58 +++++++++++++++++++ packaging/windows/versions.manifest.json | 5 ++ tests/packaging/test_windows_manifest.py | 11 ++++ .../test_browser_runtime_configuration.py | 16 +++++ 7 files changed, 141 insertions(+) diff --git a/.github/workflows/windows-packaging-publish.yml b/.github/workflows/windows-packaging-publish.yml index f491883a9..b2817d5dc 100644 --- a/.github/workflows/windows-packaging-publish.yml +++ b/.github/workflows/windows-packaging-publish.yml @@ -47,6 +47,7 @@ jobs: -RepoRoot "${{ github.workspace }}" ` -AppVersion $appVersion $uvExe = Join-Path $out "tools/uv/uv.exe" + $pythonExe = Join-Path $out "tools/python/python.exe" $nodeExe = Join-Path $out "tools/node/node.exe" $chromeExe = Get-ChildItem -Path (Join-Path $out "tools/chrome") -Recurse -Filter "chrome.exe" -File -ErrorAction SilentlyContinue | Where-Object { $_.FullName -match "chrome-win" } | @@ -54,6 +55,9 @@ jobs: if (-not (Test-Path $uvExe)) { throw "uv executable not found in staging: $uvExe" } + if (-not (Test-Path $pythonExe)) { + throw "python executable not found in staging: $pythonExe" + } if (-not (Test-Path $nodeExe)) { throw "node executable not found in staging: $nodeExe" } @@ -61,6 +65,7 @@ jobs: throw "chrome executable not found in staging under tools/chrome" } $uvVersion = (& $uvExe --version).Trim() + $pythonVersion = (& $pythonExe --version).Trim() $nodeVersion = (& $nodeExe --version).Trim() $chromeVersion = (Get-Item -LiteralPath $chromeExe.FullName).VersionInfo.ProductVersion if ([string]::IsNullOrWhiteSpace($chromeVersion)) { @@ -71,6 +76,8 @@ jobs: } Write-Host "[runtime] pinned uv version: $($manifest.uv.version)" Write-Host "[runtime] bundled uv version: $uvVersion" + Write-Host "[runtime] pinned python version: $($manifest.python.version)" + Write-Host "[runtime] bundled python version: $pythonVersion" Write-Host "[runtime] pinned node version: $($manifest.nodejs.version)" Write-Host "[runtime] bundled node version: $nodeVersion" Write-Host "[runtime] pinned chrome version: $($manifest.chrome_for_testing.version)" @@ -78,6 +85,9 @@ jobs: if ($uvVersion -notmatch ("^uv\s+" + [regex]::Escape($manifest.uv.version) + "(\s|$)")) { throw "Bundled uv version does not match pinned version in manifest" } + if ($pythonVersion -ne ("Python " + $manifest.python.version)) { + throw "Bundled python version does not match pinned version in manifest" + } if ($nodeVersion -ne ("v" + $manifest.nodejs.version)) { throw "Bundled node version does not match pinned version in manifest" } diff --git a/.github/workflows/windows-packaging.yml b/.github/workflows/windows-packaging.yml index 1e595d9f6..ea37b51c5 100644 --- a/.github/workflows/windows-packaging.yml +++ b/.github/workflows/windows-packaging.yml @@ -42,6 +42,7 @@ jobs: $manifest = Get-Content -Path $manifestPath -Raw -Encoding utf8 | ConvertFrom-Json & "${{ github.workspace }}/packaging/windows/build-installer.ps1" -OutputDir $out -RepoRoot "${{ github.workspace }}" $uvExe = Join-Path $out "tools/uv/uv.exe" + $pythonExe = Join-Path $out "tools/python/python.exe" $nodeExe = Join-Path $out "tools/node/node.exe" $chromeExe = Get-ChildItem -Path (Join-Path $out "tools/chrome") -Recurse -Filter "chrome.exe" -File -ErrorAction SilentlyContinue | Where-Object { $_.FullName -match "chrome-win" } | @@ -49,6 +50,9 @@ jobs: if (-not (Test-Path $uvExe)) { throw "uv executable not found in staging: $uvExe" } + if (-not (Test-Path $pythonExe)) { + throw "python executable not found in staging: $pythonExe" + } if (-not (Test-Path $nodeExe)) { throw "node executable not found in staging: $nodeExe" } @@ -56,6 +60,7 @@ jobs: throw "chrome executable not found in staging under tools/chrome" } $uvVersion = (& $uvExe --version).Trim() + $pythonVersion = (& $pythonExe --version).Trim() $nodeVersion = (& $nodeExe --version).Trim() $chromeVersion = (Get-Item -LiteralPath $chromeExe.FullName).VersionInfo.ProductVersion if ([string]::IsNullOrWhiteSpace($chromeVersion)) { @@ -66,6 +71,8 @@ jobs: } Write-Host "[runtime] pinned uv version: $($manifest.uv.version)" Write-Host "[runtime] bundled uv version: $uvVersion" + Write-Host "[runtime] pinned python version: $($manifest.python.version)" + Write-Host "[runtime] bundled python version: $pythonVersion" Write-Host "[runtime] pinned node version: $($manifest.nodejs.version)" Write-Host "[runtime] bundled node version: $nodeVersion" Write-Host "[runtime] pinned chrome version: $($manifest.chrome_for_testing.version)" @@ -73,6 +80,9 @@ jobs: if ($uvVersion -notmatch ("^uv\s+" + [regex]::Escape($manifest.uv.version) + "(\s|$)")) { throw "Bundled uv version does not match pinned version in manifest" } + if ($pythonVersion -ne ("Python " + $manifest.python.version)) { + throw "Bundled python version does not match pinned version in manifest" + } if ($nodeVersion -ne ("v" + $manifest.nodejs.version)) { throw "Bundled node version does not match pinned version in manifest" } diff --git a/packaging/windows/bootstrap-windows.ps1 b/packaging/windows/bootstrap-windows.ps1 index e8ace3dbd..268c89076 100644 --- a/packaging/windows/bootstrap-windows.ps1 +++ b/packaging/windows/bootstrap-windows.ps1 @@ -67,6 +67,23 @@ function Add-UserPathEntryIfMissing { } } +function Add-ProcessPathEntryIfMissing { + param([string]$Entry) + + if ([string]::IsNullOrWhiteSpace($Entry)) { return } + + $processPath = $env:Path + if ([string]::IsNullOrWhiteSpace($processPath)) { + $env:Path = $Entry + return + } + + $existing = $processPath -split ';' | Where-Object { ($_.TrimEnd('\','/')).ToLower() -eq $Entry.TrimEnd('\','/').ToLower() } + if (-not $existing) { + $env:Path = "$Entry;$processPath" + } +} + function Resolve-ChromeExecutablePath { param([string]$BrowserRoot) @@ -107,6 +124,20 @@ else { Write-Host "[flocks-bootstrap] warning: bundled node not found at $bundledNode" -ForegroundColor Yellow } +$bundledPythonDir = Join-Path $InstallRoot "tools\python" +$bundledPython = Join-Path $bundledPythonDir "python.exe" +if (Test-Path $bundledPython) { + Add-ProcessPathEntryIfMissing -Entry $bundledPythonDir + $env:FLOCKS_BUNDLED_PYTHON = $bundledPython + $env:UV_PYTHON = $bundledPython + $env:UV_PYTHON_DOWNLOADS = "never" + $env:UV_NO_MANAGED_PYTHON = "1" + Write-Host "[flocks-bootstrap] configured bundled Python runtime: $bundledPython" +} +else { + Write-Host "[flocks-bootstrap] warning: bundled Python runtime not found at $bundledPython" -ForegroundColor Yellow +} + # 2) Expose bundled Chrome for Testing under ~/.flocks/browser so install.ps1's # Resolve-ChromeForTestingPath finds it and skips the real download. # Prefer a directory junction (fast, no disk duplication) and fall back to copy. diff --git a/packaging/windows/build-staging.ps1 b/packaging/windows/build-staging.ps1 index 175bb6357..ddee15d47 100644 --- a/packaging/windows/build-staging.ps1 +++ b/packaging/windows/build-staging.ps1 @@ -157,11 +157,35 @@ function Get-OrDownloadFileFromCandidates { throw "Failed to download $Label" } +function Expand-TarGzArchive { + param( + [Parameter(Mandatory = $true)][string]$ArchivePath, + [Parameter(Mandatory = $true)][string]$DestinationPath + ) + + $tarExe = Get-Command tar.exe -ErrorAction SilentlyContinue + if (-not $tarExe) { + throw "tar.exe is required to extract $ArchivePath" + } + + Remove-PathWithRetry -Path $DestinationPath + New-Item -ItemType Directory -Path $DestinationPath -Force | Out-Null + + & $tarExe.Source -xzf $ArchivePath -C $DestinationPath + if ($LASTEXITCODE -ne 0) { + throw "tar.exe failed to extract $ArchivePath with exit code $LASTEXITCODE" + } + $global:LASTEXITCODE = 0 +} + Write-Host "[build-staging] RepoRoot: $RepoRoot" Write-Host "[build-staging] OutputDir: $OutputDir" $manifest = Read-Manifest -Path $ManifestPath $uvVersion = $manifest.uv.version +$pythonVersion = $manifest.python.version +$pythonStandaloneRelease = $manifest.python.python_build_standalone_release +$pythonArchiveName = $manifest.python.windows_archive_name $nodeVersion = $manifest.nodejs.version $nodeSuffix = $manifest.nodejs.windows_zip_suffix $cacheRoot = Resolve-CacheRoot -RepoRoot $RepoRoot -CacheRootOverride $CacheRoot @@ -170,11 +194,13 @@ Write-Host "[build-staging] CacheRoot: $cacheRoot" Ensure-EmptyDir -Path $OutputDir $toolsUv = Join-Path $OutputDir "tools\uv" +$toolsPython = Join-Path $OutputDir "tools\python" $toolsNode = Join-Path $OutputDir "tools\node" $toolsChrome = Join-Path $OutputDir "tools\chrome" $flocksDest = Join-Path $OutputDir "flocks" New-Item -ItemType Directory -Path $toolsUv -Force | Out-Null +New-Item -ItemType Directory -Path $toolsPython -Force | Out-Null New-Item -ItemType Directory -Path $toolsNode -Force | Out-Null New-Item -ItemType Directory -Path $toolsChrome -Force | Out-Null @@ -185,6 +211,38 @@ $uvZip = Join-Path $cacheRoot "downloads\uv-$uvVersion-$uvZipName" Get-OrDownloadFile -Url $uvUrl -CachePath $uvZip -Label "uv $uvVersion" Expand-Archive -Path $uvZip -DestinationPath $toolsUv -Force +# Python runtime (python-build-standalone install-only archive) +$pythonArchiveNameEscaped = [Uri]::EscapeDataString($pythonArchiveName) +$pythonMirrorBase = $env:FLOCKS_PYTHON_STANDALONE_MIRROR_BASE_URL +$pythonUrls = @() +if (-not [string]::IsNullOrWhiteSpace($pythonMirrorBase)) { + $mirrorBase = $pythonMirrorBase.TrimEnd('/') + $pythonUrls += "$mirrorBase/$pythonStandaloneRelease/$pythonArchiveNameEscaped" + Write-Host "[build-staging] Added python-build-standalone mirror candidate from FLOCKS_PYTHON_STANDALONE_MIRROR_BASE_URL" +} +$pythonUrls += "https://github.com/astral-sh/python-build-standalone/releases/download/$pythonStandaloneRelease/$pythonArchiveNameEscaped" +$pythonArchive = Join-Path $cacheRoot "downloads\python-$pythonVersion-$pythonStandaloneRelease-$pythonArchiveName" +Get-OrDownloadFileFromCandidates -Urls $pythonUrls -CachePath $pythonArchive -Label "Python $pythonVersion" + +$pythonExtract = Join-Path $env:TEMP "python-extract-$pythonVersion-$pythonStandaloneRelease" +Expand-TarGzArchive -ArchivePath $pythonArchive -DestinationPath $pythonExtract +$pythonExe = Get-ChildItem -Path $pythonExtract -Recurse -Filter "python.exe" -File -ErrorAction SilentlyContinue | + Where-Object { $_.DirectoryName -notmatch '\\DLLs($|\\)' } | + Select-Object -First 1 +if (-not $pythonExe) { + throw "python.exe not found after extracting bundled Python runtime" +} +$pythonSource = $pythonExe.Directory.FullName +robocopy $pythonSource $toolsPython /E /NFL /NDL /NJH /NJS /nc /ns /np | Out-Null +if ($LASTEXITCODE -ge 8) { + throw "robocopy failed while copying bundled Python with exit code $LASTEXITCODE" +} +$global:LASTEXITCODE = 0 +Remove-PathWithRetry -Path $pythonExtract +if (-not (Test-Path (Join-Path $toolsPython "python.exe"))) { + throw "Bundled Python runtime missing python.exe under tools\python after extraction" +} + # Node.js official zip (portable) $nodeZipName = "node-v$nodeVersion-$nodeSuffix.zip" $nodeUrl = "https://nodejs.org/dist/v$nodeVersion/$nodeZipName" diff --git a/packaging/windows/versions.manifest.json b/packaging/windows/versions.manifest.json index 1d420759b..d26eaf5c6 100644 --- a/packaging/windows/versions.manifest.json +++ b/packaging/windows/versions.manifest.json @@ -3,6 +3,11 @@ "uv": { "version": "0.9.15" }, + "python": { + "version": "3.12.12", + "python_build_standalone_release": "20251202", + "windows_archive_name": "cpython-3.12.12+20251202-x86_64-pc-windows-msvc-install_only_stripped.tar.gz" + }, "nodejs": { "version": "24.14.0", "windows_zip_suffix": "win-x64" diff --git a/tests/packaging/test_windows_manifest.py b/tests/packaging/test_windows_manifest.py index 78aea3d1a..55bff6da6 100644 --- a/tests/packaging/test_windows_manifest.py +++ b/tests/packaging/test_windows_manifest.py @@ -14,3 +14,14 @@ def test_windows_bundled_uv_supports_python_downloads_json_url() -> None: manifest = json.loads(WINDOWS_MANIFEST.read_text(encoding="utf-8")) assert _parse_version(manifest["uv"]["version"]) >= (0, 7, 3) + + +def test_windows_manifest_pins_bundled_python_runtime() -> None: + manifest = json.loads(WINDOWS_MANIFEST.read_text(encoding="utf-8")) + + python = manifest["python"] + assert _parse_version(python["version"]) >= (3, 12, 0) + assert python["python_build_standalone_release"].isdigit() + assert python["windows_archive_name"].endswith(".tar.gz") + assert "install_only" in python["windows_archive_name"] + assert "windows-msvc" in python["windows_archive_name"] diff --git a/tests/scripts/test_browser_runtime_configuration.py b/tests/scripts/test_browser_runtime_configuration.py index 9b275fdeb..92cd17385 100644 --- a/tests/scripts/test_browser_runtime_configuration.py +++ b/tests/scripts/test_browser_runtime_configuration.py @@ -68,10 +68,16 @@ def test_powershell_bootstrap_wires_bundled_toolchain() -> None: """packaging/windows/bootstrap-windows.ps1 is the single place that bridges the bundled layout to install.ps1.""" script = (PACKAGING_WINDOWS_DIR / "bootstrap-windows.ps1").read_text(encoding="utf-8-sig") + assert "Add-ProcessPathEntryIfMissing" in script assert "Resolve-ChromeExecutablePath" in script assert "FLOCKS_SKIP_ADMIN_CHECK" in script assert "FLOCKS_BROWSER_EXECUTABLE_OVERRIDE" in script + assert "FLOCKS_BUNDLED_PYTHON" in script + assert "UV_PYTHON" in script + assert "UV_PYTHON_DOWNLOADS" in script + assert "UV_NO_MANAGED_PYTHON" in script assert "tools\\uv" in script + assert "tools\\python" in script assert "tools\\node" in script assert "tools\\chrome" in script assert ".flocks\\browser" in script @@ -79,6 +85,16 @@ def test_powershell_bootstrap_wires_bundled_toolchain() -> None: assert 'scripts\\install_zh.ps1' in script +def test_build_staging_bundles_python_runtime() -> None: + script = (PACKAGING_WINDOWS_DIR / "build-staging.ps1").read_text(encoding="utf-8-sig") + + assert "tools\\python" in script + assert "python-build-standalone" in script + assert "python.exe" in script + assert "tar.exe" in script + assert "FLOCKS_PYTHON_STANDALONE_MIRROR_BASE_URL" in script + + def test_inno_setup_points_to_packaging_bootstrap() -> None: """flocks-setup.iss must invoke the bootstrap from its new packaging location.""" iss = (PACKAGING_WINDOWS_DIR / "flocks-setup.iss").read_text(encoding="utf-8") From 10498b8a777ee6ea99140a4e2dacfc2b7892e41b Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Tue, 12 May 2026 14:26:12 +0800 Subject: [PATCH 33/41] fix(channel_message): attach Bearer API token on local HTTP send _http_session_send was posting to /api/channel/session-send without an Authorization header, so the server-side auth middleware rejected it as a non-browser request and returned HTTP 401. Read the API token from the secret manager (server_api_token) and inject it as Authorization: Bearer . If no token is configured locally and the server still returns 401, silently fall back to the in-process delivery path so the tool keeps working in unauthenticated setups. Co-authored-by: Cursor --- flocks/tool/channel/channel_message.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/flocks/tool/channel/channel_message.py b/flocks/tool/channel/channel_message.py index 7b591d8a4..8fe9faf20 100644 --- a/flocks/tool/channel/channel_message.py +++ b/flocks/tool/channel/channel_message.py @@ -38,6 +38,16 @@ def _normalize_channel_type(channel_type: str | None) -> str | None: return lower +def _get_api_token() -> str | None: + """Read the server API token from the secret manager (non-async, best-effort).""" + try: + from flocks.security import get_secret_manager + token = get_secret_manager().get("server_api_token") + return token.strip() if token and token.strip() else None + except Exception: + return None + + async def _http_session_send( port: int, session_id: str, @@ -60,10 +70,16 @@ async def _http_session_send( if media_url: payload["media_url"] = media_url + headers: dict[str, str] = {} + api_token = _get_api_token() + if api_token: + headers["Authorization"] = f"Bearer {api_token}" + async with httpx.AsyncClient() as client: resp = await client.post( f"http://localhost:{port}/api/channel/session-send", json=payload, + headers=headers, timeout=10.0, ) body = resp.json() @@ -76,6 +92,10 @@ async def _http_session_send( f"ids: {body.get('message_ids', [])}" ), ) + # 401 without a token means the server requires auth but none is configured; + # fall back to the in-process path so the tool still works. + if resp.status_code == 401 and not api_token: + return None return ToolResult( success=False, error=f"Send failed (HTTP {resp.status_code}): {body.get('detail', body)}", From c13ef5946b9b1d4fc5e49685ad826903e4b896f1 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Tue, 12 May 2026 14:30:21 +0800 Subject: [PATCH 34/41] refactor(channel_message): reuse API_TOKEN_SECRET_ID and clarify fallback - Import API_TOKEN_SECRET_ID from flocks.server.auth instead of hardcoding "server_api_token" so the client and server-side auth middleware cannot drift out of sync and silently start failing 401. - Refine the comment on the 401 fallback: distinguish "client did not obtain a token" from "server has no token configured", and make it explicit that when we DID send a token but it was rejected we do not fall back, surfacing the server detail so misconfiguration is visible. Co-authored-by: Cursor --- flocks/tool/channel/channel_message.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/flocks/tool/channel/channel_message.py b/flocks/tool/channel/channel_message.py index 8fe9faf20..30b2ec790 100644 --- a/flocks/tool/channel/channel_message.py +++ b/flocks/tool/channel/channel_message.py @@ -39,10 +39,17 @@ def _normalize_channel_type(channel_type: str | None) -> str | None: def _get_api_token() -> str | None: - """Read the server API token from the secret manager (non-async, best-effort).""" + """Read the server API token from the secret manager (non-async, best-effort). + + Reuses ``API_TOKEN_SECRET_ID`` from ``flocks.server.auth`` so that the + secret id stays in lockstep with what the server-side auth middleware + expects; if those drift apart the request will silently start failing + with 401. + """ try: from flocks.security import get_secret_manager - token = get_secret_manager().get("server_api_token") + from flocks.server.auth import API_TOKEN_SECRET_ID + token = get_secret_manager().get(API_TOKEN_SECRET_ID) return token.strip() if token and token.strip() else None except Exception: return None @@ -92,8 +99,12 @@ async def _http_session_send( f"ids: {body.get('message_ids', [])}" ), ) - # 401 without a token means the server requires auth but none is configured; - # fall back to the in-process path so the tool still works. + # 401 + we had no token to present: either the secret is unset + # or this process can't read it. Either way, the in-process + # path bypasses HTTP auth and can still deliver the message, + # so we fall back instead of surfacing an error. + # (If we DID send a token and it was rejected, fall through + # and report the server's detail so misconfiguration is visible.) if resp.status_code == 401 and not api_token: return None return ToolResult( From b33ba1869174ad89464a02ac45fdecbe890fb49b Mon Sep 17 00:00:00 2001 From: xiami Date: Tue, 12 May 2026 16:01:56 +0800 Subject: [PATCH 35/41] docs: add contributing guide (#257) Document the contribution workflow in English, link it from the READMEs, and allow docs markdown files to be tracked in git. --- .gitignore | 2 - README.md | 6 +- README_zh.md | 6 +- docs/CONTRIBUTING.md | 197 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 207 insertions(+), 4 deletions(-) create mode 100644 docs/CONTRIBUTING.md diff --git a/.gitignore b/.gitignore index 45739c7f0..4d720f9ce 100644 --- a/.gitignore +++ b/.gitignore @@ -100,7 +100,6 @@ tmp/ # Documentation docs/_build/ -docs/*.md site/ # Node.js (TUI) @@ -108,7 +107,6 @@ node_modules/ tui/node_modules/ bun.lockb .bun/ -docs/* !docs/CHANGELOG.md # TUI build diff --git a/README.md b/README.md index 94e25de0f..bfa4c058d 100644 --- a/README.md +++ b/README.md @@ -306,6 +306,10 @@ Scan the QR code with **WeChat** to join our official discussion group. ![WeCom official community QR code](assets/community-wecom-qr.png) -## 6. License +## 6. Contributing + +See [`docs/CONTRIBUTING.md`](docs/CONTRIBUTING.md) for development setup, coding standards, testing expectations, and Pull Request guidelines. + +## 7. License Apache License 2.0 diff --git a/README_zh.md b/README_zh.md index a0d88a8d2..2e28c1459 100644 --- a/README_zh.md +++ b/README_zh.md @@ -271,6 +271,10 @@ flocks start --server-host 127.0.0.1 --webui-host 0.0.0.0 ![企业微信官方交流群二维码](assets/community-wecom-qr.png) -## 6. 开源协议 +## 6. 参与贡献 + +开发环境、代码规范、测试要求和 Pull Request 流程请参考 [`docs/CONTRIBUTING.md`](docs/CONTRIBUTING.md)。 + +## 7. 开源协议 Apache License 2.0 diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md new file mode 100644 index 000000000..67b9a581d --- /dev/null +++ b/docs/CONTRIBUTING.md @@ -0,0 +1,197 @@ +# Contributing Guide + +Thank you for contributing to `flocks`. We welcome bug fixes, documentation improvements, tests, UX polish, new features, and other well-scoped changes that make the project better. + +This guide explains how to contribute in a way that is easy to review, maintain, and merge. + +## Ways to Contribute + +You can contribute by: + +- reporting bugs with clear reproduction steps +- proposing features or design improvements +- improving documentation, examples, and developer experience +- fixing issues and adding regression coverage +- improving the WebUI, CLI, workflows, plugins, tools, or platform integrations + +If your change is large or affects architecture, public behavior, or user workflows, please open an Issue first so the direction can be discussed before implementation starts. + +## Before You Start + +Before writing code, please: + +1. Search existing Issues and Pull Requests to avoid duplicate work. +2. Confirm the scope for larger features, refactors, or behavior changes. +3. Keep each contribution focused on one topic whenever possible. + +## Development Environment + +The main development stack for `flocks` currently includes: + +- Python `3.12` +- `uv` for Python environment and dependency management +- Node.js `22+` +- `npm` for frontend dependencies + +Recommended setup: + +```bash +uv sync --group dev +cd webui && npm ci +``` + +If you work on browser-related features, you may also need the browser runtime dependencies described in the project README. + +## Common Commands + +Use `uv run` for Python-related commands whenever possible. + +### Backend / Python + +```bash +uv run ruff check . +uv run pytest +``` + +If your change is scoped to a smaller area, run the most relevant tests first: + +```bash +uv run pytest tests/session +uv run pytest tests/cli/test_service_manager.py +``` + +### Frontend / WebUI + +```bash +cd webui +npm run lint +npm run build +``` + +If your change touches both Python and frontend code, please run checks for both parts. + +## Coding Standards + +Please make sure your changes follow the repository conventions: + +- Follow the Google Python Style Guide for Python code. +- Use `ruff` for linting and formatting-related checks. +- New features and bug fixes must include or update tests. +- Keep all test code under `tests/`. +- Except for the repository root `README.md`, feature guides, usage docs, and summary markdown files should go under `docs/`. +- Run Python commands with `uv run`, or from the project's active virtual environment. +- Any `.ps1` file in scripts must use **UTF-8 with BOM** encoding and **CRLF** line endings. + +Please also follow these general principles: + +- Keep changes focused and avoid unrelated refactors. +- Add type hints, error handling, and regression coverage where they meaningfully improve maintainability. +- Introduce new dependencies only when necessary, and explain why they are needed. +- Add brief comments for non-obvious logic, but avoid low-value commentary. + +## Branching and Commits + +Create your working branch from the latest `dev` branch. Do not develop directly on `main`, and do not open contribution PRs against `main` unless a maintainer explicitly asks for it. + +Suggested branch naming examples: + +- `feat/add-session-export` +- `fix/webui-login-redirect` +- `docs/contributing-guide` +- `refactor/mcp-client-cache` +- `test/add-workflow-route-cases` + +Write commit messages in clear English. A Conventional Commits style is recommended: + +```text +feat(cli): add service restart timeout option +fix(auth): preserve session after password reset +docs: add contributing guide +test(session): cover runner retry path +``` + +A good commit should: + +- focus on one main change +- describe intent clearly in the title +- include extra context in the body when behavior, compatibility, or motivation needs explanation + +## Testing Expectations + +Please validate your change according to its scope: + +- Documentation changes: verify links, commands, filenames, and paths. +- Python changes: run the relevant tests; for shared infrastructure changes, run broader coverage. +- Frontend changes: run at least `npm run lint` and `npm run build`. +- Cross-cutting changes: include enough automated or manual verification to show that the change works as intended. + +If you are fixing a bug, prefer adding a regression test that reproduces the issue before or alongside the fix. + +## Pull Request Guidelines + +All contribution PRs for `flocks` should target the `dev` branch. + +When opening a PR, make it easy for reviewers to understand: + +1. What problem the change solves. +2. What the scope of the change is. +3. Why the chosen approach is appropriate. +4. How you validated the change. +5. Whether there are compatibility, migration, or configuration impacts. + +If the PR changes UI or interaction behavior, include screenshots, recordings, or a clear before/after explanation. + +Recommended PR description template: + +```markdown +## Summary +- ... + +## Why +- ... + +## Test Plan +- [x] uv run pytest ... +- [x] npm run lint +- [ ] Manual verification +``` + +Please keep PRs as small and focused as practical. Multiple reviewable PRs are usually easier to merge than one large mixed change. + +## Issue Reporting + +This repository already provides GitHub Issue templates. Please choose the most appropriate template and include enough detail to make triage efficient: + +- Bug reports: reproduction steps, expected behavior, actual behavior, logs, and version information +- Feature requests: motivation, proposed solution, alternatives considered, and expected impact +- Plugin / tool requests: target use case, inputs, outputs, and relevant constraints + +High-quality Issues significantly improve response time and implementation quality. + +## Security Issues + +If you discover a security vulnerability or any issue that could expose users or deployments to risk, please do not disclose sensitive details in a public Issue. Contact the maintainers through an approved private channel first, then coordinate on disclosure after a fix is available. + +## Communication + +Please keep communication respectful, specific, and constructive: + +- discuss the problem, not the person +- provide evidence and context, not just conclusions +- stay open to review feedback, and split changes if needed + +We strongly prefer incremental, testable, reviewable contributions over large rewrites. + +## Pre-PR Checklist + +Before opening a PR, please confirm: + +- [ ] the change is focused and does not include unrelated edits +- [ ] code, naming, and documentation style match the repository +- [ ] new features or bug fixes include appropriate tests +- [ ] relevant local checks have passed +- [ ] the PR clearly explains background, approach, and validation +- [ ] the PR targets `dev` +- [ ] any new markdown documentation has been added under `docs/` when applicable + +Thank you for helping improve `flocks`. From 2b312bc078cd0c25406053dc9a72aa92c0d612c2 Mon Sep 17 00:00:00 2001 From: xiami Date: Tue, 12 May 2026 17:12:30 +0800 Subject: [PATCH 36/41] feat(server,webui): phased startup, route timing, session/tools UX (#258) * feat(server,webui): phased startup, route timing, session/tools UX - Run server startup in timed phases; defer non-critical work to background tasks and cancel them on shutdown; log blocking startup duration - Add duration logging for session list, tools list/refresh, task dashboard APIs - Export workflow filesystem sync helper for startup - WebUI: stop auto-selecting first session; enable live/SSE only with session id; reorder tools refresh vs list; reduce loading flicker on polling hooks; drop redundant task queue refetch from Task page - TUI: inline tsconfig compiler options (drop extends) - Add useTools tests and adjust Session page tests * fix(mcp): owner task for session lifecycle and serialized RPC - Run remote/stdio MCP I/O in one asyncio task with a command queue so list/call/read/disconnect share the same ClientSession context - Use streamable_http_client transport factory pattern; improve cleanup and pending-command failure handling on disconnect - Update MCP client/SSE tests for owner-task wiring; add same-task assertions - Simplify browser-use SKILL CDP fallback instructions * fix(mcp): normalize client timeout when config is missing or invalid Default to 30s for None/non-numeric/<=0 values so connect wait_for uses a finite bound; add regression test for timeout=None. * feat(server): log slow route timings at INFO; fix(mcp) stdio stderr lifecycle - Add log_route_timing helper: fast requests emit DEBUG, >=300ms emit INFO - Wire session list, task dashboard routes, and tools list/refresh to helper - MCP: wrap stdio stderr TemporaryFile in context manager; drop unused _streams_context; add regression test ensuring stderr file closes on failure - Clarify SSE-first shutdown comment in app lifespan --- .flocks/plugins/skills/browser-use/SKILL.md | 6 +- flocks/mcp/client.py | 729 ++++++++++++-------- flocks/server/app.py | 251 +++++-- flocks/server/routes/_timing.py | 28 + flocks/server/routes/session.py | 12 +- flocks/server/routes/task_entities.py | 29 +- flocks/server/routes/tool.py | 16 +- flocks/server/routes/workflow.py | 13 +- tests/mcp/test_mcp_client.py | 170 ++++- tests/mcp/test_mcp_client_sse.py | 411 ++++++----- tests/server/routes/test_route_timing.py | 53 ++ tui/tsconfig.json | 7 +- webui/src/components/common/SessionChat.tsx | 2 +- webui/src/hooks/useTasks.ts | 20 +- webui/src/hooks/useTools.test.tsx | 64 ++ webui/src/hooks/useTools.ts | 32 +- webui/src/pages/Session/index.test.tsx | 10 +- webui/src/pages/Session/index.tsx | 9 +- webui/src/pages/Task/index.tsx | 4 +- 19 files changed, 1290 insertions(+), 576 deletions(-) create mode 100644 flocks/server/routes/_timing.py create mode 100644 tests/server/routes/test_route_timing.py create mode 100644 webui/src/hooks/useTools.test.tsx diff --git a/.flocks/plugins/skills/browser-use/SKILL.md b/.flocks/plugins/skills/browser-use/SKILL.md index 8edb33335..1f41fefe2 100644 --- a/.flocks/plugins/skills/browser-use/SKILL.md +++ b/.flocks/plugins/skills/browser-use/SKILL.md @@ -68,11 +68,7 @@ browser: not connected — 请确保 Chrome / Chromium / Edge 已打开,然后 说明当前环境不适合 `CDP 直连`。此时要: 1. 明确告诉用户是哪一项不满足,提示需要做什么操作才能达到要求 -2. 切换到 `agent-browser` 模式 -3. 立即阅读: - - `references/agent-browser.md` - -不要继续尝试 CDP。 +2. 提示用户切换到 `agent-browser` 模式 ## 执行规则 diff --git a/flocks/mcp/client.py b/flocks/mcp/client.py index 8b14d9a26..b8e709ed3 100644 --- a/flocks/mcp/client.py +++ b/flocks/mcp/client.py @@ -1,21 +1,26 @@ """ MCP Client Wrapper -Client implementation based on official MCP SDK, supporting Streamable HTTP and SSE transports +Client implementation based on official MCP SDK, supporting Streamable HTTP and SSE transports. """ import asyncio +import contextlib import os import tempfile +from contextlib import asynccontextmanager +from dataclasses import dataclass, field from pathlib import Path -from typing import Optional, Dict, Any, List, Literal +from typing import Any, Dict, List, Literal, Optional + +import httpx from mcp import ClientSession -from mcp.client.streamable_http import streamablehttp_client from mcp.client.sse import sse_client -from mcp.client.stdio import stdio_client, StdioServerParameters +from mcp.client.stdio import StdioServerParameters, stdio_client +from mcp.client.streamable_http import streamable_http_client -from flocks.mcp.types import McpToolDef, McpResource -from flocks.mcp.utils import build_mcp_headers, build_mcp_url, resolve_env_var +from flocks.mcp.types import McpResource, McpToolDef +from flocks.mcp.utils import build_mcp_headers, build_mcp_url from flocks.utils.log import Log log = Log.create(service="mcp.client") @@ -43,6 +48,24 @@ def _extract_root_cause(exc: BaseException) -> str: return str(exc) +def _normalize_timeout(timeout: object) -> float: + """Normalize optional timeout inputs to a positive float.""" + try: + value = float(timeout) # type: ignore[arg-type] + except (TypeError, ValueError): + return 30.0 + return value if value > 0 else 30.0 + + +@dataclass(slots=True) +class _ClientCommand: + """A serialized request executed by the MCP owner task.""" + + action: Literal["list_tools", "call_tool", "list_resources", "read_resource", "disconnect"] + payload: Dict[str, Any] = field(default_factory=dict) + response: asyncio.Future[Any] | None = None + + class McpClient: """ MCP Client - Wraps official SDK @@ -88,13 +111,15 @@ def __init__( self.env = env self.auth_config = auth_config self.transport = transport - self.timeout = timeout + self.timeout = _normalize_timeout(timeout) self.session: Optional[ClientSession] = None self._streams = None - self._streams_context = None self._connected = False self._transport_type: Optional[str] = None + self._command_queue: asyncio.Queue[_ClientCommand] | None = None + self._owner_task: asyncio.Task[None] | None = None + self._owner_error: BaseException | None = None async def connect(self) -> None: """ @@ -107,18 +132,105 @@ async def connect(self) -> None: if self._connected: log.warn("mcp.client.already_connected", {"server": self.name}) return - - if self.server_type in ("remote", "sse"): - # Both "remote" and "sse" use auto-detection: - # try Streamable HTTP first, fall back to SSE. - # This handles servers that only support one transport. - await self._connect_remote() - elif self.server_type in ("local", "stdio"): - await self._connect_local() - else: - raise ValueError(f"Unknown server type: {self.server_type}") - - async def _connect_remote(self) -> None: + + if self._owner_task and not self._owner_task.done(): + log.warn("mcp.client.connect_in_progress", {"server": self.name}) + return + + loop = asyncio.get_running_loop() + startup_future: asyncio.Future[None] = loop.create_future() + self._owner_error = None + self._command_queue = asyncio.Queue() + + owner_task = asyncio.create_task( + self._run_connection_owner(startup_future), + name=f"mcp-owner:{self.name}", + ) + owner_task.add_done_callback(self._handle_owner_task_done) + self._owner_task = owner_task + + try: + await asyncio.wait_for(startup_future, timeout=self.timeout + 1.0) + except Exception: + await self._cancel_owner_task() + self._reset_runtime_state() + raise + + async def _run_connection_owner(self, startup_future: asyncio.Future[None]) -> None: + """Own the entire MCP session lifecycle inside one asyncio task.""" + try: + if self.server_type in ("remote", "sse"): + await self._connect_remote(startup_future) + elif self.server_type in ("local", "stdio"): + await self._connect_local(startup_future) + else: + raise ValueError(f"Unknown server type: {self.server_type}") + except Exception as exc: + if not startup_future.done(): + startup_future.set_exception(exc) + else: + await self._fail_pending_commands( + RuntimeError(f"Connection lost: {self.name}: {_extract_root_cause(exc)}") + ) + raise + finally: + if not startup_future.done(): + startup_future.set_exception( + RuntimeError(f"Connection closed before initialization: {self.name}") + ) + self._connected = False + self.session = None + self._streams = None + self._transport_type = None + await self._fail_pending_commands(RuntimeError(f"Client not connected: {self.name}")) + + def _handle_owner_task_done(self, task: asyncio.Task[None]) -> None: + """Retrieve background task exceptions so asyncio does not emit warnings.""" + try: + owner_error = task.exception() + except asyncio.CancelledError: + owner_error = None + + self._owner_error = owner_error + if self._owner_task is task: + self._owner_task = None + + if owner_error is not None: + log.error("mcp.client.owner_task_error", { + "server": self.name, + "error": _extract_root_cause(owner_error), + }) + + async def _cancel_owner_task(self) -> None: + """Cancel and await the owner task if it is still running.""" + owner_task = self._owner_task + if owner_task is None: + return + if not owner_task.done(): + owner_task.cancel() + try: + await owner_task + except asyncio.CancelledError: + return + except Exception as exc: + # Connection startup may already have failed; preserve the error so + # callers can finish cleanup and still surface the root cause. + if self._owner_error is None: + self._owner_error = exc + + def _reset_runtime_state(self, clear_owner_error: bool = False) -> None: + """Reset local state after disconnects or failed startups.""" + self.session = None + self._streams = None + self._connected = False + self._transport_type = None + self._command_queue = None + if self._owner_task is not None and self._owner_task.done(): + self._owner_task = None + if clear_owner_error: + self._owner_error = None + + async def _connect_remote(self, startup_future: asyncio.Future[None]) -> None: """Connect to a remote server using the configured transport strategy.""" full_url = build_mcp_url(self.url, self.auth_config) request_headers = build_mcp_headers(self.headers, self.auth_config) @@ -129,7 +241,7 @@ async def _connect_remote(self) -> None: "type": "remote", "strategy": "streamable_http_only", }) - await self._connect_streamable_http_only(full_url, request_headers) + await self._connect_streamable_http_only(full_url, request_headers, startup_future) return if self.transport == "sse": @@ -138,7 +250,7 @@ async def _connect_remote(self) -> None: "type": "remote", "strategy": "sse_only", }) - await self._connect_sse_only(full_url, request_headers) + await self._connect_sse_only(full_url, request_headers, startup_future) return log.info("mcp.client.connecting", { @@ -146,163 +258,261 @@ async def _connect_remote(self) -> None: "type": "remote", "strategy": "streamable_http_then_sse", }) - await self._connect_auto(full_url, request_headers) + await self._connect_auto(full_url, request_headers, startup_future) async def _connect_streamable_http_only( - self, full_url: str, headers: Optional[Dict[str, str]] + self, + full_url: str, + headers: Optional[Dict[str, str]], + startup_future: asyncio.Future[None], ) -> None: """Connect using only Streamable HTTP.""" try: - await self._do_connect_streamable_http(full_url, headers) - self._transport_type = "streamable_http" - except asyncio.TimeoutError: - await self._cleanup_connection() + await self._run_remote_transport( + transport_name="streamable_http", + full_url=full_url, + headers=headers, + startup_future=startup_future, + transport_factory=self._create_streamable_http_streams, + ) + except asyncio.TimeoutError as exc: log.error("mcp.client.timeout", { "server": self.name, "transport": "streamable_http", }) - raise RuntimeError(f"Connection timeout: {self.name}") - except Exception as e: - root_cause = _extract_root_cause(e) - await self._cleanup_connection() - raise RuntimeError(f"Connection failed: {self.name}: {root_cause}") + raise RuntimeError(f"Connection timeout: {self.name}") from exc + except Exception as exc: + raise RuntimeError(f"Connection failed: {self.name}: {_extract_root_cause(exc)}") from exc async def _connect_sse_only( - self, full_url: str, headers: Optional[Dict[str, str]] + self, + full_url: str, + headers: Optional[Dict[str, str]], + startup_future: asyncio.Future[None], ) -> None: """Connect using only SSE.""" try: - await self._do_connect_sse(full_url, headers) - self._transport_type = "sse" - except asyncio.TimeoutError: - await self._cleanup_connection() + await self._run_remote_transport( + transport_name="sse", + full_url=full_url, + headers=headers, + startup_future=startup_future, + transport_factory=self._create_sse_streams, + ) + except asyncio.TimeoutError as exc: log.error("mcp.client.timeout", { "server": self.name, "transport": "sse", }) - raise RuntimeError(f"Connection timeout: {self.name}") - except Exception as e: - root_cause = _extract_root_cause(e) - await self._cleanup_connection() - raise RuntimeError(f"Connection failed: {self.name}: {root_cause}") + raise RuntimeError(f"Connection timeout: {self.name}") from exc + except Exception as exc: + raise RuntimeError(f"Connection failed: {self.name}: {_extract_root_cause(exc)}") from exc async def _connect_auto( - self, full_url: str, headers: Optional[Dict[str, str]] + self, + full_url: str, + headers: Optional[Dict[str, str]], + startup_future: asyncio.Future[None], ) -> None: """Connect using auto-detection: HTTP first, then SSE.""" try: - await self._do_connect_streamable_http(full_url, headers) - self._transport_type = "streamable_http" + await self._run_remote_transport( + transport_name="streamable_http", + full_url=full_url, + headers=headers, + startup_future=startup_future, + transport_factory=self._create_streamable_http_streams, + ) return - except asyncio.TimeoutError: - await self._cleanup_connection() + except asyncio.TimeoutError as exc: log.error("mcp.client.timeout", { "server": self.name, "transport": "streamable_http", }) - raise RuntimeError(f"Connection timeout: {self.name}") - except Exception as e: + raise RuntimeError(f"Connection timeout: {self.name}") from exc + except Exception as exc: + if startup_future.done(): + raise log.info("mcp.client.streamable_http_failed", { "server": self.name, - "error": str(e), + "error": _extract_root_cause(exc), "fallback": "sse", }) - await self._cleanup_connection() try: - await self._do_connect_sse(full_url, headers) - self._transport_type = "sse" - return - except Exception as e: - root_cause = _extract_root_cause(e) + await self._run_remote_transport( + transport_name="sse", + full_url=full_url, + headers=headers, + startup_future=startup_future, + transport_factory=self._create_sse_streams, + ) + except Exception as exc: + root_cause = _extract_root_cause(exc) log.error("mcp.client.all_transports_failed", { "server": self.name, "error": root_cause, }) - await self._cleanup_connection() - raise RuntimeError(f"Connection failed: {self.name}: {root_cause}") - - async def _do_connect_streamable_http( - self, full_url: str, headers: Optional[Dict[str, str]] = None + raise RuntimeError(f"Connection failed: {self.name}: {root_cause}") from exc + + async def _run_remote_transport( + self, + transport_name: Literal["streamable_http", "sse"], + full_url: str, + headers: Optional[Dict[str, str]], + startup_future: asyncio.Future[None], + transport_factory, ) -> None: - """Perform Streamable HTTP connection. - - Raises: - asyncio.TimeoutError: If connection or initialization times out - Exception: Any other connection error - """ - self._streams_context = streamablehttp_client( - full_url, - headers=headers, - timeout=self.timeout, - ) - self._streams = await self._streams_context.__aenter__() - read_stream, write_stream, _ = self._streams - - self.session = ClientSession(read_stream, write_stream) - await self.session.__aenter__() - init_result = await asyncio.wait_for( - self.session.initialize(), - timeout=self.timeout - ) - - self._connected = True - log.info("mcp.client.connected", { - "server": self.name, - "transport": "streamable_http", - "protocol_version": getattr(init_result, 'protocolVersion', 'unknown'), - "server_info": getattr(init_result, 'serverInfo', {}) - }) - - async def _do_connect_sse( - self, full_url: str, headers: Optional[Dict[str, str]] = None + """Run one remote transport from startup until disconnect.""" + async with transport_factory(full_url, headers) as streams: + self._streams = streams + await self._run_connected_session(transport_name, streams, startup_future) + + @asynccontextmanager + async def _create_streamable_http_streams( + self, + full_url: str, + headers: Optional[Dict[str, str]], + ): + """Create a modern Streamable HTTP transport context.""" + timeout = httpx.Timeout(self.timeout, read=60 * 5) + async with httpx.AsyncClient(headers=headers, timeout=timeout) as http_client: + async with streamable_http_client(full_url, http_client=http_client) as streams: + yield streams + + @asynccontextmanager + async def _create_sse_streams( + self, + full_url: str, + headers: Optional[Dict[str, str]], + ): + """Create an SSE transport context.""" + async with sse_client(full_url, headers=headers, timeout=self.timeout) as streams: + yield streams + + async def _run_connected_session( + self, + transport_name: Literal["streamable_http", "sse", "stdio"], + streams, + startup_future: asyncio.Future[None], ) -> None: - """Perform SSE connection. - - Raises: - asyncio.TimeoutError: If connection or initialization times out - Exception: Any other connection error - """ - self._streams_context = sse_client( - full_url, - headers=headers, - timeout=self.timeout, - ) - self._streams = await self._streams_context.__aenter__() - read_stream, write_stream = self._streams - - self.session = ClientSession(read_stream, write_stream) - await self.session.__aenter__() - init_result = await asyncio.wait_for( - self.session.initialize(), - timeout=self.timeout - ) - - self._connected = True - log.info("mcp.client.connected", { - "server": self.name, - "transport": "sse", - "protocol_version": getattr(init_result, 'protocolVersion', 'unknown'), - "server_info": getattr(init_result, 'serverInfo', {}) - }) - - async def _cleanup_connection(self) -> None: - """Clean up connection resources after a failed attempt""" - if self.session: + """Initialize the MCP session and then serve queued commands.""" + if len(streams) == 3: + read_stream, write_stream, _ = streams + else: + read_stream, write_stream = streams + + async with ClientSession(read_stream, write_stream) as session: + self.session = session + init_result = await asyncio.wait_for(session.initialize(), timeout=self.timeout) + self._connected = True + self._transport_type = transport_name + if not startup_future.done(): + startup_future.set_result(None) + log.info("mcp.client.connected", { + "server": self.name, + "transport": transport_name, + "protocol_version": getattr(init_result, "protocolVersion", "unknown"), + "server_info": getattr(init_result, "serverInfo", {}), + }) + await self._serve_commands(session) + + async def _serve_commands(self, session: ClientSession) -> None: + """Process serialized commands until disconnect is requested.""" + if self._command_queue is None: + raise RuntimeError(f"Command queue not initialized: {self.name}") + + while True: + command = await self._command_queue.get() + if command.action == "disconnect": + if command.response is not None and not command.response.done(): + command.response.set_result(None) + return + try: - await self.session.__aexit__(None, None, None) - except Exception: - pass - if self._streams_context: + result = await self._execute_command(session, command) + except Exception as exc: + if command.response is not None and not command.response.done(): + command.response.set_exception(exc) + else: + if command.response is not None and not command.response.done(): + command.response.set_result(result) + + async def _execute_command(self, session: ClientSession, command: _ClientCommand) -> Any: + """Execute one queued MCP command inside the owner task.""" + if command.action == "list_tools": + result = await asyncio.wait_for(session.list_tools(), timeout=self.timeout) + tools = [McpToolDef.from_sdk(tool) for tool in result.tools] + log.debug("mcp.client.tools_listed", { + "server": self.name, + "count": len(tools), + }) + return tools + + if command.action == "call_tool": + tool_name = command.payload["name"] try: - await self._streams_context.__aexit__(None, None, None) - except Exception: - pass - self.session = None - self._streams = None - self._streams_context = None - self._connected = False - self._transport_type = None + result = await asyncio.wait_for( + session.call_tool(name=tool_name, arguments=command.payload["arguments"]), + timeout=self.timeout, + ) + except asyncio.TimeoutError as exc: + log.error("mcp.client.call_timeout", { + "server": self.name, + "tool": tool_name, + }) + from concurrent.futures import TimeoutError as _FuturesTimeoutError + + raise _FuturesTimeoutError(f"MCP工具调用超时 ({self.timeout}s): {tool_name}") from exc + + log.debug("mcp.client.tool_called", { + "server": self.name, + "tool": tool_name, + }) + return result + + if command.action == "list_resources": + result = await asyncio.wait_for(session.list_resources(), timeout=self.timeout) + resources = [ + McpResource( + name=resource.name, + uri=resource.uri, + description=getattr(resource, "description", None), + mime_type=getattr(resource, "mimeType", None), + server=self.name, + ) + for resource in result.resources + ] + log.debug("mcp.client.resources_listed", { + "server": self.name, + "count": len(resources), + }) + return resources + + if command.action == "read_resource": + uri = command.payload["uri"] + result = await asyncio.wait_for(session.read_resource(uri=uri), timeout=self.timeout) + log.debug("mcp.client.resource_read", { + "server": self.name, + "uri": uri, + }) + return result + + raise ValueError(f"Unknown MCP command: {command.action}") + + async def _fail_pending_commands(self, error: Exception) -> None: + """Fail any queued commands when the owner task exits.""" + if self._command_queue is None: + return + + while True: + try: + command = self._command_queue.get_nowait() + except asyncio.QueueEmpty: + return + + if command.response is not None and not command.response.done(): + command.response.set_exception(error) @staticmethod def _read_stderr(stderr_file) -> str: @@ -325,7 +535,13 @@ def _flocks_mcp_prefix() -> str: prefix.mkdir(parents=True, exist_ok=True) return str(prefix) - async def _connect_local(self) -> None: + @asynccontextmanager + async def _create_stdio_streams(self, server_params: StdioServerParameters, stderr_file): + """Create stdio transport streams.""" + async with stdio_client(server_params, errlog=stderr_file) as streams: + yield streams + + async def _connect_local(self, startup_future: asyncio.Future[None]) -> None: """Connect to local server via Stdio transport.""" if not self.command: raise ValueError(f"No command specified for local server: {self.name}") @@ -374,90 +590,63 @@ async def _connect_local(self) -> None: "args": args, }) - stderr_file = tempfile.TemporaryFile(mode="w+") - try: - self._streams_context = stdio_client(server_params, errlog=stderr_file) - self._streams = await self._streams_context.__aenter__() - read_stream, write_stream = self._streams - - self.session = ClientSession(read_stream, write_stream) - await self.session.__aenter__() - init_result = await asyncio.wait_for( - self.session.initialize(), - timeout=self.timeout, - ) - - self._connected = True - self._transport_type = "stdio" - log.info("mcp.client.connected", { - "server": self.name, - "transport": "stdio", - "protocol_version": getattr(init_result, 'protocolVersion', 'unknown'), - "server_info": getattr(init_result, 'serverInfo', {}), - }) - except asyncio.TimeoutError: - stderr_output = self._read_stderr(stderr_file) - await self._cleanup_connection() - log.error("mcp.client.timeout", { - "server": self.name, - "transport": "stdio", - "stderr": stderr_output, - }) - detail = f"Connection timeout (stdio): {self.name}" - if stderr_output: - detail += f"\nServer stderr:\n{stderr_output}" - raise RuntimeError(detail) - except Exception as e: - stderr_output = self._read_stderr(stderr_file) - root_cause = _extract_root_cause(e) - await self._cleanup_connection() - log.error("mcp.client.stdio_failed", { - "server": self.name, - "error": root_cause, - "stderr": stderr_output, - }) - detail = f"Stdio connection failed: {self.name}: {root_cause}" - if stderr_output: - detail += f"\nServer stderr:\n{stderr_output}" - raise RuntimeError(detail) - finally: - stderr_file.close() + # Keep stderr capture lifetime explicit: the file must outlive the stdio + # transport context, but should close immediately once the attempt ends. + with tempfile.TemporaryFile(mode="w+") as stderr_file: + try: + async with self._create_stdio_streams(server_params, stderr_file) as streams: + self._streams = streams + await self._run_connected_session("stdio", streams, startup_future) + except asyncio.TimeoutError as exc: + stderr_output = self._read_stderr(stderr_file) + log.error("mcp.client.timeout", { + "server": self.name, + "transport": "stdio", + "stderr": stderr_output, + }) + detail = f"Connection timeout (stdio): {self.name}" + if stderr_output: + detail += f"\nServer stderr:\n{stderr_output}" + raise RuntimeError(detail) from exc + except Exception as exc: + stderr_output = self._read_stderr(stderr_file) + root_cause = _extract_root_cause(exc) + log.error("mcp.client.stdio_failed", { + "server": self.name, + "error": root_cause, + "stderr": stderr_output, + }) + detail = f"Stdio connection failed: {self.name}: {root_cause}" + if stderr_output: + detail += f"\nServer stderr:\n{stderr_output}" + raise RuntimeError(detail) async def disconnect(self) -> None: """Disconnect from server""" - if not self._connected: + owner_task = self._owner_task + if owner_task is None and not self._connected: return - + try: - # Close session first - if self.session: - try: - await self.session.__aexit__(None, None, None) - except Exception as e: - log.warn("mcp.client.session_close_error", { - "server": self.name, - "error": str(e) - }) - - # Then close streams - if self._streams_context: - try: - await self._streams_context.__aexit__(None, None, None) - except Exception as e: - log.warn("mcp.client.streams_close_error", { - "server": self.name, - "error": str(e) - }) - except Exception as e: + if owner_task is not None and not owner_task.done() and self._command_queue is not None: + response = asyncio.get_running_loop().create_future() + await self._command_queue.put(_ClientCommand(action="disconnect", response=response)) + await response + elif owner_task is not None and not owner_task.done(): + owner_task.cancel() + + if owner_task is not None: + with contextlib.suppress(asyncio.CancelledError): + await owner_task + except Exception as exc: log.error("mcp.client.disconnect_error", { "server": self.name, - "error": str(e) + "error": str(exc), }) finally: - self._connected = False - self.session = None - self._streams = None - self._streams_context = None + self._reset_runtime_state(clear_owner_error=True) + if self._owner_task is owner_task: + self._owner_task = None log.info("mcp.client.disconnected", {"server": self.name}) async def list_tools(self) -> List[McpToolDef]: @@ -470,24 +659,13 @@ async def list_tools(self) -> List[McpToolDef]: Raises: RuntimeError: If not connected """ - if not self._connected or not self.session: - raise RuntimeError(f"Client not connected: {self.name}") - try: - result = await asyncio.wait_for( - self.session.list_tools(), - timeout=self.timeout - ) - tools = [McpToolDef.from_sdk(tool) for tool in result.tools] - log.debug("mcp.client.tools_listed", { - "server": self.name, - "count": len(tools) - }) - return tools - except Exception as e: + result = await self._submit_command("list_tools") + return result + except Exception as exc: log.error("mcp.client.list_tools_error", { "server": self.name, - "error": str(e) + "error": str(exc), }) raise @@ -505,31 +683,13 @@ async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Any: Raises: RuntimeError: If not connected """ - if not self._connected or not self.session: - raise RuntimeError(f"Client not connected: {self.name}") - try: - result = await asyncio.wait_for( - self.session.call_tool(name=name, arguments=arguments), - timeout=self.timeout - ) - log.debug("mcp.client.tool_called", { - "server": self.name, - "tool": name - }) - return result - except asyncio.TimeoutError: - log.error("mcp.client.call_timeout", { - "server": self.name, - "tool": name - }) - from concurrent.futures import TimeoutError as _FuturesTimeoutError - raise _FuturesTimeoutError(f"MCP工具调用超时 ({self.timeout}s): {name}") - except Exception as e: + return await self._submit_command("call_tool", name=name, arguments=arguments) + except Exception as exc: log.error("mcp.client.call_error", { "server": self.name, "tool": name, - "error": str(e) + "error": str(exc), }) raise @@ -543,32 +703,13 @@ async def list_resources(self) -> List[McpResource]: Raises: RuntimeError: If not connected """ - if not self._connected or not self.session: - raise RuntimeError(f"Client not connected: {self.name}") - try: - result = await asyncio.wait_for( - self.session.list_resources(), - timeout=self.timeout - ) - resources = [] - for r in result.resources: - resources.append(McpResource( - name=r.name, - uri=r.uri, - description=getattr(r, 'description', None), - mime_type=getattr(r, 'mimeType', None), - server=self.name - )) - log.debug("mcp.client.resources_listed", { - "server": self.name, - "count": len(resources) - }) - return resources - except Exception as e: + result = await self._submit_command("list_resources") + return result + except Exception as exc: log.error("mcp.client.list_resources_error", { "server": self.name, - "error": str(e) + "error": str(exc), }) raise @@ -585,26 +726,42 @@ async def read_resource(self, uri: str) -> Any: Raises: RuntimeError: If not connected """ - if not self._connected or not self.session: - raise RuntimeError(f"Client not connected: {self.name}") - try: - result = await asyncio.wait_for( - self.session.read_resource(uri=uri), - timeout=self.timeout - ) - log.debug("mcp.client.resource_read", { - "server": self.name, - "uri": uri - }) - return result - except Exception as e: + return await self._submit_command("read_resource", uri=uri) + except Exception as exc: log.error("mcp.client.read_resource_error", { "server": self.name, "uri": uri, - "error": str(e) + "error": str(exc), }) raise + + async def _submit_command(self, action: str, **payload: Any) -> Any: + """Send a serialized command to the owner task.""" + if not self._connected or self._command_queue is None: + if self._owner_error is not None: + raise RuntimeError( + f"Client not connected: {self.name}: {_extract_root_cause(self._owner_error)}" + ) from self._owner_error + raise RuntimeError(f"Client not connected: {self.name}") + + owner_task = self._owner_task + if owner_task is None: + if self._owner_error is not None: + raise RuntimeError( + f"Client not connected: {self.name}: {_extract_root_cause(self._owner_error)}" + ) from self._owner_error + raise RuntimeError(f"Client not connected: {self.name}") + + response = asyncio.get_running_loop().create_future() + command = _ClientCommand(action=action, payload=payload, response=response) + await self._command_queue.put(command) + + if owner_task.done() and not response.done(): + owner_error = self._owner_error or RuntimeError(f"Client not connected: {self.name}") + response.set_exception(owner_error) + + return await response @property def is_connected(self) -> bool: diff --git a/flocks/server/app.py b/flocks/server/app.py index 183ea8d60..fa713c757 100644 --- a/flocks/server/app.py +++ b/flocks/server/app.py @@ -5,10 +5,12 @@ """ import asyncio +import inspect import os import time +from types import SimpleNamespace from pathlib import Path -from typing import Optional +from typing import Any, Callable, Optional from contextlib import asynccontextmanager from fastapi import FastAPI, Request, Response, status from fastapi.middleware.cors import CORSMiddleware @@ -43,6 +45,60 @@ # Lifespan context manager for startup/shutdown +async def _maybe_await(result: Any) -> Any: + """Await values that are awaitable and return plain values unchanged.""" + if inspect.isawaitable(result): + return await result + return result + + +async def _run_startup_phase( + log, + phase: str, + fn: Callable[[], Any], +) -> Any: + """Execute one startup phase and emit structured timing logs.""" + started_at = time.perf_counter() + try: + result = await _maybe_await(fn()) + except Exception as exc: + duration_ms = int((time.perf_counter() - started_at) * 1000) + log.warning("server.startup.phase", { + "phase": phase, + "status": "failed", + "duration_ms": duration_ms, + "error": str(exc), + }) + raise + + duration_ms = int((time.perf_counter() - started_at) * 1000) + log.info("server.startup.phase", { + "phase": phase, + "status": "completed", + "duration_ms": duration_ms, + }) + return result + + +def _schedule_startup_phase( + app: FastAPI, + log, + phase: str, + fn: Callable[[], Any], +) -> None: + """Run a non-critical startup phase in the background after app is ready.""" + + async def _runner() -> None: + try: + await _run_startup_phase(log, phase, fn) + except Exception: + # _run_startup_phase already logged the failure. + return + + task = asyncio.create_task(_runner(), name=f"startup:{phase}") + app.state.startup_background_tasks.append(task) + + @asynccontextmanager async def lifespan(app: FastAPI): """Handle application lifecycle""" @@ -51,13 +107,21 @@ async def lifespan(app: FastAPI): await Log.init(print=False, dev=False, level=LogLevel.INFO) log = Log.create(service="server") + if not hasattr(app, "state") or app.state is None: + app.state = SimpleNamespace() + app.state.startup_background_tasks = [] + startup_started_at = time.perf_counter() # Startup log.info("server.startup", {"version": "0.2.0"}) try: from flocks.updater.updater import cleanup_replaced_files - await asyncio.to_thread(cleanup_replaced_files) + await _run_startup_phase( + log, + "updater.cleanup_leftovers", + lambda: asyncio.to_thread(cleanup_replaced_files), + ) log.info("updater.leftovers.cleaned") except Exception as e: log.warning("updater.leftovers.cleanup_failed", {"error": str(e)}) @@ -65,13 +129,21 @@ async def lifespan(app: FastAPI): try: from flocks.updater.updater import _get_repo_root, _refresh_global_cli_entry - await asyncio.to_thread(_refresh_global_cli_entry, _get_repo_root()) + await _run_startup_phase( + log, + "cli.refresh_global_entry", + lambda: asyncio.to_thread(_refresh_global_cli_entry, _get_repo_root()), + ) log.info("cli.global_entry.refreshed") except Exception as e: log.warning("cli.global_entry.refresh_failed", {"error": str(e)}) try: - init_observability() + await _run_startup_phase( + log, + "observability.init", + init_observability, + ) log.info("observability.initialized") except Exception as e: log.warning("observability.init_failed", {"error": str(e)}) @@ -79,7 +151,11 @@ async def lifespan(app: FastAPI): # Ensure config files exist (copy from examples if needed) try: from flocks.config.config_writer import ensure_config_files - ensure_config_files() + await _run_startup_phase( + log, + "config.ensure_files", + ensure_config_files, + ) log.info("config.files.checked") except Exception as e: log.warning("config.files.check_failed", {"error": str(e)}) @@ -89,7 +165,11 @@ async def lifespan(app: FastAPI): # ``_v`` once the plugin declares a version. try: from flocks.config.api_versioning import migrate_api_services - actions = migrate_api_services() + actions = await _run_startup_phase( + log, + "config.migrate_api_services", + migrate_api_services, + ) copied = [k for k, v in actions.items() if v == "copied"] if copied: log.info("config.api_services.migrated", {"copied": copied}) @@ -97,31 +177,42 @@ async def lifespan(app: FastAPI): log.warning("config.api_services.migrate_failed", {"error": str(e)}) # Initialize storage - await Storage.init() + await _run_startup_phase(log, "storage.init", Storage.init) log.info("storage.initialized") # Initialize local auth/account tables - await AuthService.init() + await _run_startup_phase(log, "auth.init", AuthService.init) log.info("auth.initialized") # Best-effort migration: old sessions default to admin ownership. # The migration itself is idempotent (guarded by a persisted marker), # but we still skip loading users when the marker is already set # to avoid unnecessary DB + session scans on every startup. - try: + async def _migrate_legacy_sessions_to_admin() -> None: marker = await Storage.get("auth:migration:legacy_session_owner_to_admin", dict) - if not (marker and marker.get("done")): - if await AuthService.has_users(): - users = await AuthService.list_users() - admin = next((u for u in users if u.role == "admin"), None) - if admin: - await AuthService.migrate_legacy_sessions_to_admin(admin.id) - except Exception as e: - log.warning("auth.legacy_sessions.migration_failed", {"error": str(e)}) + if marker and marker.get("done"): + return + if not await AuthService.has_users(): + return + users = await AuthService.list_users() + admin = next((u for u in users if u.role == "admin"), None) + if admin: + await AuthService.migrate_legacy_sessions_to_admin(admin.id) + + _schedule_startup_phase( + app, + log, + "auth.migrate_legacy_session_owner", + _migrate_legacy_sessions_to_admin, + ) # Setup question handler for real user interaction from flocks.tool.question_handler import setup_api_question_handler - setup_api_question_handler() + await _run_startup_phase( + log, + "question_handler.setup", + setup_api_question_handler, + ) log.info("question_handler.initialized") # Register built-in hooks if memory is enabled @@ -129,7 +220,11 @@ async def lifespan(app: FastAPI): config = await Config.get() if config.memory.enabled: from flocks.hooks.builtin import register_builtin_hooks - register_builtin_hooks() + await _run_startup_phase( + log, + "hooks.register_builtin", + register_builtin_hooks, + ) log.info("hooks.registered") except Exception as e: # Hook registration failure should not stop server startup @@ -138,25 +233,47 @@ async def lifespan(app: FastAPI): # Migrate env-var credentials to .secret.json (idempotent) try: from flocks.provider.credential import migrate_env_credentials - migrated = migrate_env_credentials() - if migrated > 0: - log.info("credential.env_migration.done", {"migrated": migrated}) + + def _migrate_env_credentials_phase() -> None: + migrated = migrate_env_credentials() + if migrated > 0: + log.info("credential.env_migration.done", {"migrated": migrated}) + + _schedule_startup_phase( + app, + log, + "credential.migrate_env_credentials", + _migrate_env_credentials_phase, + ) except Exception as e: log.warning("credential.env_migration.failed", {"error": str(e)}) # Sync new catalog models into flocks.json for existing providers (idempotent) try: from flocks.provider.model_catalog import sync_catalog_models_to_config - synced = sync_catalog_models_to_config() - if synced > 0: - log.info("catalog.model_sync.done", {"models_added": synced}) + + def _sync_catalog_models_phase() -> None: + synced = sync_catalog_models_to_config() + if synced > 0: + log.info("catalog.model_sync.done", {"models_added": synced}) + + _schedule_startup_phase( + app, + log, + "catalog.sync_models_to_config", + _sync_catalog_models_phase, + ) except Exception as e: log.warning("catalog.model_sync.failed", {"error": str(e)}) # Load custom providers from flocks.json into runtime try: from flocks.server.routes.custom_provider import load_custom_providers_on_startup - await load_custom_providers_on_startup() + await _run_startup_phase( + log, + "custom_providers.load", + load_custom_providers_on_startup, + ) log.info("custom_providers.loaded") except Exception as e: log.warning("custom_providers.load.failed", {"error": str(e)}) @@ -165,23 +282,31 @@ async def lifespan(app: FastAPI): # after a service restart, without requiring manual UI reconnection. try: from flocks.mcp import MCP - await MCP.init() - log.info("mcp.initialized") + + _schedule_startup_phase(app, log, "mcp.init", MCP.init) except Exception as e: log.warning("mcp.init_failed", {"error": str(e)}) # Sync workflows from .flocks/workflow/ filesystem into Storage try: from flocks.server.routes.workflow import sync_workflows_from_filesystem - imported = await sync_workflows_from_filesystem() - log.info("workflow.sync.done", {"imported": imported}) + + async def _sync_workflows_phase() -> None: + imported = await sync_workflows_from_filesystem() + log.info("workflow.sync.done", {"imported": imported}) + + _schedule_startup_phase(app, log, "workflow.sync_filesystem", _sync_workflows_phase) except Exception as e: log.warning("workflow.sync.failed", {"error": str(e)}) # Start Task Center (scheduler + queue executor) try: from flocks.task.manager import TaskManager - await TaskManager.start() + await _run_startup_phase( + log, + "task_manager.start", + TaskManager.start, + ) log.info("task_manager.started") except Exception as e: from flocks.task.manager import TaskManager @@ -191,41 +316,61 @@ async def lifespan(app: FastAPI): # Seed built-in scheduled tasks from .flocks/plugins/tasks/*.json (idempotent) try: from flocks.task.plugin import seed_tasks_from_plugin - seeded = await seed_tasks_from_plugin() - if seeded: - log.info("task.plugin.seeded", {"count": seeded}) + + async def _seed_tasks_phase() -> None: + seeded = await seed_tasks_from_plugin() + if seeded: + log.info("task.plugin.seeded", {"count": seeded}) + + _schedule_startup_phase(app, log, "task.seed_plugin_specs", _seed_tasks_phase) except Exception as e: log.warning("task.plugin.seed_failed", {"error": str(e)}) # Start Skill file watcher (auto-invalidate cache on SKILL.md changes) try: from flocks.skill.skill import Skill - Skill.start_watcher() - log.info("skill.watcher.initialized") + + def _start_skill_watcher() -> None: + Skill.start_watcher() + log.info("skill.watcher.initialized") + + _schedule_startup_phase(app, log, "skill.watcher.start", _start_skill_watcher) except Exception as e: log.warning("skill.watcher.init_failed", {"error": str(e)}) # Start Agent file watcher (auto-invalidate cache on plugin agent changes) try: from flocks.agent.registry import Agent - Agent.start_watcher() - log.info("agent.watcher.initialized") + + def _start_agent_watcher() -> None: + Agent.start_watcher() + log.info("agent.watcher.initialized") + + _schedule_startup_phase(app, log, "agent.watcher.start", _start_agent_watcher) except Exception as e: log.warning("agent.watcher.init_failed", {"error": str(e)}) # Start Tool file watcher (auto-reload plugin tools on file changes) try: from flocks.tool.registry import ToolRegistry - ToolRegistry.start_watcher() - log.info("tool.watcher.initialized") + + def _start_tool_watcher() -> None: + ToolRegistry.start_watcher() + log.info("tool.watcher.initialized") + + _schedule_startup_phase(app, log, "tool.watcher.start", _start_tool_watcher) except Exception as e: log.warning("tool.watcher.init_failed", {"error": str(e)}) # Start Channel Gateway (connect enabled IM channels) try: from flocks.channel.gateway.manager import default_manager - await default_manager.start_all() - log.info("channel.gateway.started") + + async def _start_channel_gateway() -> None: + await default_manager.start_all() + log.info("channel.gateway.started") + + _schedule_startup_phase(app, log, "channel.gateway.start", _start_channel_gateway) except Exception as e: log.warning("channel.gateway.start_failed", {"error": str(e)}) @@ -241,14 +386,32 @@ async def lifespan(app: FastAPI): try: from flocks.updater.updater import recover_upgrade_state - await asyncio.to_thread(recover_upgrade_state) + await _run_startup_phase( + log, + "updater.recover_upgrade_state", + lambda: asyncio.to_thread(recover_upgrade_state), + ) log.info("updater.recovery.checked") except Exception as e: log.warning("updater.recovery.failed", {"error": str(e)}) + blocking_startup_ms = int((time.perf_counter() - startup_started_at) * 1000) + log.info("server.startup.ready", { + "blocking_duration_ms": blocking_startup_ms, + "background_tasks": len(app.state.startup_background_tasks), + }) + yield - # --- Graceful shutdown: notify SSE clients FIRST --- + background_tasks = list(getattr(app.state, "startup_background_tasks", [])) + for task in background_tasks: + if not task.done(): + task.cancel() + if background_tasks: + await asyncio.gather(*background_tasks, return_exceptions=True) + + # Notify SSE clients before stopping sessions, MCP transports, and other + # long-lived runtime services so browser listeners see the shutdown event. try: from flocks.server.routes.event import EventBroadcaster broadcaster = EventBroadcaster.get() diff --git a/flocks/server/routes/_timing.py b/flocks/server/routes/_timing.py new file mode 100644 index 000000000..f0c8e883a --- /dev/null +++ b/flocks/server/routes/_timing.py @@ -0,0 +1,28 @@ +"""Helpers for route timing logs.""" + +from __future__ import annotations + +import time +from typing import Any + +from flocks.utils.log import Logger + +DEFAULT_SLOW_ROUTE_LOG_THRESHOLD_MS = 300 + + +def log_route_timing( + logger: Logger, + event: str, + *, + started_at: float, + extra: dict[str, Any] | None = None, + slow_threshold_ms: int = DEFAULT_SLOW_ROUTE_LOG_THRESHOLD_MS, +) -> int: + """Log route timings at INFO only when a request is slow enough.""" + duration_ms = int((time.perf_counter() - started_at) * 1000) + payload = {"duration_ms": duration_ms, **(extra or {})} + if duration_ms >= slow_threshold_ms: + logger.info(event, payload) + else: + logger.debug(event, payload) + return duration_ms diff --git a/flocks/server/routes/session.py b/flocks/server/routes/session.py index db40d3972..e2d04c205 100644 --- a/flocks/server/routes/session.py +++ b/flocks/server/routes/session.py @@ -15,6 +15,7 @@ from pydantic import BaseModel, Field, ConfigDict from flocks.auth.context import get_current_auth_user +from flocks.server.routes._timing import log_route_timing from flocks.session.session import Session, SessionInfo as SessionModel from flocks.session.policy import SessionPolicy from flocks.utils.log import Log @@ -206,6 +207,7 @@ async def list_sessions( category: Optional[str] = Query(None, description="Filter by category: user or task"), ) -> List[SessionResponse]: """List all sessions with optional filters""" + started_at = time.perf_counter() _current_user = require_user(request) all_sessions = await Session.list_all() @@ -235,7 +237,15 @@ async def list_sessions( if limit is not None and len(filtered) >= limit: break - return [_session_to_response(s) for s in filtered] + response = [_session_to_response(s) for s in filtered] + log_route_timing(log, "session.list.complete", started_at=started_at, extra={ + "count": len(response), + "roots": roots, + "limit": limit, + "search": bool(search), + "category": category, + }) + return response @router.post( diff --git a/flocks/server/routes/task_entities.py b/flocks/server/routes/task_entities.py index 62e01abb3..69170c4dd 100644 --- a/flocks/server/routes/task_entities.py +++ b/flocks/server/routes/task_entities.py @@ -1,13 +1,17 @@ """Execution-centric task scheduler/execution routes.""" from enum import Enum +import time from typing import List, Optional, Type from fastapi import APIRouter, HTTPException, Query, status from pydantic import BaseModel, ConfigDict, Field +from flocks.server.routes._timing import log_route_timing +from flocks.utils.log import Log router = APIRouter() +log = Log.create(service="task-routes") class SchedulerCreateRequest(BaseModel): @@ -143,21 +147,40 @@ def _parse_task_type(task_type: str) -> str: async def get_task_system_notice(): from flocks.task.manager import TaskManager - return await TaskManager.get_task_page_notice() + started_at = time.perf_counter() + notice = await TaskManager.get_task_page_notice() + log_route_timing(log, "task.notice.complete", started_at=started_at, extra={ + "has_notice": bool(notice), + }) + return notice @router.get("/task-system/dashboard") async def task_dashboard(): from flocks.task.manager import TaskManager - return await TaskManager.dashboard() + started_at = time.perf_counter() + payload = await TaskManager.dashboard() + log_route_timing(log, "task.dashboard.complete", started_at=started_at, extra={ + "running": payload.get("running"), + "queued": payload.get("queued"), + "scheduled_active": payload.get("scheduled_active"), + }) + return payload @router.get("/task-system/queue/status") async def task_queue_status(): from flocks.task.manager import TaskManager - return await TaskManager.queue_status() + started_at = time.perf_counter() + payload = await TaskManager.queue_status() + log_route_timing(log, "task.queue_status.complete", started_at=started_at, extra={ + "queued": payload.get("queued"), + "running": payload.get("running"), + "paused": payload.get("paused"), + }) + return payload @router.post("/task-system/queue/pause") diff --git a/flocks/server/routes/tool.py b/flocks/server/routes/tool.py index 6fcc5f303..dc1a0f19f 100644 --- a/flocks/server/routes/tool.py +++ b/flocks/server/routes/tool.py @@ -3,11 +3,13 @@ """ import asyncio +import time from typing import List, Optional, Dict, Any from fastapi import APIRouter, Depends, HTTPException, status from pydantic import BaseModel, Field from flocks.server.auth import require_admin +from flocks.server.routes._timing import log_route_timing from flocks.utils.log import Log from flocks.config.config_writer import ConfigWriter from flocks.permission.next import DeniedError, PermissionNext @@ -439,6 +441,7 @@ async def list_tools( List of tool information """ # Initialize registry if needed + started_at = time.perf_counter() ToolRegistry.init() # Parse category filter @@ -458,7 +461,12 @@ async def list_tools( # Apply source filter if specified if source: result = [t for t in result if t.source == source] - + + log_route_timing(log, "tools.list.complete", started_at=started_at, extra={ + "count": len(result), + "category": category, + "source": source, + }) return result @@ -764,6 +772,7 @@ async def refresh_tools(_admin: object = Depends(require_admin)): This is the batch counterpart to the single-tool ``/{name}/reload`` endpoint. """ + started_at = time.perf_counter() ToolRegistry.init() errors: list[str] = [] @@ -783,7 +792,10 @@ async def refresh_tools(_admin: object = Depends(require_admin)): errors.append(f"plugin: {e}") tool_count = len(ToolRegistry.all_tool_ids()) - log.info("tools.refresh.done", {"tool_count": tool_count, "errors": len(errors)}) + log_route_timing(log, "tools.refresh.done", started_at=started_at, extra={ + "tool_count": tool_count, + "errors": len(errors), + }) if errors: return RefreshResponse( diff --git a/flocks/server/routes/workflow.py b/flocks/server/routes/workflow.py index ad6cae1b2..a127c2f77 100644 --- a/flocks/server/routes/workflow.py +++ b/flocks/server/routes/workflow.py @@ -366,17 +366,14 @@ def _list_workflows_from_fs() -> List[Dict[str, Any]]: async def sync_workflows_from_filesystem() -> int: - """Scan all workflow directories on the filesystem and return the count of discovered workflows. + """Best-effort startup sync for filesystem-backed workflows. - Called at server startup (app.py) to surface any filesystem-only workflows early. - The filesystem is already the sole source of truth for workflow definitions, so this - function only needs to scan and count—no data needs to be written anywhere. - It also triggers the one-time storage → filesystem migration as a side-effect. + The filesystem is the source of truth for workflow definitions. Startup only + needs to migrate any legacy Storage-only records to disk and report how many + workflows are currently discoverable from the configured workflow roots. """ await _migrate_storage_to_filesystem() - all_data = _list_workflows_from_fs() - log.info("workflow.sync.filesystem", {"count": len(all_data)}) - return len(all_data) + return len(_list_workflows_from_fs()) async def _migrate_storage_to_filesystem() -> None: diff --git a/tests/mcp/test_mcp_client.py b/tests/mcp/test_mcp_client.py index 5eff5df2c..4bdc162ab 100644 --- a/tests/mcp/test_mcp_client.py +++ b/tests/mcp/test_mcp_client.py @@ -1,82 +1,178 @@ +import asyncio +from contextlib import asynccontextmanager +from types import MethodType +from unittest.mock import AsyncMock + import pytest +import flocks.mcp.client as mcp_client_module from flocks.mcp.client import McpClient class TestMcpClientTransportSelection: @pytest.mark.asyncio - async def test_connect_uses_sse_only_when_transport_is_sse(self, monkeypatch: pytest.MonkeyPatch): + async def test_connect_routes_remote_servers_to_remote_owner( + self, + monkeypatch: pytest.MonkeyPatch, + ): calls: list[str] = [] - async def fake_http(*args, **kwargs): - calls.append("http") - - async def fake_sse(*args, **kwargs): - calls.append("sse") + async def fake_remote(startup_future): + calls.append("remote") + startup_future.set_result(None) client = McpClient( name="demo", server_type="remote", - url="https://example.com/sse", - transport="sse", + url="https://example.com/mcp", ) - monkeypatch.setattr(client, "_do_connect_streamable_http", fake_http) - monkeypatch.setattr(client, "_do_connect_sse", fake_sse) + monkeypatch.setattr(client, "_connect_remote", fake_remote) await client.connect() - assert calls == ["sse"] - assert client._transport_type == "sse" + assert calls == ["remote"] @pytest.mark.asyncio - async def test_connect_uses_http_only_when_transport_is_http(self, monkeypatch: pytest.MonkeyPatch): + async def test_connect_routes_stdio_servers_to_local_owner( + self, + monkeypatch: pytest.MonkeyPatch, + ): calls: list[str] = [] - async def fake_http(*args, **kwargs): - calls.append("http") + async def fake_local(startup_future): + calls.append("local") + startup_future.set_result(None) + + client = McpClient( + name="demo", + server_type="stdio", + command=["python", "-m", "demo"], + ) + monkeypatch.setattr(client, "_connect_local", fake_local) + + await client.connect() + + assert calls == ["local"] - async def fake_sse(*args, **kwargs): - calls.append("sse") + @pytest.mark.asyncio + async def test_timeout_none_defaults_to_safe_float( + self, + monkeypatch: pytest.MonkeyPatch, + ): + observed: list[float] = [] + + async def fake_remote(startup_future): + observed.append(client.timeout) + startup_future.set_result(None) client = McpClient( name="demo", server_type="remote", url="https://example.com/mcp", - transport="http", + timeout=None, ) - monkeypatch.setattr(client, "_do_connect_streamable_http", fake_http) - monkeypatch.setattr(client, "_do_connect_sse", fake_sse) + monkeypatch.setattr(client, "_connect_remote", fake_remote) await client.connect() - assert calls == ["http"] - assert client._transport_type == "streamable_http" + assert observed == [30.0] @pytest.mark.asyncio - async def test_connect_auto_falls_back_to_sse_after_http_failure(self, monkeypatch: pytest.MonkeyPatch): - calls: list[str] = [] + async def test_unknown_type_raises_value_error(self): + client = McpClient( + name="demo", + server_type="websocket", + url="wss://example.com", + ) - async def fake_http(*args, **kwargs): - calls.append("http") - raise RuntimeError("HTTP 405") + with pytest.raises(ValueError, match="Unknown server type: websocket"): + await client.connect() - async def fake_sse(*args, **kwargs): - calls.append("sse") + @pytest.mark.asyncio + async def test_failed_connect_cleans_up_owner_runtime_state(self): + client = McpClient( + name="demo", + server_type="websocket", + url="wss://example.com", + ) - async def fake_cleanup(): - return None + with pytest.raises(ValueError, match="Unknown server type: websocket"): + await client.connect() + assert client._connected is False + assert client._command_queue is None + assert client._owner_task is None + assert isinstance(client._owner_error, ValueError) + + @pytest.mark.asyncio + async def test_already_connected_skips_new_owner_task( + self, + monkeypatch: pytest.MonkeyPatch, + ): client = McpClient( name="demo", server_type="remote", url="https://example.com/mcp", - transport="auto", ) - monkeypatch.setattr(client, "_do_connect_streamable_http", fake_http) - monkeypatch.setattr(client, "_do_connect_sse", fake_sse) - monkeypatch.setattr(client, "_cleanup_connection", fake_cleanup) + client._connected = True + fake_owner = AsyncMock() + monkeypatch.setattr(client, "_run_connection_owner", fake_owner) await client.connect() - assert calls == ["http", "sse"] - assert client._transport_type == "sse" + fake_owner.assert_not_called() + + @pytest.mark.asyncio + async def test_connect_local_closes_stderr_file_on_failure( + self, + monkeypatch: pytest.MonkeyPatch, + ): + class _FakeTempFile: + def __init__(self) -> None: + self.closed = False + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb) -> bool: + self.close() + return False + + def seek(self, _offset: int) -> None: + return None + + def read(self, _size: int = -1) -> str: + return "stdio stderr" + + def close(self) -> None: + self.closed = True + + fake_stderr = _FakeTempFile() + client = McpClient( + name="demo", + server_type="stdio", + command=["python", "-m", "demo"], + ) + + @asynccontextmanager + async def broken_stdio(self, _server_params, stderr_file): + assert stderr_file is fake_stderr + raise RuntimeError("spawn failed") + yield + + monkeypatch.setattr( + mcp_client_module.tempfile, + "TemporaryFile", + lambda mode="w+": fake_stderr, + ) + monkeypatch.setattr( + client, + "_create_stdio_streams", + MethodType(broken_stdio, client), + ) + + startup_future = asyncio.get_running_loop().create_future() + with pytest.raises(RuntimeError, match="Stdio connection failed"): + await client._connect_local(startup_future) + + assert fake_stderr.closed is True diff --git a/tests/mcp/test_mcp_client_sse.py b/tests/mcp/test_mcp_client_sse.py index a0da9feed..479a32a44 100644 --- a/tests/mcp/test_mcp_client_sse.py +++ b/tests/mcp/test_mcp_client_sse.py @@ -1,220 +1,243 @@ -""" -Tests for MCP Client SSE transport support - -Verifies that McpClient correctly handles: -- remote / sse server type (auto-detect: Streamable HTTP -> SSE fallback) -- Timeout does NOT fall back (avoids double wait) -- Unknown server types (raises ValueError) -- Error message extraction from ExceptionGroups -""" +"""Tests for MCP client remote transport lifecycle and fallback behavior.""" import asyncio +from contextlib import asynccontextmanager +from types import MethodType, SimpleNamespace import pytest -from unittest.mock import AsyncMock, MagicMock, patch -from flocks.mcp.client import McpClient, _extract_root_cause - - -class TestMcpClientServerTypes: - """Test McpClient server type routing""" - - def test_init_sse_type(self): - """SSE type should be accepted""" - client = McpClient( - name="test-sse", - server_type="sse", - url="https://example.com/sse", - ) - assert client.server_type == "sse" - assert client.url == "https://example.com/sse" - - def test_init_remote_type(self): - """Remote type should be accepted""" - client = McpClient( - name="test-remote", - server_type="remote", - url="https://example.com/mcp", - ) - assert client.server_type == "remote" - - @pytest.mark.asyncio - async def test_unknown_type_raises_value_error(self): - """Unknown server type should raise ValueError""" - client = McpClient( - name="test-bad", - server_type="websocket", - url="wss://example.com", - ) - with pytest.raises(ValueError, match="Unknown server type: websocket"): - await client.connect() - - @pytest.mark.asyncio - async def test_sse_type_uses_auto_detect(self): - """SSE type should use _connect_remote (auto-detect) same as remote""" - client = McpClient( - name="test", - server_type="sse", - url="https://example.com/sse", - ) - client._connect_remote = AsyncMock() - await client.connect() - client._connect_remote.assert_called_once() - @pytest.mark.asyncio - async def test_remote_type_calls_connect_remote(self): - """Remote type should call _connect_remote""" - client = McpClient( - name="test", - server_type="remote", - url="https://example.com/mcp", - ) - client._connect_remote = AsyncMock() - await client.connect() - client._connect_remote.assert_called_once() +import flocks.mcp.client as mcp_client_module +from flocks.mcp.client import McpClient, _extract_root_cause - @pytest.mark.asyncio - async def test_stdio_type_calls_connect_local(self): - """Stdio type attempts connection (raises RuntimeError on failure)""" - client = McpClient( - name="test", - server_type="stdio", - url=None, - command=["python", "-m", "some_server"], - ) - # Stdio connection will fail since 'some_server' doesn't exist - with pytest.raises((NotImplementedError, RuntimeError)): - await client.connect() - @pytest.mark.asyncio - async def test_already_connected_skips(self): - """Already connected client should skip reconnection""" - client = McpClient( - name="test", - server_type="sse", - url="https://example.com/sse", - ) - client._connected = True - client._connect_remote = AsyncMock() - await client.connect() - client._connect_remote.assert_not_called() +def _make_session_class( + *, + events: dict[str, object] | None = None, + tool_result: object | None = None, + tools: list[object] | None = None, + resources: list[object] | None = None, +): + class FakeSession: + def __init__(self, read_stream, write_stream): + self.read_stream = read_stream + self.write_stream = write_stream + + async def __aenter__(self): + if events is not None: + events["session_enter_task"] = asyncio.current_task() + return self + + async def __aexit__(self, exc_type, exc, tb): + if events is not None: + events["session_exit_task"] = asyncio.current_task() + return False + + async def initialize(self): + return SimpleNamespace(protocolVersion="2026-05-12", serverInfo={"name": "demo"}) + + async def list_tools(self): + return SimpleNamespace(tools=tools or []) + + async def call_tool(self, name, arguments): + if events is not None: + events["call_tool_task"] = asyncio.current_task() + if isinstance(tool_result, Exception): + raise tool_result + if tool_result is not None: + return tool_result + return {"name": name, "arguments": arguments} + + async def list_resources(self): + return SimpleNamespace(resources=resources or []) + + async def read_resource(self, uri): + return {"uri": uri} + + return FakeSession + + +def _make_remote_transport_factory( + label: str, + *, + streams: tuple[object, ...] | None = None, + error: Exception | None = None, + events: dict[str, object] | None = None, + captures: list[tuple[str, str, dict | None]] | None = None, +): + if streams is None: + if label == "http": + streams = ("read", "write", lambda: None) + else: + streams = ("read", "write") + + @asynccontextmanager + async def factory(self, url, headers): + if captures is not None: + captures.append((label, url, headers)) + if error is not None: + raise error + if events is not None: + events[f"{label}_enter_task"] = asyncio.current_task() + try: + yield streams + finally: + if events is not None: + events[f"{label}_exit_task"] = asyncio.current_task() + + return factory + + +def _bind_method(monkeypatch: pytest.MonkeyPatch, client: McpClient, name: str, method) -> None: + monkeypatch.setattr(client, name, MethodType(method, client)) class TestMcpClientRemoteFallback: - """Test remote type fallback from Streamable HTTP to SSE""" - @pytest.mark.asyncio - async def test_remote_falls_back_to_sse(self): - """Remote type should fall back to SSE when Streamable HTTP fails""" + async def test_remote_falls_back_to_sse(self, monkeypatch: pytest.MonkeyPatch): client = McpClient( name="test-remote", server_type="remote", url="https://mcp.example.com/mcp", timeout=10.0, ) + monkeypatch.setattr(mcp_client_module, "ClientSession", _make_session_class()) - # Mock _do_connect_streamable_http to fail - client._do_connect_streamable_http = AsyncMock( - side_effect=RuntimeError("Streamable HTTP not supported") + _bind_method( + monkeypatch, + client, + "_create_streamable_http_streams", + _make_remote_transport_factory("http", error=RuntimeError("HTTP failed")), + ) + _bind_method( + monkeypatch, + client, + "_create_sse_streams", + _make_remote_transport_factory("sse"), ) - # Mock _do_connect_sse to succeed - async def mark_connected(url, headers=None): - client._connected = True - client._do_connect_sse = AsyncMock(side_effect=mark_connected) await client.connect() - client._do_connect_streamable_http.assert_called_once() - client._do_connect_sse.assert_called_once() assert client._transport_type == "sse" + await client.disconnect() @pytest.mark.asyncio - async def test_remote_streamable_http_success_no_sse(self): - """Remote type should not try SSE if Streamable HTTP succeeds""" + async def test_remote_streamable_http_success_no_sse(self, monkeypatch: pytest.MonkeyPatch): client = McpClient( name="test-remote", server_type="remote", url="https://mcp.example.com/mcp", timeout=10.0, ) - - async def mark_connected(url, headers=None): - client._connected = True - client._do_connect_streamable_http = AsyncMock(side_effect=mark_connected) - client._do_connect_sse = AsyncMock() + captures: list[tuple[str, str, dict | None]] = [] + monkeypatch.setattr(mcp_client_module, "ClientSession", _make_session_class()) + + _bind_method( + monkeypatch, + client, + "_create_streamable_http_streams", + _make_remote_transport_factory("http", captures=captures), + ) + _bind_method( + monkeypatch, + client, + "_create_sse_streams", + _make_remote_transport_factory("sse", captures=captures), + ) await client.connect() - client._do_connect_streamable_http.assert_called_once() - client._do_connect_sse.assert_not_called() assert client._transport_type == "streamable_http" + assert [label for label, _, _ in captures] == ["http"] + await client.disconnect() @pytest.mark.asyncio - async def test_remote_both_fail_raises(self): - """Remote type should raise RuntimeError if both transports fail""" + async def test_remote_both_fail_raises(self, monkeypatch: pytest.MonkeyPatch): client = McpClient( name="test-remote", server_type="remote", url="https://mcp.example.com/mcp", timeout=10.0, ) + monkeypatch.setattr(mcp_client_module, "ClientSession", _make_session_class()) - client._do_connect_streamable_http = AsyncMock( - side_effect=RuntimeError("HTTP failed") + _bind_method( + monkeypatch, + client, + "_create_streamable_http_streams", + _make_remote_transport_factory("http", error=RuntimeError("HTTP failed")), ) - client._do_connect_sse = AsyncMock( - side_effect=RuntimeError("SSE failed") + _bind_method( + monkeypatch, + client, + "_create_sse_streams", + _make_remote_transport_factory("sse", error=RuntimeError("SSE failed")), ) with pytest.raises(RuntimeError, match="Connection failed.*SSE failed"): await client.connect() @pytest.mark.asyncio - async def test_sse_type_also_tries_streamable_http_first(self): - """SSE type uses same auto-detect strategy as remote (Streamable HTTP first)""" + async def test_sse_type_also_tries_streamable_http_first(self, monkeypatch: pytest.MonkeyPatch): client = McpClient( name="test-sse", server_type="sse", url="https://mcp.example.com/mcp", timeout=10.0, ) - - async def mark_connected(url, headers=None): - client._connected = True - client._do_connect_streamable_http = AsyncMock(side_effect=mark_connected) - client._do_connect_sse = AsyncMock() + captures: list[tuple[str, str, dict | None]] = [] + monkeypatch.setattr(mcp_client_module, "ClientSession", _make_session_class()) + + _bind_method( + monkeypatch, + client, + "_create_streamable_http_streams", + _make_remote_transport_factory("http", captures=captures), + ) + _bind_method( + monkeypatch, + client, + "_create_sse_streams", + _make_remote_transport_factory("sse", captures=captures), + ) await client.connect() - # "sse" and "remote" share the same auto-detect logic - client._do_connect_streamable_http.assert_called_once() - client._do_connect_sse.assert_not_called() assert client._transport_type == "streamable_http" + assert [label for label, _, _ in captures] == ["http"] + await client.disconnect() @pytest.mark.asyncio - async def test_timeout_does_not_fall_back(self): - """Streamable HTTP timeout should NOT fall back to SSE (avoids double wait)""" + async def test_timeout_does_not_fall_back(self, monkeypatch: pytest.MonkeyPatch): client = McpClient( name="test-timeout", server_type="remote", url="https://mcp.example.com/mcp", timeout=10.0, ) - - client._do_connect_streamable_http = AsyncMock( - side_effect=asyncio.TimeoutError() + captures: list[tuple[str, str, dict | None]] = [] + monkeypatch.setattr(mcp_client_module, "ClientSession", _make_session_class()) + + _bind_method( + monkeypatch, + client, + "_create_streamable_http_streams", + _make_remote_transport_factory("http", error=asyncio.TimeoutError()), + ) + _bind_method( + monkeypatch, + client, + "_create_sse_streams", + _make_remote_transport_factory("sse", captures=captures), ) - client._do_connect_sse = AsyncMock() with pytest.raises(RuntimeError, match="Connection timeout"): await client.connect() - # SSE should NOT have been attempted - client._do_connect_sse.assert_not_called() + assert captures == [] assert client._transport_type is None @pytest.mark.asyncio - async def test_remote_passes_resolved_headers_to_transports(self): - """Remote connection should pass config and auth headers to SDK transports""" + async def test_remote_passes_resolved_headers_to_transports(self, monkeypatch: pytest.MonkeyPatch): client = McpClient( name="test-headers", server_type="remote", @@ -228,15 +251,21 @@ async def test_remote_passes_resolved_headers_to_transports(self): }, timeout=10.0, ) - - client._do_connect_streamable_http = AsyncMock( - side_effect=RuntimeError("HTTP failed") + captures: list[tuple[str, str, dict | None]] = [] + monkeypatch.setattr(mcp_client_module, "ClientSession", _make_session_class()) + + _bind_method( + monkeypatch, + client, + "_create_streamable_http_streams", + _make_remote_transport_factory("http", captures=captures, error=RuntimeError("HTTP failed")), + ) + _bind_method( + monkeypatch, + client, + "_create_sse_streams", + _make_remote_transport_factory("sse", captures=captures), ) - - async def mark_connected(url, headers): - client._connected = True - - client._do_connect_sse = AsyncMock(side_effect=mark_connected) await client.connect() @@ -244,46 +273,104 @@ async def mark_connected(url, headers): "Api-Key": "token123", "Authorization": "Bearer abc", } - client._do_connect_streamable_http.assert_called_once_with( - "https://mcp.example.com/mcp", - expected_headers, + assert captures == [ + ("http", "https://mcp.example.com/mcp", expected_headers), + ("sse", "https://mcp.example.com/mcp", expected_headers), + ] + await client.disconnect() + + @pytest.mark.asyncio + async def test_disconnect_closes_streams_and_session_in_owner_task( + self, + monkeypatch: pytest.MonkeyPatch, + ): + client = McpClient( + name="test-owner", + server_type="remote", + url="https://mcp.example.com/mcp", + ) + events: dict[str, object] = {} + monkeypatch.setattr(mcp_client_module, "ClientSession", _make_session_class(events=events)) + + _bind_method( + monkeypatch, + client, + "_create_streamable_http_streams", + _make_remote_transport_factory("http", events=events), ) - client._do_connect_sse.assert_called_once_with( - "https://mcp.example.com/mcp", - expected_headers, + _bind_method( + monkeypatch, + client, + "_create_sse_streams", + _make_remote_transport_factory("sse", events=events), ) + await client.connect() + await client.disconnect() -class TestExtractRootCause: - """Test _extract_root_cause helper function""" + assert events["http_enter_task"] is events["http_exit_task"] + assert events["session_enter_task"] is events["session_exit_task"] + + @pytest.mark.asyncio + async def test_call_tool_runs_through_owner_task(self, monkeypatch: pytest.MonkeyPatch): + client = McpClient( + name="test-call", + server_type="remote", + url="https://mcp.example.com/mcp", + ) + events: dict[str, object] = {} + monkeypatch.setattr( + mcp_client_module, + "ClientSession", + _make_session_class(events=events), + ) + _bind_method( + monkeypatch, + client, + "_create_streamable_http_streams", + _make_remote_transport_factory("http"), + ) + _bind_method( + monkeypatch, + client, + "_create_sse_streams", + _make_remote_transport_factory("sse"), + ) + + await client.connect() + result = await client.call_tool("demo_tool", {"value": 1}) + await client.disconnect() + + assert result == {"name": "demo_tool", "arguments": {"value": 1}} + assert events["call_tool_task"] is events["session_enter_task"] + + +class TestExtractRootCause: def test_simple_exception(self): - """Simple exception returns its message""" assert _extract_root_cause(RuntimeError("simple error")) == "simple error" def test_exception_group(self): - """ExceptionGroup should unwrap to the root cause""" inner = RuntimeError("real error") group = ExceptionGroup("group", [inner]) assert _extract_root_cause(group) == "real error" def test_nested_exception_group(self): - """Nested ExceptionGroups should be fully unwrapped""" inner = ValueError("deep error") group1 = ExceptionGroup("inner group", [inner]) group2 = ExceptionGroup("outer group", [group1]) assert _extract_root_cause(group2) == "deep error" def test_http_status_error(self): - """HTTP status errors should show status code""" - # Simulate httpx.HTTPStatusError class MockResponse: status_code = 401 + class MockRequest: url = "https://example.com/mcp?apikey=secret123" + exc = Exception("HTTP error") exc.response = MockResponse() exc.request = MockRequest() result = _extract_root_cause(exc) assert "401" in result - assert "secret" not in result # URL should be masked + assert "secret" not in result diff --git a/tests/server/routes/test_route_timing.py b/tests/server/routes/test_route_timing.py new file mode 100644 index 000000000..86fa81a5e --- /dev/null +++ b/tests/server/routes/test_route_timing.py @@ -0,0 +1,53 @@ +import pytest + +from flocks.server.routes import _timing as timing_module + + +class _Recorder: + def __init__(self) -> None: + self.debug_calls: list[tuple[str, dict]] = [] + self.info_calls: list[tuple[str, dict]] = [] + + def debug(self, message, extra=None) -> None: + self.debug_calls.append((message, extra or {})) + + def info(self, message, extra=None) -> None: + self.info_calls.append((message, extra or {})) + + +def test_log_route_timing_uses_debug_below_threshold(monkeypatch: pytest.MonkeyPatch) -> None: + logger = _Recorder() + monkeypatch.setattr(timing_module.time, "perf_counter", lambda: 100.2) + + duration_ms = timing_module.log_route_timing( + logger, + "session.list.complete", + started_at=100.0, + extra={"count": 2}, + slow_threshold_ms=300, + ) + + assert 199 <= duration_ms <= 200 + assert logger.info_calls == [] + assert logger.debug_calls == [ + ("session.list.complete", {"duration_ms": duration_ms, "count": 2}), + ] + + +def test_log_route_timing_uses_info_at_threshold(monkeypatch: pytest.MonkeyPatch) -> None: + logger = _Recorder() + monkeypatch.setattr(timing_module.time, "perf_counter", lambda: 200.3) + + duration_ms = timing_module.log_route_timing( + logger, + "task.dashboard.complete", + started_at=200.0, + extra={"running": 1}, + slow_threshold_ms=300, + ) + + assert 299 <= duration_ms <= 300 + assert logger.debug_calls == [] + assert logger.info_calls == [ + ("task.dashboard.complete", {"duration_ms": duration_ms, "running": 1}), + ] diff --git a/tui/tsconfig.json b/tui/tsconfig.json index 77a008120..746eed59c 100644 --- a/tui/tsconfig.json +++ b/tui/tsconfig.json @@ -1,7 +1,12 @@ { "$schema": "https://json.schemastore.org/tsconfig", - "extends": "@tsconfig/bun/tsconfig.json", "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "resolveJsonModule": true, + "noEmit": true, "jsx": "preserve", "jsxImportSource": "@opentui/solid", "lib": ["ESNext", "DOM", "DOM.Iterable"], diff --git a/webui/src/components/common/SessionChat.tsx b/webui/src/components/common/SessionChat.tsx index 517beb82b..2d1adc785 100644 --- a/webui/src/components/common/SessionChat.tsx +++ b/webui/src/components/common/SessionChat.tsx @@ -521,7 +521,7 @@ export default function SessionChat({ const hasUserMessage = useMemo(() => messages.some((m) => m.role === 'user'), [messages]); - const sseEnabled = live || isStreaming || !hideInput; + const sseEnabled = Boolean(sessionId) && (live || isStreaming || !hideInput); const handleSSEEvent = useCallback( (event: SSEChatEvent) => { diff --git a/webui/src/hooks/useTasks.ts b/webui/src/hooks/useTasks.ts index 5f94e5e4d..d8189ce4a 100644 --- a/webui/src/hooks/useTasks.ts +++ b/webui/src/hooks/useTasks.ts @@ -22,10 +22,11 @@ export function useTaskSchedulers( const [loading, setLoading] = useState(true); const [error, setError] = useState(null); const tasksRef = useRef([]); + const initializedRef = useRef(false); const fetchTasks = useCallback(async () => { try { - setLoading(true); + if (!initializedRef.current) setLoading(true); setError(null); const response = await taskAPI.listSchedulers(filters); const data = response.data; @@ -39,6 +40,7 @@ export function useTaskSchedulers( setTotal(0); } finally { setLoading(false); + initializedRef.current = true; } }, [ filters?.status, @@ -87,10 +89,11 @@ export function useTaskExecutions( const [loading, setLoading] = useState(true); const [error, setError] = useState(null); const tasksRef = useRef([]); + const initializedRef = useRef(false); const fetchTasks = useCallback(async () => { try { - setLoading(true); + if (!initializedRef.current) setLoading(true); setError(null); const response = await taskAPI.listExecutions(filters); const data = response.data; @@ -104,6 +107,7 @@ export function useTaskExecutions( setTotal(0); } finally { setLoading(false); + initializedRef.current = true; } }, [ filters?.status, @@ -173,10 +177,11 @@ export function useTaskDashboard(options?: { pollInterval?: number }) { const [counts, setCounts] = useState(null); const [loading, setLoading] = useState(true); const [error, setError] = useState(null); + const initializedRef = useRef(false); const fetchDashboard = useCallback(async () => { try { - setLoading(true); + if (!initializedRef.current) setLoading(true); setError(null); const response = await taskAPI.dashboard(); setCounts(response.data); @@ -184,6 +189,7 @@ export function useTaskDashboard(options?: { pollInterval?: number }) { setError(err.message || 'Failed to fetch dashboard'); } finally { setLoading(false); + initializedRef.current = true; } }, []); @@ -232,10 +238,11 @@ export function useQueueStatus(options?: { pollInterval?: number }) { const [queueStatus, setQueueStatus] = useState(null); const [loading, setLoading] = useState(true); const [error, setError] = useState(null); + const initializedRef = useRef(false); const fetchQueueStatus = useCallback(async () => { try { - setLoading(true); + if (!initializedRef.current) setLoading(true); setError(null); const response = await taskAPI.queueStatus(); setQueueStatus(response.data); @@ -243,6 +250,7 @@ export function useQueueStatus(options?: { pollInterval?: number }) { setError(err.message || 'Failed to fetch queue status'); } finally { setLoading(false); + initializedRef.current = true; } }, []); @@ -263,10 +271,11 @@ export function useTaskSystemNotice() { const [notice, setNotice] = useState(null); const [loading, setLoading] = useState(false); const [error, setError] = useState(null); + const initializedRef = useRef(false); const fetchNotice = useCallback(async () => { try { - setLoading(true); + if (!initializedRef.current) setLoading(true); setError(null); const response = await taskAPI.getSystemNotice(); setNotice(response.data ?? null); @@ -274,6 +283,7 @@ export function useTaskSystemNotice() { setError(err.message || 'Failed to fetch system notice'); } finally { setLoading(false); + initializedRef.current = true; } }, []); diff --git a/webui/src/hooks/useTools.test.tsx b/webui/src/hooks/useTools.test.tsx new file mode 100644 index 000000000..ac6fd9e76 --- /dev/null +++ b/webui/src/hooks/useTools.test.tsx @@ -0,0 +1,64 @@ +import { renderHook, waitFor } from '@testing-library/react'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +import { useTools } from './useTools'; + +const { listMock, refreshMock } = vi.hoisted(() => ({ + listMock: vi.fn(), + refreshMock: vi.fn(), +})); + +vi.mock('@/api/tool', () => ({ + toolAPI: { + list: listMock, + refresh: refreshMock, + }, +})); + +function deferred() { + let resolve!: (value: T | PromiseLike) => void; + const promise = new Promise((res) => { + resolve = res; + }); + return { promise, resolve }; +} + +describe('useTools', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('renders the tool list before the background refresh completes', async () => { + const refreshDeferred = deferred<{ data: { status: string } }>(); + + listMock.mockResolvedValue({ + data: [ + { + name: 'tool-alpha', + description: 'alpha tool', + category: 'custom', + source: 'custom', + enabled: true, + }, + ], + }); + refreshMock.mockReturnValue(refreshDeferred.promise); + + const { result } = renderHook(() => useTools()); + + await waitFor(() => { + expect(result.current.loading).toBe(false); + }); + + expect(result.current.tools).toHaveLength(1); + expect(result.current.tools[0].name).toBe('tool-alpha'); + expect(listMock).toHaveBeenCalledTimes(1); + expect(refreshMock).toHaveBeenCalledTimes(1); + + refreshDeferred.resolve({ data: { status: 'success' } }); + + await waitFor(() => { + expect(listMock).toHaveBeenCalledTimes(2); + }); + }); +}); diff --git a/webui/src/hooks/useTools.ts b/webui/src/hooks/useTools.ts index 8cebfc843..f4119b2f9 100644 --- a/webui/src/hooks/useTools.ts +++ b/webui/src/hooks/useTools.ts @@ -6,17 +6,19 @@ export function useTools() { const [loading, setLoading] = useState(true); const [error, setError] = useState(null); const lastRefreshRef = useRef(0); + const initializedRef = useRef(false); const fetchTools = useCallback(async (showLoading = false) => { try { - if (showLoading) setLoading(true); + if (showLoading && !initializedRef.current) setLoading(true); setError(null); const response = await toolAPI.list(); setTools(Array.isArray(response.data) ? response.data : []); } catch (err: any) { setError(err.message || 'Failed to fetch tools'); } finally { - if (showLoading) setLoading(false); + if (showLoading && !initializedRef.current) setLoading(false); + initializedRef.current = true; } }, []); @@ -31,18 +33,34 @@ export function useTools() { }, [fetchTools]); useEffect(() => { + let cancelled = false; + const init = async () => { - try { await toolAPI.refresh(); } catch { /* ignore */ } - lastRefreshRef.current = Date.now(); await fetchTools(true); + if (cancelled) return; + + try { + await toolAPI.refresh(); + if (cancelled) return; + lastRefreshRef.current = Date.now(); + await fetchTools(false); + } catch { + /* ignore */ + } }; - init(); + + void init(); const onVisible = () => { - if (document.visibilityState === 'visible') refreshAndFetch(); + if (document.visibilityState === 'visible') { + void refreshAndFetch(); + } }; document.addEventListener('visibilitychange', onVisible); - return () => document.removeEventListener('visibilitychange', onVisible); + return () => { + cancelled = true; + document.removeEventListener('visibilitychange', onVisible); + }; }, [fetchTools, refreshAndFetch]); return { diff --git a/webui/src/pages/Session/index.test.tsx b/webui/src/pages/Session/index.test.tsx index e48265f47..d6e9c72bc 100644 --- a/webui/src/pages/Session/index.test.tsx +++ b/webui/src/pages/Session/index.test.tsx @@ -264,6 +264,12 @@ describe('SessionPage session actions menu', () => { expect(global.confirm).toHaveBeenCalledWith('confirmDelete'); }); + it('does not auto-select the first session on initial load', () => { + renderSessionPage(); + + expect(screen.getByTestId('session-chat')).toHaveTextContent('no-session'); + }); + it('syncs selected session when query param changes after mount', async () => { const user = userEvent.setup(); @@ -294,9 +300,7 @@ describe('SessionPage session actions menu', () => { , ); - await waitFor(() => { - expect(screen.getByTestId('session-chat')).toHaveTextContent('session-1'); - }); + expect(screen.getByTestId('session-chat')).toHaveTextContent('no-session'); await user.click(screen.getByRole('button', { name: 'go-session-2' })); diff --git a/webui/src/pages/Session/index.tsx b/webui/src/pages/Session/index.tsx index 8f32fbfa7..dacc354db 100644 --- a/webui/src/pages/Session/index.tsx +++ b/webui/src/pages/Session/index.tsx @@ -99,13 +99,6 @@ export default function SessionPage() { } }, [searchParams, selectedSessionId, setSearchParams]); - // Auto select first session - useEffect(() => { - if (!selectedSessionId && sessions.length > 0) { - setSelectedSessionId(sessions[0].id); - } - }, [sessions, selectedSessionId]); - // Close agent dropdown on outside click useEffect(() => { if (!showAgentOptions) return; @@ -617,7 +610,7 @@ export default function SessionPage() { { refetchDashboard(); - refetchQueue(); }; const forceRemountSections = () => { From 241527253d30ca1a2432d44abc53abb0fdcb9d8d Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Tue, 12 May 2026 17:24:37 +0800 Subject: [PATCH 37/41] chore: bump package version to v2026.5.12 Co-authored-by: Cursor --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6d87f8c24..a7ec57737 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "flocks" -version = "v2026.5.9" +version = "v2026.5.12" description = "AI-Native SecOps platform with multi-agent collaboration" authors = [ {name = "Flocks Team", email = "team@example.com"} diff --git a/uv.lock b/uv.lock index a2cd269f2..b932376ea 100644 --- a/uv.lock +++ b/uv.lock @@ -509,7 +509,7 @@ wheels = [ [[package]] name = "flocks" -version = "2026.5.9" +version = "2026.5.12" source = { editable = "." } dependencies = [ { name = "aiofiles" }, From bba801b09a1e1e151c2e38468f36334ddb3697b3 Mon Sep 17 00:00:00 2001 From: xiami Date: Wed, 13 May 2026 14:00:07 +0800 Subject: [PATCH 38/41] docs(skills): web2cli flow, capture path, and browser experience in skills (#263) - browser-use: document product browser experience and link new reference - Add browser-experience-in-skill.md for workflow and templates - web2cli: output under outputs/web2cli, iteration step 11, auth troubleshooting - cli-in-skill: browser-workflow.md scope and writing guide --- .flocks/plugins/skills/browser-use/SKILL.md | 15 +++ .../references/browser-experience-in-skill.md | 118 ++++++++++++++++++ .flocks/plugins/skills/web2cli/SKILL.md | 34 +++-- .../skills/web2cli/references/cli-in-skill.md | 22 +++- 4 files changed, 167 insertions(+), 22 deletions(-) create mode 100644 .flocks/plugins/skills/browser-use/references/browser-experience-in-skill.md diff --git a/.flocks/plugins/skills/browser-use/SKILL.md b/.flocks/plugins/skills/browser-use/SKILL.md index 1f41fefe2..51c513987 100644 --- a/.flocks/plugins/skills/browser-use/SKILL.md +++ b/.flocks/plugins/skills/browser-use/SKILL.md @@ -75,7 +75,22 @@ browser: not connected — 请确保 Chrome / Chromium / Edge 已打开,然后 1. 模式一旦确定,立即只读取对应的 reference。 2. 不要同时加载 `references/cdp-direct.md` 和 `references/agent-browser.md`。 +## 产品经验Skill + +把特定产品页/网站的浏览器操作经验,沉淀到对应产品 skill,实现可复用。 + +适合沉淀的经验包括: + +- 已确认某产品的稳定登录的方法 +- 更稳定的页面进入方式,例如“优先直接拼 URL,不走菜单” +- 表格、筛选器、分页、弹窗、下载、详情展开等可靠操作路径 +- 某站点特有的等待条件、重渲染特征、虚拟列表/SPA 交互怪癖 +- 特定操作的成功经验,失败案例(特定操作失败 2 次以上,最终成功的经验) + +具体怎么沉淀到 产品skill,请阅读 `references/browser-experience-in-skill.md`。 + ## References +- `references/browser-experience-in-skill.md`:如何把浏览器经验沉淀到产品 skill,以及推荐记录模板 - `references/cdp-direct.md`:以 `flocks browser` 作为 CDP 直连入口的启动方式、API、页面探索策略、错误处理 - `references/agent-browser.md`:agent-browser 的使用说明、错误处理等 diff --git a/.flocks/plugins/skills/browser-use/references/browser-experience-in-skill.md b/.flocks/plugins/skills/browser-use/references/browser-experience-in-skill.md new file mode 100644 index 000000000..5834212ec --- /dev/null +++ b/.flocks/plugins/skills/browser-use/references/browser-experience-in-skill.md @@ -0,0 +1,118 @@ +# 浏览器经验记录到 Skill 的说明 + +把特定产品页/网站的浏览器操作经验,沉淀到对应产品 skill,实现可复用。 + +## 目标文件 + +浏览器经验统一沉淀到目标产品 skill 的 `references/browser-workflow.md`。 + +该文件用于记录某一产品的浏览器相关长期知识,包括登录操作、页面入口、稳定操作方式、等待条件、认证恢复流程,以及 CLI 与浏览器的切换边界。 + +## 适用场景 + +出现以下任一情况时,应补充或更新目标产品的 `references/browser-workflow.md`: + +- 已确认某产品的稳定登录的方法 +- 已验证更稳定的页面进入方式,例如“优先直接拼 URL,不走菜单” +- 已验证表格、筛选器、分页、弹窗、下载、上传、详情展开等操作的稳定路径 +- 已验证某站点特有的等待条件、重渲染行为、虚拟列表、iframe 或 SPA 交互特征 +- 特定操作的成功经验,失败案例(特定操作失败 2 次以上,最终成功的经验) + + +## 创建新Skill + +如果当前仓库里还没有对应产品 skill,就按下面的最小结构创建(如果已经有就跳过这一步): + +```text +$HOME/.flocks/plugins/skills/-use/ +├── SKILL.md +└── references/ + └── browser-workflow.md +``` + +其中 `SKILL.md` 必须遵守 Flocks 的标准 skill 格式: + +- 文件开头必须是 YAML frontmatter,第一行必须为 `---` +- frontmatter 至少包含 `name` 和 `description` +- `name` 使用稳定的 skill 标识,推荐与目录名一致,例如 `-use` +- frontmatter 结束后,再写正文标题、触发条件、模式判断和使用说明 + +最小模板示例: + +```md +--- +name: test-use +description: 用于查询 Test 测试平台数据,…… +--- + +# Test Use + +## 触发条件 + +- 用户提到 Test 平台 +- 用户需要查询 Test 数据 +``` + +## 记录原则 + +### 1. 记录成功经验和禁止的失败操作,不记录任务流水账 + +应记录可跨任务复用的稳定经验,例如固定入口、稳定操作路径、稳定等待条件、固定恢复流程。 + +不应记录一次性过程描述,例如“先点 A 失败,再点 B 成功”这类仅对单次会话成立的临时过程。 + +### 2. 记录产品知识,不重复通用浏览器说明 + +通用模式判断、底层命令、浏览器 API 用法,保留在 `browser-use` 自身的 reference 中。 + +产品 skill 的 `references/browser-workflow.md` 只记录该产品自己的页面结构、操作特征和恢复策略。 + +### 3. 记录可执行经验,不记录敏感数据 + +可以记录固定路径、页面结构、等待条件、认证文件位置、恢复步骤。 + +不得记录 cookie、token、密码、短信码、TOTP、个人数据,也不得记录一次性的 `@eN` ref、临时 tab id、临时 selector 或像素坐标。 + +## 建议结构 + +目标产品的 `references/browser-workflow.md` 建议至少包含以下部分: + +```md +# <产品名> 浏览器工作流 + +## 零、登录认证 +- state 文件路径 +- 首次登录方式 +- session 失效恢复 + +## 一、页面入口 +- 首页 / 列表 / 详情 / 导出页 URL + +## 二、浏览器操作 +- 该网站特定的操作经验 +- 稳定定位方式 +- 容易失败的组件及替代方案 + +## 三、重要提醒 +- 不要无限重试 +- 哪些高风险按钮需要用户确认 +- 何时回退 CLI 或请求用户接管 +``` + +## 不应记录的内容 + +以下内容不应沉淀到产品 skill: + +- cookie、token、密码、短信码、TOTP、个人数据 +- 一次性的 `@eN` ref、临时 tab id、临时 selector、临时像素坐标 +- 单次任务过程中的临时试错记录 +- 只在某一次页面状态下成立、无法稳定复用的偶然现象 +- 所有浏览器操作的通用技巧 + +## 与 `web2cli` 的关系 + +当产品同时存在 CLI 与浏览器、或其他说明等多条路径时,职责划分如下: + +- `cli-reference.md`:记录 CLI 参数、命令示例和输出字段 +- `browser-workflow.md`:记录登录、state、页面入口、浏览器操作经验、等待条件和认证恢复流程 +- `SKILL.md`:记录高层触发条件、模式判断,以及“何时优先 CLI、何时退回浏览器”的说明 diff --git a/.flocks/plugins/skills/web2cli/SKILL.md b/.flocks/plugins/skills/web2cli/SKILL.md index 8dbf12187..d1e5120cb 100644 --- a/.flocks/plugins/skills/web2cli/SKILL.md +++ b/.flocks/plugins/skills/web2cli/SKILL.md @@ -8,14 +8,6 @@ required: browser-use > 正式开始前,先明确需要操作的网站或tab -## 使用的资源 - -- 默认注入脚本:`scripts/inject-hook-base.js` -- CLI 生成器:`.flocks/plugins/skills/web2cli/scripts/generate-cli.py` -- 支持两种模式: - - `MODE=agent-browser` - - `MODE=cdp-direct` - ## 模式选择 ### `agent-browser` @@ -40,15 +32,14 @@ browser: not connected — 请确保 Chrome / Chromium / Edge 已打开,然后 ## 输出目录约定 -捕获产生的文件统一落到 `~/.flocks/workspace/outputs//web2cli//`。 +捕获产生的文件统一落到 `~/.flocks/workspace/outputs/web2cli//`。 开始前先准备目录: ```bash MODE="${MODE:-cdp-direct}" CAPTURE_NAME="" -TODAY="$(date +%F)" -CAPTURE_ROOT="$HOME/.flocks/workspace/outputs/$TODAY/web2cli/$CAPTURE_NAME" +CAPTURE_ROOT="$HOME/.flocks/workspace/outputs/web2cli/$CAPTURE_NAME" WEB2CLI_SKILL=".flocks/plugins/skills/web2cli" mkdir -p "$CAPTURE_ROOT/captures" ``` @@ -307,7 +298,7 @@ if target_id: ) ``` -将 cookie 和 localStorage 保存为后续 CLI 调用的认证输入。`flocks browser state save` 会输出更接近标准 `storageState` 的交换格式:cookies 放在顶层,origin 级 localStorage 放在 `origins[]`。保存时会尽量覆盖当前站点下的多子域 cookie,适合知乎这类依赖跨子域登录态的 CLI。认证文件包含敏感值,只能写入 `$CAPTURE_ROOT` 这类工作区输出目录,不要写入代码仓库。 +将 cookie 和 localStorage 保存为后续 CLI 调用的认证输入。 ### 7. 分析捕获的 web API @@ -421,13 +412,16 @@ else: `cdp-direct` 必须保留用户原有的 tab 不受影响。 +### 11. 迭代与 skill 沉淀 + +将 CLI 按 `references/cli-in-skill.md` 集成为 skill; + +### 12. summary -### 11. summary +总结当前生成的 CLI 工具有哪些能力,然后可提示用户下一步操作: -总结当前 生成 的CLI 工具有哪些能力,然后可提示用户下一步操作: -- 精简或修正CLI -- 进一步丰富 CLI 工具,重新开始 web2cli标准流程 -- 保存为对应的 skill 方便后续操作(进入此操作后,需要阅读references) +- 精简或修正 CLI +- 若仍需扩展能力或沉淀为 skill,回到步骤 11 ## 故障处理 @@ -455,10 +449,10 @@ else: 5. 修改sameOriginOnly 参数 6. 以上方法都不可行时,按照Hook 注入报错的原则,自定义hook.js -### 认证失效 +### CLI认证失效 -- `agent-browser`:重新登录后再次执行保存状态命令。 -- `cdp-direct`:重新登录后再次执行保存认证状态。 +- 登录状态有效:利用已有知识和查找公开资料尝试解决。 +- 登录状态失效:重新登录后再次执行保存状态命令。 ## Reference - references/cli-in-skill.md 将生成的 CLI 集成到 skill 中使用 diff --git a/.flocks/plugins/skills/web2cli/references/cli-in-skill.md b/.flocks/plugins/skills/web2cli/references/cli-in-skill.md index 6210c3644..ff35e27c8 100644 --- a/.flocks/plugins/skills/web2cli/references/cli-in-skill.md +++ b/.flocks/plugins/skills/web2cli/references/cli-in-skill.md @@ -40,7 +40,7 @@ cp "$CAPTURE_ROOT/auth-state.json" \ 1. 在 `scripts/config.py` 中把认证状态默认值指向 `~/.flocks/browser//auth-state.json` 2. 在 `references/cli-reference.md` 中写清楚 CLI 用法、环境变量和示例 -3. 在 `references/browser-workflow.md` 中写清楚浏览器登录与保存 state 的流程 +3. 在 `references/browser-workflow.md` 中写清楚浏览器登录、保存 state、页面入口、稳定操作方式、等待条件和认证恢复流程 4. 在 `SKILL.md` 中说明什么时候优先走 CLI,什么时候退回浏览器 推荐的配置写法: @@ -122,11 +122,29 @@ description: 用于查询 Test 测试平台数据,支持通过 CLI 快速查 - `SKILL.md`:定义触发条件、模式判断、总入口说明 - `scripts/_cli.py`:承载生成并整理后的 CLI 能力 - `scripts/config.py`:集中管理 `BASE_URL`、`AUTH_STATE_FILE`、超时、SSL 等默认配置 -- `references/browser-workflow.md`:写浏览器登录、保存 state、认证恢复流程 +- `references/browser-workflow.md`:统一写浏览器登录、保存 state、页面入口、具体操作经验、等待条件与认证恢复流程 - `references/cli-reference.md`:写 CLI 参数、命令示例、常见查询 新 skill 的原则也一样:先把生成的 CLI 改成稳定文件名,再把临时 `auth-state.json` 切换到全局默认位置 `~/.flocks/browser//auth-state.json`。 +## `browser-workflow.md` 写作指南 + +`references/browser-workflow.md` 是产品 skill 里统一承载浏览器经验的单文件。凡是已经验证、后续还会复用的浏览器操作经验,都应该优先沉淀到这里,而不是散落在临时对话里。 + +推荐写入以下内容: + +- 固定的登录入口、首页、详情页、导出页 URL +- 默认 state 路径,例如 `~/.flocks/browser//auth-state.json` +- 认证失效识别与恢复步骤 +- CLI 与浏览器的分工边界,例如“列表查询优先 CLI,详情预览/导出/人工登录走浏览器” +- 特定操作的成功经验,失败案例(特定操作失败 2 次以上,最终成功的经验) + +不要写入: + +- cookie、token、密码、短信码、TOTP 等敏感信息 +- 一次性的 `@eN` ref、临时 tab id、临时 selector、像素坐标 +- 本次任务的操作流水账 + ## 认证失败怎么处理 CLI 调用出现以下情况时,优先按认证失效处理: From a1a324d9f237e6cc97cc7e671b9fb11206c4f175 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Thu, 14 May 2026 11:29:35 +0800 Subject: [PATCH 39/41] fix(workflow,plugin): stop watcher reload loop and harden execution path Plugin/agent/skill file watchers used watchdog's ``on_any_event`` and re-fired on ``opened``/``closed``/``closed_no_write`` events. Each reload opens every YAML/Python file, which retriggers the watcher and creates a self-sustaining loop that scanned plugins every 2-3 s, kept ToolRegistry and Agent caches thrashing, and silently amplified syslog-driven load. Only react to actual content events (modified/created/deleted/moved) and ignore __pycache__/dotfile noise. Workflow execution path: - Serialize per-workflow stats RMW with an asyncio.Lock so concurrent syslog/HTTP completions no longer drop callCount/successCount. - Run Recorder audit + history trim as background tasks so the syslog dispatcher releases its semaphore slot without waiting on SQLite. - Tolerate a missing/non-dict execution record in step progress and finish/error paths instead of crashing on ``NoneType.update``. - Add done_callback cleanup so ``_active_workflow_executions`` cannot leak when a run is cancelled before reaching its own ``finally``. Startup hygiene: - Skip built-in hook registration safely when ``config.memory`` is None. - Migrate-legacy-sessions: drop the bogus ``dict`` model arg to ``Storage.get`` and make ``Storage.get/list_entries`` tolerate non-Pydantic types via ``json.loads`` fallback. - Provider auto-register: opencode entry referenced the missing ``FlocksCompatProvider`` symbol on the submodule; use the real ``OpenCodeProvider`` class. - command_loader: fix ``flocks_global_dir`` NameError typo. - mcp: demote benign ``mcp.already_initialized`` warn to debug. Logs noise: - ``plugin.tool.duplicate``: silently skip idempotent re-scans of the same plugin source; only warn on genuine cross-source collisions. - ``agent.toolset.tool_missing``: demote to debug for built-in agents that declare optional ``lsp_*``/``ast_grep_search`` tools. Co-authored-by: Cursor --- flocks/agent/registry.py | 9 ++ flocks/agent/toolset.py | 6 +- flocks/command/command_loader.py | 2 +- flocks/ingest/syslog/listener.py | 30 ++++--- flocks/ingest/syslog/manager.py | 123 ++++++++++++++++++++++--- flocks/mcp/server.py | 6 +- flocks/provider/provider.py | 2 +- flocks/server/app.py | 32 +++++-- flocks/server/routes/workflow.py | 57 +++++++++--- flocks/skill/skill.py | 8 ++ flocks/storage/storage.py | 10 ++- flocks/tool/registry.py | 51 +++++++++-- flocks/workflow/execution_store.py | 140 ++++++++++++++++++++++++----- 13 files changed, 397 insertions(+), 79 deletions(-) diff --git a/flocks/agent/registry.py b/flocks/agent/registry.py index 874470edf..b73fd517a 100644 --- a/flocks/agent/registry.py +++ b/flocks/agent/registry.py @@ -621,10 +621,19 @@ def start(self) -> None: watcher = self + # Only react to actual content-mutation events. Without this guard the + # ``opened``/``closed``/``closed_no_write`` events that watchdog emits + # whenever any code (including the agent loader itself) reads + # ``agent.yaml`` / ``*.md`` would re-trigger cache invalidation on every + # access, causing a self-sustaining reload loop. + _RELOAD_EVENT_TYPES = frozenset({"modified", "created", "deleted", "moved"}) + class _Handler(FileSystemEventHandler): def on_any_event(self, event: FileSystemEvent) -> None: if event.is_directory: return + if getattr(event, "event_type", "") not in _RELOAD_EVENT_TYPES: + return src = getattr(event, "src_path", "") or "" fname = os.path.basename(src) if fname == "agent.yaml" or src.endswith(".md"): diff --git a/flocks/agent/toolset.py b/flocks/agent/toolset.py index 5607f2683..20b1e4f16 100644 --- a/flocks/agent/toolset.py +++ b/flocks/agent/toolset.py @@ -47,7 +47,11 @@ def normalize_declared_tool_names( matches = [raw_name] if raw_name in available else [] if not matches: - log.warn("agent.toolset.tool_missing", {"tool": raw_name}) + # Built-in agent definitions (librarian, metis, …) declare optional + # tools such as ``lsp_*`` / ``ast_grep_search`` that ship in separate + # binaries; they are gracefully skipped when not installed. Treat + # this as informational only to avoid flooding operational logs. + log.debug("agent.toolset.tool_missing", {"tool": raw_name}) continue for match in matches: diff --git a/flocks/command/command_loader.py b/flocks/command/command_loader.py index 90ec5ee87..6f0b64b02 100644 --- a/flocks/command/command_loader.py +++ b/flocks/command/command_loader.py @@ -103,7 +103,7 @@ def discover_commands() -> Dict[str, CommandInfo]: try: from flocks.utils.compat import get_flocks_config_dir - opencode_global_dir = str(get_flocks_config_dir(binary="opencode") / "command") + flocks_global_dir = str(get_flocks_config_dir(binary="opencode") / "command") sources.append(("flocks-global", flocks_global_dir, "**/*.md")) except Exception as e: log.warn("command.flocks_dir.error", {"error": str(e)}) diff --git a/flocks/ingest/syslog/listener.py b/flocks/ingest/syslog/listener.py index decce3cda..7212f372d 100644 --- a/flocks/ingest/syslog/listener.py +++ b/flocks/ingest/syslog/listener.py @@ -11,7 +11,12 @@ class SyslogUDPProtocol(asyncio.DatagramProtocol): - """Receive syslog datagrams and invoke async callback with parsed dict.""" + """Receive syslog datagrams and invoke async callback with parsed dict. + + The *on_message* callback is expected to be non-blocking (e.g. a queue + put_nowait). This protocol deliberately does NOT create unbounded asyncio + tasks on every datagram — the caller owns concurrency control. + """ def __init__( self, @@ -24,19 +29,24 @@ def __init__( def datagram_received(self, data: bytes, _addr) -> None: # noqa: ANN001 text = data.decode("utf-8", errors="replace") parsed = parse_syslog(text, self._format_hint) - try: - loop = asyncio.get_running_loop() - except RuntimeError: - return - loop.create_task(self._safe_dispatch(parsed)) - - async def _safe_dispatch(self, parsed: dict) -> None: try: res = self._on_message(parsed) + # If the callback returns a coroutine (legacy path), schedule it + # but only once — do NOT create_task without bound. if asyncio.iscoroutine(res): - await res + try: + loop = asyncio.get_running_loop() + except RuntimeError: + return + loop.create_task(self._safe_await(res)) + except Exception: + pass + + @staticmethod + async def _safe_await(coro) -> None: # noqa: ANN001 + try: + await coro except Exception: - # Logged by caller / manager pass diff --git a/flocks/ingest/syslog/manager.py b/flocks/ingest/syslog/manager.py index 6984d322b..040545dd3 100644 --- a/flocks/ingest/syslog/manager.py +++ b/flocks/ingest/syslog/manager.py @@ -21,6 +21,11 @@ log = Log.create(service="syslog.manager") +# Maximum concurrent workflow executions per workflow to avoid FD exhaustion and SQLite write contention +_MAX_CONCURRENT_EXECUTIONS = 8 +# Maximum number of buffered syslog messages per workflow; excess messages are dropped with a warning +_MAX_QUEUE_SIZE = 200 + class SyslogManager: """One async listener task per workflow id (when enabled).""" @@ -28,6 +33,12 @@ class SyslogManager: def __init__(self) -> None: self._tasks: dict[str, asyncio.Task] = {} self._abort_events: dict[str, asyncio.Event] = {} + # Per-workflow semaphore to cap concurrent executions + self._semaphores: dict[str, asyncio.Semaphore] = {} + # Per-workflow bounded message queue for backpressure + self._queues: dict[str, asyncio.Queue] = {} + # Per-workflow queue consumer task + self._consumer_tasks: dict[str, asyncio.Task] = {} @staticmethod def _config_key(workflow_id: str) -> str: @@ -69,6 +80,16 @@ async def stop_workflow(self, workflow_id: str) -> None: await task except asyncio.CancelledError: pass + # Stop the queue consumer task + consumer = self._consumer_tasks.pop(workflow_id, None) + if consumer is not None and not consumer.done(): + consumer.cancel() + try: + await consumer + except asyncio.CancelledError: + pass + self._semaphores.pop(workflow_id, None) + self._queues.pop(workflow_id, None) async def restart_workflow(self, workflow_id: str) -> None: await self.stop_workflow(workflow_id) @@ -81,10 +102,35 @@ async def restart_workflow(self, workflow_id: str) -> None: if not isinstance(data, dict) or not data.get("enabled"): return + # Load and cache the workflow JSON once; avoids a disk read per message + wf_data = read_workflow_from_fs(workflow_id) + if not wf_data: + log.warning("syslog.workflow_not_found_on_start", {"workflow_id": workflow_id}) + return + workflow_json = wf_data.get("workflowJson") + if not workflow_json: + log.warning("syslog.workflow_json_missing_on_start", {"workflow_id": workflow_id}) + return + + # Set up concurrency control resources + self._semaphores[workflow_id] = asyncio.Semaphore(_MAX_CONCURRENT_EXECUTIONS) + queue: asyncio.Queue = asyncio.Queue(maxsize=_MAX_QUEUE_SIZE) + self._queues[workflow_id] = queue + abort = asyncio.Event() self._abort_events[workflow_id] = abort + + input_key = str(data.get("inputKey") or "syslog_message") + + # Start the consumer that drains the queue and dispatches executions under the semaphore + consumer = asyncio.create_task( + self._queue_consumer(workflow_id, workflow_json, input_key, queue, abort), + name=f"syslog-consumer-{workflow_id}", + ) + self._consumer_tasks[workflow_id] = consumer + task = asyncio.create_task( - self._listener_loop(workflow_id, data, abort), + self._listener_loop(workflow_id, data, queue, abort), name=f"syslog-{workflow_id}", ) self._tasks[workflow_id] = task @@ -94,16 +140,25 @@ async def _listener_loop( self, workflow_id: str, config: Dict[str, Any], + queue: asyncio.Queue, abort: asyncio.Event, ) -> None: host = str(config.get("host") or "0.0.0.0") port = int(config.get("port") or 5140) protocol = str(config.get("protocol") or "udp").lower() format_hint = str(config.get("format") or "auto") - input_key = str(config.get("inputKey") or "syslog_message") - async def on_msg(parsed: dict) -> None: - await self._trigger_workflow(workflow_id, parsed, input_key) + # NOTE: keep this callback synchronous so the UDP protocol layer can + # invoke it inline from datagram_received() without creating an + # asyncio task per packet. That preserves the queue-based backpressure. + def on_msg(parsed: dict) -> None: + try: + queue.put_nowait(parsed) + except asyncio.QueueFull: + log.warning("syslog.queue_full_dropped", { + "workflow_id": workflow_id, + "queue_size": queue.qsize(), + }) try: if protocol == "tcp": @@ -132,15 +187,57 @@ async def on_msg(parsed: dict) -> None: except Exception as exc: log.error("syslog.listener_error", {"workflow_id": workflow_id, "error": str(exc)}) - async def _trigger_workflow(self, workflow_id: str, syslog_msg: dict, input_key: str) -> None: - data = read_workflow_from_fs(workflow_id) - if not data: - log.warning("syslog.workflow_not_found", {"workflow_id": workflow_id}) - return - workflow_json = data.get("workflowJson") - if not workflow_json: - log.warning("syslog.workflow_json_missing", {"workflow_id": workflow_id}) - return + async def _queue_consumer( + self, + workflow_id: str, + workflow_json: Any, + input_key: str, + queue: asyncio.Queue, + abort: asyncio.Event, + ) -> None: + """Drain the message queue and dispatch executions bounded by the semaphore.""" + semaphore = self._semaphores[workflow_id] + pending: set[asyncio.Task] = set() + + async def _dispatch(m: dict) -> None: + async with semaphore: + await self._trigger_workflow(workflow_id, workflow_json, m, input_key) + + try: + while not abort.is_set(): + try: + # Poll with a short timeout so we can react to abort promptly + msg = await asyncio.wait_for(queue.get(), timeout=0.5) + except asyncio.TimeoutError: + continue + + t = asyncio.create_task(_dispatch(msg)) + pending.add(t) + t.add_done_callback(pending.discard) + except asyncio.CancelledError: + pass + finally: + # Best-effort drain: wait briefly for in-flight dispatches so their + # final Storage writes complete; cancel anything still stuck so we + # don't leak tasks on shutdown. + if pending: + try: + await asyncio.wait_for( + asyncio.gather(*pending, return_exceptions=True), + timeout=5.0, + ) + except (asyncio.TimeoutError, asyncio.CancelledError): + for t in list(pending): + if not t.done(): + t.cancel() + + async def _trigger_workflow( + self, + workflow_id: str, + workflow_json: Any, + syslog_msg: dict, + input_key: str, + ) -> None: inputs = {input_key: syslog_msg} exec_data = await create_execution_record( diff --git a/flocks/mcp/server.py b/flocks/mcp/server.py index 65b511616..059ef372b 100644 --- a/flocks/mcp/server.py +++ b/flocks/mcp/server.py @@ -60,7 +60,11 @@ async def init(self) -> None: Servers that fail to connect will be retried in the background with exponential backoff. """ if self._initialized: - log.warn("mcp.already_initialized") + # ``MCP.init`` is invoked from both the global server lifespan and + # the per-instance bootstrap on startup. The guard above keeps the + # call idempotent, so this is informational only and should not + # surface as a warning in operational logs. + log.debug("mcp.already_initialized") return log.info("mcp.initializing") diff --git a/flocks/provider/provider.py b/flocks/provider/provider.py index f83ad7fe5..4537ca1a5 100644 --- a/flocks/provider/provider.py +++ b/flocks/provider/provider.py @@ -218,7 +218,7 @@ def _ensure_initialized(cls): ("github-copilot", "flocks.provider.sdk.github_copilot", "GitHubCopilotProvider"), ("github-copilot-enterprise", "flocks.provider.sdk.github_copilot", "GitHubCopilotEnterpriseProvider"), ("vercel", "flocks.provider.sdk.vercel", "VercelProvider"), - ("opencode", "flocks.provider.sdk.opencode", "FlocksCompatProvider"), + ("opencode", "flocks.provider.sdk.opencode", "OpenCodeProvider"), ("sap-ai-core", "flocks.provider.sdk.sap_ai_core", "SAPAICoreProvider"), ("cloudflare-ai-gateway", "flocks.provider.sdk.cloudflare_gateway", "CloudflareGatewayProvider"), # Added in Batch 7 - Final providers diff --git a/flocks/server/app.py b/flocks/server/app.py index fa713c757..f7dc8fb32 100644 --- a/flocks/server/app.py +++ b/flocks/server/app.py @@ -189,8 +189,12 @@ async def lifespan(app: FastAPI): # but we still skip loading users when the marker is already set # to avoid unnecessary DB + session scans on every startup. async def _migrate_legacy_sessions_to_admin() -> None: - marker = await Storage.get("auth:migration:legacy_session_owner_to_admin", dict) - if marker and marker.get("done"): + # ``Storage.get`` interprets a non-``None`` ``model`` argument as a + # Pydantic model and calls ``model_validate_json``. Passing the + # builtin ``dict`` type therefore raised ``AttributeError``; omit the + # model so the value is decoded with ``json.loads``. + marker = await Storage.get("auth:migration:legacy_session_owner_to_admin") + if isinstance(marker, dict) and marker.get("done"): return if not await AuthService.has_users(): return @@ -218,7 +222,11 @@ async def _migrate_legacy_sessions_to_admin() -> None: # Register built-in hooks if memory is enabled try: config = await Config.get() - if config.memory.enabled: + # ``config.memory`` may be ``None`` when the memory system is not + # configured at all; in that case there is nothing to register. + memory_cfg = getattr(config, "memory", None) + memory_enabled = bool(getattr(memory_cfg, "enabled", False)) if memory_cfg else False + if memory_enabled: from flocks.hooks.builtin import register_builtin_hooks await _run_startup_phase( log, @@ -374,12 +382,24 @@ async def _start_channel_gateway() -> None: except Exception as e: log.warning("channel.gateway.start_failed", {"error": str(e)}) - # Start syslog listeners for workflows with syslog enabled + # Start syslog listeners for workflows with syslog enabled. + # Use a background task with a short delay so the main startup path is not + # blocked and to break the crash-restart loop where an immediate syslog + # flood would bring the server back down before it is fully ready. try: from flocks.ingest.syslog.manager import default_manager as default_syslog_manager - await default_syslog_manager.start_all() - log.info("syslog.manager.started") + async def _delayed_syslog_start() -> None: + # Wait for storage and tool registry to be fully initialised before + # resuming syslog listeners. + await asyncio.sleep(3) + try: + await default_syslog_manager.start_all() + log.info("syslog.manager.started") + except Exception as exc: + log.warning("syslog.manager.start_failed", {"error": str(exc)}) + + _schedule_startup_phase(app, log, "syslog.manager.start", _delayed_syslog_start) except Exception as e: log.warning("syslog.manager.start_failed", {"error": str(e)}) diff --git a/flocks/server/routes/workflow.py b/flocks/server/routes/workflow.py index a127c2f77..db95920d1 100644 --- a/flocks/server/routes/workflow.py +++ b/flocks/server/routes/workflow.py @@ -465,10 +465,24 @@ async def _run_workflow_execution_task( loop = asyncio.get_running_loop() def _write_progress(update_fields: Dict[str, Any]) -> None: + # Called from the workflow-engine worker thread on every step + # start/complete. Step events for a single execution are issued + # serially by the engine, so no extra lock is needed beyond the + # caller's invariant — but we must still tolerate transient + # ``Storage.read`` failures (e.g. SQLite contention) without + # corrupting ``current`` with a non-dict result. try: - current = asyncio.run_coroutine_threadsafe(Storage.read(exec_key), loop).result(timeout=5) + current = asyncio.run_coroutine_threadsafe( + Storage.read(exec_key), loop + ).result(timeout=5) + if not isinstance(current, dict): + # Execution record was trimmed mid-run or never persisted; + # rebuild a minimal payload so the write still goes through. + current = {"id": exec_id, "workflowId": workflow_id} current.update(update_fields) - asyncio.run_coroutine_threadsafe(Storage.write(exec_key, current), loop).result(timeout=5) + asyncio.run_coroutine_threadsafe( + Storage.write(exec_key, current), loop + ).result(timeout=5) except Exception as exc: log.warning("workflow.step_progress.write_failed", { "exec_id": exec_id, @@ -510,6 +524,11 @@ def _on_step_complete(step_result) -> None: duration = time.time() - start_time current_data = await Storage.read(exec_key) + if not isinstance(current_data, dict): + # Defensive: the execution record could be missing if it was + # trimmed/cleaned up mid-run. Rebuild a baseline so the final + # status write still succeeds rather than blowing up. + current_data = {"id": exec_id, "workflowId": workflow_id} status_value, error_message = _resolve_execution_outcome(result) current_data.update({ "outputResults": result.outputs, @@ -534,6 +553,8 @@ def _on_step_complete(step_result) -> None: except Exception as exc: duration = time.time() - start_time current_data = await Storage.read(exec_key) + if not isinstance(current_data, dict): + current_data = {"id": exec_id, "workflowId": workflow_id} current_data.update({ "status": "cancelled" if cancel_event.is_set() else "error", "finishedAt": int(time.time() * 1000), @@ -855,13 +876,21 @@ async def run_workflow_endpoint(workflow_id: str, req: WorkflowRunRequest): exec_id=exec_id, cancel_event=cancel_event, tool_context=tool_context, - ) + ), + name=f"workflow-run-{exec_id}", ) _active_workflow_executions[exec_id] = ActiveWorkflowExecution( workflow_id=workflow_id, task=task, cancel_event=cancel_event, ) + # Guarantee cleanup of the registry entry even when the task is + # cancelled or fails before reaching its own ``finally`` block (e.g. + # if the event loop is shutting down). This prevents the ``Active*`` + # map from growing forever when tasks are abandoned. + def _cleanup_active(_t: asyncio.Task, _eid: str = exec_id) -> None: + _active_workflow_executions.pop(_eid, None) + task.add_done_callback(_cleanup_active) log.info("workflow.execution.started", { "id": workflow_id, @@ -1113,29 +1142,31 @@ async def get_workflow_history( ): """ Get workflow execution history - + Returns list of recent executions for this workflow. """ try: if not _read_workflow_from_fs(workflow_id): raise HTTPException(status_code=404, detail=f"Workflow not found: {workflow_id}") - all_exec_keys = await Storage.list("workflow_execution/") + # 单次查询批量读取所有 execution 记录,避免 N 次单独 read 导致超长耗时 + all_entries = await Storage.list_entries("workflow_execution/") executions = [] - - for key in all_exec_keys: + for _key, exec_data in all_entries: try: - exec_data = await Storage.read(key) - if exec_data.get("workflowId") == workflow_id: - executions.append(WorkflowExecutionResponse(**exec_data)) + if not isinstance(exec_data, dict): + continue + if exec_data.get("workflowId") != workflow_id: + continue + executions.append(WorkflowExecutionResponse(**exec_data)) except Exception as e: - log.warning("workflow.history.skip", {"key": key, "error": str(e)}) + log.warning("workflow.history.skip", {"key": _key, "error": str(e)}) continue - + # Sort by start time (newest first) and limit executions.sort(key=lambda e: e.startedAt, reverse=True) executions = executions[:limit] - + log.info("workflow.history", {"id": workflow_id, "count": len(executions)}) return executions except HTTPException: diff --git a/flocks/skill/skill.py b/flocks/skill/skill.py index 16242d3fc..1e854b924 100644 --- a/flocks/skill/skill.py +++ b/flocks/skill/skill.py @@ -482,10 +482,18 @@ def start(self) -> None: watcher = self + # Only react to actual content-mutation events. watchdog emits + # ``opened``/``closed``/``closed_no_write`` events whenever any code + # (including the skill loader itself) reads ``SKILL.md`` files, which + # would otherwise cause a self-sustaining cache-invalidation loop. + _RELOAD_EVENT_TYPES = frozenset({"modified", "created", "deleted", "moved"}) + class _Handler(FileSystemEventHandler): def on_any_event(self, event: FileSystemEvent): if event.is_directory: return + if getattr(event, "event_type", "") not in _RELOAD_EVENT_TYPES: + return src = getattr(event, "src_path", "") or "" if src.endswith("SKILL.md"): watcher._schedule_clear() diff --git a/flocks/storage/storage.py b/flocks/storage/storage.py index 8573a1864..734883e06 100644 --- a/flocks/storage/storage.py +++ b/flocks/storage/storage.py @@ -447,10 +447,12 @@ async def get(cls, key: str, model: Optional[Type[T]] = None) -> Optional[T | An value_str, value_type = row - if model is not None: + if model is not None and hasattr(model, "model_validate_json"): return model.model_validate_json(value_str) - else: - return json.loads(value_str) + # Fall back to a plain JSON decode when no Pydantic model is supplied + # (or when callers accidentally pass a builtin container type such as + # ``dict``/``list``, which is not a Pydantic model). + return json.loads(value_str) @classmethod async def delete(cls, key: str) -> bool: @@ -538,7 +540,7 @@ async def list_entries( entries: List[Tuple[str, T | Any]] = [] for key, value_str in rows: - if model is not None: + if model is not None and hasattr(model, "model_validate_json"): value = model.model_validate_json(value_str) else: value = json.loads(value_str) diff --git a/flocks/tool/registry.py b/flocks/tool/registry.py index 423035043..be20ec338 100644 --- a/flocks/tool/registry.py +++ b/flocks/tool/registry.py @@ -1095,8 +1095,21 @@ def _consume_tools(items: list, source: str) -> None: for spec in items: # YAML factory produces Tool instances directly if isinstance(spec, Tool): - if spec.info.name in cls._tools: - log.warn("plugin.tool.duplicate", {"source": source, "name": spec.info.name}) + existing = cls._tools.get(spec.info.name) + if existing is not None: + # ``PluginLoader.load_all()`` is invoked by multiple + # subsystems (ToolRegistry, Agent registry, etc.). A + # re-scan that re-encounters the same plugin file is + # idempotent and should not produce a noisy warning; + # only flag genuine name collisions from a different + # source. + existing_source = getattr(existing.info, "source", None) + if existing_source not in (None, "plugin_yaml", "plugin_py"): + log.warn("plugin.tool.duplicate", { + "source": source, + "name": spec.info.name, + "existing_source": existing_source, + }) continue if spec.info.source is None: spec.info.source = "plugin_yaml" @@ -1115,8 +1128,18 @@ def _consume_tools(items: list, source: str) -> None: "spec_keys": list(spec.keys()), }) continue - if name in cls._tools: - log.warn("plugin.tool.duplicate", {"source": source, "name": name}) + existing = cls._tools.get(name) + if existing is not None: + # Idempotent re-scan: same plugin source discovered again + # via another ``PluginLoader.load_all()`` pass. Only warn + # on genuine cross-source collisions. + existing_source = getattr(existing.info, "source", None) + if existing_source not in (None, "plugin_yaml", "plugin_py"): + log.warn("plugin.tool.duplicate", { + "source": source, + "name": name, + "existing_source": existing_source, + }) continue if isinstance(handler, str): @@ -1523,13 +1546,29 @@ def start(self) -> None: watcher = self + # Only react to events that change file CONTENT. watchdog also emits + # ``opened``/``closed``/``closed_no_write`` events whenever any process + # (including this one) reads a YAML/Python file, and ``refresh_plugin_tools`` + # itself opens every plugin tool file on every reload. Listening to those + # access events creates an infinite reload feedback loop where the watcher + # endlessly re-triggers itself every ~debounce-window seconds. + _RELOAD_EVENT_TYPES = frozenset({"modified", "created", "deleted", "moved"}) + class _Handler(FileSystemEventHandler): def on_any_event(self, event: FileSystemEvent) -> None: if event.is_directory: return + if getattr(event, "event_type", "") not in _RELOAD_EVENT_TYPES: + return src = getattr(event, "src_path", "") or "" - if src.endswith(".yaml") or src.endswith(".py"): - watcher._schedule_refresh() + if not (src.endswith(".yaml") or src.endswith(".py")): + return + # Ignore Python bytecode / temp / hidden files that get touched + # during normal imports but never carry plugin definitions. + fname = os.path.basename(src) + if fname.startswith(".") or fname.startswith("_") or "/__pycache__/" in src: + return + watcher._schedule_refresh() handler = _Handler() observer = Observer() diff --git a/flocks/workflow/execution_store.py b/flocks/workflow/execution_store.py index 9d300205a..cf8bc3091 100644 --- a/flocks/workflow/execution_store.py +++ b/flocks/workflow/execution_store.py @@ -2,14 +2,40 @@ from __future__ import annotations +import asyncio import time import uuid from typing import Any, Dict, Optional from flocks.session.recorder import Recorder from flocks.storage.storage import Storage +from flocks.utils.log import Log from flocks.workflow.runner import RunWorkflowResult +log = Log.create(service="workflow.execution_store") + +# Maximum number of execution history records retained per workflow. +# Older records are pruned automatically to prevent a syslog flood from bloating Storage. +_MAX_EXECUTION_HISTORY_PER_WORKFLOW = 500 +# Trim is an O(N) scan over all workflow_execution rows; only run it every Nth +# call per workflow to amortise the cost under high syslog throughput. +_TRIM_CHECK_INTERVAL = 50 +_trim_counters: Dict[str, int] = {} + +# Per-workflow lock to serialize read-modify-write of stats. Concurrent +# executions of the same workflow (e.g. syslog-triggered runs with +# semaphore=8) would otherwise race on ``Storage.read → mutate → write`` +# and silently lose counter increments. +_stats_locks: Dict[str, asyncio.Lock] = {} + + +def _get_stats_lock(workflow_id: str) -> asyncio.Lock: + lock = _stats_locks.get(workflow_id) + if lock is None: + lock = asyncio.Lock() + _stats_locks[workflow_id] = lock + return lock + def _workflow_stats_key(workflow_id: str) -> str: return f"workflow/{workflow_id}/stats" @@ -27,25 +53,34 @@ def _workflow_stats_key(workflow_id: str) -> str: async def _update_workflow_stats(workflow_id: str, success: bool, duration: float) -> None: - """Increment workflow call/success/error counters and update avgRuntime.""" - try: - key = _workflow_stats_key(workflow_id) + """Increment workflow call/success/error counters and update avgRuntime. + + Serialised per workflow to keep concurrent updates from clobbering each + other (read → mutate → write race). + """ + lock = _get_stats_lock(workflow_id) + async with lock: try: - stats: Dict[str, Any] = await Storage.read(key) or dict(_DEFAULT_STATS) - except Exception: - stats = dict(_DEFAULT_STATS) - stats["callCount"] = stats.get("callCount", 0) + 1 - if success: - stats["successCount"] = stats.get("successCount", 0) + 1 - else: - stats["errorCount"] = stats.get("errorCount", 0) + 1 - total = stats.get("totalRuntime", 0.0) + duration - stats["totalRuntime"] = total - call_count = stats["callCount"] - stats["avgRuntime"] = (total / call_count) if call_count > 0 else 0.0 - await Storage.write(key, stats) - except Exception: - pass + key = _workflow_stats_key(workflow_id) + try: + stats: Dict[str, Any] = await Storage.read(key) or dict(_DEFAULT_STATS) + except Exception: + stats = dict(_DEFAULT_STATS) + stats["callCount"] = stats.get("callCount", 0) + 1 + if success: + stats["successCount"] = stats.get("successCount", 0) + 1 + else: + stats["errorCount"] = stats.get("errorCount", 0) + 1 + total = stats.get("totalRuntime", 0.0) + duration + stats["totalRuntime"] = total + call_count = stats["callCount"] + stats["avgRuntime"] = (total / call_count) if call_count > 0 else 0.0 + await Storage.write(key, stats) + except Exception as exc: + log.warning("workflow.stats.update_failed", { + "workflow_id": workflow_id, + "error": str(exc), + }) def workflow_execution_key(exec_id: str) -> str: @@ -149,11 +184,70 @@ async def record_execution_result( duration = max(0.0, (finished_at - started_at) / 1000.0) await _update_workflow_stats(workflow_id, success, float(duration)) + # Recorder writes to its own SQLite tables and can be slow under load. + # Run it as a background task so the syslog/HTTP dispatcher can release the + # concurrency slot immediately instead of waiting on session-history I/O. try: - await Recorder.record_workflow_execution( - exec_id=exec_id, - workflow_id=workflow_id, - run_result=exec_data, - ) + async def _record_audit() -> None: + try: + await Recorder.record_workflow_execution( + exec_id=exec_id, + workflow_id=workflow_id, + run_result=exec_data, + ) + except Exception as exc: + log.debug("workflow.audit.record_failed", { + "exec_id": exec_id, + "error": str(exc), + }) + + asyncio.create_task(_record_audit(), name=f"audit-{exec_id}") + except RuntimeError: + # No running loop (e.g. unit tests) — best-effort sync fallback. + try: + await Recorder.record_workflow_execution( + exec_id=exec_id, + workflow_id=workflow_id, + run_result=exec_data, + ) + except Exception: + pass + + # Prune old execution records when the per-workflow limit is exceeded. + # Throttled by a per-workflow counter to amortise the O(N) storage scan. + try: + counter = _trim_counters.get(workflow_id, 0) + 1 + _trim_counters[workflow_id] = counter + if counter >= _TRIM_CHECK_INTERVAL: + _trim_counters[workflow_id] = 0 + # Run trim in the background as well; it scans all execution rows + # and we don't want to delay the caller. + try: + asyncio.create_task( + _trim_execution_history(workflow_id), + name=f"trim-{workflow_id}", + ) + except RuntimeError: + await _trim_execution_history(workflow_id) except Exception: pass + + +async def _trim_execution_history(workflow_id: str) -> None: + """Delete the oldest execution records once the per-workflow cap is exceeded.""" + all_entries = await Storage.list_entries("workflow_execution/") + wf_entries = [ + (key, data) + for key, data in all_entries + if isinstance(data, dict) and data.get("workflowId") == workflow_id + ] + if len(wf_entries) <= _MAX_EXECUTION_HISTORY_PER_WORKFLOW: + return + # Sort ascending by startedAt and remove the oldest excess records + wf_entries.sort(key=lambda kd: kd[1].get("startedAt", 0)) + excess = len(wf_entries) - _MAX_EXECUTION_HISTORY_PER_WORKFLOW + for key, _ in wf_entries[:excess]: + try: + await Storage.remove(key) + except Exception: + pass From 9c53a5647865affb5c1f2bfa0a8fe0cc07c92f0a Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Thu, 14 May 2026 11:40:38 +0800 Subject: [PATCH 40/41] chore(workflows): drop legacy dedup workflows and rewrite tdp_alert_triage - Remove four obsolete workflows that have been superseded by the current alert-triage pipeline: alert_dedup_triage, http_alert_dedup, stream_alert_dedup, tdp_alert_pull_dedup. - Rewrite tdp_alert_triage workflow (definition + docs) as the canonical "NDR/TDP alert investigation" workflow, replacing the previous HTTP-log oriented variant. Co-authored-by: Cursor --- .../alert_dedup_triage/workflow.json | 87 --- .../workflows/alert_dedup_triage/workflow.md | 146 ---- .../workflows/http_alert_dedup/meta.json | 10 - .../workflows/http_alert_dedup/workflow.json | 74 -- .../workflows/http_alert_dedup/workflow.md | 118 --- .../stream_alert_dedup/workflow.json | 84 --- .../workflows/stream_alert_dedup/workflow.md | 138 ---- .../tdp_alert_pull_dedup/_build_workflow.py | 93 --- .../_node_pull_dedup_loop.py | 670 ------------------ .../tdp_alert_pull_dedup/workflow.json | 48 -- .../tdp_alert_pull_dedup/workflow.md | 219 ------ .../workflows/tdp_alert_triage/workflow.json | 159 ++--- .../workflows/tdp_alert_triage/workflow.md | 218 +++--- 13 files changed, 161 insertions(+), 1903 deletions(-) delete mode 100644 .flocks/plugins/workflows/alert_dedup_triage/workflow.json delete mode 100644 .flocks/plugins/workflows/alert_dedup_triage/workflow.md delete mode 100644 .flocks/plugins/workflows/http_alert_dedup/meta.json delete mode 100644 .flocks/plugins/workflows/http_alert_dedup/workflow.json delete mode 100644 .flocks/plugins/workflows/http_alert_dedup/workflow.md delete mode 100644 .flocks/plugins/workflows/stream_alert_dedup/workflow.json delete mode 100644 .flocks/plugins/workflows/stream_alert_dedup/workflow.md delete mode 100644 .flocks/plugins/workflows/tdp_alert_pull_dedup/_build_workflow.py delete mode 100644 .flocks/plugins/workflows/tdp_alert_pull_dedup/_node_pull_dedup_loop.py delete mode 100644 .flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.json delete mode 100644 .flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.md diff --git a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json b/.flocks/plugins/workflows/alert_dedup_triage/workflow.json deleted file mode 100644 index d252426e6..000000000 --- a/.flocks/plugins/workflows/alert_dedup_triage/workflow.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "name": "alert_dedup_triage", - "description": "Chained pipeline: http_alert_dedup -> tdp_alert_triage. Supports syslog (real-time single alert), alerts list (batch), or alert_file path. Deduplicates with MinHash LSH; runs LLM triage on first-seen unique alerts; returns cached triage for duplicates. Syslog mode returns triage result directly; batch mode generates a full summary report.", - "description_cn": "去重+研判串联工作流。支持三种输入模式:syslog 实时单条、alerts 批次列表、alert_file 文件路径。syslog 模式下跳过汇总节点,直接返回单条研判结果;批次模式生成完整汇总报告。重复告警从 triage_cache.pkl 中回填历史研判结果(FIFO LRU,max_dedup_keys 可调)。", - "start": "receive_alerts", - "nodes": [ - { - "id": "receive_alerts", - "type": "python", - "description": "Parse incoming alerts (syslog_message / alerts list / alert_file). Resolves source_log_type in priority order: (1) explicit input param, (2) syslog app_name/hostname hint (contains 'tdp' or 'skyeye'), (3) JSON field auto-detection (TDP: nested net dict/behave_uuid; Skyeye: uri/vuln_name/attack_result), (4) default 'tdp'. Logs detection reason for traceability.", - "code": "\nimport json\nimport os\n\n# Input priority: syslog_message > alerts > alert_file\nalerts_input = []\ninput_mode = 'unknown'\n_syslog_msg = None\n\nsyslog_msg = inputs.get('syslog_message') or inputs.get('syslog')\nif syslog_msg and isinstance(syslog_msg, dict):\n raw_text = syslog_msg.get('message', '')\n if raw_text:\n try:\n alert = json.loads(raw_text)\n alert['_syslog_meta'] = {\n 'hostname': syslog_msg.get('hostname', ''),\n 'app_name': syslog_msg.get('app_name', ''),\n 'timestamp': syslog_msg.get('timestamp', ''),\n 'severity': syslog_msg.get('severity'),\n 'facility': syslog_msg.get('facility'),\n 'format': syslog_msg.get('format', ''),\n }\n alerts_input = [alert]\n input_mode = 'syslog'\n _syslog_msg = syslog_msg\n print(f'[receive] syslog mode: host={syslog_msg.get(\"hostname\")!r} '\n f'app={syslog_msg.get(\"app_name\")!r} '\n f'severity={syslog_msg.get(\"severity\")} '\n f'format={syslog_msg.get(\"format\")!r}')\n except (json.JSONDecodeError, TypeError) as _e:\n print(f'[receive] WARNING: syslog.message not valid JSON ({_e}), '\n f'raw={raw_text[:120]!r}')\n else:\n print('[receive] WARNING: syslog_message.message is empty, skipping')\n\nif not alerts_input:\n alerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n if alerts_input:\n input_mode = 'alerts'\n\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n input_mode = 'alert_file'\n print(f'[receive] file mode: loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nif not alerts_input:\n print('[receive] WARNING: no alerts (syslog_message, alerts, alert_file all empty)')\n\n# ── Source log type resolution ────────────────────────────────────────────────\n# Priority:\n# 1. Explicit input param source_log_type (always respected)\n# 2. Syslog metadata: app_name / hostname contains 'skyeye' or 'tdp'\n# 3. JSON field signature on first alert:\n# TDP -> nested 'net' dict, 'behave_uuid', 'flow_id'\n# Skyeye -> 'uri', 'vuln_name', 'attack_result', 'attack_flag'\n# 4. Default 'tdp'\n\ndef _detect_from_syslog_meta(sm):\n for field in ('app_name', 'hostname'):\n val = str(sm.get(field, '') or '').lower()\n if 'skyeye' in val:\n return 'skyeye', f'syslog.{field}={sm.get(field)!r}'\n if 'tdp' in val:\n return 'tdp', f'syslog.{field}={sm.get(field)!r}'\n return None, None\n\ndef _detect_from_alert_json(alert):\n if not isinstance(alert, dict):\n return None, None\n if isinstance(alert.get('net'), dict):\n return 'tdp', 'alert has nested net dict (TDP)'\n if any(k in alert for k in ('behave_uuid', 'flow_id')):\n return 'tdp', 'alert has behave_uuid/flow_id (TDP)'\n if any(k in alert for k in ('uri', 'vuln_name', 'attack_result', 'attack_flag')):\n return 'skyeye', 'alert has uri/vuln_name/attack_result (Skyeye)'\n return None, None\n\nexplicit_type = str(inputs.get('source_log_type', '') or '').lower()\nif explicit_type in ('tdp', 'skyeye'):\n source_log_type = explicit_type\n source_log_type_reason = 'explicit input parameter'\nelif input_mode == 'syslog' and _syslog_msg:\n source_log_type, reason = _detect_from_syslog_meta(_syslog_msg)\n if source_log_type:\n source_log_type_reason = f'syslog metadata: {reason}'\n else:\n first_alert = alerts_input[0] if alerts_input else {}\n source_log_type, reason = _detect_from_alert_json(first_alert)\n if source_log_type:\n source_log_type_reason = f'JSON field detection: {reason}'\n else:\n source_log_type = 'tdp'\n source_log_type_reason = 'fallback default (no hint in syslog meta or alert JSON)'\nelse:\n source_log_type = 'tdp'\n source_log_type_reason = 'default'\n\nprint(f'[receive] source_log_type={source_log_type!r} reason={source_log_type_reason!r}')\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nprint(f'[receive] input_mode={input_mode} raw_alerts={len(alerts_input)} max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['input_mode'] = input_mode\noutputs['source_log_type'] = source_log_type\noutputs['source_log_type_reason'] = source_log_type_reason\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['threshold'] = threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\noutputs['dedup_workflow_id'] = inputs.get('dedup_workflow_id', 'http_alert_dedup')\noutputs['triage_workflow_id'] = inputs.get('triage_workflow_id', 'tdp_alert_triage')\n" - }, - { - "id": "dedup_and_triage", - "type": "python", - "description": "Core loop: for each alert, invoke http_alert_dedup in-process (no published service required) to deduplicate; for first-seen unique alerts invoke tdp_alert_triage in-process for LLM triage. Duplicate alerts are served from triage_cache.pkl (FIFO LRU, max_dedup_keys configurable).", - "code": "\nimport json\nimport os\nimport pickle\nimport sys\nimport time\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nWORKFLOW_NAME = 'alert_dedup_triage'\n\n# ── Triage cache helpers ──────────────────────────────────────────────────────\n\ndef _triage_cache_path():\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent\n state_dir = flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME\n state_dir.mkdir(parents=True, exist_ok=True)\n return str(state_dir / 'triage_cache.pkl'), str(state_dir / 'triage_cache.lock')\n\ndef _acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L'); fh.flush(); fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef _release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef _load_triage_cache(cache_path):\n if not os.path.exists(cache_path) or os.path.getsize(cache_path) == 0:\n return {}\n try:\n with open(cache_path, 'rb') as f:\n c = pickle.load(f)\n if not isinstance(c, dict):\n return {}\n print(f'[triage_cache] loaded {len(c)} entries from {cache_path}')\n return c\n except Exception as e:\n print(f'[triage_cache] failed to load ({e}), starting fresh')\n return {}\n\ndef _save_triage_cache(cache_path, cache):\n tmp = cache_path + '.tmp'\n try:\n with open(tmp, 'wb') as f:\n pickle.dump(cache, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, cache_path)\n print(f'[triage_cache] saved {len(cache)} entries -> {cache_path}')\n except Exception as e:\n print(f'[triage_cache] failed to save: {e}')\n if os.path.exists(tmp):\n try: os.remove(tmp)\n except Exception: pass\n\ndef _evict_cache(cache, max_keys):\n excess = len(cache) - max_keys\n if excess > 0:\n for k in list(cache.keys())[:excess]:\n del cache[k]\n return excess\n return 0\n\n# ── Embedded sub-workflow invocation (no HTTP, no published service needed) ──\n\ndef _find_workflow_path(workflow_id):\n from flocks.workflow.fs_store import workflow_scan_dirs\n for root, _ in workflow_scan_dirs():\n p = root / workflow_id / 'workflow.json'\n if p.exists():\n return p\n return None\n\ndef _invoke_workflow(workflow_id, wf_inputs, timeout_s):\n from flocks.workflow.runner import run_workflow as _run_wf\n t0 = time.time()\n try:\n wf_path = _find_workflow_path(workflow_id)\n if not wf_path:\n return None, round((time.time() - t0) * 1000), f'Workflow not found: {workflow_id!r}'\n result = _run_wf(\n workflow=wf_path,\n inputs=wf_inputs,\n timeout_s=float(timeout_s),\n ensure_requirements=False,\n )\n ms = round((time.time() - t0) * 1000)\n if result.status == 'SUCCEEDED':\n return {'status': 'SUCCEEDED', 'outputs': result.outputs}, ms, None\n else:\n return None, ms, result.error or f'status={result.status}'\n except Exception as e:\n return None, round((time.time() - t0) * 1000), str(e)\n\n# ── Main ─────────────────────────────────────────────────────────────────────\n\nraw_alerts = inputs.get('raw_alerts', [])\nsource_log_type = inputs.get('source_log_type', 'tdp')\nfilter_enabled = inputs.get('filter_enabled', True)\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = inputs.get('threshold', 0.7)\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = inputs.get('max_field_len', 500)\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\ndedup_wf_id = inputs.get('dedup_workflow_id', 'http_alert_dedup')\ntriage_wf_id = inputs.get('triage_workflow_id', 'tdp_alert_triage')\ntriage_timeout_s = int(inputs.get('triage_timeout_s', 300))\ndedup_timeout_s = int(inputs.get('dedup_timeout_s', 60))\n\ndedup_base_inputs = {\n 'source_log_type': source_log_type,\n 'filter_enabled': filter_enabled,\n 'dedup_enabled': dedup_enabled,\n 'threshold': threshold,\n 'strict_fields': strict_fields,\n 'lsh_fields': lsh_fields,\n 'max_field_len': max_field_len,\n 'max_dedup_keys': max_dedup_keys,\n}\n\ncache_path, lock_path = _triage_cache_path()\nlock_fh = _acquire_lock(lock_path)\ntry:\n triage_cache = _load_triage_cache(cache_path)\nfinally:\n _release_lock(lock_fh)\n\nresults = []\ntriage_results = []\ncache_dirty = False\n\nstats = {\n 'total_input': len(raw_alerts),\n 'dedup_failed': 0,\n 'filtered_out': 0,\n 'duplicate_skipped': 0,\n 'duplicate_with_triage': 0,\n 'triage_invoked': 0,\n 'triage_success': 0,\n 'triage_failed': 0,\n 'verdict_counts': {},\n}\n\nfor i, alert in enumerate(raw_alerts):\n entry = {\n 'alert_index': i,\n 'alert_id': alert.get('id') or alert.get('uuid') or alert.get('behave_uuid'),\n 'threat_name': (alert.get('threat') or {}).get('name', ''),\n }\n\n # Step 1: dedup via embedded http_alert_dedup workflow\n dr, dms, derr = _invoke_workflow(\n dedup_wf_id,\n {**dedup_base_inputs, 'alerts': [alert]},\n dedup_timeout_s,\n )\n if derr or not dr or dr.get('status') != 'SUCCEEDED':\n stats['dedup_failed'] += 1\n entry.update({'stage': 'dedup_failed', 'dedup_error': derr or dr, 'dedup_ms': dms})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] dedup FAILED ({dms}ms) {str(derr or \"\")[:80]}')\n continue\n\n dout = dr.get('outputs', {})\n unique_alerts = dout.get('unique_alerts', [])\n dstats = dout.get('stats', {})\n entry['dedup_ms'] = dms\n entry['filter_removed'] = dstats.get('filter_removed_count', 0)\n entry['lsh_clusters'] = dstats.get('lsh_total_clusters')\n entry['lsh_dedup_keys'] = dstats.get('lsh_total_dedup_keys')\n\n if not unique_alerts:\n stats['filtered_out'] += 1\n entry.update({'stage': 'filtered_out', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - filtered_out ({dms}ms)')\n continue\n\n already = bool(unique_alerts[0].get('dedup_key_already_exists'))\n dedup_key = unique_alerts[0].get('dedup_key', '')\n entry['dedup_key'] = dedup_key\n\n if already:\n cached_triage = triage_cache.get(dedup_key)\n if cached_triage:\n stats['duplicate_with_triage'] += 1\n verdict = cached_triage.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n entry.update({'stage': 'duplicate_with_triage', 'triage': cached_triage})\n results.append(entry)\n triage_results.append({**entry, **cached_triage})\n print(f' [{i+1}/{len(raw_alerts)}] duplicate+cached ({dms}ms) key={dedup_key[:8]} verdict={verdict}')\n else:\n stats['duplicate_skipped'] += 1\n entry.update({'stage': 'duplicate_skipped', 'triage': None})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] - duplicate (no cache) ({dms}ms) key={dedup_key[:8]}')\n continue\n\n # Step 2: triage via embedded tdp_alert_triage workflow (unique alerts only)\n stats['triage_invoked'] += 1\n tr, tms, terr = _invoke_workflow(\n triage_wf_id,\n {'alert_data': alert},\n triage_timeout_s,\n )\n if terr or not tr or tr.get('status') != 'SUCCEEDED':\n stats['triage_failed'] += 1\n entry.update({'stage': 'triage_failed', 'triage_ms': tms,\n 'triage_error': terr or tr})\n results.append(entry)\n print(f' [{i+1}/{len(raw_alerts)}] dedup OK + triage FAILED ({dms}+{tms}ms) {str(terr or \"\")[:80]}')\n continue\n\n stats['triage_success'] += 1\n tout = tr.get('outputs', {})\n verdict = tout.get('attack_verdict', 'unknown')\n stats['verdict_counts'][verdict] = stats['verdict_counts'].get(verdict, 0) + 1\n\n triage_info = {\n 'attack_verdict': verdict,\n 'risk_level': tout.get('risk_level'),\n 'report_title': tout.get('report_title'),\n 'report_path': tout.get('report_path'),\n 'final_report': tout.get('final_report', ''),\n }\n if dedup_key in triage_cache:\n del triage_cache[dedup_key]\n triage_cache[dedup_key] = triage_info\n cache_dirty = True\n\n entry.update({'stage': 'triage_done', 'triage_ms': tms, 'triage': triage_info})\n results.append(entry)\n triage_results.append({**entry, **triage_info})\n print(f' [{i+1}/{len(raw_alerts)}] dedup+triage OK ({dms}+{tms}ms) verdict={verdict} title={(tout.get(\"report_title\") or \"\")[:40]}')\n\nif cache_dirty:\n lock_fh = _acquire_lock(lock_path)\n try:\n evicted = _evict_cache(triage_cache, max_dedup_keys)\n if evicted:\n print(f'[triage_cache] LRU eviction: dropped {evicted} entries (max={max_dedup_keys})')\n _save_triage_cache(cache_path, triage_cache)\n finally:\n _release_lock(lock_fh)\n\nstats['triage_cache_size'] = len(triage_cache)\nprint(f'[pipeline] stats={json.dumps(stats, ensure_ascii=False)}')\n\noutputs['results'] = results\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\n" - }, - { - "id": "branch_output_mode", - "type": "branch", - "select_key": "input_mode", - "description": "Route output based on input_mode: syslog (single alert) -> direct_output; batch (alerts list or file) -> generate_summary." - }, - { - "id": "direct_output", - "type": "python", - "description": "Syslog / single-alert output: extract triage result directly without generating a full summary table. Outputs are field-compatible with generate_summary.", - "code": "\nimport json\nimport datetime\n\nresults = inputs.get('results', [])\ntriage_results = inputs.get('triage_results', [])\nstats = inputs.get('stats', {})\n\n# In syslog mode there is exactly one alert; pull its triage info directly.\nentry = results[0] if results else {}\ntriage = entry.get('triage') or {}\n\nattack_verdict = triage.get('attack_verdict', '')\nrisk_level = triage.get('risk_level', '')\nreport_title = triage.get('report_title', '')\nfinal_report = triage.get('final_report', '')\nreport_path = triage.get('report_path', '')\nstage = entry.get('stage', '')\n\n# Minimal one-liner summary (no full table needed for a single alert).\nverdict_label = {\n 'attack_success': 'Attack Success',\n 'attack_failed': 'Attack Failed',\n 'attack': 'Attack',\n 'unknown': 'Unknown',\n 'benign': 'Benign',\n}\nsummary_report = (\n f'# {report_title or \"Alert Triage Result\"}\\n\\n'\n f'**Date**: {datetime.date.today().isoformat()} '\n f'**Stage**: {stage} '\n f'**Verdict**: {verdict_label.get(attack_verdict, attack_verdict or \"-\")} '\n f'**Risk**: {risk_level or \"-\"}\\n'\n)\n\nprint(f'[direct_output] stage={stage} verdict={attack_verdict} title={report_title[:40]!r}')\n\noutputs['final_reports'] = [final_report] if final_report else []\noutputs['triage_results'] = triage_results\noutputs['results'] = results\noutputs['stats'] = stats\noutputs['summary_report'] = summary_report\noutputs['report_path'] = report_path\noutputs['final_report'] = final_report\noutputs['report_title'] = report_title\noutputs['attack_verdict'] = attack_verdict\noutputs['risk_level'] = risk_level\n" - }, - { - "id": "generate_summary", - "type": "python", - "description": "Aggregate all triage results, write pipeline_summary.md, and expose the highest-risk alert's triage labels (attack_verdict, risk_level, report_title, final_report) for downstream compatibility with tdp_alert_triage outputs.", - "code": "\nimport json\nimport datetime\n\nresults = inputs.get('results', [])\ntriage_results = inputs.get('triage_results', [])\nstats = inputs.get('stats', {})\n\nVERDICT_ORDER = {'attack_success': 5, 'attack': 4, 'attack_failed': 3, 'unknown': 2, 'benign': 1}\n\n# Pick the highest-risk triage result (includes duplicates with cached results).\ntop = None\nfor r in triage_results:\n if top is None:\n top = r\n elif VERDICT_ORDER.get(r.get('attack_verdict', ''), 0) > VERDICT_ORDER.get(top.get('attack_verdict', ''), 0):\n top = r\n\nfinal_report = top.get('final_report', '') if top else ''\nreport_title = top.get('report_title', '') if top else ''\nreport_path = top.get('report_path', '') if top else ''\nattack_verdict = top.get('attack_verdict', '') if top else ''\nrisk_level = top.get('risk_level', '') if top else ''\n\ntoday = datetime.date.today().isoformat()\nverdict_label = {\n 'attack_success': 'Attack Success',\n 'attack_failed': 'Attack Failed',\n 'attack': 'Attack',\n 'unknown': 'Unknown',\n 'benign': 'Benign',\n}\nstage_label = {\n 'triage_done': 'Triaged',\n 'duplicate_with_triage': 'Duplicate (cached)',\n 'duplicate_skipped': 'Duplicate (skipped)',\n 'filtered_out': 'Filtered',\n 'dedup_failed': 'Dedup Failed',\n 'triage_failed': 'Triage Failed',\n}\n\nrows = []\nfor r in results:\n stage = r.get('stage', '')\n triage = r.get('triage')\n verdict = triage.get('attack_verdict', '-') if isinstance(triage, dict) else '-'\n title = triage.get('report_title', '-') if isinstance(triage, dict) else '-'\n cache_mark = '' # stage_label already includes cache indicator\n rows.append(\n f\"| {r.get('alert_index', 0) + 1} \"\n f\"| {(r.get('threat_name') or '')[:30]} \"\n f\"| {stage_label.get(stage, stage)}{cache_mark} \"\n f\"| {verdict_label.get(verdict, verdict)} \"\n f\"| {(title or '')[:30]} |\"\n )\n\nsummary_md = (\n f'# Alert Dedup & Triage Summary\\n\\n'\n f'**Date**: {today}\\n\\n'\n f'## Statistics\\n'\n f'- Total input: {stats.get(\"total_input\", 0)}\\n'\n f'- New triages: {stats.get(\"triage_success\", 0)}\\n'\n f'- Duplicates (cached triage): {stats.get(\"duplicate_with_triage\", 0)}\\n'\n f'- Duplicates (no cache): {stats.get(\"duplicate_skipped\", 0)}\\n'\n f'- Filtered out: {stats.get(\"filtered_out\", 0)}\\n'\n f'- Triage failed: {stats.get(\"triage_failed\", 0)}\\n'\n f'- Triage cache size: {stats.get(\"triage_cache_size\", 0)}\\n'\n f'- Verdict distribution: {json.dumps(stats.get(\"verdict_counts\", {}), ensure_ascii=False)}\\n\\n'\n f'## Details\\n\\n'\n f'| # | Threat | Stage | Verdict | Report Title |\\n'\n f'|---|--------|-------|---------|--------------|\\n'\n + '\\n'.join(rows) + '\\n\\n'\n + (f'## Top-Risk Alert Report\\n\\n{final_report}\\n' if final_report else '')\n)\n\ntry:\n out_path = get_path('pipeline_summary.md')\n import os\n os.makedirs(os.path.dirname(str(out_path)), exist_ok=True)\n with open(str(out_path), 'w', encoding='utf-8') as _f:\n _f.write(summary_md)\n summary_path = str(out_path)\n print(f'[summary] written to {summary_path}')\nexcept Exception as _e:\n print(f'[summary] WARNING: could not write summary file: {_e}')\n summary_path = ''\n\nprint(f'[summary] triage_success={stats.get(\"triage_success\", 0)}, '\n f'duplicate_with_triage={stats.get(\"duplicate_with_triage\", 0)}, '\n f'top_verdict={attack_verdict}')\n\noutputs['final_reports'] = [r.get('triage', {}).get('final_report', '') for r in triage_results if isinstance(r.get('triage'), dict)]\noutputs['triage_results'] = triage_results\noutputs['stats'] = stats\noutputs['summary_report'] = summary_md\noutputs['report_path'] = summary_path\noutputs['final_report'] = final_report\noutputs['report_title'] = report_title\noutputs['attack_verdict'] = attack_verdict\noutputs['risk_level'] = risk_level\n" - } - ], - "edges": [ - { - "from": "receive_alerts", - "to": "dedup_and_triage", - "order": 0 - }, - { - "from": "dedup_and_triage", - "to": "branch_output_mode", - "order": 0 - }, - { - "from": "branch_output_mode", - "to": "direct_output", - "order": 0, - "label": "syslog" - }, - { - "from": "branch_output_mode", - "to": "generate_summary", - "order": 1 - } - ], - "metadata": { - "node_timeout_s": 7200, - "sampleInputs": { - "source_log_type": "tdp", - "filter_enabled": true, - "dedup_enabled": true, - "threshold": 0.7, - "triage_timeout_s": 300, - "_comment_alerts": "Pass 'alerts' (list) or 'alert_file' (path to JSON file)", - "max_dedup_keys": 100000, - "_comment_syslog": "Syslog mode: configure via POST /api/workflow/{id}/syslog-config {enabled:true, protocol:'udp', port:5140, inputKey:'syslog_message'}. The syslog listener parses RFC3164/5424 and injects the result as 'syslog_message'; TDP alert JSON must be in the syslog message body.", - "syslog_message": { - "raw": "<134>May 10 16:00:00 tdp-sensor tdp: {\"id\":\"AZtRkZkzj\",\"net\":{...}}", - "facility": 16, - "severity": 6, - "timestamp": "2026-05-10T16:00:00", - "hostname": "tdp-sensor", - "app_name": "tdp", - "message": "{\"id\":\"AZtRkZkzj\",\"net\":{\"http\":{\"url\":\"/admin\"}},\"threat\":{\"name\":\"SQL注入\"}}", - "format": "rfc3164" - }, - "dedup_workflow_id": "http_alert_dedup", - "triage_workflow_id": "tdp_alert_triage", - "_comment_ids": "Sub-workflow IDs used for embedded invocation. No published service required - the engine locates and runs these workflows directly." - } - } -} \ No newline at end of file diff --git a/.flocks/plugins/workflows/alert_dedup_triage/workflow.md b/.flocks/plugins/workflows/alert_dedup_triage/workflow.md deleted file mode 100644 index 06cec5866..000000000 --- a/.flocks/plugins/workflows/alert_dedup_triage/workflow.md +++ /dev/null @@ -1,146 +0,0 @@ -# 告警去重研判串联工作流 - -## 业务场景 - -将 `http_alert_dedup`(MinHash LSH 去重)与 `tdp_alert_triage`(LLM 研判)串联为单一工作流: - -1. 接收 TDP/HTTP 原始告警(支持 syslog 实时单条 / 批量列表 / 文件三种模式) -2. 逐条调用 http_alert_dedup 服务去重:跨批次已见告警直接跳过,节省研判算力 -3. 对首次出现的唯一告警调用 tdp_alert_triage 服务进行 LLM 研判(测绘/CVE/payload 并行) -4. 聚合所有结果输出汇总报告及最高风险告警的研判详情 - -## 流程结构 - -``` -receive_alerts (解析输入:syslog_message / alerts 列表 / alert_file 文件) - ↓ -dedup_and_triage (逐条去重 → 唯一告警 → 研判 → 缓存回填) - ↓ -generate_summary (聚合输出,写 pipeline_summary.md) -``` - -### 内部循环逻辑(dedup_and_triage) - -``` -for each raw_alert: - POST /invoke → http_alert_dedup (port 19000) - ├─ filtered_out → 跳过(非 HTTP / 扫描告警) - ├─ duplicate_with_triage → 回填历史研判缓存(triage_cache.pkl) - ├─ duplicate_skipped → 跳过(跨批次已见且无缓存) - └─ unique → POST /invoke → tdp_alert_triage (port 19001) - ↓ - collect & persist triage result -``` - -## 节点详情 - -### 1. `receive_alerts` - -支持三种输入模式(优先级由高到低): - -| 模式 | 触发条件 | 说明 | -|------|---------|------| -| **syslog** | `syslog_message` 字段存在 | flocks syslog 监听器注入,TDP 告警 JSON 在 `.message` 字段;syslog 元数据附加到告警的 `_syslog_meta` 字段 | -| **alerts** | `alerts` 或 `alert_list` 字段为非空列表 | 批量调用,直接传入告警列表 | -| **alert_file** | `alert_file` 为本地 JSON 文件路径 | 离线测试 / 批处理场景 | - -- 提取去重配置:`source_log_type`、`filter_enabled`、`dedup_enabled`、`threshold` -- 支持通过 `dedup_service_url` / `triage_service_url` 输入字段覆盖服务地址 - -### 2. `dedup_and_triage` -逐条处理每条原始告警: -- 单条 POST 到 `http_alert_dedup` 服务(保持跨批次 LSH 状态持久化) -- 根据返回的 `dedup_key_already_exists` 判断是否为首次出现 -- 仅对首次出现的告警用**原始 raw alert**(而非归一化字段)调用 `tdp_alert_triage` -- 对所有告警记录处理阶段:`filtered_out` / `duplicate_skipped` / `triage_done` / `dedup_failed` / `triage_failed` - -### 3. `generate_summary` -- 聚合所有研判结果,按 `attack_verdict` 风险级别排序 -- 生成 Markdown 汇总表(明细 + 统计)+ 最高风险告警的完整报告 -- 落盘到 `~/.flocks/workspace/outputs//artifacts/pipeline_summary.md` -- 主要输出字段与 `tdp_alert_triage` 兼容(`attack_verdict`、`risk_level`、`report_title`、`final_report`),方便单告警场景直接对接下游 - -## 输入参数 - -| 字段 | 类型 | 默认 | 说明 | -|------|------|------|------| -| `syslog_message` | dict | — | syslog 解析结果(由 flocks 监听器注入),TDP JSON 在 `.message` 字段 | -| `alerts` | list | — | 原始告警列表(与 alert_file 二选一) | -| `alert_file` | string | — | JSON 文件路径(替代 alerts 列表) | -| `source_log_type` | string | `"tdp"` | 日志类型(`tdp` / `skyeye`) | -| `filter_enabled` | bool | `true` | 是否启用告警过滤 | -| `dedup_enabled` | bool | `true` | 是否启用去重(含持久化) | -| `threshold` | float | `0.7` | LSH Jaccard 相似度阈值 | -| `max_dedup_keys` | int | `100000` | LSH hash + 研判缓存最大条数,超出后 FIFO 淘汰 | -| `dedup_service_url` | string | `http://127.0.0.1:19000` | http_alert_dedup 服务地址 | -| `triage_service_url` | string | `http://127.0.0.1:19001` | tdp_alert_triage 服务地址 | -| `triage_timeout_s` | int | `300` | 单条研判超时秒数 | -| `dedup_timeout_s` | int | `60` | 单条去重超时秒数 | - -## 输出参数 - -| 字段 | 类型 | 说明 | -|------|------|------| -| `final_report` | string | 最高风险告警的完整 Markdown 报告 | -| `report_title` | string | 最高风险告警的标题 | -| `attack_verdict` | enum | 最高风险告警的判定标签 | -| `risk_level` | enum | 最高风险告警的风险等级 | -| `final_reports` | list | 所有研判成功告警的报告列表 | -| `triage_results` | list | 所有研判成功告警的详情 | -| `summary_report` | string | 汇总 Markdown(统计 + 明细表) | -| `report_path` | string | `pipeline_summary.md` 落盘路径 | -| `stats` | dict | 处理统计(total/filtered/dedup/triage 各计数) | - -## Syslog 接入配置 - -flocks 内置了 RFC 3164 / RFC 5424 syslog 监听器(UDP + TCP,默认端口 5140)。只需通过 API 为本工作流开启监听,即可实现 TDP 实时告警接入。 - -### 启用 syslog 监听 - -```bash -curl -X POST http://127.0.0.1:8000/api/workflow/alert_dedup_triage/syslog-config \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer " \ - -d '{ - "enabled": true, - "protocol": "udp", - "host": "0.0.0.0", - "port": 5140, - "inputKey": "syslog_message" - }' -``` - -| 参数 | 说明 | -|------|------| -| `protocol` | `udp`(推荐)或 `tcp` | -| `port` | syslog 监听端口(默认 5140) | -| `inputKey` | 注入到工作流 inputs 的键名,本工作流固定读取 `syslog_message` | - -### TDP 设备 syslog 转发格式 - -TDP 传感器/探针将告警以 syslog 方式推送时,**消息体(MSG 字段)必须是合法 JSON 格式的 TDP 告警对象**,例如: - -``` -<134>May 10 16:00:00 tdp-sensor tdp: {"id":"AZtRk...","net":{"http":{"url":"/admin"}},"threat":{"name":"SQL注入"}} -``` - -- `receive_alerts` 节点会从 `syslog_message.message` 字段提取并解析该 JSON -- syslog 元数据(`hostname`、`severity`、`timestamp` 等)附加到告警的 `_syslog_meta` 字段,可供后续节点溯源但不参与去重计算 - -### 查询当前配置 - -```bash -curl http://127.0.0.1:8000/api/workflow/alert_dedup_triage/syslog-config \ - -H "Authorization: Bearer " -``` - ---- - -## 工程要点 - -- **三种输入模式**:syslog 实时单条(最高优先级)→ alerts 批次列表 → alert_file 文件,`receive_alerts` 自动检测并切换,`input_mode` 字段记录实际生效模式 -- **跨批次去重**:`dedup_and_triage` 每次单条调用 dedup 服务,LSH 状态持久化在 `~/.flocks/workspace/workflows/http_alert_dedup/` 下,syslog 实时模式与批次模式共享同一 LSH 状态 -- **研判缓存回填**:重复告警从 `~/.flocks/workspace/workflows/alert_dedup_triage/triage_cache.pkl` 读取历史研判结果(`stage=duplicate_with_triage`),实时 syslog 模式下可做到秒级响应 -- **原始告警传给研判**:triage 接收的是原始 raw alert(保留嵌套 `net.http.*` / `threat.*` 字段),syslog 模式下包含 `_syslog_meta` 附加字段 -- **节点超时**:`node_timeout_s = 7200`,留出足够余量处理大批量告警(每条研判约 50s × N 条) -- **输出兼容性**:`generate_summary` 的主要输出字段与 `tdp_alert_triage` 相同,单告警(syslog)场景下可无缝替换 diff --git a/.flocks/plugins/workflows/http_alert_dedup/meta.json b/.flocks/plugins/workflows/http_alert_dedup/meta.json deleted file mode 100644 index a9e5e0be8..000000000 --- a/.flocks/plugins/workflows/http_alert_dedup/meta.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "name": "http_alert_dedup", - "description": "网络告警去重 Pipeline:归一化(TDP/Skyeye 字段映射)→ 过滤(剔除扫描/非 HTTP 告警)→ 去重(URI 归一化 + 5-gram Jaccard 相似度聚类)。输入 dict,输出 dict(deduped_alerts / unique_alerts / stats)。", - "category": "security", - "status": "active", - "createdBy": null, - "createdAt": 1746691200000, - "updatedAt": 1746777600000, - "id": "http_alert_dedup" -} diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.json b/.flocks/plugins/workflows/http_alert_dedup/workflow.json deleted file mode 100644 index df630592d..000000000 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.json +++ /dev/null @@ -1,74 +0,0 @@ -{ - "name": "http_alert_dedup", - "description": "Network alert deduplication pipeline: normalize (per-alert TDP/Skyeye auto-detection + field mapping, mixed batches supported) -> filter (remove scans / non-HTTP) -> dedup (URI normalization + 5-gram Jaccard MinHash LSH). Returns deduped_alerts, unique_alerts and stats.", - "description_cn": "网络告警去重 Pipeline:归一化(按每条告警自动识别 TDP/Skyeye 并字段映射,支持单批次混合)→ 过滤(剔除扫描/非 HTTP 告警)→ 去重(URI 归一化 + 5-gram MinHash LSH + dedup_key 持久化,FIFO LRU 上限默认 10W、可通过 max_dedup_keys 调整)。输入 dict,输出 dict(deduped_alerts / unique_alerts / stats)。", - "start": "receive_alerts", - "nodes": [ - { - "id": "receive_alerts", - "type": "python", - "description": "Parse input (alerts list / alert_file path), extract pipeline configuration, and forward source_log_type as a batch-level hint used by normalize when per-alert field detection is inconclusive.", - "code": "\nimport json\nimport os\n\nalerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n\n# Support alert_file: load JSON from a local file path when alerts list is not given directly.\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n print(f'[receive] loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts from file: {alert_file}')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nsource_log_type = str(inputs.get('source_log_type', 'tdp')).lower()\nif source_log_type not in ('tdp', 'skyeye'):\n source_log_type = 'tdp'\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\ndedup_threshold = float(inputs.get('threshold', inputs.get('dedup_threshold', 0.7)))\nstrict_fields = inputs.get('strict_fields', inputs.get('dedup_fields_strict', ['sip', 'dip']))\nlsh_fields = inputs.get('lsh_fields', inputs.get('dedup_fields_lsh', ['req_http_url', 'req_body', 'rsp_body']))\nmax_field_len = int(inputs.get('max_field_len', 500))\n# Maximum dedup_keys (and LSH clusters) to keep in persisted state.\n# When the cache grows beyond this limit, oldest entries are evicted in FIFO\n# order on the next dedup run. Default 100,000 — tunable per request.\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nif not isinstance(strict_fields, list) or not strict_fields:\n strict_fields = ['sip', 'dip']\nif not isinstance(lsh_fields, list) or not lsh_fields:\n lsh_fields = ['req_http_url', 'req_body', 'rsp_body']\n\nprint(f'[receive] source_log_type={source_log_type}, total={len(alerts_input)}, max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['stats'] = {'raw_count': len(alerts_input)}\noutputs['source_log_type'] = source_log_type\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['dedup_threshold'] = dedup_threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\n" - }, - { - "id": "normalize", - "type": "python", - "description": "Normalize TDP and Skyeye alerts into a unified schema — supports mixed batches. Each alert is individually classified via field signatures: TDP (nested net dict / behave_uuid / net_real_src_ip) or Skyeye (uri / vuln_name / attack_result / attack_flag). Falls back to the batch-level source_log_type hint (or tdp default) when no signature is found.", - "code": "\nimport uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent':'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent':'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef detect_alert_type(alert, batch_hint):\n # Per-alert source type detection.\n # Priority: JSON field signatures > batch_hint fallback.\n #\n # TDP signatures (raw or pre-flattened):\n # - nested 'net' dict (e.g. net.http.url)\n # - 'behave_uuid' or 'flow_id' key\n # - pre-flattened TDP keys: 'net_real_src_ip', 'net_http_url', 'threat_suuid'\n #\n # Skyeye signatures:\n # - 'uri', 'vuln_name', 'attack_result', 'attack_flag'\n if isinstance(alert.get('net'), dict):\n return 'tdp'\n if any(k in alert for k in ('behave_uuid', 'flow_id')):\n return 'tdp'\n if any(k in alert for k in ('net_real_src_ip', 'net_http_url', 'threat_suuid')):\n return 'tdp'\n if any(k in alert for k in ('uri', 'vuln_name', 'attack_result', 'attack_flag')):\n return 'skyeye'\n # No clear signature: use batch-level hint (or default 'tdp')\n return batch_hint\n\ndef normalize_single(alert, source_type):\n flat = flatten_dict(alert)\n field_map = TDP_FIELD_MAP if source_type == 'tdp' else SKYEYE_FIELD_MAP\n norm = {}\n for std_key, raw_key in field_map.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n norm['_source_type'] = source_type # carry detection result for downstream use\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\n# batch_hint: explicit caller param or 'tdp'. Used only when per-alert\n# field detection is inconclusive (e.g. already-normalised data).\nbatch_hint = str(inputs.get('source_log_type', 'tdp') or 'tdp').lower()\nif batch_hint not in ('tdp', 'skyeye'):\n batch_hint = 'tdp'\n\ntype_counts = {'tdp': 0, 'skyeye': 0}\nnormalized = []\nfor alert in raw_alerts:\n src_type = detect_alert_type(alert, batch_hint)\n type_counts[src_type] = type_counts.get(src_type, 0) + 1\n normalized.append(normalize_single(alert, src_type))\n\nstats['normalized_count'] = len(normalized)\nstats['normalize_type_counts'] = type_counts\nprint(f'[normalize] {len(raw_alerts)} alerts -> {len(normalized)} normalized '\n f'(tdp={type_counts.get(\"tdp\",0)}, skyeye={type_counts.get(\"skyeye\",0)}, '\n f'batch_hint={batch_hint!r})')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len',\n 'max_dedup_keys']:\n outputs[k] = inputs.get(k)\n" - }, - { - "id": "filter_logs", - "type": "python", - "description": "Filter alerts using per-alert _source_type (mixed batches supported). Classifies into 9 process_types and keeps non-scan + HTTP alerts (direction in/out/lateral). For Skyeye, direction is normalised to inbound.", - "code": "\nnormalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nbatch_hint = str(inputs.get('source_log_type', 'tdp') or 'tdp').lower()\nif batch_hint not in ('tdp', 'skyeye'):\n batch_hint = 'tdp'\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert):\n # Per-alert source detection: use the _source_type tag added by normalize\n # node (written even when fallback was used). Defaults to batch_hint when\n # the field is missing for any reason (e.g. legacy upstream).\n src = alert.get('_source_type') or batch_hint\n if src == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert):\n src = alert.get('_source_type') or batch_hint\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if src == 'skyeye':\n # Skyeye direction is not reliable; assume inbound for routing.\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['dedup_enabled', 'dedup_threshold', 'strict_fields', 'lsh_fields',\n 'max_field_len', 'max_dedup_keys']:\n outputs[k] = inputs.get(k)\n" - }, - { - "id": "dedup_logs", - "type": "python", - "description": "Step 3 — Dedup (terminal): URI normalization + MinHash LSH (datasketch, 128 perms, 5-gram shingles). LSH index + dedup_key cache are persisted to ~/.flocks/workspace/workflows/http_alert_dedup/ with atomic write and a cross-platform exclusive file lock (POSIX fcntl / Windows msvcrt). FIFO LRU eviction enforces max_dedup_keys (default 100,000, tunable via inputs.max_dedup_keys); cluster_id is monotonically allocated to survive eviction. Survives restarts; safe for concurrent runs. When dedup_enabled=False, no disk state is read or written.", - "code": "\nimport os\nimport re\nimport sys\nimport pickle\nimport hashlib\nfrom datasketch import MinHash, MinHashLSH\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'http_alert_dedup'\nLSH_CLUSTER_WARN_THRESHOLD = 100000 # Warn when persisted cluster or dedup-key count exceeds this.\n\ndef normalize_uri(uri):\n # Normalize dynamic segments in URIs to reduce noise before shingling.\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n # Build a MinHash signature from 5-gram shingles of the text.\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_state_paths(threshold):\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent # ~/.flocks\n state_dir = str(flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n base = os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\ndef acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L')\n fh.flush()\n fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1)\n break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0)\n msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef load_state(state_path, threshold):\n # Returns (lsh_index, lsh_cache_dict, dedup_key_cache_dict, next_cluster_id).\n # On any error or parameter mismatch, returns (None, None, None, 0).\n # Backward-compat: legacy state stored dedup_key_cache as a set; we coerce to dict.\n if not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None, 0\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch (stored np={state.get(\"num_perm\")}, th={state.get(\"threshold\")}), starting fresh')\n return None, None, None, 0\n cache = state['lsh_cache']\n # Coerce dedup_key_cache to dict (insertion-ordered) for FIFO eviction.\n seen_raw = state.get('dedup_key_cache', {})\n if isinstance(seen_raw, set):\n # Legacy format: set has no order. Best-effort: treat as one batch.\n seen = {k: None for k in seen_raw}\n elif isinstance(seen_raw, dict):\n seen = seen_raw\n else:\n seen = {}\n # next_cluster_id: prefer stored, otherwise derive from max key (legacy state).\n next_cid = state.get('next_cluster_id')\n if next_cid is None:\n next_cid = (max(cache.keys()) + 1) if cache else 0\n print(f'[dedup] loaded state: {len(cache)} clusters, {len(seen)} dedup_keys, next_cid={next_cid} from {state_path}')\n return state['lsh_index'], cache, seen, next_cid\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None, 0\n\ndef evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_keys):\n # FIFO eviction to enforce max_keys upper bound on both caches.\n # Returns (evicted_keys, evicted_clusters) for logging.\n evicted_keys = 0\n evicted_clusters = 0\n excess_keys = len(dedup_key_cache) - max_keys\n if excess_keys > 0:\n # dict preserves insertion order in Python 3.7+; iterate to get oldest.\n old_keys = list(dedup_key_cache.keys())[:excess_keys]\n for k in old_keys:\n del dedup_key_cache[k]\n evicted_keys = excess_keys\n excess_clusters = len(lsh_cache) - max_keys\n if excess_clusters > 0:\n old_cids = list(lsh_cache.keys())[:excess_clusters]\n for cid in old_cids:\n # Drop from LSH band index *and* the cache dict, otherwise lsh_index.query\n # would return cluster_ids missing from lsh_cache and trigger KeyError.\n try:\n lsh_index.remove(cid)\n except (KeyError, ValueError):\n pass\n del lsh_cache[cid]\n evicted_clusters = excess_clusters\n return evicted_keys, evicted_clusters\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, next_cluster_id):\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index,\n 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache,\n 'next_cluster_id': next_cluster_id,\n 'num_perm': NUM_PERM,\n 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f)\n f.flush()\n os.fsync(f.fileno())\n os.replace(tmp, state_path)\n print(f'[dedup] state saved: {len(lsh_cache)} clusters, {len(dedup_key_cache)} dedup_keys -> {state_path}')\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try:\n os.remove(tmp)\n except Exception:\n pass\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\nstats = dict(inputs.get('stats', {}))\n\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\nstate_path, lock_path = get_state_paths(threshold)\nlock_fh = acquire_lock(lock_path) if dedup_enabled else None\n\nevicted_keys = 0\nevicted_clusters = 0\n\ntry:\n # lsh_index: MinHashLSH band/row index — O(1) approximate candidate lookup.\n # lsh_cache: cluster_id -> MinHash, used for exact Jaccard re-ranking.\n # dedup_key_cache: ordered dict of MD5 dedup_keys ever seen.\n # next_cluster_id: monotonic counter; survives eviction so cluster_ids never collide.\n if dedup_enabled:\n lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n dedup_key_cache = {}\n next_cluster_id = 0\n else:\n lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = None, {}, {}, 0\n\n # next_cluster_id wrapped in a 1-element list so the closure can mutate it.\n # Workflow scripts run at module-top; nonlocal/global doesn't reach this scope.\n _cid_box = [next_cluster_id]\n def query_most_similar(minhash):\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = _cid_box[0]\n _cid_box[0] += 1\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\n keyed = []\n for alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n alert['_lsh_cluster_id'] = None\n alert['dedup_key'] = dk\n alert['dedup_key_already_exists'] = dk in dedup_key_cache\n dedup_key_cache[dk] = None\n keyed.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n # Cross-batch awareness: dedup_key_cache loaded from disk at start.\n # Re-insert refreshes insertion order so recently-seen keys survive eviction longer.\n already = dk in dedup_key_cache\n if already:\n del dedup_key_cache[dk]\n dedup_key_cache[dk] = None\n alert['dedup_key_already_exists'] = already\n keyed.append(alert)\n\n if dedup_enabled:\n # Enforce max_dedup_keys upper bound before persisting.\n evicted_keys, evicted_clusters = evict_oldest(\n lsh_index, lsh_cache, dedup_key_cache, max_dedup_keys\n )\n if evicted_keys or evicted_clusters:\n print(f'[dedup] LRU eviction (max_dedup_keys={max_dedup_keys}): '\n f'dropped {evicted_keys} keys, {evicted_clusters} clusters')\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters '\n f'and {len(dedup_key_cache)} dedup_keys (warn={LSH_CLUSTER_WARN_THRESHOLD}); '\n f'consider raising max_dedup_keys or rotating state file at {state_path}')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, _cid_box[0])\nfinally:\n if lock_fh is not None:\n release_lock(lock_fh)\n\nseen = {}\nunique_alerts = []\nfor a in keyed:\n k = a['dedup_key']\n if k not in seen:\n seen[k] = a\n unique_alerts.append(a)\n\ndup_count = len(keyed) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, unique_clusters={len(unique_alerts)}, deduped={dup_count}')\n\nstats['after_dedup_count'] = len(keyed)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(keyed), 4) if keyed else 0.0\nstats['dedup_state_persisted'] = bool(dedup_enabled)\nif dedup_enabled:\n stats['lsh_total_clusters'] = len(lsh_cache)\n stats['lsh_total_dedup_keys'] = len(dedup_key_cache)\n stats['lsh_state_path'] = state_path\n stats['lsh_max_dedup_keys'] = max_dedup_keys\n stats['lsh_evicted_keys'] = evicted_keys\n stats['lsh_evicted_clusters'] = evicted_clusters\n\nif dedup_enabled:\n summary = (\n f'http_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique_clusters={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | persisted_clusters={len(lsh_cache)}, persisted_keys={len(dedup_key_cache)}, max={max_dedup_keys}'\n )\nelse:\n summary = (\n f'http_alert_dedup done (dedup_enabled=False, no state persisted): '\n f'raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> unique={len(unique_alerts)} (in-batch only)'\n )\nprint(f'[dedup] {summary}')\n\noutputs['deduped_alerts'] = keyed\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary\n" - } - ], - "edges": [ - { - "from": "receive_alerts", - "to": "normalize", - "order": 0 - }, - { - "from": "normalize", - "to": "filter_logs", - "order": 0 - }, - { - "from": "filter_logs", - "to": "dedup_logs", - "order": 0 - } - ], - "metadata": { - "node_timeout_s": 300, - "sampleInputs": { - "source_log_type": "tdp", - "filter_enabled": true, - "dedup_enabled": true, - "threshold": 0.7, - "_comment_alert_file": "Alternative to 'alerts': pass a JSON file path, e.g. alert_file: '/Users/foo/Downloads/tdp_logs.json'", - "alerts": [ - { - "net_real_src_ip": "1.2.3.4", - "net_dest_ip": "10.0.0.1", - "direction": "in", - "net_type": "http", - "net_http_url": "/admin/login.php?id=1 OR 1=1", - "net_http_reqs_body": "username=admin&password=123456", - "net_http_resp_body": "root@localhost", - "threat_name": "SQL注入攻击", - "threat_type": "web攻击" - } - ], - "max_dedup_keys": 100000, - "_comment_source_log_type": "Optional. Explicit type hint used only when per-alert field detection is inconclusive. Values: 'tdp' (default) or 'skyeye'. Mixed batches are handled automatically." - } - } -} \ No newline at end of file diff --git a/.flocks/plugins/workflows/http_alert_dedup/workflow.md b/.flocks/plugins/workflows/http_alert_dedup/workflow.md deleted file mode 100644 index eb272c3d2..000000000 --- a/.flocks/plugins/workflows/http_alert_dedup/workflow.md +++ /dev/null @@ -1,118 +0,0 @@ -# http_alert_dedup - -网络告警去重 Pipeline,三阶段处理:**归一化 → 过滤 → 去重**。 - -输入 `dict`(原始告警列表 + 配置),输出 `dict`(去重后的告警 + 统计信息),不调用 LLM。 - -## 工作流图 - -``` -receive_alerts - │ -branch_log_type - ├─ tdp ─→ normalize_tdp - └─ skyeye ─→ normalize_skyeye - │ - filter_logs - │ - dedup_logs ◀── 终点,输出 dict -``` - -## 输入参数 - -| 参数 | 类型 | 默认值 | 说明 | -|------|------|--------|------| -| `alerts` | `list[dict]` | — | 原始告警列表(必填) | -| `source_log_type` | `str` | `"tdp"` | 日志来源类型,`"tdp"` 或 `"skyeye"` | -| `filter_enabled` | `bool` | `true` | 是否启用过滤阶段 | -| `dedup_enabled` | `bool` | `true` | 是否启用去重阶段(false 时每条告警独立分配 key) | -| `threshold` | `float` | `0.7` | Jaccard 相似度阈值(0–1) | -| `strict_fields` | `list[str]` | `["sip","dip"]` | 严格匹配字段(需完全相同才参与模糊聚类) | -| `lsh_fields` | `list[str]` | `["req_http_url","req_body","rsp_body"]` | 模糊匹配字段(URI 归一化 + Jaccard) | -| `max_field_len` | `int` | `500` | 单字段截断长度 | - -## 输出参数(终点节点 `dedup_logs` 的 outputs) - -| 字段 | 类型 | 说明 | -|------|------|------| -| `deduped_alerts` | `list[dict]` | 全量告警(经过滤),每条含 `dedup_key`(MD5)和 `dedup_key_already_exists`(是否重复) | -| `unique_alerts` | `list[dict]` | 每个 dedup_key 的代表性告警(去重后唯一集合) | -| `stats` | `dict` | 各阶段统计(见下表) | -| `dedup_summary` | `str` | 一行文字摘要 | - -### stats 字段 - -| 字段 | 说明 | -|------|------| -| `raw_count` | 原始输入告警数 | -| `normalized_count` | 归一化后告警数 | -| `after_filter_count` | 过滤后保留数 | -| `filter_removed_count` | 过滤剔除数 | -| `filter_process_type_counts` | 各 process_type 计数 dict | -| `after_dedup_count` | 去重后告警总数(等于 after_filter_count) | -| `unique_key_count` | 唯一 dedup_key 数(簇数) | -| `dedup_removed_count` | 去重压缩的重复条数 | -| `dedup_ratio` | 压缩率(dedup_removed / after_dedup) | - -## 节点说明 - -### receive_alerts -解析输入,支持 `alerts` / `alert_list` 键,支持 JSON 字符串或 `{"data": [...]}` 包装格式,提取 Pipeline 配置参数。 - -### branch_log_type -按 `source_log_type` 路由:`"tdp"` → `normalize_tdp`,`"skyeye"` → `normalize_skyeye`。 - -### normalize_tdp / normalize_skyeye -字段映射,将各来源的原始字段统一为标准字段(`sip`/`dip`/`req_http_url`/`req_body`/`rsp_body`/`threat_name` 等)。对缺失 `id` 的告警使用 UUID v3 生成。 - -**TDP 关键映射(部分)** - -| 标准字段 | TDP 原始字段 | -|----------|-------------| -| `sip` | `net_real_src_ip` | -| `dip` | `net_dest_ip` | -| `req_http_url` | `net_http_url` | -| `req_body` | `net_http_reqs_body` | -| `rsp_body` | `net_http_resp_body` | -| `threat_name` | `threat_name` | - -**Skyeye 关键映射(部分)** - -| 标准字段 | Skyeye 原始字段 | -|----------|----------------| -| `req_http_url` | `uri` | -| `threat_name` | `vuln_name` | -| `threat_type` | `vuln_type` | -| `threat_result` | `attack_result` | - -### filter_logs -基于 `process_type` 的 9 类分类过滤: - -| process_type | 保留/过滤 | -|-------------|----------| -| `alert_not_scan_http_direction_in` | ✅ 保留 | -| `alert_not_scan_http_direction_out` | ✅ 保留 | -| `alert_not_scan_http_direction_lateral` | ✅ 保留 | -| `alert_scan_direction_*` | ❌ 过滤(扫描类) | -| `alert_not_scan_not_http_*` | ❌ 过滤(非 HTTP) | -| `alert_not_process` | ❌ 过滤(其他) | - -### dedup_logs(终点) - -**URI 归一化**(减少 LSH 字段噪音): - -| 正则模式 | 替换为 | -|---------|--------| -| 日期时间 | `DATETIME` | -| UUID | `UUID` | -| 6 位以上数字 | `NUM` | -| 路径穿越 | `../` | -| `%00` | `NULL` | -| 连续 URL 编码(≥3 组) | `ENCODED` | - -**去重算法**: -1. `strict_fields` 拼接作为严格前缀,不同前缀的告警不归并 -2. 对 `lsh_fields`(URI 归一化后)做 **5-gram shingling** -3. 与已注册簇计算 **Jaccard 相似度**,≥ `threshold` 则归入该簇 -4. 新簇生成 **MD5 dedup_key**;重复告警标记 `dedup_key_already_exists=True` - diff --git a/.flocks/plugins/workflows/stream_alert_dedup/workflow.json b/.flocks/plugins/workflows/stream_alert_dedup/workflow.json deleted file mode 100644 index 6fec44154..000000000 --- a/.flocks/plugins/workflows/stream_alert_dedup/workflow.json +++ /dev/null @@ -1,84 +0,0 @@ -{ - "name": "stream_alert_dedup", - "description": "Streaming-friendly HTTP alert deduplication pipeline: supports syslog real-time single alerts, alerts list, or alert_file path. Normalizes (TDP/Skyeye auto-detection, mixed batches) -> filters (remove scans / non-HTTP) -> deduplicates (URI normalization + 5-gram Jaccard MinHash LSH). Each output alert carries the full normalized fields plus dedup annotation: dedup_key (MD5), is_duplicate (cross-batch), _lsh_cluster_id, _source_type, _process_type. Results appended to JSONL files: ~/.flocks/workspace/workflows/stream_alert_dedup//dedup_result_NNN.jsonl (max 10,000 records per file; each file starts with a timestamp header line).", - "description_cn": "流式 HTTP 告警去重 Pipeline。支持三种输入:syslog 实时单条、alerts 批次列表、alert_file 文件路径。处理流程:归一化 → 过滤 → 去重(URI 归一化 + 5-gram MinHash LSH,跨批次持久化,FIFO LRU)。输出告警保留全部归一化字段,追加去重字段:dedup_key、is_duplicate、_lsh_cluster_id 等。结果追加写入 ~/.flocks/workspace/workflows/stream_alert_dedup//dedup_result_NNN.jsonl,每文件最多 10,000 条(不含首行 header),超出时自动新建序号文件,每个文件首行为含时间戳的 header JSON 行。", - "start": "receive_alert", - "nodes": [ - { - "id": "receive_alert", - "type": "python", - "description": "Parse incoming alert(s): syslog_message (single alert, RFC3164/5424) > alerts list > alert_file. Auto-detects source_log_type (TDP/Skyeye) from syslog app_name/hostname, then JSON field signatures, then defaults to 'tdp'.", - "code": "\nimport json\nimport os\n\n# Input priority: syslog_message > alerts > alert_file\nalerts_input = []\ninput_mode = 'unknown'\n_syslog_msg = None\n\nsyslog_msg = inputs.get('syslog_message') or inputs.get('syslog')\nif syslog_msg and isinstance(syslog_msg, dict):\n raw_text = syslog_msg.get('message', '')\n if raw_text:\n try:\n alert = json.loads(raw_text)\n alert['_syslog_meta'] = {\n 'hostname': syslog_msg.get('hostname', ''),\n 'app_name': syslog_msg.get('app_name', ''),\n 'timestamp': syslog_msg.get('timestamp', ''),\n 'severity': syslog_msg.get('severity'),\n 'facility': syslog_msg.get('facility'),\n 'format': syslog_msg.get('format', ''),\n }\n alerts_input = [alert]\n input_mode = 'syslog'\n _syslog_msg = syslog_msg\n print(f'[receive] syslog mode: host={syslog_msg.get(\"hostname\")!r} '\n f'app={syslog_msg.get(\"app_name\")!r} '\n f'severity={syslog_msg.get(\"severity\")} '\n f'format={syslog_msg.get(\"format\")!r}')\n except (json.JSONDecodeError, TypeError) as _e:\n print(f'[receive] WARNING: syslog.message not valid JSON ({_e}), '\n f'raw={raw_text[:120]!r}')\n else:\n print('[receive] WARNING: syslog_message.message is empty, skipping')\n\nif not alerts_input:\n alerts_input = inputs.get('alerts', inputs.get('alert_list', []))\n if alerts_input:\n input_mode = 'alerts'\n\nif not alerts_input:\n alert_file = inputs.get('alert_file', '')\n if alert_file:\n alert_file = os.path.expanduser(str(alert_file))\n try:\n with open(alert_file, 'r', encoding='utf-8') as _f:\n alerts_input = json.load(_f)\n input_mode = 'alert_file'\n print(f'[receive] file mode: loaded {len(alerts_input) if isinstance(alerts_input, list) else 1} alerts')\n except Exception as _e:\n print(f'[receive] WARNING: failed to load alert_file={alert_file!r}: {_e}')\n alerts_input = []\n\nif isinstance(alerts_input, str):\n try:\n alerts_input = json.loads(alerts_input)\n except Exception:\n alerts_input = []\nif isinstance(alerts_input, dict) and 'data' in alerts_input:\n alerts_input = alerts_input.get('data', [])\nif not isinstance(alerts_input, list):\n alerts_input = [alerts_input] if alerts_input else []\n\nif not alerts_input:\n print('[receive] WARNING: no alerts (syslog_message, alerts, alert_file all empty)')\n\n# ── Source log type resolution ────────────────────────────────────────────────\ndef _detect_from_syslog_meta(sm):\n for field in ('app_name', 'hostname'):\n val = str(sm.get(field, '') or '').lower()\n if 'skyeye' in val:\n return 'skyeye', f'syslog.{field}={sm.get(field)!r}'\n if 'tdp' in val:\n return 'tdp', f'syslog.{field}={sm.get(field)!r}'\n return None, None\n\ndef _detect_from_alert_json(alert):\n if not isinstance(alert, dict):\n return None, None\n if isinstance(alert.get('net'), dict):\n return 'tdp', 'alert has nested net dict (TDP)'\n if any(k in alert for k in ('behave_uuid', 'flow_id')):\n return 'tdp', 'alert has behave_uuid/flow_id (TDP)'\n if any(k in alert for k in ('net_real_src_ip', 'net_http_url', 'threat_suuid')):\n return 'tdp', 'alert has pre-flattened TDP fields'\n if any(k in alert for k in ('uri', 'vuln_name', 'attack_result', 'attack_flag')):\n return 'skyeye', 'alert has uri/vuln_name/attack_result (Skyeye)'\n return None, None\n\nexplicit_type = str(inputs.get('source_log_type', '') or '').lower()\nif explicit_type in ('tdp', 'skyeye'):\n source_log_type = explicit_type\n source_log_type_reason = 'explicit input parameter'\nelif input_mode == 'syslog' and _syslog_msg:\n source_log_type, reason = _detect_from_syslog_meta(_syslog_msg)\n if source_log_type:\n source_log_type_reason = f'syslog metadata: {reason}'\n else:\n first_alert = alerts_input[0] if alerts_input else {}\n source_log_type, reason = _detect_from_alert_json(first_alert)\n if source_log_type:\n source_log_type_reason = f'JSON field detection: {reason}'\n else:\n source_log_type = 'tdp'\n source_log_type_reason = 'fallback default'\nelse:\n first_alert = alerts_input[0] if alerts_input else {}\n source_log_type, reason = _detect_from_alert_json(first_alert)\n if source_log_type:\n source_log_type_reason = f'JSON field detection: {reason}'\n else:\n source_log_type = 'tdp'\n source_log_type_reason = 'default'\n\nprint(f'[receive] source_log_type={source_log_type!r} reason={source_log_type_reason!r}')\n\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\n\nprint(f'[receive] input_mode={input_mode} raw_alerts={len(alerts_input)} '\n f'filter_enabled={filter_enabled} dedup_enabled={dedup_enabled} '\n f'max_dedup_keys={max_dedup_keys}')\n\noutputs['raw_alerts'] = alerts_input\noutputs['input_mode'] = input_mode\noutputs['source_log_type'] = source_log_type\noutputs['source_log_type_reason'] = source_log_type_reason\noutputs['filter_enabled'] = filter_enabled\noutputs['dedup_enabled'] = dedup_enabled\noutputs['dedup_threshold'] = threshold\noutputs['strict_fields'] = strict_fields\noutputs['lsh_fields'] = lsh_fields\noutputs['max_field_len'] = max_field_len\noutputs['max_dedup_keys'] = max_dedup_keys\noutputs['stats'] = {'raw_count': len(alerts_input)}\n" - }, - { - "id": "normalize", - "type": "python", - "description": "Normalize TDP and Skyeye alerts into a unified schema. Per-alert type detection via field signatures; falls back to batch_hint. Carries _syslog_meta and _source_type to downstream nodes.", - "code": "\nimport uuid\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent':'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\nSKYEYE_FIELD_MAP = {\n 'id': 'none',\n 'time': 'time',\n 'direction': 'none',\n 'sip': 'sip',\n 'dip': 'dip',\n 'sport': 'sport',\n 'dport': 'dport',\n 'net_type': 'none',\n 'net_app_proto': 'none',\n 'req_http_url': 'uri',\n 'req_user_agent':'agent',\n 'req_host': 'host',\n 'req_line': 'none',\n 'req_header': 'req_header',\n 'req_body': 'req_body',\n 'req_cookie': 'none',\n 'req_body_len': 'none',\n 'rsp_status_code': 'rsp_status',\n 'rsp_line': 'none',\n 'rsp_header': 'rsp_header',\n 'rsp_body': 'rsp_body',\n 'rsp_body_len': 'rsp_body_len',\n 'threat_rule_id': 'rule_id',\n 'threat_name': 'vuln_name',\n 'threat_msg': 'vuln_desc',\n 'threat_ioc': 'none',\n 'threat_level': 'none',\n 'threat_severity': 'severity',\n 'threat_phase': 'none',\n 'threat_type': 'vuln_type',\n 'threat_tactic_id': 'attck_tactic',\n 'threat_technique_id': 'attck_tech',\n 'threat_result': 'attack_result',\n 'threat_confidence': 'confidence',\n 'connection_established': 'established',\n 'real_attack': 'attack_flag',\n}\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\ndef make_uuid(norm):\n return str(uuid.uuid3(uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n\ndef detect_alert_type(alert, batch_hint):\n if isinstance(alert.get('net'), dict):\n return 'tdp'\n if any(k in alert for k in ('behave_uuid', 'flow_id')):\n return 'tdp'\n if any(k in alert for k in ('net_real_src_ip', 'net_http_url', 'threat_suuid')):\n return 'tdp'\n if any(k in alert for k in ('uri', 'vuln_name', 'attack_result', 'attack_flag')):\n return 'skyeye'\n return batch_hint\n\ndef normalize_single(alert, source_type):\n flat = flatten_dict(alert)\n field_map = TDP_FIELD_MAP if source_type == 'tdp' else SKYEYE_FIELD_MAP\n norm = {}\n for std_key, raw_key in field_map.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = make_uuid(norm)\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n norm['_source_type'] = source_type\n # Carry syslog metadata if present\n if '_syslog_meta' in alert:\n norm['_syslog_meta'] = alert['_syslog_meta']\n return norm\n\nraw_alerts = inputs.get('raw_alerts', [])\nstats = dict(inputs.get('stats', {}))\nbatch_hint = str(inputs.get('source_log_type', 'tdp') or 'tdp').lower()\nif batch_hint not in ('tdp', 'skyeye'):\n batch_hint = 'tdp'\n\ntype_counts = {'tdp': 0, 'skyeye': 0}\nnormalized = []\nfor alert in raw_alerts:\n src_type = detect_alert_type(alert, batch_hint)\n type_counts[src_type] = type_counts.get(src_type, 0) + 1\n normalized.append(normalize_single(alert, src_type))\n\nstats['normalized_count'] = len(normalized)\nstats['normalize_type_counts'] = type_counts\nprint(f'[normalize] {len(raw_alerts)} alerts -> {len(normalized)} normalized '\n f'(tdp={type_counts.get(\"tdp\",0)}, skyeye={type_counts.get(\"skyeye\",0)}, '\n f'batch_hint={batch_hint!r})')\n\noutputs['normalized_alerts'] = normalized\noutputs['stats'] = stats\nfor k in ['input_mode', 'source_log_type', 'filter_enabled', 'dedup_enabled',\n 'dedup_threshold', 'strict_fields', 'lsh_fields', 'max_field_len', 'max_dedup_keys']:\n outputs[k] = inputs.get(k)\n" - }, - { - "id": "filter_logs", - "type": "python", - "description": "Filter: classify into 9 process_types, keep non-scan HTTP alerts (direction in/out/lateral). Adds _process_type and _threat_type fields. When filter_enabled=False, all alerts pass through.", - "code": "\nnormalized_alerts = inputs.get('normalized_alerts', [])\nfilter_enabled = inputs.get('filter_enabled', True)\nbatch_hint = str(inputs.get('source_log_type', 'tdp') or 'tdp').lower()\nif batch_hint not in ('tdp', 'skyeye'):\n batch_hint = 'tdp'\nstats = dict(inputs.get('stats', {}))\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\ndef get_threat_type(alert):\n src = alert.get('_source_type') or batch_hint\n if src == 'skyeye':\n return str(alert.get('threat_type', 'general') or 'general')\n return str(alert.get('threat_name', 'general') or 'general')\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\ndef get_process_type(alert):\n src = alert.get('_source_type') or batch_hint\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if src == 'skyeye':\n return 'alert_scan_direction_in' if scan else 'alert_not_scan_http_direction_in'\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\nfiltered = []\nprocess_type_counts = {}\nfor alert in normalized_alerts:\n alert = dict(alert)\n if filter_enabled:\n ptype = get_process_type(alert)\n need = ptype in NEED_ANALYSIS\n threat_type = get_threat_type(alert)\n else:\n ptype = 'filter_disabled'\n need = True\n threat_type = get_threat_type(alert)\n process_type_counts[ptype] = process_type_counts.get(ptype, 0) + 1\n alert['_process_type'] = ptype\n alert['_threat_type'] = threat_type\n if need:\n filtered.append(alert)\n\nprint(f'[filter] input={len(normalized_alerts)}, kept={len(filtered)}')\nprint(f'[filter] process_type_counts={process_type_counts}')\n\nstats['after_filter_count'] = len(filtered)\nstats['filter_removed_count'] = len(normalized_alerts) - len(filtered)\nstats['filter_process_type_counts'] = process_type_counts\n\noutputs['filtered_alerts'] = filtered\noutputs['stats'] = stats\nfor k in ['input_mode', 'dedup_enabled', 'dedup_threshold', 'strict_fields',\n 'lsh_fields', 'max_field_len', 'max_dedup_keys']:\n outputs[k] = inputs.get(k)\n" - }, - { - "id": "dedup_and_write", - "type": "python", - "description": "Dedup (terminal): URI normalization + MinHash LSH (128 perms, 5-gram). LSH state persisted to ~/.flocks/workspace/workflows/stream_alert_dedup/ (atomic write, file lock, FIFO LRU eviction). Each output alert = normalized fields + dedup_key + is_duplicate + _lsh_cluster_id. Appends enriched alerts to JSONL files under ~/.flocks/workspace/workflows/stream_alert_dedup//dedup_result_NNN.jsonl. Each new file begins with a header line {_type:file_header, created_at, ...}; max 10,000 alert records per file, auto-increments sequence number on rollover.", - "code": "\nimport os\nimport re\nimport sys\nimport json\nimport pickle\nimport hashlib\nimport datetime\nfrom datasketch import MinHash, MinHashLSH\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nMINHASH_SEED = 2024\nNUM_PERM = 128\nWORKFLOW_NAME = 'stream_alert_dedup'\nLSH_CLUSTER_WARN_THRESHOLD = 100000\n\ndef normalize_uri(uri):\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\ndef gen_minhash(text, permutations):\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\ndef get_state_paths(threshold):\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent\n state_dir = str(flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME)\n os.makedirs(state_dir, exist_ok=True)\n base = os.path.join(state_dir, f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\ndef get_output_dir():\n from flocks.config import Config\n from pathlib import Path\n flocks_root = Config().get_global().data_dir.parent\n date_str = datetime.datetime.now().strftime('%Y-%m-%d')\n out_dir = flocks_root / 'workspace' / 'workflows' / WORKFLOW_NAME / date_str\n out_dir.mkdir(parents=True, exist_ok=True)\n return str(out_dir)\n\ndef acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L'); fh.flush(); fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\ndef release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\ndef load_state(state_path, threshold):\n if not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None, 0\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch, starting fresh')\n return None, None, None, 0\n cache = state['lsh_cache']\n seen_raw = state.get('dedup_key_cache', {})\n seen = {k: None for k in seen_raw} if isinstance(seen_raw, set) else (seen_raw if isinstance(seen_raw, dict) else {})\n next_cid = state.get('next_cluster_id') or ((max(cache.keys()) + 1) if cache else 0)\n print(f'[dedup] loaded state: {len(cache)} clusters, {len(seen)} dedup_keys, next_cid={next_cid}')\n return state['lsh_index'], cache, seen, next_cid\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None, 0\n\ndef evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_keys):\n evicted_keys = evicted_clusters = 0\n excess = len(dedup_key_cache) - max_keys\n if excess > 0:\n for k in list(dedup_key_cache.keys())[:excess]:\n del dedup_key_cache[k]\n evicted_keys = excess\n excess = len(lsh_cache) - max_keys\n if excess > 0:\n for cid in list(lsh_cache.keys())[:excess]:\n try: lsh_index.remove(cid)\n except (KeyError, ValueError): pass\n del lsh_cache[cid]\n evicted_clusters = excess\n return evicted_keys, evicted_clusters\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, next_cluster_id):\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index, 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache, 'next_cluster_id': next_cluster_id,\n 'num_perm': NUM_PERM, 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f); f.flush(); os.fsync(f.fileno())\n os.replace(tmp, state_path)\n print(f'[dedup] state saved: {len(lsh_cache)} clusters, {len(dedup_key_cache)} dedup_keys')\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try: os.remove(tmp)\n except Exception: pass\n\n# ── Main ──────────────────────────────────────────────────────────────────────\n\nfiltered_alerts = inputs.get('filtered_alerts', [])\ninput_mode = inputs.get('input_mode', 'unknown')\ndedup_enabled = inputs.get('dedup_enabled', True)\nthreshold = float(inputs.get('dedup_threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\nstats = dict(inputs.get('stats', {}))\n\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\n\nstate_path, lock_path = get_state_paths(threshold)\nlock_fh = acquire_lock(lock_path) if dedup_enabled else None\n\nevicted_keys = evicted_clusters = 0\n\ntry:\n if dedup_enabled:\n lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache = {}\n dedup_key_cache = {}\n next_cluster_id = 0\n else:\n lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = None, {}, {}, 0\n\n _cid_box = [next_cluster_id]\n def query_most_similar(minhash):\n sim_keys = lsh_index.query(minhash)\n if sim_keys:\n candidates = sim_keys[:100]\n sims = [minhash.jaccard(lsh_cache[k]) for k in candidates]\n return candidates[sims.index(max(sims))]\n cluster_id = _cid_box[0]\n _cid_box[0] += 1\n lsh_index.insert(cluster_id, minhash)\n lsh_cache[cluster_id] = minhash\n return cluster_id\n\n enriched = []\n for alert in filtered_alerts:\n alert = dict(alert)\n text_strict = '. '.join(str(alert.get(f, ''))[:max_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(alert.get(f, ''))[:max_len] for f in lsh_fields))\n\n if not dedup_enabled:\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n alert['_lsh_cluster_id'] = None\n alert['dedup_key'] = dk\n alert['is_duplicate'] = dk in dedup_key_cache\n dedup_key_cache[dk] = None\n enriched.append(alert)\n continue\n\n mh = gen_minhash(text_lsh.lower(), _permutations)\n cluster_id = query_most_similar(mh)\n alert['_lsh_cluster_id'] = cluster_id\n\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n alert['dedup_key'] = dk\n already = dk in dedup_key_cache\n if already:\n del dedup_key_cache[dk]\n dedup_key_cache[dk] = None\n alert['is_duplicate'] = already\n enriched.append(alert)\n\n if dedup_enabled:\n evicted_keys, evicted_clusters = evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_dedup_keys)\n if evicted_keys or evicted_clusters:\n print(f'[dedup] LRU eviction: dropped {evicted_keys} keys, {evicted_clusters} clusters')\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters '\n f'and {len(dedup_key_cache)} dedup_keys (warn={LSH_CLUSTER_WARN_THRESHOLD})')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, _cid_box[0])\nfinally:\n if lock_fh is not None:\n release_lock(lock_fh)\n\n# Unique alerts: one representative per dedup_key (first seen)\nseen_keys = {}\nunique_alerts = []\nfor a in enriched:\n k = a['dedup_key']\n if k not in seen_keys:\n seen_keys[k] = a\n unique_alerts.append(a)\n\ndup_count = len(enriched) - len(unique_alerts)\nprint(f'[dedup] input={len(filtered_alerts)}, enriched={len(enriched)}, unique={len(unique_alerts)}, duplicates={dup_count}')\n\nstats['after_dedup_count'] = len(enriched)\nstats['unique_key_count'] = len(unique_alerts)\nstats['dedup_removed_count'] = dup_count\nstats['dedup_ratio'] = round(dup_count / len(enriched), 4) if enriched else 0.0\nstats['dedup_state_persisted'] = bool(dedup_enabled)\nif dedup_enabled:\n stats['lsh_total_clusters'] = len(lsh_cache)\n stats['lsh_total_dedup_keys'] = len(dedup_key_cache)\n stats['lsh_max_dedup_keys'] = max_dedup_keys\n stats['lsh_evicted_keys'] = evicted_keys\n stats['lsh_evicted_clusters'] = evicted_clusters\n\nif dedup_enabled:\n summary = (\n f'stream_alert_dedup done: raw={stats.get(\"raw_count\", 0)}'\n f' -> normalized={stats.get(\"normalized_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> enriched={len(enriched)}, unique={len(unique_alerts)} (compression {stats[\"dedup_ratio\"]:.1%})'\n f' | clusters={len(lsh_cache)}, keys={len(dedup_key_cache)}, max={max_dedup_keys}'\n )\nelse:\n summary = (\n f'stream_alert_dedup done (dedup_enabled=False): '\n f'raw={stats.get(\"raw_count\", 0)}'\n f' -> filtered={stats.get(\"after_filter_count\", 0)}'\n f' -> enriched={len(enriched)}'\n )\nprint(f'[dedup] {summary}')\n\n# ── Write enriched alerts to JSONL files (max 10,000 records per file) ────────\n# File naming: dedup_result_001.jsonl, 002.jsonl, ...\n# Each file starts with a header line: {\"_type\":\"file_header\",\"created_at\":...}\n# Subsequent lines: one enriched alert per line (no header line counted).\n# When the current file reaches MAX_RECORDS_PER_FILE, a new numbered file is created.\n\nMAX_RECORDS_PER_FILE = 10000\n_JSONL_PREFIX = 'dedup_result'\n\ndef _count_alert_lines(file_path):\n count = 0\n try:\n with open(file_path, 'r', encoding='utf-8') as _f:\n for _line in _f:\n _s = _line.strip()\n if _s and '\"_type\"' not in _s:\n count += 1\n except Exception:\n pass\n return count\n\ndef _find_active_file(out_dir):\n import glob\n pattern = os.path.join(out_dir, _JSONL_PREFIX + '_*.jsonl')\n existing = sorted(glob.glob(pattern))\n if not existing:\n return None, 0, 0\n latest = existing[-1]\n basename = os.path.basename(latest)\n try:\n seq = int(basename.replace(_JSONL_PREFIX + '_', '').replace('.jsonl', ''))\n except ValueError:\n seq = len(existing)\n count = _count_alert_lines(latest)\n return latest, count, seq\n\ndef _write_jsonl(out_dir, alerts, now):\n written = []\n active_path, active_count, seq = _find_active_file(out_dir)\n remaining = list(alerts)\n\n while remaining:\n available = MAX_RECORDS_PER_FILE - active_count\n if available <= 0 or active_path is None:\n seq += 1\n active_path = os.path.join(out_dir, f'{_JSONL_PREFIX}_{seq:03d}.jsonl')\n active_count = 0\n available = MAX_RECORDS_PER_FILE\n header = {\n '_type': 'file_header',\n 'created_at': now.isoformat(),\n 'date': now.strftime('%Y-%m-%d'),\n 'workflow': WORKFLOW_NAME,\n 'seq': seq,\n }\n with open(active_path, 'w', encoding='utf-8') as _hf:\n _hf.write(json.dumps(header, ensure_ascii=False) + '\\n')\n\n batch = remaining[:available]\n remaining = remaining[available:]\n\n with open(active_path, 'a', encoding='utf-8') as _af:\n for _alert in batch:\n _af.write(json.dumps(_alert, ensure_ascii=False) + '\\n')\n\n active_count += len(batch)\n if active_path not in written:\n written.append(active_path)\n\n if remaining:\n active_path = None\n active_count = 0\n\n return written\n\n_now = datetime.datetime.now()\ntry:\n _out_dir = get_output_dir()\n _written_paths = _write_jsonl(_out_dir, enriched, _now)\n _out_path = _written_paths[-1] if _written_paths else ''\n print(f'[dedup] wrote {len(enriched)} records -> {_written_paths}')\n stats['output_path'] = _out_path\n stats['output_paths'] = _written_paths\n outputs['output_path'] = _out_path\n outputs['output_paths'] = _written_paths\nexcept Exception as _we:\n import traceback\n print(f'[dedup] WARNING: failed to write JSONL: {_we}\\n{traceback.format_exc()}')\n _out_path = ''\n outputs['output_path'] = ''\n outputs['output_paths'] = []\n\n# ── Outputs ───────────────────────────────────────────────────────────────────\noutputs['enriched_alerts'] = enriched\noutputs['unique_alerts'] = unique_alerts\noutputs['stats'] = stats\noutputs['dedup_summary'] = summary\noutputs['input_mode'] = input_mode\n\n# Convenience fields for single-alert / syslog callers\nif enriched:\n outputs['dedup_key'] = enriched[0].get('dedup_key', '')\n outputs['is_duplicate'] = enriched[0].get('is_duplicate', False)\nelse:\n outputs['dedup_key'] = ''\n outputs['is_duplicate'] = False\n" - } - ], - "edges": [ - { - "from": "receive_alert", - "to": "normalize", - "order": 0 - }, - { - "from": "normalize", - "to": "filter_logs", - "order": 0 - }, - { - "from": "filter_logs", - "to": "dedup_and_write", - "order": 0 - } - ], - "metadata": { - "node_timeout_s": 300, - "sampleInputs": { - "source_log_type": "tdp", - "filter_enabled": true, - "dedup_enabled": true, - "threshold": 0.7, - "max_dedup_keys": 100000, - "_comment_syslog": "Syslog mode: POST /api/workflow/{id}/syslog-config {enabled:true, protocol:'udp', port:5140, inputKey:'syslog_message'}. The TDP/Skyeye alert JSON must be in syslog message body.", - "syslog_message": { - "raw": "<134>May 12 10:00:00 tdp-sensor tdp: {\"id\":\"AZtRkZkzj\",\"net\":{}}", - "facility": 16, - "severity": 6, - "timestamp": "2026-05-12T10:00:00", - "hostname": "tdp-sensor", - "app_name": "tdp", - "message": "{\"id\":\"AZtRkZkzj\",\"net\":{\"http\":{\"url\":\"/admin\"}},\"threat\":{\"name\":\"SQL注入\"}}", - "format": "rfc3164" - }, - "_comment_batch": "Or pass 'alerts' (list) or 'alert_file' (path to JSON file)", - "alerts": [ - { - "net_real_src_ip": "1.2.3.4", - "net_dest_ip": "10.0.0.1", - "direction": "in", - "net_type": "http", - "net_http_url": "/admin/login.php?id=1 OR 1=1", - "net_http_reqs_body": "username=admin&password=123456", - "net_http_resp_body": "root@localhost", - "threat_name": "SQL注入攻击", - "threat_type": "web攻击" - } - ] - } - } -} \ No newline at end of file diff --git a/.flocks/plugins/workflows/stream_alert_dedup/workflow.md b/.flocks/plugins/workflows/stream_alert_dedup/workflow.md deleted file mode 100644 index e5acdee43..000000000 --- a/.flocks/plugins/workflows/stream_alert_dedup/workflow.md +++ /dev/null @@ -1,138 +0,0 @@ -# stream_alert_dedup - -流式 HTTP 告警去重 Pipeline,三阶段处理:**归一化 → 过滤 → 去重**。 - -与 `http_alert_dedup` 的核心区别: -1. **流式单条输入**:支持 syslog 实时单条(`syslog_message`),也兼容批次列表与文件 -2. **输出为原始数据增强**:每条输出告警 = 归一化字段 + 去重字段(`dedup_key`、`is_duplicate`、`_lsh_cluster_id` 等) -3. **结果落盘**:每次执行自动将结果写入 `~/.flocks/workspace/workflows/stream_alert_dedup//` - -## 工作流图 - -``` -receive_alert - │ - normalize - │ - filter_logs - │ -dedup_and_write ◀── 终点,输出增强告警 + 写日期目录 JSON -``` - -## 输入参数 - -| 参数 | 类型 | 默认值 | 说明 | -|------|------|--------|------| -| `syslog_message` | `dict` | — | Syslog 消息体(优先级最高,单条流式) | -| `alerts` | `list[dict]` | — | 原始告警列表(批次模式) | -| `alert_file` | `str` | — | JSON 文件路径(文件模式) | -| `source_log_type` | `str` | 自动识别 | 来源类型 `"tdp"` 或 `"skyeye"`,不填则自动检测 | -| `filter_enabled` | `bool` | `true` | 是否启用过滤阶段 | -| `dedup_enabled` | `bool` | `true` | 是否启用跨批次去重(false 时仅批内去重) | -| `threshold` | `float` | `0.7` | Jaccard 相似度阈值(0–1) | -| `strict_fields` | `list[str]` | `["sip","dip"]` | 严格匹配字段 | -| `lsh_fields` | `list[str]` | `["req_http_url","req_body","rsp_body"]` | 模糊匹配字段(URI 归一化 + MinHash) | -| `max_field_len` | `int` | `500` | 单字段截断长度 | -| `max_dedup_keys` | `int` | `100000` | FIFO LRU 上限(持久化 dedup_key 最大数量) | - -### syslog_message 格式 - -Flocks syslog 监听器解析 RFC3164 / RFC5424 后注入的结构体,TDP/Skyeye 原始 JSON 须在 `message` 字段内: - -```json -{ - "hostname": "tdp-sensor", - "app_name": "tdp", - "timestamp": "2026-05-12T10:00:00", - "severity": 6, - "facility": 16, - "format": "rfc3164", - "message": "{\"id\":\"AZtRkZkzj\",\"net\":{\"http\":{\"url\":\"/admin\"}},\"threat\":{\"name\":\"SQL注入\"}}" -} -``` - -> 开启 syslog 接收:`POST /api/workflow/{id}/syslog-config {"enabled":true,"protocol":"udp","port":5140,"inputKey":"syslog_message"}` - -## 输出参数 - -| 字段 | 类型 | 说明 | -|------|------|------| -| `enriched_alerts` | `list[dict]` | 过滤后全量告警,每条含完整归一化字段 + 去重字段 | -| `unique_alerts` | `list[dict]` | 每个 dedup_key 的代表性告警(首次出现) | -| `dedup_key` | `str` | 第一条告警的 dedup_key(syslog 单条场景直接使用) | -| `is_duplicate` | `bool` | 第一条告警是否为跨批次重复(syslog 单条场景直接使用) | -| `output_path` | `str` | 当次写入的最后一个 JSONL 文件路径 | -| `output_paths` | `list[str]` | 本次写入涉及的所有文件路径(批量超阈值时跨多个文件) | -| `stats` | `dict` | 各阶段统计(见下表) | -| `dedup_summary` | `str` | 一行文字摘要 | -| `input_mode` | `str` | 输入模式:`syslog` / `alerts` / `alert_file` | - -### 每条 enriched_alert 的增强字段 - -| 字段 | 说明 | -|------|------| -| `dedup_key` | MD5 去重键(`strict_fields + cluster_id` 的哈希) | -| `is_duplicate` | 是否已在历史批次中出现过(跨批次持久化感知) | -| `_lsh_cluster_id` | MinHash LSH 簇 ID | -| `_source_type` | 识别出的来源类型(`tdp` / `skyeye`) | -| `_process_type` | 过滤分类(如 `alert_not_scan_http_direction_in`) | -| `_threat_type` | 威胁类型字符串 | -| `_syslog_meta` | syslog 元数据(仅 syslog 模式下存在) | - -### stats 字段 - -| 字段 | 说明 | -|------|------| -| `raw_count` | 原始输入告警数 | -| `normalized_count` | 归一化后告警数 | -| `after_filter_count` | 过滤后保留数 | -| `filter_removed_count` | 过滤剔除数 | -| `after_dedup_count` | 去重处理总数(= after_filter_count) | -| `unique_key_count` | 唯一 dedup_key 数 | -| `dedup_removed_count` | 批内重复数 | -| `dedup_ratio` | 批内压缩率 | -| `output_path` | 结果文件路径 | - -## 结果文件格式 - -写入路径:`~/.flocks/workspace/workflows/stream_alert_dedup//dedup_result_NNN.jsonl` - -- **JSONL 格式**:每行一个 JSON 对象 -- **首行**:`file_header`(含时间戳,不计入告警条数) -- **后续行**:每行一条 enriched_alert -- **分卷规则**:每文件最多 **10,000 条**告警(不含 header 行),超出时自动创建 `dedup_result_002.jsonl`、`003.jsonl`… - -```jsonl -{"_type": "file_header", "created_at": "2026-05-12T10:00:00.123456", "date": "2026-05-12", "workflow": "stream_alert_dedup", "seq": 1} -{"sip": "1.2.3.4", "dip": "10.0.0.1", "req_http_url": "/admin/login.php?id=1 OR 1=1", "threat_name": "SQL注入攻击", "_source_type": "tdp", "_process_type": "alert_not_scan_http_direction_in", "dedup_key": "a3f9...", "is_duplicate": false, "_lsh_cluster_id": 42} -{"sip": "5.6.7.8", "dip": "10.0.0.2", ...} -``` - -`output_path` 输出字段为当次写入的**最后一个**文件路径;`output_paths` 为本次写入涉及的所有文件路径列表(批量超过分卷阈值时可能跨多个文件)。 - -## 节点说明 - -### receive_alert -解析三种输入格式(syslog > alerts > alert_file)。从以下来源按优先级解析 `source_log_type`: -1. 显式 `source_log_type` 参数 -2. Syslog `app_name` / `hostname` 中含 `tdp` 或 `skyeye` -3. 告警 JSON 字段签名(TDP: 嵌套 net 字典 / behave_uuid;Skyeye: uri / vuln_name) -4. 默认 `tdp` - -### normalize -字段映射统一为标准 schema(`sip`/`dip`/`req_http_url`/`req_body`/`rsp_body`/`threat_name` 等),自动检测每条告警类型,支持混合批次。保留 `_syslog_meta`。 - -### filter_logs -基于 `process_type` 9 分类过滤,保留非扫描 HTTP 告警(`in`/`out`/`lateral` 方向)。`filter_enabled=False` 时全量透传。 - -### dedup_and_write(终点) - -**去重算法**(与 http_alert_dedup 相同): -1. `strict_fields` 拼接作为精确前缀 -2. `lsh_fields` URI 归一化后做 **5-gram shingling** -3. MinHash LSH(128 permutations)近似 Jaccard 相似度聚类,阈值 ≥ `threshold` -4. `dedup_key = MD5(strict_prefix + cluster_id)`;`is_duplicate=True` 表示历史已见 - -**持久化**:LSH 状态存于 `~/.flocks/workspace/workflows/stream_alert_dedup/lsh_state_np128_th*.pkl`,原子写 + 文件锁,FIFO LRU 上限 `max_dedup_keys`,可跨批次/跨进程复用。 - -> **注意**:`stream_alert_dedup` 维护独立的 LSH 状态,与 `http_alert_dedup` 不共享去重历史。如需共享历史,可修改 `WORKFLOW_NAME = 'http_alert_dedup'`(同时共享 dedup_key 空间)。 diff --git a/.flocks/plugins/workflows/tdp_alert_pull_dedup/_build_workflow.py b/.flocks/plugins/workflows/tdp_alert_pull_dedup/_build_workflow.py deleted file mode 100644 index 5f58c3b33..000000000 --- a/.flocks/plugins/workflows/tdp_alert_pull_dedup/_build_workflow.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Build workflow.json for tdp_alert_pull_dedup. - -Run: python _build_workflow.py - -Reads the pull_dedup_loop node code from _node_pull_dedup_loop.py and -serializes a fully-valid workflow.json next to it. -""" - -from __future__ import annotations - -import json -import os - -HERE = os.path.dirname(os.path.abspath(__file__)) - - -def read_code(name: str) -> str: - with open(os.path.join(HERE, name), "r", encoding="utf-8") as f: - return f.read() - - -workflow = { - "name": "tdp_alert_pull_dedup", - "description": ( - "Long-running TDP alert puller + deduper. Each iteration calls the " - "tdp_log_search tool to fetch attack-level HTTP alerts in a moving time " - "window, normalizes them, filters non-HTTP/scan noise, deduplicates via " - "URI-normalized 5-gram MinHash LSH (persistent across iterations / runs), " - "and appends enriched alerts to JSONL files under " - "~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl. " - "A persistent time cursor at ~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json " - "guarantees no gaps and no overlap across restarts." - ), - "description_cn": ( - "长时间运行的 TDP 告警拉取 + 去重 Pipeline。单个 python 节点内 while 循环:每轮调用 " - "tdp_log_search 拉取一个时间窗口内的攻击级 HTTP 告警 → 归一化 → 过滤(去扫描/非HTTP)→ " - "URI 归一化 + 5-gram MinHash LSH 去重(持久化 LSH 状态,跨轮次/跨进程共享)→ 追加写入 " - "~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl,每文件 10,000 条上限。" - "时间游标 ~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json 持久化,重启可无重叠续拉。" - "通过 pull_interval_s / max_iterations / max_runtime_s 控制循环节奏与停止条件。" - ), - "start": "pull_dedup_loop", - "nodes": [ - { - "id": "pull_dedup_loop", - "type": "python", - "description": ( - "持续拉取 TDP 告警的主循环节点。内部 while 循环执行:调用 tdp_log_search → 归一化 → " - "过滤 → LSH 去重 → 写盘。time_from 来自持久化游标(首次回退 initial_lookback_s 秒)," - "time_to=当前时间,保证窗口连续无重叠。" - "停止条件:max_iterations / max_runtime_s 任一达到即返回;外部取消(如 SIGINT 或节点超时)也会优雅退出。" - ), - "code": read_code("_node_pull_dedup_loop.py"), - } - ], - "edges": [], - "metadata": { - "node_timeout_s": 2592000, - "sampleInputs": { - "pull_interval_s": 60, - "initial_lookback_s": 300, - "max_iterations": 0, - "max_runtime_s": 0, - "batch_size": 1000, - "net_data_types": ["attack"], - "sql": "threat.level = 'attack'", - "assets_group": [], - "filter_enabled": True, - "dedup_enabled": True, - "threshold": 0.7, - "strict_fields": ["sip", "dip"], - "lsh_fields": ["req_http_url", "req_body", "rsp_body"], - "max_field_len": 500, - "max_dedup_keys": 100000, - "reset_cursor": False, - "log_progress_every": 1, - "_comment_runtime": ( - "node_timeout_s 默认 30 天(2,592,000s),适合长时间持续运行;" - "若想短跑测试,把 max_iterations 调小或设 max_runtime_s 即可。" - ), - "_comment_path": ( - "输出落盘根目录:~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl;" - "时间游标:~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json;" - "LSH 持久化:~/.flocks/workflows/tdp_alert_pull_dedup/lsh_state_np128_th{int(threshold*100)}.pkl" - ), - }, - }, -} - -with open(os.path.join(HERE, "workflow.json"), "w", encoding="utf-8") as f: - json.dump(workflow, f, ensure_ascii=False, indent=2) - -print(f"wrote {os.path.join(HERE, 'workflow.json')}") diff --git a/.flocks/plugins/workflows/tdp_alert_pull_dedup/_node_pull_dedup_loop.py b/.flocks/plugins/workflows/tdp_alert_pull_dedup/_node_pull_dedup_loop.py deleted file mode 100644 index 01e8da7e5..000000000 --- a/.flocks/plugins/workflows/tdp_alert_pull_dedup/_node_pull_dedup_loop.py +++ /dev/null @@ -1,670 +0,0 @@ - -import os -import re -import sys -import json -import time -import pickle -import hashlib -import datetime -import traceback -from pathlib import Path - -from datasketch import MinHash, MinHashLSH - -IS_WINDOWS = sys.platform == 'win32' -if IS_WINDOWS: - import msvcrt # noqa: F401 -else: - import fcntl # noqa: F401 - -WORKFLOW_NAME = 'tdp_alert_pull_dedup' -MINHASH_SEED = 2024 -NUM_PERM = 128 -LSH_CLUSTER_WARN_THRESHOLD = 100000 -MAX_RECORDS_PER_FILE = 10000 -_JSONL_PREFIX = 'alerts' - -HTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE'] - -TDP_FIELD_MAP = { - 'customer_uuid': 'customer_uuid', - 'device_id': 'device_id', - 'id': 'id', - 'time': 'time', - 'direction': 'direction', - 'sip': 'net_real_src_ip', - 'dip': 'net_dest_ip', - 'sport': 'net_src_port', - 'dport': 'net_dest_port', - 'net_type': 'net_type', - 'net_app_proto': 'net_app_proto', - 'req_http_url': 'net_http_url', - 'req_user_agent':'net_http_reqs_user_agent', - 'req_host': 'net_http_reqs_host', - 'req_line': 'net_http_reqs_line', - 'req_header': 'net_http_reqs_header', - 'req_body': 'net_http_reqs_body', - 'req_cookie': 'net_http_reqs_cookie', - 'req_body_len': 'net_http_reqs_content_length', - 'rsp_status_code': 'net_http_status', - 'rsp_line': 'net_http_resp_line', - 'rsp_header': 'net_http_resp_header', - 'rsp_body': 'net_http_resp_body', - 'rsp_body_len': 'net_http_resp_content_length', - 'net_bytes_toclient': 'net_bytes_toclient', - 'net_bytes_toserver': 'net_bytes_toserver', - 'threat_rule_id': 'threat_suuid', - 'threat_name': 'threat_name', - 'threat_msg': 'threat_msg', - 'threat_ioc': 'threat_ioc', - 'threat_level': 'threat_level', - 'threat_severity': 'threat_severity', - 'threat_phase': 'threat_phase', - 'threat_type': 'threat_type', - 'threat_result': 'threat_result', - 'threat_confidence': 'threat_confidence', - 'connection_established': 'established', - 'asset_group_name': 'dest_assets_group_name', - 'asset_name': 'dest_assets_latestName', -} - -NEED_ANALYSIS = { - 'alert_not_scan_http_direction_in', - 'alert_not_scan_http_direction_out', - 'alert_not_scan_http_direction_lateral', -} - -# ── Paths ───────────────────────────────────────────────────────────────────── - -def get_workflow_root(): - from flocks.config import Config - flocks_root = Config().get_global().data_dir.parent # ~/.flocks - root = Path(flocks_root) / 'workflows' / WORKFLOW_NAME - root.mkdir(parents=True, exist_ok=True) - return root - - -def get_state_paths(threshold): - base = str(get_workflow_root() / f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}') - return base + '.pkl', base + '.lock' - - -def get_cursor_path(): - return str(get_workflow_root() / 'cursor.json') - - -def get_output_dir(now): - date_str = now.strftime('%Y-%m-%d') - out_dir = get_workflow_root() / date_str - out_dir.mkdir(parents=True, exist_ok=True) - return str(out_dir) - - -# ── File locking ────────────────────────────────────────────────────────────── - -def acquire_lock(lock_path): - fh = open(lock_path, 'w+') - if IS_WINDOWS: - fh.write('L'); fh.flush(); fh.seek(0) - while True: - try: - msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break - except OSError: - continue - else: - fcntl.flock(fh.fileno(), fcntl.LOCK_EX) - return fh - - -def release_lock(fh): - try: - if IS_WINDOWS: - try: - fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1) - except OSError: - pass - else: - fcntl.flock(fh.fileno(), fcntl.LOCK_UN) - finally: - fh.close() - - -# ── LSH state ───────────────────────────────────────────────────────────────── - -def load_state(state_path, threshold): - if not state_path or not os.path.exists(state_path) or os.path.getsize(state_path) == 0: - return None, None, None, 0 - try: - with open(state_path, 'rb') as f: - state = pickle.load(f) - if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold: - print(f'[dedup] state params mismatch, starting fresh') - return None, None, None, 0 - cache = state['lsh_cache'] - seen_raw = state.get('dedup_key_cache', {}) - seen = {k: None for k in seen_raw} if isinstance(seen_raw, set) else (seen_raw if isinstance(seen_raw, dict) else {}) - next_cid = state.get('next_cluster_id') or ((max(cache.keys()) + 1) if cache else 0) - return state['lsh_index'], cache, seen, next_cid - except Exception as e: - print(f'[dedup] failed to load state ({e}), starting fresh') - return None, None, None, 0 - - -def evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_keys): - evicted_keys = evicted_clusters = 0 - excess = len(dedup_key_cache) - max_keys - if excess > 0: - for k in list(dedup_key_cache.keys())[:excess]: - del dedup_key_cache[k] - evicted_keys = excess - excess = len(lsh_cache) - max_keys - if excess > 0: - for cid in list(lsh_cache.keys())[:excess]: - try: lsh_index.remove(cid) - except (KeyError, ValueError): pass - del lsh_cache[cid] - evicted_clusters = excess - return evicted_keys, evicted_clusters - - -def dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, next_cluster_id): - tmp = state_path + '.tmp' - try: - state = { - 'lsh_index': lsh_index, 'lsh_cache': lsh_cache, - 'dedup_key_cache': dedup_key_cache, 'next_cluster_id': next_cluster_id, - 'num_perm': NUM_PERM, 'threshold': threshold, - } - with open(tmp, 'wb') as f: - pickle.dump(state, f); f.flush(); os.fsync(f.fileno()) - os.replace(tmp, state_path) - except Exception as e: - print(f'[dedup] failed to save state: {e}') - if os.path.exists(tmp): - try: os.remove(tmp) - except Exception: pass - - -# ── Cursor ──────────────────────────────────────────────────────────────────── - -def load_cursor(): - p = get_cursor_path() - if not os.path.exists(p): - return None - try: - with open(p, 'r', encoding='utf-8') as f: - return json.load(f) - except Exception: - return None - - -def save_cursor(cursor): - p = get_cursor_path() - tmp = p + '.tmp' - try: - with open(tmp, 'w', encoding='utf-8') as f: - json.dump(cursor, f, ensure_ascii=False) - f.flush(); os.fsync(f.fileno()) - os.replace(tmp, p) - except Exception as e: - print(f'[cursor] save failed: {e}') - - -# ── Normalize ───────────────────────────────────────────────────────────────── - -def flatten_dict(d, prefix=''): - res = {} - for k, v in d.items(): - if isinstance(v, dict): - res.update(flatten_dict(v, f'{prefix}{k}_')) - else: - res[f'{prefix}{k}'] = v - return res - - -def normalize_single(alert): - import uuid as _uuid - if not isinstance(alert, dict): - return None - flat = flatten_dict(alert) - norm = {} - for std_key, raw_key in TDP_FIELD_MAP.items(): - norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none' - if norm.get('id') in ('none', None, ''): - norm['id'] = str(_uuid.uuid3(_uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values()))) - if norm.get('net_type') in ('none', None, ''): - method = flat.get('method', 'none') - norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other') - norm['_source_type'] = 'tdp' - return norm - - -# ── Filter ──────────────────────────────────────────────────────────────────── - -def is_scan_alert(threat_name): - tnl = str(threat_name or '').lower() - return ('扫描' in tnl) and ('webshell' not in tnl) - - -def is_http(alert): - for field in ('application_layer_protocol', 'net_type', 'net_app_proto'): - val = str(alert.get(field, '') or '').lower() - if val and val != 'none' and 'http' in val: - return True - return False - - -def get_process_type(alert): - threat_name = alert.get('threat_name', '') - direction = str(alert.get('direction', '') or '').lower() - scan = is_scan_alert(threat_name) - http = is_http(alert) - if scan: - return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in' - if http: - return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in' - return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process' - - -# ── Dedup helpers ───────────────────────────────────────────────────────────── - -def normalize_uri(uri): - uri = str(uri or '') - uri = re.sub(r'\d{4}-\d{2}-\d{2}', 'DATETIME', uri) - uri = re.sub(r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE) - uri = re.sub(r'(\.\./)+', 'TRAVERSAL', uri) - uri = re.sub(r'\bNULL\b', 'NULL_REPLACED', uri) - uri = re.sub(r'chr\$\d+\$\|\|chr\$\d+\$', 'CHR_SEQUENCE', uri) - uri = re.sub(r'\b\d+={1,2}\d+\b', 'NUMBER_COMPARISON', uri) - uri = re.sub(r'\b[a-fA-F0-9]{32}\b', 'HEXADECIMAL CHARACTERS', uri) - return uri - - -def gen_minhash(text, permutations): - shingles = [text[i:i+5] for i in range(len(text) - 4)] - m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations) - for s in shingles: - m.update(s.encode('utf-8')) - return m - - -# ── TDP response unwrapping ─────────────────────────────────────────────────── - -def extract_alerts_from_response(resp): - """tdp_log_search returns the inner 'data' field from TDP API (already unwrapped). - - Tolerate several shapes: - - list → used as-is - - dict with 'log' / 'logs' / 'list' / 'data' / 'records' / 'items' key - (possibly nested one level) - """ - if resp is None: - return [] - if isinstance(resp, list): - return list(resp) - if isinstance(resp, dict): - for key in ('log', 'logs', 'list', 'data', 'records', 'items', 'hits'): - v = resp.get(key) - if isinstance(v, list): - return list(v) - if isinstance(v, dict): - for sub in ('list', 'data', 'records', 'items', 'hits'): - sv = v.get(sub) - if isinstance(sv, list): - return list(sv) - return [] - - -# ── JSONL writer ────────────────────────────────────────────────────────────── - -def _count_alert_lines(file_path): - count = 0 - try: - with open(file_path, 'r', encoding='utf-8') as _f: - for _line in _f: - _s = _line.strip() - if _s and '"_type"' not in _s: - count += 1 - except Exception: - pass - return count - - -def _find_active_file(out_dir): - import glob - pattern = os.path.join(out_dir, _JSONL_PREFIX + '_*.jsonl') - existing = sorted(glob.glob(pattern)) - if not existing: - return None, 0, 0 - latest = existing[-1] - basename = os.path.basename(latest) - try: - seq = int(basename.replace(_JSONL_PREFIX + '_', '').replace('.jsonl', '')) - except ValueError: - seq = len(existing) - count = _count_alert_lines(latest) - return latest, count, seq - - -def _write_jsonl(out_dir, alerts, now): - written = [] - active_path, active_count, seq = _find_active_file(out_dir) - remaining = list(alerts) - while remaining: - available = MAX_RECORDS_PER_FILE - active_count - if available <= 0 or active_path is None: - seq += 1 - active_path = os.path.join(out_dir, f'{_JSONL_PREFIX}_{seq:03d}.jsonl') - active_count = 0 - available = MAX_RECORDS_PER_FILE - header = { - '_type': 'file_header', - 'created_at': now.isoformat(), - 'date': now.strftime('%Y-%m-%d'), - 'workflow': WORKFLOW_NAME, - 'seq': seq, - } - with open(active_path, 'w', encoding='utf-8') as _hf: - _hf.write(json.dumps(header, ensure_ascii=False) + '\n') - batch = remaining[:available] - remaining = remaining[available:] - with open(active_path, 'a', encoding='utf-8') as _af: - for _alert in batch: - _af.write(json.dumps(_alert, ensure_ascii=False, default=str) + '\n') - active_count += len(batch) - if active_path not in written: - written.append(active_path) - if remaining: - active_path = None - active_count = 0 - return written - - -# ── Inputs ──────────────────────────────────────────────────────────────────── - -pull_interval_s = float(inputs.get('pull_interval_s', 60)) -initial_lookback_s = int(inputs.get('initial_lookback_s', 300)) -max_iterations = int(inputs.get('max_iterations', 0)) # 0 = infinite -max_runtime_s = float(inputs.get('max_runtime_s', 0)) # 0 = no time limit -batch_size = int(inputs.get('batch_size', 1000)) -net_data_types = inputs.get('net_data_types', ['attack']) -if isinstance(net_data_types, str): - net_data_types = [s.strip() for s in net_data_types.split(',') if s.strip()] -sql_filter = str(inputs.get('sql', "threat.level = 'attack'") or "threat.level = 'attack'") -assets_group = inputs.get('assets_group') or [] -filter_enabled = bool(inputs.get('filter_enabled', True)) -dedup_enabled = bool(inputs.get('dedup_enabled', True)) -threshold = float(inputs.get('threshold', 0.7)) -strict_fields = inputs.get('strict_fields', ['sip', 'dip']) -lsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body']) -max_field_len = int(inputs.get('max_field_len', 500)) -max_dedup_keys = int(inputs.get('max_dedup_keys', 100000)) -reset_cursor = bool(inputs.get('reset_cursor', False)) -log_progress_every = max(1, int(inputs.get('log_progress_every', 1))) - -if max_dedup_keys < 1: - max_dedup_keys = 100000 -if pull_interval_s < 0.1: - pull_interval_s = 0.1 -if batch_size < 1: - batch_size = 1 -if batch_size > 10000: - batch_size = 10000 - -print(f'[init] workflow={WORKFLOW_NAME}') -print(f'[init] pull_interval_s={pull_interval_s}, initial_lookback_s={initial_lookback_s}, ' - f'batch_size={batch_size}, max_iterations={max_iterations}, max_runtime_s={max_runtime_s}') -print(f'[init] sql={sql_filter!r}, net_data_types={net_data_types}, assets_group={list(assets_group) if assets_group else []}') -print(f'[init] filter_enabled={filter_enabled}, dedup_enabled={dedup_enabled}, ' - f'threshold={threshold}, max_dedup_keys={max_dedup_keys}') -print(f'[init] output_root={get_workflow_root()}') - -# ── Cursor init ─────────────────────────────────────────────────────────────── - -now_ts = int(time.time()) -if reset_cursor: - cur = None - print('[cursor] reset_cursor=True, starting from initial_lookback_s') -else: - cur = load_cursor() - -if cur and isinstance(cur.get('next_from'), int): - last_to = int(cur['next_from']) - print(f'[cursor] resumed: next_from={last_to} ({datetime.datetime.fromtimestamp(last_to)})') -else: - last_to = now_ts - initial_lookback_s - print(f'[cursor] fresh start: next_from={last_to} ({datetime.datetime.fromtimestamp(last_to)})') - -# ── MinHash permutations (init once) ────────────────────────────────────────── - -_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations -state_path, lock_path = (get_state_paths(threshold) if dedup_enabled else (None, None)) - -# ── Aggregate stats ─────────────────────────────────────────────────────────── - -stats_all = { - 'iterations': 0, - 'pulls_succeeded': 0, - 'pulls_failed': 0, - 'raw_total': 0, - 'normalized_total': 0, - 'filtered_total': 0, - 'enriched_total': 0, - 'unique_total': 0, - 'duplicates_total': 0, - 'written_files': [], - 'last_window_from': last_to, - 'last_window_to': None, - 'last_error': None, -} - -start_t = time.time() -iter_cnt = 0 -stop_reason = 'completed' - -# ── Main loop ───────────────────────────────────────────────────────────────── - -try: - while True: - iter_cnt += 1 - stats_all['iterations'] = iter_cnt - - if max_iterations and iter_cnt > max_iterations: - stop_reason = f'reached max_iterations={max_iterations}' - print(f'[loop] {stop_reason}') - break - if max_runtime_s and (time.time() - start_t) > max_runtime_s: - stop_reason = f'reached max_runtime_s={max_runtime_s}' - print(f'[loop] {stop_reason}') - break - - time_to_ts = int(time.time()) - time_from = last_to - if time_to_ts <= time_from: - # window not advanced yet (e.g., very short pull_interval); sleep and retry - time.sleep(pull_interval_s) - continue - - stats_all['last_window_from'] = time_from - stats_all['last_window_to'] = time_to_ts - - # ── Pull from TDP ───────────────────────────────────────────────────── - tdp_kwargs = { - 'action': 'search', - 'time_from': time_from, - 'time_to': time_to_ts, - 'net_data_type': list(net_data_types), - 'sql': sql_filter, - 'size': batch_size, - } - if assets_group: - tdp_kwargs['assets_group'] = list(assets_group) - - try: - resp = tool.run('tdp_log_search', **tdp_kwargs) - stats_all['pulls_succeeded'] += 1 - except Exception as _e: - stats_all['pulls_failed'] += 1 - stats_all['last_error'] = f'tdp_log_search failed: {_e}' - print(f'[pull] iter={iter_cnt}: tdp_log_search failed: {_e}') - # Do NOT advance the cursor on failure: we'll retry the same window next round. - time.sleep(pull_interval_s) - continue - - raw_alerts = extract_alerts_from_response(resp) - if iter_cnt % log_progress_every == 0: - print(f'[pull] iter={iter_cnt}: window=[{time_from},{time_to_ts}] ' - f'({datetime.datetime.fromtimestamp(time_from)} → ' - f'{datetime.datetime.fromtimestamp(time_to_ts)}), raw={len(raw_alerts)}') - stats_all['raw_total'] += len(raw_alerts) - - # ── Normalize ───────────────────────────────────────────────────────── - normalized = [] - for a in raw_alerts: - n = normalize_single(a) - if n is not None: - normalized.append(n) - stats_all['normalized_total'] += len(normalized) - - # ── Filter ──────────────────────────────────────────────────────────── - if filter_enabled: - filtered = [] - for a in normalized: - a = dict(a) - ptype = get_process_type(a) - a['_process_type'] = ptype - a['_threat_type'] = str(a.get('threat_name', 'general') or 'general') - if ptype in NEED_ANALYSIS: - filtered.append(a) - else: - filtered = [ - {**a, - '_process_type': 'filter_disabled', - '_threat_type': str(a.get('threat_name', 'general') or 'general')} - for a in normalized - ] - stats_all['filtered_total'] += len(filtered) - - # ── Dedup ───────────────────────────────────────────────────────────── - enriched = [] - if dedup_enabled and filtered: - lock_fh = acquire_lock(lock_path) - try: - lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = load_state(state_path, threshold) - if lsh_index is None: - lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM) - lsh_cache, dedup_key_cache, next_cluster_id = {}, {}, 0 - cid_box = [next_cluster_id] - for a in filtered: - a = dict(a) - text_strict = '. '.join(str(a.get(f, ''))[:max_field_len] for f in strict_fields) - text_lsh = normalize_uri('. '.join(str(a.get(f, ''))[:max_field_len] for f in lsh_fields)) - mh = gen_minhash(text_lsh.lower(), _permutations) - sim_keys = lsh_index.query(mh) - if sim_keys: - cands = sim_keys[:100] - sims = [mh.jaccard(lsh_cache[k]) for k in cands] - cluster_id = cands[sims.index(max(sims))] - else: - cluster_id = cid_box[0] - cid_box[0] += 1 - lsh_index.insert(cluster_id, mh) - lsh_cache[cluster_id] = mh - a['_lsh_cluster_id'] = cluster_id - dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest() - a['dedup_key'] = dk - already = dk in dedup_key_cache - if already: - del dedup_key_cache[dk] - dedup_key_cache[dk] = None - a['is_duplicate'] = already - enriched.append(a) - evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_dedup_keys) - if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD: - print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters and ' - f'{len(dedup_key_cache)} dedup_keys (warn={LSH_CLUSTER_WARN_THRESHOLD})') - dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, cid_box[0]) - finally: - release_lock(lock_fh) - else: - for a in filtered: - a = dict(a) - text_strict = '. '.join(str(a.get(f, ''))[:max_field_len] for f in strict_fields) - text_lsh = '. '.join(str(a.get(f, ''))[:max_field_len] for f in lsh_fields) - dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest() - a['_lsh_cluster_id'] = None - a['dedup_key'] = dk - a['is_duplicate'] = False - enriched.append(a) - - # Unique within this batch (first-seen by dedup_key) - seen_keys = set() - unique_count = 0 - for a in enriched: - k = a.get('dedup_key') - if k not in seen_keys: - seen_keys.add(k) - unique_count += 1 - dup_count = len(enriched) - unique_count - stats_all['enriched_total'] += len(enriched) - stats_all['unique_total'] += unique_count - stats_all['duplicates_total'] += dup_count - - if enriched and iter_cnt % log_progress_every == 0: - print(f'[dedup] iter={iter_cnt}: enriched={len(enriched)}, ' - f'unique={unique_count}, duplicates={dup_count}') - - # ── Write to disk ───────────────────────────────────────────────────── - if enriched: - try: - _now = datetime.datetime.now() - out_dir = get_output_dir(_now) - written_paths = _write_jsonl(out_dir, enriched, _now) - for p in written_paths: - if p not in stats_all['written_files']: - stats_all['written_files'].append(p) - if iter_cnt % log_progress_every == 0: - print(f'[write] iter={iter_cnt}: {len(enriched)} → {written_paths[-1] if written_paths else ""}') - except Exception as _we: - stats_all['last_error'] = f'write failed: {_we}' - print(f'[write] iter={iter_cnt}: failed: {_we}\n{traceback.format_exc()}') - - # ── Advance cursor ──────────────────────────────────────────────────── - last_to = time_to_ts - save_cursor({ - 'next_from': last_to, - 'updated_at': datetime.datetime.now().isoformat(), - 'iter': iter_cnt, - 'workflow': WORKFLOW_NAME, - }) - - time.sleep(pull_interval_s) - -except KeyboardInterrupt: - stop_reason = 'KeyboardInterrupt' - print('[loop] interrupted by user') -except Exception as _loop_err: - stop_reason = f'unhandled error: {_loop_err}' - stats_all['last_error'] = f'unhandled: {_loop_err}' - print(f'[loop] unhandled error: {_loop_err}\n{traceback.format_exc()}') - -# ── Outputs ─────────────────────────────────────────────────────────────────── - -summary = ( - f'{WORKFLOW_NAME} done: iters={stats_all["iterations"]}, ' - f'pulls(ok={stats_all["pulls_succeeded"]}, fail={stats_all["pulls_failed"]}), ' - f'raw={stats_all["raw_total"]}, normalized={stats_all["normalized_total"]}, ' - f'filtered={stats_all["filtered_total"]}, enriched={stats_all["enriched_total"]}, ' - f'unique={stats_all["unique_total"]} (compression ' - f'{(stats_all["duplicates_total"] / stats_all["enriched_total"]) if stats_all["enriched_total"] else 0:.1%}), ' - f'files_written={len(stats_all["written_files"])}, stop={stop_reason}' -) -print(f'[done] {summary}') - -outputs['stats'] = stats_all -outputs['summary'] = summary -outputs['stop_reason'] = stop_reason -outputs['final_cursor'] = last_to -outputs['output_paths'] = list(stats_all['written_files']) -outputs['output_path'] = stats_all['written_files'][-1] if stats_all['written_files'] else '' diff --git a/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.json b/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.json deleted file mode 100644 index 5b2ba7374..000000000 --- a/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "name": "tdp_alert_pull_dedup", - "description": "Long-running TDP alert puller + deduper. Each iteration calls the tdp_log_search tool to fetch attack-level HTTP alerts in a moving time window, normalizes them, filters non-HTTP/scan noise, deduplicates via URI-normalized 5-gram MinHash LSH (persistent across iterations / runs), and appends enriched alerts to JSONL files under ~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl. A persistent time cursor at ~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json guarantees no gaps and no overlap across restarts.", - "description_cn": "长时间运行的 TDP 告警拉取 + 去重 Pipeline。单个 python 节点内 while 循环:每轮调用 tdp_log_search 拉取一个时间窗口内的攻击级 HTTP 告警 → 归一化 → 过滤(去扫描/非HTTP)→ URI 归一化 + 5-gram MinHash LSH 去重(持久化 LSH 状态,跨轮次/跨进程共享)→ 追加写入 ~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl,每文件 10,000 条上限。时间游标 ~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json 持久化,重启可无重叠续拉。通过 pull_interval_s / max_iterations / max_runtime_s 控制循环节奏与停止条件。", - "start": "pull_dedup_loop", - "nodes": [ - { - "id": "pull_dedup_loop", - "type": "python", - "description": "持续拉取 TDP 告警的主循环节点。内部 while 循环执行:调用 tdp_log_search → 归一化 → 过滤 → LSH 去重 → 写盘。time_from 来自持久化游标(首次回退 initial_lookback_s 秒),time_to=当前时间,保证窗口连续无重叠。停止条件:max_iterations / max_runtime_s 任一达到即返回;外部取消(如 SIGINT 或节点超时)也会优雅退出。", - "code": "\nimport os\nimport re\nimport sys\nimport json\nimport time\nimport pickle\nimport hashlib\nimport datetime\nimport traceback\nfrom pathlib import Path\n\nfrom datasketch import MinHash, MinHashLSH\n\nIS_WINDOWS = sys.platform == 'win32'\nif IS_WINDOWS:\n import msvcrt # noqa: F401\nelse:\n import fcntl # noqa: F401\n\nWORKFLOW_NAME = 'tdp_alert_pull_dedup'\nMINHASH_SEED = 2024\nNUM_PERM = 128\nLSH_CLUSTER_WARN_THRESHOLD = 100000\nMAX_RECORDS_PER_FILE = 10000\n_JSONL_PREFIX = 'alerts'\n\nHTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS', 'PATCH', 'TRACE']\n\nTDP_FIELD_MAP = {\n 'customer_uuid': 'customer_uuid',\n 'device_id': 'device_id',\n 'id': 'id',\n 'time': 'time',\n 'direction': 'direction',\n 'sip': 'net_real_src_ip',\n 'dip': 'net_dest_ip',\n 'sport': 'net_src_port',\n 'dport': 'net_dest_port',\n 'net_type': 'net_type',\n 'net_app_proto': 'net_app_proto',\n 'req_http_url': 'net_http_url',\n 'req_user_agent':'net_http_reqs_user_agent',\n 'req_host': 'net_http_reqs_host',\n 'req_line': 'net_http_reqs_line',\n 'req_header': 'net_http_reqs_header',\n 'req_body': 'net_http_reqs_body',\n 'req_cookie': 'net_http_reqs_cookie',\n 'req_body_len': 'net_http_reqs_content_length',\n 'rsp_status_code': 'net_http_status',\n 'rsp_line': 'net_http_resp_line',\n 'rsp_header': 'net_http_resp_header',\n 'rsp_body': 'net_http_resp_body',\n 'rsp_body_len': 'net_http_resp_content_length',\n 'net_bytes_toclient': 'net_bytes_toclient',\n 'net_bytes_toserver': 'net_bytes_toserver',\n 'threat_rule_id': 'threat_suuid',\n 'threat_name': 'threat_name',\n 'threat_msg': 'threat_msg',\n 'threat_ioc': 'threat_ioc',\n 'threat_level': 'threat_level',\n 'threat_severity': 'threat_severity',\n 'threat_phase': 'threat_phase',\n 'threat_type': 'threat_type',\n 'threat_result': 'threat_result',\n 'threat_confidence': 'threat_confidence',\n 'connection_established': 'established',\n 'asset_group_name': 'dest_assets_group_name',\n 'asset_name': 'dest_assets_latestName',\n}\n\nNEED_ANALYSIS = {\n 'alert_not_scan_http_direction_in',\n 'alert_not_scan_http_direction_out',\n 'alert_not_scan_http_direction_lateral',\n}\n\n# ── Paths ─────────────────────────────────────────────────────────────────────\n\ndef get_workflow_root():\n from flocks.config import Config\n flocks_root = Config().get_global().data_dir.parent # ~/.flocks\n root = Path(flocks_root) / 'workflows' / WORKFLOW_NAME\n root.mkdir(parents=True, exist_ok=True)\n return root\n\n\ndef get_state_paths(threshold):\n base = str(get_workflow_root() / f'lsh_state_np{NUM_PERM}_th{int(threshold * 100)}')\n return base + '.pkl', base + '.lock'\n\n\ndef get_cursor_path():\n return str(get_workflow_root() / 'cursor.json')\n\n\ndef get_output_dir(now):\n date_str = now.strftime('%Y-%m-%d')\n out_dir = get_workflow_root() / date_str\n out_dir.mkdir(parents=True, exist_ok=True)\n return str(out_dir)\n\n\n# ── File locking ──────────────────────────────────────────────────────────────\n\ndef acquire_lock(lock_path):\n fh = open(lock_path, 'w+')\n if IS_WINDOWS:\n fh.write('L'); fh.flush(); fh.seek(0)\n while True:\n try:\n msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1); break\n except OSError:\n continue\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_EX)\n return fh\n\n\ndef release_lock(fh):\n try:\n if IS_WINDOWS:\n try:\n fh.seek(0); msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)\n except OSError:\n pass\n else:\n fcntl.flock(fh.fileno(), fcntl.LOCK_UN)\n finally:\n fh.close()\n\n\n# ── LSH state ─────────────────────────────────────────────────────────────────\n\ndef load_state(state_path, threshold):\n if not state_path or not os.path.exists(state_path) or os.path.getsize(state_path) == 0:\n return None, None, None, 0\n try:\n with open(state_path, 'rb') as f:\n state = pickle.load(f)\n if state.get('num_perm') != NUM_PERM or state.get('threshold') != threshold:\n print(f'[dedup] state params mismatch, starting fresh')\n return None, None, None, 0\n cache = state['lsh_cache']\n seen_raw = state.get('dedup_key_cache', {})\n seen = {k: None for k in seen_raw} if isinstance(seen_raw, set) else (seen_raw if isinstance(seen_raw, dict) else {})\n next_cid = state.get('next_cluster_id') or ((max(cache.keys()) + 1) if cache else 0)\n return state['lsh_index'], cache, seen, next_cid\n except Exception as e:\n print(f'[dedup] failed to load state ({e}), starting fresh')\n return None, None, None, 0\n\n\ndef evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_keys):\n evicted_keys = evicted_clusters = 0\n excess = len(dedup_key_cache) - max_keys\n if excess > 0:\n for k in list(dedup_key_cache.keys())[:excess]:\n del dedup_key_cache[k]\n evicted_keys = excess\n excess = len(lsh_cache) - max_keys\n if excess > 0:\n for cid in list(lsh_cache.keys())[:excess]:\n try: lsh_index.remove(cid)\n except (KeyError, ValueError): pass\n del lsh_cache[cid]\n evicted_clusters = excess\n return evicted_keys, evicted_clusters\n\n\ndef dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, next_cluster_id):\n tmp = state_path + '.tmp'\n try:\n state = {\n 'lsh_index': lsh_index, 'lsh_cache': lsh_cache,\n 'dedup_key_cache': dedup_key_cache, 'next_cluster_id': next_cluster_id,\n 'num_perm': NUM_PERM, 'threshold': threshold,\n }\n with open(tmp, 'wb') as f:\n pickle.dump(state, f); f.flush(); os.fsync(f.fileno())\n os.replace(tmp, state_path)\n except Exception as e:\n print(f'[dedup] failed to save state: {e}')\n if os.path.exists(tmp):\n try: os.remove(tmp)\n except Exception: pass\n\n\n# ── Cursor ────────────────────────────────────────────────────────────────────\n\ndef load_cursor():\n p = get_cursor_path()\n if not os.path.exists(p):\n return None\n try:\n with open(p, 'r', encoding='utf-8') as f:\n return json.load(f)\n except Exception:\n return None\n\n\ndef save_cursor(cursor):\n p = get_cursor_path()\n tmp = p + '.tmp'\n try:\n with open(tmp, 'w', encoding='utf-8') as f:\n json.dump(cursor, f, ensure_ascii=False)\n f.flush(); os.fsync(f.fileno())\n os.replace(tmp, p)\n except Exception as e:\n print(f'[cursor] save failed: {e}')\n\n\n# ── Normalize ─────────────────────────────────────────────────────────────────\n\ndef flatten_dict(d, prefix=''):\n res = {}\n for k, v in d.items():\n if isinstance(v, dict):\n res.update(flatten_dict(v, f'{prefix}{k}_'))\n else:\n res[f'{prefix}{k}'] = v\n return res\n\n\ndef normalize_single(alert):\n import uuid as _uuid\n if not isinstance(alert, dict):\n return None\n flat = flatten_dict(alert)\n norm = {}\n for std_key, raw_key in TDP_FIELD_MAP.items():\n norm[std_key] = flat.get(raw_key, 'none') if raw_key != 'none' else 'none'\n if norm.get('id') in ('none', None, ''):\n norm['id'] = str(_uuid.uuid3(_uuid.NAMESPACE_DNS, ''.join(str(v) for v in norm.values())))\n if norm.get('net_type') in ('none', None, ''):\n method = flat.get('method', 'none')\n norm['net_type'] = 'http' if method in HTTP_METHODS else ('none' if method == 'none' else 'other')\n norm['_source_type'] = 'tdp'\n return norm\n\n\n# ── Filter ────────────────────────────────────────────────────────────────────\n\ndef is_scan_alert(threat_name):\n tnl = str(threat_name or '').lower()\n return ('扫描' in tnl) and ('webshell' not in tnl)\n\n\ndef is_http(alert):\n for field in ('application_layer_protocol', 'net_type', 'net_app_proto'):\n val = str(alert.get(field, '') or '').lower()\n if val and val != 'none' and 'http' in val:\n return True\n return False\n\n\ndef get_process_type(alert):\n threat_name = alert.get('threat_name', '')\n direction = str(alert.get('direction', '') or '').lower()\n scan = is_scan_alert(threat_name)\n http = is_http(alert)\n if scan:\n return f'alert_scan_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_scan_direction_in'\n if http:\n return f'alert_not_scan_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_scan_http_direction_in'\n return f'alert_not_scan_not_http_direction_{direction}' if direction in ('in', 'out', 'lateral') else 'alert_not_process'\n\n\n# ── Dedup helpers ─────────────────────────────────────────────────────────────\n\ndef normalize_uri(uri):\n uri = str(uri or '')\n uri = re.sub(r'\\d{4}-\\d{2}-\\d{2}', 'DATETIME', uri)\n uri = re.sub(r'[\\da-f]{8}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{4}-[\\da-f]{12}', 'UUID', uri, flags=re.IGNORECASE)\n uri = re.sub(r'(\\.\\./)+', 'TRAVERSAL', uri)\n uri = re.sub(r'\\bNULL\\b', 'NULL_REPLACED', uri)\n uri = re.sub(r'chr\\$\\d+\\$\\|\\|chr\\$\\d+\\$', 'CHR_SEQUENCE', uri)\n uri = re.sub(r'\\b\\d+={1,2}\\d+\\b', 'NUMBER_COMPARISON', uri)\n uri = re.sub(r'\\b[a-fA-F0-9]{32}\\b', 'HEXADECIMAL CHARACTERS', uri)\n return uri\n\n\ndef gen_minhash(text, permutations):\n shingles = [text[i:i+5] for i in range(len(text) - 4)]\n m = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED, permutations=permutations)\n for s in shingles:\n m.update(s.encode('utf-8'))\n return m\n\n\n# ── TDP response unwrapping ───────────────────────────────────────────────────\n\ndef extract_alerts_from_response(resp):\n \"\"\"tdp_log_search returns the inner 'data' field from TDP API (already unwrapped).\n\n Tolerate several shapes:\n - list → used as-is\n - dict with 'log' / 'logs' / 'list' / 'data' / 'records' / 'items' key\n (possibly nested one level)\n \"\"\"\n if resp is None:\n return []\n if isinstance(resp, list):\n return list(resp)\n if isinstance(resp, dict):\n for key in ('log', 'logs', 'list', 'data', 'records', 'items', 'hits'):\n v = resp.get(key)\n if isinstance(v, list):\n return list(v)\n if isinstance(v, dict):\n for sub in ('list', 'data', 'records', 'items', 'hits'):\n sv = v.get(sub)\n if isinstance(sv, list):\n return list(sv)\n return []\n\n\n# ── JSONL writer ──────────────────────────────────────────────────────────────\n\ndef _count_alert_lines(file_path):\n count = 0\n try:\n with open(file_path, 'r', encoding='utf-8') as _f:\n for _line in _f:\n _s = _line.strip()\n if _s and '\"_type\"' not in _s:\n count += 1\n except Exception:\n pass\n return count\n\n\ndef _find_active_file(out_dir):\n import glob\n pattern = os.path.join(out_dir, _JSONL_PREFIX + '_*.jsonl')\n existing = sorted(glob.glob(pattern))\n if not existing:\n return None, 0, 0\n latest = existing[-1]\n basename = os.path.basename(latest)\n try:\n seq = int(basename.replace(_JSONL_PREFIX + '_', '').replace('.jsonl', ''))\n except ValueError:\n seq = len(existing)\n count = _count_alert_lines(latest)\n return latest, count, seq\n\n\ndef _write_jsonl(out_dir, alerts, now):\n written = []\n active_path, active_count, seq = _find_active_file(out_dir)\n remaining = list(alerts)\n while remaining:\n available = MAX_RECORDS_PER_FILE - active_count\n if available <= 0 or active_path is None:\n seq += 1\n active_path = os.path.join(out_dir, f'{_JSONL_PREFIX}_{seq:03d}.jsonl')\n active_count = 0\n available = MAX_RECORDS_PER_FILE\n header = {\n '_type': 'file_header',\n 'created_at': now.isoformat(),\n 'date': now.strftime('%Y-%m-%d'),\n 'workflow': WORKFLOW_NAME,\n 'seq': seq,\n }\n with open(active_path, 'w', encoding='utf-8') as _hf:\n _hf.write(json.dumps(header, ensure_ascii=False) + '\\n')\n batch = remaining[:available]\n remaining = remaining[available:]\n with open(active_path, 'a', encoding='utf-8') as _af:\n for _alert in batch:\n _af.write(json.dumps(_alert, ensure_ascii=False, default=str) + '\\n')\n active_count += len(batch)\n if active_path not in written:\n written.append(active_path)\n if remaining:\n active_path = None\n active_count = 0\n return written\n\n\n# ── Inputs ────────────────────────────────────────────────────────────────────\n\npull_interval_s = float(inputs.get('pull_interval_s', 60))\ninitial_lookback_s = int(inputs.get('initial_lookback_s', 300))\nmax_iterations = int(inputs.get('max_iterations', 0)) # 0 = infinite\nmax_runtime_s = float(inputs.get('max_runtime_s', 0)) # 0 = no time limit\nbatch_size = int(inputs.get('batch_size', 1000))\nnet_data_types = inputs.get('net_data_types', ['attack'])\nif isinstance(net_data_types, str):\n net_data_types = [s.strip() for s in net_data_types.split(',') if s.strip()]\nsql_filter = str(inputs.get('sql', \"threat.level = 'attack'\") or \"threat.level = 'attack'\")\nassets_group = inputs.get('assets_group') or []\nfilter_enabled = bool(inputs.get('filter_enabled', True))\ndedup_enabled = bool(inputs.get('dedup_enabled', True))\nthreshold = float(inputs.get('threshold', 0.7))\nstrict_fields = inputs.get('strict_fields', ['sip', 'dip'])\nlsh_fields = inputs.get('lsh_fields', ['req_http_url', 'req_body', 'rsp_body'])\nmax_field_len = int(inputs.get('max_field_len', 500))\nmax_dedup_keys = int(inputs.get('max_dedup_keys', 100000))\nreset_cursor = bool(inputs.get('reset_cursor', False))\nlog_progress_every = max(1, int(inputs.get('log_progress_every', 1)))\n\nif max_dedup_keys < 1:\n max_dedup_keys = 100000\nif pull_interval_s < 0.1:\n pull_interval_s = 0.1\nif batch_size < 1:\n batch_size = 1\nif batch_size > 10000:\n batch_size = 10000\n\nprint(f'[init] workflow={WORKFLOW_NAME}')\nprint(f'[init] pull_interval_s={pull_interval_s}, initial_lookback_s={initial_lookback_s}, '\n f'batch_size={batch_size}, max_iterations={max_iterations}, max_runtime_s={max_runtime_s}')\nprint(f'[init] sql={sql_filter!r}, net_data_types={net_data_types}, assets_group={list(assets_group) if assets_group else []}')\nprint(f'[init] filter_enabled={filter_enabled}, dedup_enabled={dedup_enabled}, '\n f'threshold={threshold}, max_dedup_keys={max_dedup_keys}')\nprint(f'[init] output_root={get_workflow_root()}')\n\n# ── Cursor init ───────────────────────────────────────────────────────────────\n\nnow_ts = int(time.time())\nif reset_cursor:\n cur = None\n print('[cursor] reset_cursor=True, starting from initial_lookback_s')\nelse:\n cur = load_cursor()\n\nif cur and isinstance(cur.get('next_from'), int):\n last_to = int(cur['next_from'])\n print(f'[cursor] resumed: next_from={last_to} ({datetime.datetime.fromtimestamp(last_to)})')\nelse:\n last_to = now_ts - initial_lookback_s\n print(f'[cursor] fresh start: next_from={last_to} ({datetime.datetime.fromtimestamp(last_to)})')\n\n# ── MinHash permutations (init once) ──────────────────────────────────────────\n\n_permutations = MinHash(num_perm=NUM_PERM, seed=MINHASH_SEED).permutations\nstate_path, lock_path = (get_state_paths(threshold) if dedup_enabled else (None, None))\n\n# ── Aggregate stats ───────────────────────────────────────────────────────────\n\nstats_all = {\n 'iterations': 0,\n 'pulls_succeeded': 0,\n 'pulls_failed': 0,\n 'raw_total': 0,\n 'normalized_total': 0,\n 'filtered_total': 0,\n 'enriched_total': 0,\n 'unique_total': 0,\n 'duplicates_total': 0,\n 'written_files': [],\n 'last_window_from': last_to,\n 'last_window_to': None,\n 'last_error': None,\n}\n\nstart_t = time.time()\niter_cnt = 0\nstop_reason = 'completed'\n\n# ── Main loop ─────────────────────────────────────────────────────────────────\n\ntry:\n while True:\n iter_cnt += 1\n stats_all['iterations'] = iter_cnt\n\n if max_iterations and iter_cnt > max_iterations:\n stop_reason = f'reached max_iterations={max_iterations}'\n print(f'[loop] {stop_reason}')\n break\n if max_runtime_s and (time.time() - start_t) > max_runtime_s:\n stop_reason = f'reached max_runtime_s={max_runtime_s}'\n print(f'[loop] {stop_reason}')\n break\n\n time_to_ts = int(time.time())\n time_from = last_to\n if time_to_ts <= time_from:\n # window not advanced yet (e.g., very short pull_interval); sleep and retry\n time.sleep(pull_interval_s)\n continue\n\n stats_all['last_window_from'] = time_from\n stats_all['last_window_to'] = time_to_ts\n\n # ── Pull from TDP ─────────────────────────────────────────────────────\n tdp_kwargs = {\n 'action': 'search',\n 'time_from': time_from,\n 'time_to': time_to_ts,\n 'net_data_type': list(net_data_types),\n 'sql': sql_filter,\n 'size': batch_size,\n }\n if assets_group:\n tdp_kwargs['assets_group'] = list(assets_group)\n\n try:\n resp = tool.run('tdp_log_search', **tdp_kwargs)\n stats_all['pulls_succeeded'] += 1\n except Exception as _e:\n stats_all['pulls_failed'] += 1\n stats_all['last_error'] = f'tdp_log_search failed: {_e}'\n print(f'[pull] iter={iter_cnt}: tdp_log_search failed: {_e}')\n # Do NOT advance the cursor on failure: we'll retry the same window next round.\n time.sleep(pull_interval_s)\n continue\n\n raw_alerts = extract_alerts_from_response(resp)\n if iter_cnt % log_progress_every == 0:\n print(f'[pull] iter={iter_cnt}: window=[{time_from},{time_to_ts}] '\n f'({datetime.datetime.fromtimestamp(time_from)} → '\n f'{datetime.datetime.fromtimestamp(time_to_ts)}), raw={len(raw_alerts)}')\n stats_all['raw_total'] += len(raw_alerts)\n\n # ── Normalize ─────────────────────────────────────────────────────────\n normalized = []\n for a in raw_alerts:\n n = normalize_single(a)\n if n is not None:\n normalized.append(n)\n stats_all['normalized_total'] += len(normalized)\n\n # ── Filter ────────────────────────────────────────────────────────────\n if filter_enabled:\n filtered = []\n for a in normalized:\n a = dict(a)\n ptype = get_process_type(a)\n a['_process_type'] = ptype\n a['_threat_type'] = str(a.get('threat_name', 'general') or 'general')\n if ptype in NEED_ANALYSIS:\n filtered.append(a)\n else:\n filtered = [\n {**a,\n '_process_type': 'filter_disabled',\n '_threat_type': str(a.get('threat_name', 'general') or 'general')}\n for a in normalized\n ]\n stats_all['filtered_total'] += len(filtered)\n\n # ── Dedup ─────────────────────────────────────────────────────────────\n enriched = []\n if dedup_enabled and filtered:\n lock_fh = acquire_lock(lock_path)\n try:\n lsh_index, lsh_cache, dedup_key_cache, next_cluster_id = load_state(state_path, threshold)\n if lsh_index is None:\n lsh_index = MinHashLSH(threshold=threshold, num_perm=NUM_PERM)\n lsh_cache, dedup_key_cache, next_cluster_id = {}, {}, 0\n cid_box = [next_cluster_id]\n for a in filtered:\n a = dict(a)\n text_strict = '. '.join(str(a.get(f, ''))[:max_field_len] for f in strict_fields)\n text_lsh = normalize_uri('. '.join(str(a.get(f, ''))[:max_field_len] for f in lsh_fields))\n mh = gen_minhash(text_lsh.lower(), _permutations)\n sim_keys = lsh_index.query(mh)\n if sim_keys:\n cands = sim_keys[:100]\n sims = [mh.jaccard(lsh_cache[k]) for k in cands]\n cluster_id = cands[sims.index(max(sims))]\n else:\n cluster_id = cid_box[0]\n cid_box[0] += 1\n lsh_index.insert(cluster_id, mh)\n lsh_cache[cluster_id] = mh\n a['_lsh_cluster_id'] = cluster_id\n dk = hashlib.md5(f'{text_strict}. {cluster_id}'.encode('utf-8')).hexdigest()\n a['dedup_key'] = dk\n already = dk in dedup_key_cache\n if already:\n del dedup_key_cache[dk]\n dedup_key_cache[dk] = None\n a['is_duplicate'] = already\n enriched.append(a)\n evict_oldest(lsh_index, lsh_cache, dedup_key_cache, max_dedup_keys)\n if len(lsh_cache) > LSH_CLUSTER_WARN_THRESHOLD or len(dedup_key_cache) > LSH_CLUSTER_WARN_THRESHOLD:\n print(f'[dedup] WARNING: persisted state holds {len(lsh_cache)} clusters and '\n f'{len(dedup_key_cache)} dedup_keys (warn={LSH_CLUSTER_WARN_THRESHOLD})')\n dump_state_atomic(state_path, lsh_index, lsh_cache, dedup_key_cache, threshold, cid_box[0])\n finally:\n release_lock(lock_fh)\n else:\n for a in filtered:\n a = dict(a)\n text_strict = '. '.join(str(a.get(f, ''))[:max_field_len] for f in strict_fields)\n text_lsh = '. '.join(str(a.get(f, ''))[:max_field_len] for f in lsh_fields)\n dk = hashlib.md5(f'{text_strict}. {text_lsh}'.encode('utf-8')).hexdigest()\n a['_lsh_cluster_id'] = None\n a['dedup_key'] = dk\n a['is_duplicate'] = False\n enriched.append(a)\n\n # Unique within this batch (first-seen by dedup_key)\n seen_keys = set()\n unique_count = 0\n for a in enriched:\n k = a.get('dedup_key')\n if k not in seen_keys:\n seen_keys.add(k)\n unique_count += 1\n dup_count = len(enriched) - unique_count\n stats_all['enriched_total'] += len(enriched)\n stats_all['unique_total'] += unique_count\n stats_all['duplicates_total'] += dup_count\n\n if enriched and iter_cnt % log_progress_every == 0:\n print(f'[dedup] iter={iter_cnt}: enriched={len(enriched)}, '\n f'unique={unique_count}, duplicates={dup_count}')\n\n # ── Write to disk ─────────────────────────────────────────────────────\n if enriched:\n try:\n _now = datetime.datetime.now()\n out_dir = get_output_dir(_now)\n written_paths = _write_jsonl(out_dir, enriched, _now)\n for p in written_paths:\n if p not in stats_all['written_files']:\n stats_all['written_files'].append(p)\n if iter_cnt % log_progress_every == 0:\n print(f'[write] iter={iter_cnt}: {len(enriched)} → {written_paths[-1] if written_paths else \"\"}')\n except Exception as _we:\n stats_all['last_error'] = f'write failed: {_we}'\n print(f'[write] iter={iter_cnt}: failed: {_we}\\n{traceback.format_exc()}')\n\n # ── Advance cursor ────────────────────────────────────────────────────\n last_to = time_to_ts\n save_cursor({\n 'next_from': last_to,\n 'updated_at': datetime.datetime.now().isoformat(),\n 'iter': iter_cnt,\n 'workflow': WORKFLOW_NAME,\n })\n\n time.sleep(pull_interval_s)\n\nexcept KeyboardInterrupt:\n stop_reason = 'KeyboardInterrupt'\n print('[loop] interrupted by user')\nexcept Exception as _loop_err:\n stop_reason = f'unhandled error: {_loop_err}'\n stats_all['last_error'] = f'unhandled: {_loop_err}'\n print(f'[loop] unhandled error: {_loop_err}\\n{traceback.format_exc()}')\n\n# ── Outputs ───────────────────────────────────────────────────────────────────\n\nsummary = (\n f'{WORKFLOW_NAME} done: iters={stats_all[\"iterations\"]}, '\n f'pulls(ok={stats_all[\"pulls_succeeded\"]}, fail={stats_all[\"pulls_failed\"]}), '\n f'raw={stats_all[\"raw_total\"]}, normalized={stats_all[\"normalized_total\"]}, '\n f'filtered={stats_all[\"filtered_total\"]}, enriched={stats_all[\"enriched_total\"]}, '\n f'unique={stats_all[\"unique_total\"]} (compression '\n f'{(stats_all[\"duplicates_total\"] / stats_all[\"enriched_total\"]) if stats_all[\"enriched_total\"] else 0:.1%}), '\n f'files_written={len(stats_all[\"written_files\"])}, stop={stop_reason}'\n)\nprint(f'[done] {summary}')\n\noutputs['stats'] = stats_all\noutputs['summary'] = summary\noutputs['stop_reason'] = stop_reason\noutputs['final_cursor'] = last_to\noutputs['output_paths'] = list(stats_all['written_files'])\noutputs['output_path'] = stats_all['written_files'][-1] if stats_all['written_files'] else ''\n" - } - ], - "edges": [], - "metadata": { - "node_timeout_s": 2592000, - "sampleInputs": { - "pull_interval_s": 60, - "initial_lookback_s": 300, - "max_iterations": 0, - "max_runtime_s": 0, - "batch_size": 1000, - "net_data_types": [ - "attack" - ], - "sql": "threat.level = 'attack'", - "assets_group": [], - "filter_enabled": true, - "dedup_enabled": true, - "threshold": 0.7, - "strict_fields": [ - "sip", - "dip" - ], - "lsh_fields": [ - "req_http_url", - "req_body", - "rsp_body" - ], - "max_field_len": 500, - "max_dedup_keys": 100000, - "reset_cursor": false, - "log_progress_every": 1, - "_comment_runtime": "node_timeout_s 默认 30 天(2,592,000s),适合长时间持续运行;若想短跑测试,把 max_iterations 调小或设 max_runtime_s 即可。", - "_comment_path": "输出落盘根目录:~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl;时间游标:~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json;LSH 持久化:~/.flocks/workflows/tdp_alert_pull_dedup/lsh_state_np128_th{int(threshold*100)}.pkl" - } - } -} \ No newline at end of file diff --git a/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.md b/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.md deleted file mode 100644 index 23ad2cafa..000000000 --- a/.flocks/plugins/workflows/tdp_alert_pull_dedup/workflow.md +++ /dev/null @@ -1,219 +0,0 @@ -# tdp_alert_pull_dedup - -**TDP 告警持续拉取 + 去重 Pipeline** - -参考 `stream_alert_dedup` 的处理流水线,把"数据源"从 syslog 监听换成主动调用 TDP v3.3.10 的 `tdp_log_search` 工具拉取。 - -工作流启动后,单个节点内部以 `while` 循环持续运行:每轮从 TDP 拉取一个时间窗口的告警 → 归一化 → 过滤(去扫描/非HTTP)→ MinHash LSH 去重 → 追加写入按日期切分的 JSONL 文件,直到达到 `max_iterations` / `max_runtime_s` 任一停止条件,或被外部取消。 - -## 工作流图 - -``` -pull_dedup_loop (循环节点;启动后持续运行) - │ - ├─ tool.run('tdp_log_search', ...) ── TDP 告警拉取 - ├─ normalize ── 字段映射到统一 schema - ├─ filter_logs ── 9 分类,保留非扫描 HTTP 告警 - ├─ dedup ── URI 归一化 + 5-gram MinHash LSH - └─ append JSONL ── 按日期 + 序号分卷写盘 - ▲ - │ - advance time cursor (持久化) -``` - -## 输入参数 - -| 参数 | 类型 | 默认值 | 说明 | -|------|------|--------|------| -| `pull_interval_s` | `float` | `60` | 每两次拉取之间的休眠秒数(不含 TDP 响应时间) | -| `initial_lookback_s` | `int` | `300` | 首次启动(无持久化游标时)从 `now - initial_lookback_s` 开始 | -| `max_iterations` | `int` | `0` | 最大循环次数,`0` 表示无限循环直到外部取消 | -| `max_runtime_s` | `float` | `0` | 最长运行时长,`0` 表示无限制 | -| `batch_size` | `int` | `1000` | 单次 TDP 拉取的最大告警数(映射到 `size`,上限 10000) | -| `net_data_types` | `list[str]` | `["attack"]` | 传给 `tdp_log_search` 的 `net_data_type`,可选 `attack` / `risk` / `action` | -| `sql` | `str` | `"threat.level = 'attack'"` | TDP 过滤表达式(**不是完整 SQL**),用于过滤拉取范围 | -| `assets_group` | `list[int]` | `[]` | 业务组 ID 列表,可选 | -| `filter_enabled` | `bool` | `true` | 是否启用 9 分类过滤(去扫描 / 仅留 HTTP) | -| `dedup_enabled` | `bool` | `true` | 是否启用 LSH 去重(关闭后仅记录原始 dedup_key、不跨批次感知) | -| `threshold` | `float` | `0.7` | Jaccard 相似度阈值 | -| `strict_fields` | `list[str]` | `["sip","dip"]` | 精确匹配字段(拼接进 dedup_key) | -| `lsh_fields` | `list[str]` | `["req_http_url","req_body","rsp_body"]` | 模糊匹配字段(URI 归一化后做 MinHash) | -| `max_field_len` | `int` | `500` | 单字段截断长度 | -| `max_dedup_keys` | `int` | `100000` | LSH 状态 FIFO LRU 上限 | -| `reset_cursor` | `bool` | `false` | `true` 时忽略已有游标,重新从 `now - initial_lookback_s` 开始 | -| `log_progress_every` | `int` | `1` | 每隔 N 轮打印一次进度日志(避免日志过于频繁) | - -> ⚠️ 启动时**不要**传 `time_from` / `time_to`,工作流会自己用游标推进。要从指定时间开始重拉,把 `reset_cursor` 设为 `true` 并调整 `initial_lookback_s`。 - -## 输出参数 - -工作流执行结束(达到停止条件或被取消)后写入: - -| 字段 | 类型 | 说明 | -|------|------|------| -| `summary` | `str` | 一行摘要(iters / pulls / raw / unique / files / stop_reason) | -| `stop_reason` | `str` | 退出原因:`completed` / `reached max_iterations=N` / `reached max_runtime_s=X` / `KeyboardInterrupt` / `unhandled error: ...` | -| `final_cursor` | `int` | 最后一次成功推进到的时间戳(下次启动从此处继续) | -| `output_paths` | `list[str]` | 本次运行写入的所有 JSONL 文件路径 | -| `output_path` | `str` | 最后写入的 JSONL 文件路径(便于单值消费) | -| `stats` | `dict` | 完整统计(见下表) | - -### stats 字段 - -| 字段 | 说明 | -|------|------| -| `iterations` | 实际执行的循环轮次数 | -| `pulls_succeeded` / `pulls_failed` | TDP API 调用成功 / 失败次数 | -| `raw_total` | 从 TDP 拉到的原始告警数总和 | -| `normalized_total` | 归一化后告警数总和 | -| `filtered_total` | 过滤后保留的告警数总和(filter_enabled=true 时) | -| `enriched_total` | 经过去重处理的告警总数(含重复) | -| `unique_total` | 唯一 dedup_key 数总和 | -| `duplicates_total` | 被识别为重复的告警数 | -| `written_files` | 本次运行追加写入的所有文件路径列表 | -| `last_window_from` / `last_window_to` | 最近一次拉取的时间窗口 | -| `last_error` | 最近一次错误描述(无错误时为 `null`) | - -## 文件落盘 - -### 告警结果(每轮追加) - -``` -~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl -``` - -- **JSONL 格式**:每行一个 JSON 对象。 -- **首行**:`{"_type":"file_header", "created_at":..., "date":..., "workflow":"tdp_alert_pull_dedup", "seq":N}`(不计入告警条数)。 -- **后续行**:每行一条 enriched_alert(归一化字段 + 去重字段)。 -- **分卷规则**:每文件最多 **10,000 条**告警,超出时自动新建 `alerts_002.jsonl`、`003.jsonl`… -- **跨天滚动**:每轮检测当前日期,自动写入新的 `/` 目录。 - -### 时间游标(断点续传) - -``` -~/.flocks/workflows/tdp_alert_pull_dedup/cursor.json -``` - -```json -{ - "next_from": 1715501234, - "updated_at": "2026-05-12T15:43:54.123456", - "iter": 42, - "workflow": "tdp_alert_pull_dedup" -} -``` - -- 每轮成功完成后原子写入。 -- 重启工作流时自动加载,继续从 `next_from` 推进,**无重叠也无空洞**。 -- TDP 调用失败时不推进游标,下一轮会重试同一个时间窗口。 - -### LSH 去重状态 - -``` -~/.flocks/workflows/tdp_alert_pull_dedup/lsh_state_np128_th{int(threshold*100)}.pkl -~/.flocks/workflows/tdp_alert_pull_dedup/lsh_state_np128_th{int(threshold*100)}.lock -``` - -- 原子写入 + 文件锁(跨进程安全)。 -- FIFO LRU 淘汰:达到 `max_dedup_keys` 阈值后逐出最早条目。 -- 不同 `threshold` 互相独立(避免不同阈值之间状态混淆)。 - -### 每条 enriched_alert 的增强字段 - -| 字段 | 说明 | -|------|------| -| `dedup_key` | MD5 去重键(`strict_fields + cluster_id` 的哈希) | -| `is_duplicate` | 是否已在历史轮次中出现过(跨轮持久化感知) | -| `_lsh_cluster_id` | MinHash LSH 簇 ID(`dedup_enabled=false` 时为 `null`) | -| `_source_type` | 固定为 `tdp`(数据源) | -| `_process_type` | 过滤分类(如 `alert_not_scan_http_direction_in`) | -| `_threat_type` | 威胁类型字符串(同 `threat_name`) | - -## 节点说明 - -### `pull_dedup_loop`(唯一节点,长时间运行) - -`type: python`,`metadata.node_timeout_s = 2,592,000`(30 天)。 - -主循环步骤(每轮): -1. **时间窗口计算**:`time_from = 上次 time_to`(首次为 `now - initial_lookback_s`),`time_to = 当前时间戳`。 -2. **TDP 拉取**:`tool.run('tdp_log_search', action='search', time_from, time_to, net_data_type, sql, size)`,失败时 `pulls_failed++` 且**不**推进游标,下轮重试同一窗口。 -3. **响应解包**:自动识别 `list` / `{"log":[...]}` / `{"list":[...]}` / `{"data":[...]}` 等常见 TDP 返回结构。 -4. **归一化**:仅 TDP,复用 `stream_alert_dedup` 的 `TDP_FIELD_MAP`(嵌套字段也支持)。 -5. **过滤**:9 分类,保留 `alert_not_scan_http_direction_{in|out|lateral}`。 -6. **去重**:URI 归一化 + 5-gram MinHash LSH,跨轮 / 跨进程持久化。 -7. **写盘**:JSONL 追加,达到 10,000 条自动滚卷。 -8. **推进游标**:成功完成后 atomic 写入 `cursor.json`。 -9. **休眠**:`pull_interval_s` 秒。 - -退出条件: -- `iter > max_iterations`(且 `max_iterations > 0`) -- `elapsed > max_runtime_s`(且 `max_runtime_s > 0`) -- `KeyboardInterrupt` / 节点取消 -- 不可恢复异常(已被 catch,会写入 `stats.last_error`) - -## 与 stream_alert_dedup 的差异 - -| 维度 | stream_alert_dedup | tdp_alert_pull_dedup | -|------|--------------------|----------------------| -| 数据来源 | syslog 监听 / `alerts` / `alert_file` 三选一 | 主动调用 `tdp_log_search` 工具 | -| 触发方式 | 外部事件驱动(每收到一条触发一次工作流) | 工作流自身长时间运行(while 循环) | -| 多源支持 | TDP + Skyeye 自动识别 | 仅 TDP(数据来源固定) | -| 时间游标 | 无(事件驱动无需游标) | 持久化游标,断点续传 | -| 落盘路径 | `~/.flocks/workspace/workflows/stream_alert_dedup//dedup_result_NNN.jsonl` | `~/.flocks/workflows/tdp_alert_pull_dedup//alerts_NNN.jsonl` | -| LSH 状态 | `~/.flocks/workspace/workflows/stream_alert_dedup/lsh_state_*.pkl` | `~/.flocks/workflows/tdp_alert_pull_dedup/lsh_state_*.pkl` | - -> 两个工作流维护**独立**的 LSH 状态与去重历史,不会互相干扰。 - -## 运行方式 - -### 1. 通过 webui 启动 - -打开 webui → Workflows → `tdp_alert_pull_dedup` → 点击运行;可在右侧 RunTab 调整默认 `sampleInputs`。 - -### 2. 通过 API 启动 - -```bash -curl -s -X POST http://localhost:8000/api/workflow/tdp_alert_pull_dedup/run \ - -H 'Content-Type: application/json' \ - -d '{ - "inputs": { - "pull_interval_s": 60, - "initial_lookback_s": 600, - "max_iterations": 0, - "batch_size": 500, - "net_data_types": ["attack"], - "sql": "threat.level = '\''attack'\''", - "filter_enabled": true, - "dedup_enabled": true, - "threshold": 0.7 - } - }' -``` - -### 3. 短跑测试 - -```json -{ - "max_iterations": 5, - "pull_interval_s": 5, - "initial_lookback_s": 86400, - "reset_cursor": true -} -``` - -跑 5 轮、每轮拉取过去 24 小时的告警、忽略已有游标,便于快速验证 pipeline。 - -## 前置条件 - -1. **TDP 凭据已配置**:`tdp_api_key` / `tdp_secret` / `tdp_host` 已通过 secrets 或 `api_services.tdp_api.base_url` 配置。可用 `python -c "from flocks.tool import ToolRegistry; ToolRegistry.init(); print(ToolRegistry.get('tdp_log_search'))"` 验证工具已注册。 -2. **`datasketch` 依赖**:和 `stream_alert_dedup` 共享,已在 flocks 项目依赖中。 -3. **写盘权限**:用户对 `~/.flocks/workflows/` 目录有读写权限。 - -## 工程要点 - -- **节点超时**:`node_timeout_s = 2,592,000` (30 天)。如需更长运行时间可调高 metadata,或拆成多次有限轮次执行(搭配 cron scheduler)。 -- **TDP 调用失败时的语义**:不推进游标,下次重试**同一时间窗口**,避免丢数据。但若 TDP 长时间不可用,建议外部监控 `stats.pulls_failed`。 -- **time_from = 上次 time_to**:闭区间还是开区间取决于 TDP 服务端实现。如观察到边界重复,可在 dedup 阶段被 LSH 自动去掉;若不开启 dedup,建议手动 `+1` 偏移。 -- **路径根目录**:通过 `Config().get_global().data_dir.parent` 解析 `~/.flocks`,避免硬编码用户目录。 -- **不依赖 syslog/Kafka**:与 `stream_alert_dedup` 解耦;如需同时跑两套去重,记得它们**不共享** LSH 历史。 diff --git a/.flocks/plugins/workflows/tdp_alert_triage/workflow.json b/.flocks/plugins/workflows/tdp_alert_triage/workflow.json index 3e21eeaf0..9464401be 100644 --- a/.flocks/plugins/workflows/tdp_alert_triage/workflow.json +++ b/.flocks/plugins/workflows/tdp_alert_triage/workflow.json @@ -1,168 +1,89 @@ { "name": "tdp_alert_triage", - "description": "NDR/TDP HTTP alert triage workflow with parallel survey/cve/payload analysis.", - "description_cn": "TDP/NDR HTTP 日志研判工作流(默认输入为 HTTP 日志,跳过类型判别)— 测绘 / 漏洞分析 / 漏洞详情 / 攻击 payload 四节点并行执行", + "description": "NDR/TDP alert triage workflow - handles nested TDP alert payloads and automates HTTP-focused security investigation.", + "description_cn": "NDR/TDP 告警调查工作流 - 兼容 TDP 嵌套告警结构并自动化研判 HTTP 类安全事件", "start": "receive_alert", "nodes": [ { "id": "receive_alert", "type": "python", - "description": "解析告警,支持三种原始格式:嵌套 TDP(net.http.url)、扁平 TDP(net_real_src_ip/net_http_url/net_http_reqs_body 等)、归一化 schema(sip/dip/req_http_url/req_body 等)。提取 HTTP 请求/响应、IOC、威胁字段,生成统一 log_text 供下游 LLM 使用。", - "code": "\nimport json\nimport re\n\nalert_input = inputs.get('alert_data', inputs.get('alert', {}))\n\nif isinstance(alert_input, str):\n try:\n alert_input = json.loads(alert_input)\n except Exception:\n alert_input = {}\n\nif isinstance(alert_input, list):\n alert_data = alert_input[0] if alert_input else {}\nelif isinstance(alert_input, dict) and isinstance(alert_input.get('data'), list):\n alert_data = alert_input.get('data', [])[0] if alert_input.get('data') else {}\nelse:\n alert_data = alert_input if isinstance(alert_input, dict) else {}\n\nnet = alert_data.get('net', {}) or {}\nhttp = net.get('http', {}) or {}\nthreat = alert_data.get('threat', {}) or {}\nassets = alert_data.get('assets', {}) or {}\n\ndef pick(*values):\n for value in values:\n if value not in (None, '', [], {}):\n return value\n return ''\n\n# Support three raw formats:\n# 1. Nested TDP: net.src_ip / net.http.reqs_line etc.\n# 2. Flat TDP: net_real_src_ip / net_http_url / net_http_reqs_body etc.\n# 3. Normalized: sip / dip / req_http_url / req_body / rsp_body etc.\nsrc_ip = pick(\n alert_data.get('attacker'), alert_data.get('external_ip'),\n net.get('src_ip'), net.get('flow_src_ip'),\n alert_data.get('net_real_src_ip'),\n alert_data.get('sip'), alert_data.get('src_ip'), alert_data.get('src'),\n)\ndst_ip = pick(\n alert_data.get('victim'), alert_data.get('machine'),\n alert_data.get('server_ip'), net.get('dest_ip'), net.get('flow_dest_ip'),\n alert_data.get('net_dest_ip'),\n alert_data.get('dip'), alert_data.get('dst_ip'), alert_data.get('dst'),\n)\nsrc_port = pick(\n net.get('src_port'), net.get('flow_src_port'),\n alert_data.get('external_port'), alert_data.get('net_src_port'),\n alert_data.get('sport'), alert_data.get('src_port'), 0,\n)\ndst_port = pick(\n net.get('dest_port'), net.get('flow_dest_port'),\n alert_data.get('server_port'), alert_data.get('machine_port'),\n alert_data.get('net_dest_port'),\n alert_data.get('dport'), alert_data.get('dst_port'), 0,\n)\nprotocol = pick(\n net.get('app_proto'), net.get('type'), net.get('proto'),\n alert_data.get('net_app_proto'), alert_data.get('protocol'),\n alert_data.get('event_type'), 'TCP',\n)\nalert_type = pick(\n threat.get('name'), alert_data.get('threat_name'),\n alert_data.get('vuln_name'),\n alert_data.get('alert_type'), threat.get('topic'),\n alert_data.get('type'), 'unknown',\n)\nseverity = pick(\n threat.get('severity'), alert_data.get('threat_severity'),\n alert_data.get('severity'), threat.get('level'),\n alert_data.get('level'), 'medium',\n)\n\nreq_line = pick(\n http.get('reqs_line'),\n alert_data.get('req_line'), alert_data.get('net_http_reqs_line'),\n)\nreq_header = pick(\n http.get('reqs_header'),\n alert_data.get('req_header'), alert_data.get('net_http_reqs_header'),\n)\nreq_body = pick(\n http.get('req_body'),\n alert_data.get('req_body'), alert_data.get('net_http_reqs_body'),\n)\nresp_line = pick(\n http.get('resp_line'),\n alert_data.get('rsp_line'), alert_data.get('resp_line'),\n alert_data.get('net_http_resp_line'),\n)\nresp_header = pick(\n http.get('resp_header'),\n alert_data.get('rsp_header'), alert_data.get('resp_header'),\n alert_data.get('net_http_resp_header'),\n)\nresp_body = pick(\n http.get('resp_body'),\n alert_data.get('rsp_body'), alert_data.get('resp_body'),\n alert_data.get('net_http_resp_body'),\n)\nstatus = pick(\n http.get('status'),\n alert_data.get('http_status'), alert_data.get('net_http_status'),\n alert_data.get('rsp_status_code'), 0,\n)\n\nhost = pick(\n http.get('reqs_host'), alert_data.get('url_host'), http.get('domain'),\n alert_data.get('req_host'), alert_data.get('net_http_reqs_host'),\n dst_ip,\n)\nraw_url = pick(\n http.get('raw_url'), http.get('url'),\n alert_data.get('url_path'),\n alert_data.get('net_http_url'), alert_data.get('req_http_url'),\n alert_data.get('uri'),\n)\nurl = ''\nif host and raw_url:\n scheme = 'https' if net.get('is_https') else 'http'\n url = raw_url if str(raw_url).startswith(('http://', 'https://')) else f'{scheme}://{host}{raw_url}'\nelif raw_url and str(raw_url).startswith(('http://', 'https://')):\n url = raw_url\nelif raw_url:\n # Relative path only — keep as-is for log readability\n url = raw_url\n\npayload = f'请求行: {req_line}\\n请求头: {req_header}\\n请求体: {req_body}'\nresponse = f'状态行: {resp_line}\\n响应头: {resp_header}\\n响应体: {resp_body}'\n\n# threat_result / threat_msg: also check flat TDP field names\nthreat_result = pick(threat.get('result'), alert_data.get('threat_result'))\nthreat_msg = pick(threat.get('msg'), alert_data.get('threat_msg'))\n\nlog_text = (\n f'[告警基本信息]\\n'\n f'告警类型: {alert_type}\\n严重级别: {severity}\\n'\n f'源地址: {src_ip}:{src_port}\\n目的地址: {dst_ip}:{dst_port}\\n'\n f'协议: {protocol}\\nURL: {url}\\nHTTP状态码: {status}\\n'\n f'TDP判定: {threat_result}\\nTDP消息: {threat_msg}\\n\\n'\n f'[HTTP请求内容]\\n{payload}\\n\\n'\n f'[HTTP响应内容]\\n{response}'\n)\n\nvuln_text = '\\n'.join(str(item) for item in [\n threat_msg, threat.get('topic', ''),\n alert_data.get('data', ''), url,\n json.dumps(threat.get('tag', []), ensure_ascii=False),\n] if item)\nvuln_matches = sorted(set(re.findall(r'\\b(?:CVE|CNVD|CNNVD|XVE)-[A-Za-z0-9._-]+\\b', vuln_text, flags=re.I)))\n\niocs = []\nfor candidate in [src_ip, dst_ip]:\n if candidate:\n iocs.append({'type': 'ip', 'value': candidate})\nif url:\n iocs.append({'type': 'url', 'value': url})\nif host and not re.match(r'^\\d{1,3}(?:\\.\\d{1,3}){3}(?::\\d+)?$', str(host)):\n iocs.append({'type': 'domain', 'value': str(host).split(':')[0]})\n\noutputs['parsed_alert'] = {\n 'src_ip': src_ip, 'dst_ip': dst_ip, 'src_port': src_port, 'dst_port': dst_port,\n 'protocol': protocol, 'payload': payload, 'response': response,\n 'url': url, 'status': status,\n 'alert_type': alert_type, 'severity': severity,\n 'vuln_id': vuln_matches[0] if vuln_matches else '',\n 'vuln_candidates': vuln_matches,\n 'threat_result': threat_result,\n 'threat_msg': threat_msg,\n 'failed_by': threat.get('failed_by', []),\n 'asset_ip': assets.get('ip', ''), 'asset_name': assets.get('name', []),\n 'iocs': iocs, 'raw_alert': alert_data, 'log_text': log_text,\n}\noutputs['log_text'] = log_text\noutputs['iocs'] = iocs\n" + "description": "接收并解析 NDR/TDP 告警数据,提取关键字段、HTTP 请求响应和 IOC", + "code": "import json\nimport re\n\nalert_input = inputs.get('alert_data', inputs.get('alert', {}))\n\nif isinstance(alert_input, str):\n try:\n alert_input = json.loads(alert_input)\n except Exception:\n alert_input = {}\n\nif isinstance(alert_input, list):\n alert_data = alert_input[0] if alert_input else {}\nelif isinstance(alert_input, dict) and isinstance(alert_input.get('data'), list):\n alert_data = alert_input.get('data', [])[0] if alert_input.get('data') else {}\nelse:\n alert_data = alert_input if isinstance(alert_input, dict) else {}\n\nnet = alert_data.get('net', {}) or {}\nhttp = net.get('http', {}) or {}\nthreat = alert_data.get('threat', {}) or {}\nassets = alert_data.get('assets', {}) or {}\n\ndef pick(*values):\n for value in values:\n if value not in (None, '', [], {}):\n return value\n return ''\n\nsrc_ip = pick(\n alert_data.get('attacker'),\n alert_data.get('external_ip'),\n net.get('src_ip'),\n net.get('flow_src_ip'),\n alert_data.get('sip'),\n alert_data.get('src_ip'),\n alert_data.get('src')\n)\ndst_ip = pick(\n alert_data.get('victim'),\n alert_data.get('machine'),\n alert_data.get('server_ip'),\n net.get('dest_ip'),\n net.get('flow_dest_ip'),\n alert_data.get('dip'),\n alert_data.get('dst_ip'),\n alert_data.get('dst')\n)\nsrc_port = pick(\n net.get('src_port'),\n net.get('flow_src_port'),\n alert_data.get('external_port'),\n alert_data.get('sport'),\n alert_data.get('src_port'),\n 0\n)\ndst_port = pick(\n net.get('dest_port'),\n net.get('flow_dest_port'),\n alert_data.get('server_port'),\n alert_data.get('machine_port'),\n alert_data.get('dport'),\n alert_data.get('dst_port'),\n 0\n)\nprotocol = pick(\n net.get('app_proto'),\n net.get('type'),\n net.get('proto'),\n alert_data.get('net_app_proto'),\n alert_data.get('protocol'),\n alert_data.get('event_type'),\n 'TCP'\n)\nalert_type = pick(\n threat.get('name'),\n alert_data.get('threat_name'),\n alert_data.get('alert_type'),\n threat.get('topic'),\n alert_data.get('type'),\n 'unknown'\n)\nseverity = pick(\n threat.get('severity'),\n alert_data.get('threat_severity'),\n alert_data.get('severity'),\n threat.get('level'),\n alert_data.get('level'),\n 'medium'\n)\n\nreq_line = pick(http.get('reqs_line'), alert_data.get('req_line'))\nreq_header = pick(http.get('reqs_header'), alert_data.get('req_header'))\nreq_body = pick(http.get('req_body'), alert_data.get('req_body'))\nresp_line = pick(http.get('resp_line'), alert_data.get('rsp_line'), alert_data.get('resp_line'))\nresp_header = pick(http.get('resp_header'), alert_data.get('rsp_header'), alert_data.get('resp_header'))\nresp_body = pick(http.get('resp_body'), alert_data.get('rsp_body'), alert_data.get('resp_body'))\nstatus = pick(http.get('status'), alert_data.get('http_status'), 0)\n\nhost = pick(http.get('reqs_host'), alert_data.get('url_host'), http.get('domain'), dst_ip)\nraw_url = pick(http.get('raw_url'), http.get('url'), alert_data.get('url_path'))\nurl = ''\nif host and raw_url:\n scheme = 'https' if net.get('is_https') else 'http'\n url = raw_url if str(raw_url).startswith(('http://', 'https://')) else f'{scheme}://{host}{raw_url}'\n\npayload = f'请求行: {req_line}\\n请求头: {req_header}\\n请求体: {req_body}'\nresponse = f'状态行: {resp_line}\\n响应头: {resp_header}\\n响应体: {resp_body}'\n\nvuln_text = '\\n'.join(str(item) for item in [\n threat.get('msg', ''),\n threat.get('topic', ''),\n alert_data.get('data', ''),\n url,\n json.dumps(threat.get('tag', []), ensure_ascii=False)\n] if item)\nvuln_matches = sorted(set(re.findall(r'\\b(?:CVE|CNVD|CNNVD|XVE)-[A-Za-z0-9._-]+\\b', vuln_text, flags=re.I)))\nvuln_id = vuln_matches[0] if vuln_matches else ''\n\niocs = []\nfor candidate in [src_ip, dst_ip]:\n if candidate:\n iocs.append({'type': 'ip', 'value': candidate})\nif url:\n iocs.append({'type': 'url', 'value': url})\nif host and not re.match(r'^\\d{1,3}(?:\\.\\d{1,3}){3}(?::\\d+)?$', str(host)):\n iocs.append({'type': 'domain', 'value': str(host).split(':')[0]})\n\nsearch_text = '\\n'.join([payload, response, str(alert_data.get('data', '')), url])\nurl_pattern = r'https?://[^\\s<>\"]+'\nfor matched_url in re.findall(url_pattern, search_text):\n if matched_url not in [ioc['value'] for ioc in iocs if ioc['type'] == 'url']:\n iocs.append({'type': 'url', 'value': matched_url})\n\ndomain_pattern = r'(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\\.)+(?:com|net|org|io|cn|co|edu|gov|info|biz|xyz|top|cc|tk|ml|ga|cf|gq|pw|ws|site|online|club|shop|live|fun|tech|pro|app|dev|cloud|host|space|vip|tw|hk|jp|kr|ru|uk|de|fr|eu|au|ca)'\nfor domain in re.findall(domain_pattern, search_text):\n if domain not in [ioc['value'] for ioc in iocs if ioc['type'] == 'domain']:\n iocs.append({'type': 'domain', 'value': domain})\n\noutputs['parsed_alert'] = {\n 'src_ip': src_ip,\n 'dst_ip': dst_ip,\n 'src_port': src_port,\n 'dst_port': dst_port,\n 'protocol': protocol,\n 'payload': payload,\n 'response': response,\n 'url': url,\n 'status': status,\n 'alert_type': alert_type,\n 'severity': severity,\n 'vuln_id': vuln_id,\n 'vuln_candidates': vuln_matches,\n 'threat_result': threat.get('result', ''),\n 'threat_confidence': threat.get('confidence', ''),\n 'threat_msg': threat.get('msg', ''),\n 'failed_by': threat.get('failed_by', []),\n 'asset_ip': assets.get('ip', ''),\n 'asset_name': assets.get('name', []),\n 'iocs': iocs,\n 'raw_alert': alert_data\n}\noutputs['has_vuln_id'] = bool(vuln_id)\noutputs['iocs'] = iocs" }, { - "id": "prepare_intel", + "id": "query_threat_intel", "type": "python", - "description": "并行前的预处理:查询 IP 威胁情报 + CVE 漏洞情报,生成 intel_content / vuln_content 供后续 survey/cve_info 使用", - "code": "\nimport ipaddress\nimport json\n\n# Pre-fetch external intel (IP geo/asset + CVE info) so the parallel `survey` and `cve_info`\n# nodes have a concrete `content` block to feed their LLM prompts. Internal/RFC1918 IPs\n# are skipped to avoid wasted upstream calls.\nparsed_alert = inputs.get('parsed_alert', {})\niocs = inputs.get('iocs', parsed_alert.get('iocs', []))\n\ndef is_public_ip(value):\n try:\n ip_obj = ipaddress.ip_address(value)\n except Exception:\n return False\n return not (ip_obj.is_private or ip_obj.is_loopback or ip_obj.is_reserved\n or ip_obj.is_link_local or ip_obj.is_multicast or ip_obj.is_unspecified)\n\nintel_results = []\nseen = set()\nfor ioc in iocs:\n ioc_type = ioc.get('type', '')\n ioc_value = str(ioc.get('value', '')).strip()\n key = (ioc_type, ioc_value)\n if not ioc_value or key in seen:\n continue\n seen.add(key)\n if ioc_type == 'ip':\n if not is_public_ip(ioc_value):\n continue\n r = tool.run_safe('threatbook_ip_query', ip=ioc_value)\n if r['success']:\n intel_results.append({'source': 'threatbook', 'type': 'ip', 'value': ioc_value, 'result': r['text']})\n elif ioc_type == 'domain':\n r = tool.run_safe('threatbook_domain_query', domain=ioc_value)\n if r['success']:\n intel_results.append({'source': 'threatbook', 'type': 'domain', 'value': ioc_value, 'result': r['text']})\n elif ioc_type == 'url':\n r = tool.run_safe('threatbook_url_query', url=ioc_value)\n if r['success']:\n intel_results.append({'source': 'threatbook', 'type': 'url', 'value': ioc_value, 'result': r['text']})\n\n# CVE intel (only if we already extracted a vuln id in receive_alert).\nvuln_info = {}\nvuln_id = parsed_alert.get('vuln_id', '')\nif vuln_id:\n r = tool.run_safe('__mcp_vuln_query', vuln_id=vuln_id)\n if r['success']:\n try:\n obj = r.get('obj')\n if isinstance(obj, str):\n obj = json.loads(obj)\n vuln_info = obj if isinstance(obj, dict) else {'raw_result': r.get('text', '')}\n except Exception:\n vuln_info = {'raw_result': r.get('text', '')}\n\n# Compact text blobs for downstream prompts.\nintel_content = '\\n'.join(\n f\"[{i['source']}/{i['type']}] {i['value']}\\n{i['result']}\" for i in intel_results\n) or '(无可用情报数据)'\n\nvuln_content = (\n json.dumps(vuln_info, ensure_ascii=False, indent=2)\n if vuln_info else '(无可用漏洞情报数据)'\n)\n\nprint(f'[prepare_intel] intel_results={len(intel_results)}, vuln_id={vuln_id or \"none\"}')\n\noutputs['parsed_alert'] = parsed_alert\noutputs['log_text'] = inputs.get('log_text', parsed_alert.get('log_text', ''))\noutputs['intel_results'] = intel_results\noutputs['intel_content'] = intel_content\noutputs['vuln_info'] = vuln_info\noutputs['vuln_content'] = vuln_content\n" + "description": "查询外部 IOC 的威胁情报,自动跳过内网/保留地址", + "code": "import ipaddress\n\niocs = inputs.get('iocs', [])\nintel_results = []\nseen = set()\n\ndef is_public_ip(value):\n try:\n ip_obj = ipaddress.ip_address(value)\n except Exception:\n return False\n return not (\n ip_obj.is_private\n or ip_obj.is_loopback\n or ip_obj.is_reserved\n or ip_obj.is_link_local\n or ip_obj.is_multicast\n or ip_obj.is_unspecified\n )\n\nfor ioc in iocs:\n ioc_type = ioc.get('type', '')\n ioc_value = str(ioc.get('value', '')).strip()\n key = (ioc_type, ioc_value)\n\n if not ioc_value or key in seen:\n continue\n seen.add(key)\n\n if ioc_type == 'ip':\n if not is_public_ip(ioc_value):\n continue\n result = tool.run_safe('threatbook_ip_query', ip=ioc_value)\n if result['success']:\n intel_results.append({\n 'source': 'threatbook',\n 'type': 'ip',\n 'value': ioc_value,\n 'result': result['text']\n })\n vt_result = tool.run_safe('virustotal_ip_query', ip=ioc_value)\n if vt_result['success']:\n intel_results.append({\n 'source': 'virustotal',\n 'type': 'ip',\n 'value': ioc_value,\n 'result': vt_result['text']\n })\n\n elif ioc_type == 'domain':\n result = tool.run_safe('threatbook_domain_query', domain=ioc_value)\n if result['success']:\n intel_results.append({\n 'source': 'threatbook',\n 'type': 'domain',\n 'value': ioc_value,\n 'result': result['text']\n })\n vt_result = tool.run_safe('virustotal_domain_query', domain=ioc_value)\n if vt_result['success']:\n intel_results.append({\n 'source': 'virustotal',\n 'type': 'domain',\n 'value': ioc_value,\n 'result': vt_result['text']\n })\n\n elif ioc_type == 'url':\n result = tool.run_safe('threatbook_url_query', url=ioc_value)\n if result['success']:\n intel_results.append({\n 'source': 'threatbook',\n 'type': 'url',\n 'value': ioc_value,\n 'result': result['text']\n })\n vt_result = tool.run_safe('virustotal_url_query', url=ioc_value)\n if vt_result['success']:\n intel_results.append({\n 'source': 'virustotal',\n 'type': 'url',\n 'value': ioc_value,\n 'result': vt_result['text']\n })\n\noutputs['intel_results'] = intel_results\noutputs['has_intel'] = len(intel_results) > 0" }, { - "id": "survey", + "id": "query_vuln", "type": "python", - "description": "测绘:总结 IP 情报中的空间测绘信息(标签、服务、应用资产)", - "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Summarize spatial-mapping (CMDB-style) info from intel data per IP.\nlog_text = inputs.get('log_text', '')\nintel_content = inputs.get('intel_content', '')\n\nprompt = f'''你是一个专业的Web日志分析专家。请总结以下IP的情报数据中的空间测绘信息。\n1. 如果该IP没有测绘信息,则不列出。\n2. 如果IP有测绘信息,则以简短的语言对该IP的测绘信息进行总结,关键说明ip的标签和测绘信息显示有哪些服务或者应用资产。\n3. 多个IP的测绘信息以无序列表显示,每个ip数据描述占一行数据。\n4. 不需要生成其他额外的补充信息。\n\n## 情报参考信息\n{intel_content}\n\n## 用户的原始输入日志\n{log_text}\n'''\n\nresult = _strip_think(llm.ask(prompt))\nprint(f'[survey] {len(result)} chars')\noutputs['survey_result'] = result\n" + "description": "仅在识别到标准漏洞编号时查询漏洞信息", + "code": "import json\n\nparsed_alert = inputs.get('parsed_alert', {})\nvuln_id = parsed_alert.get('vuln_id', '')\nvuln_info = {}\n\nif vuln_id:\n result = tool.run_safe('__mcp_vuln_query', vuln_id=vuln_id)\n if result['success']:\n try:\n obj = result.get('obj')\n if isinstance(obj, str):\n obj = json.loads(obj)\n vuln_info = obj if isinstance(obj, dict) else {}\n except Exception:\n vuln_info = {'raw_result': result.get('text', '')}\n else:\n vuln_info = {'error': result.get('error', 'Query failed'), 'vuln_id': vuln_id}\n\noutputs['vuln_info'] = vuln_info\noutputs['has_vuln_info'] = bool(vuln_info) and 'error' not in vuln_info" }, { - "id": "cve_related", + "id": "analyze_payload", "type": "python", - "description": "关联漏洞:仅从日志文本中提取 CVE/CNVD/CNNVD/XVE 编号", - "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Extract CVE/CNVD/CNNVD/XVE numbers strictly from the log text.\nlog_text = inputs.get('log_text', '')\n\nprompt = f'''请从以下的日志数据中提取漏洞编号。\n要求:\n1. 仅从日志文本中识别漏洞编号,不要做任何推测。\n2. 如果日志中存在漏洞编号,则用简短语言描述,如:\"日志中存在漏洞编号:CVE-****-****\"。\n3. 如果日志中不存在漏洞编号,则输出:\"日志中无关联漏洞情报\"。\n\n日志数据如下:\n{log_text}\n'''\n\nresult = _strip_think(llm.ask(prompt))\nprint(f'[cve_related] {result[:80]}')\noutputs['cve_related_result'] = result\n" + "description": "使用 LLM 分析 HTTP 请求负载并按规范落盘", + "code": "import os\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nparsed_alert = inputs.get('parsed_alert', {})\npayload = parsed_alert.get('payload', '')\nalert_type = parsed_alert.get('alert_type', 'unknown')\nurl = parsed_alert.get('url', '')\n\nprompt = f\"\"\"你是一位网络安全专家,正在分析 HTTP 请求负载。\n\n告警类型: {alert_type}\nURL: {url}\n请求内容:\n{payload}\n\n请输出不超过 120 字的中文分析,包含:\n1. 该流量更像攻击、扫描、误报还是正常请求\n2. 具体攻击/扫描方式\n3. 攻击意图或合法目的\n\"\"\"\n\nanalysis_result = llm.ask(prompt)\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\ntool.run('write', filePath=os.path.join(artifacts_dir, 'payload_analysis_llm_output.md'), content=analysis_result)\n\noutputs['payload_analysis'] = analysis_result\noutputs['has_payload_analysis'] = bool(analysis_result)" }, { - "id": "cve_info", + "id": "analyze_response", "type": "python", - "description": "漏洞详情:基于 vuln_content 输出关联漏洞的基本信息(不含修复建议)", - "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Summarize basic CVE info from intel content (no remediation).\nlog_text = inputs.get('log_text', '')\nvuln_content = inputs.get('vuln_content', '')\n\nprompt = f'''你是一个专业的Web日志分析专家。参考情报信息中的漏洞数据,简要说明关联的CVE漏洞信息。\n1. 不要输出任何解释说明,只输出漏洞基本信息。不需要生成漏洞的处置建议或修复措施等。\n\n## 情报参考信息\n{vuln_content}\n\n## 用户的原始输入日志\n{log_text}\n'''\n\nresult = _strip_think(llm.ask(prompt))\nprint(f'[cve_info] {len(result)} chars')\noutputs['cve_info_result'] = result\n" + "description": "结合 HTTP 响应和 TDP 判定字段判断攻击是否成功", + "code": "import os\nimport re\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nparsed_alert = inputs.get('parsed_alert', {})\npayload = parsed_alert.get('payload', '')\nresponse = parsed_alert.get('response', '')\nstatus = parsed_alert.get('status', 0)\nfailed_by = parsed_alert.get('failed_by', [])\nthreat_result = parsed_alert.get('threat_result', '')\n\nprompt = f\"\"\"你是一位网络安全专家,正在分析 HTTP 请求和响应来判断攻击是否成功。\n\n请求内容:\n{payload}\n\n响应内容:\n{response}\n\n补充信号:\n- HTTP 状态码: {status}\n- TDP result: {threat_result}\n- failed_by: {failed_by}\n\n请严格输出 JSON,对象中必须包含以下字段:\n{{\n \\\"is_attack\\\": true/false,\n \\\"attack_success\\\": true/false,\n \\\"summary\\\": \\\"不超过120字的中文结论\\\",\n \\\"success_evidence\\\": [\\\"字符串\\\"],\n \\\"reason\\\": \\\"简要原因\\\"\n}}\n\"\"\"\n\nanalysis_raw = llm.ask(prompt)\nanalysis_obj = {}\ntry:\n matched = re.search(r'\\{.*\\}', analysis_raw, re.S)\n if matched:\n analysis_obj = json.loads(matched.group(0))\nexcept Exception:\n analysis_obj = {}\n\nstatus_int = 0\ntry:\n status_int = int(status)\nexcept Exception:\n status_int = 0\n\ndef fallback_success():\n if any('http_status_4' in item or 'http_status_5' in item for item in failed_by):\n return False\n if status_int >= 400:\n return False\n if threat_result in ('success', 'succeeded'):\n return True\n return False\n\nattack_success = bool(analysis_obj.get('attack_success')) if analysis_obj else fallback_success()\nsummary = analysis_obj.get('summary') if analysis_obj else ''\nreason = analysis_obj.get('reason') if analysis_obj else ''\nsuccess_evidence = analysis_obj.get('success_evidence') if analysis_obj else []\nif not isinstance(success_evidence, list):\n success_evidence = [str(success_evidence)] if success_evidence else []\n\nresponse_analysis = summary or analysis_raw\nif reason:\n response_analysis += f'\\n原因: {reason}'\nif success_evidence:\n response_analysis += '\\n成功标志: ' + ';'.join(str(item) for item in success_evidence if item)\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\ntool.run('write', filePath=os.path.join(artifacts_dir, 'response_analysis_llm_output.md'), content=response_analysis)\n\noutputs['response_analysis'] = response_analysis\noutputs['has_response_analysis'] = bool(response_analysis)\noutputs['attack_success'] = attack_success" }, { - "id": "payload_analysis", - "type": "python", - "description": "攻击 payload:分析日志中是否包含攻击负载并给出判定依据,不分析意图与影响", - "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Payload-only analysis. Per prompt rules, no intent/impact discussion.\nlog_text = inputs.get('log_text', '')\n\nprompt = f'''你是一个专业的Web日志分析专家。根据用户输入的日志进行攻击负载分析。\n1. 首先分析日志中是否包含攻击负载,并给出判定依据。\n2. 不要进行攻击意图分析、攻击影响分析。\n3. 用简短的语言在一段话中进行描述。\n\n## 用户的原始输入日志:\n{log_text}\n'''\n\nresult = _strip_think(llm.ask(prompt))\nprint(f'[payload_analysis] {len(result)} chars')\noutputs['payload_analysis_result'] = result\n" - }, - { - "id": "attack_analysis_result", + "id": "join_results", "type": "python", "join": true, - "description": "攻击分析结果(4 个并行节点的 join 点):按五分类标准产出长文本判定", - "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Join point for the 4 parallel branches; produces the long-form attack-status reasoning\n# that report/verdict/title nodes consume.\nimport os\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nlog_text = inputs.get('log_text', '')\n\nprompt = f'''你是一名专业且经验丰富的网络安全分析师和Web日志分析专家,你对HTTP协议以及Web攻击有着深入的理解,并且你能够快速识别和应对各种网络威胁。你的任务是对提供的HTTP请求与响应内容进行详细的专业分析,并判断日志请求的攻击状态。\n\n请严格遵循以下指令进行思考和分析:\n1. 攻击状态只能从以下情况中选择一种:[\"攻击成功\", \"攻击失败\", \"攻击\", \"未知\", \"安全\"]。\n2. 从日志中提取出\"HTTP请求内容\"和\"HTTP响应内容\"。请注意,HTTP请求内容和HTTP响应内容是分开的,请不要混淆,有些日志中没有包含HTTP响应内容,请不要将HTTP请求内容和HTTP响应内容混淆。分析后请你记住哪些是HTTP请求内容,哪些是HTTP响应内容。\n3. 请检查HTTP响应状态码,2xx或者3xx状态码都代表本次HTTP请求成功,4xx或者5xx状态码大多数情况下都代表请求失败,只有在请求成功的情况下才能对攻击是否成功进行后续判断。\n\n各攻击状态的定义以及判定标准:\n1. 攻击成功:\n(1) 首先分析日志中是否含有清晰的\"HTTP响应内容\",如果日志中没有\"HTTP响应内容\",则肯定不属于攻击成功。\n(2) 如果日志中未提供\"HTTP响应内容\",即使HTTP请求内容中包含攻击者预期的结果,也不能判定为攻击成功。\n(3) 从日志中提取出\"HTTP请求内容\"和\"HTTP响应内容\"。请深入分析\"HTTP响应内容\",并判定其是否为\"HTTP请求内容\"攻击成功时的预期结果,这是判定攻击成功的强依据。请注意,HTTP响应码200仅表示网络连接成功,不代表攻击攻击成功。\n(4) 分析HTTP请求内容和HTTP响应内容,只有当HTTP响应内容中明确包含攻击载荷在目标机器上成功执行的证据,并且HTTP请求内容中包含攻击载荷的特征,则判定为\"攻击成功\"。\n(5) 请注意:攻击成功的判定必须包含HTTP响应内容。如果不包含HTTP响应内容,则肯定不属于攻击成功。\n(6) 请注意:如果不包含HTTP响应内容,即使HTTP请求内容是攻击,这也不属于攻击成功。\n2. 攻击失败:\n(1) 分析HTTP请求内容和HTTP响应内容,如果HTTP响应内容中明确包含攻击载荷在目标机器上执行失败或者被阻止的证据,并且HTTP请求内容中包含攻击载荷的特征,则判定为\"攻击失败\"。\n(2) 攻击失败的判定必须包含HTTP响应内容。如果不包含HTTP响应内容,则肯定不属于攻击失败。\n3. 攻击:\n(1) 在\"HTTP请求内容\"或\"HTTP响应内容\"中发现任何证明存在攻击意图的证据,即可判定为存在攻击行为。但如果不符合上述的攻击成功或者攻击失败的标准,则\"攻击状态\"为\"攻击\"。\n(2) 请注意:如果日志中只提供了\"HTTP请求内容\",且没有提供\"HTTP响应内容\",且HTTP的请求内容分析中是包含攻击行为的,则\"攻击状态\"为\"攻击\"。\n4. 未知:\n(1) 如果不能100%确定HTTP通信的攻击结果,那么请在\"攻击状态\"处给出\"未知\"。\n(2) 请注意:如果在你给的判定原因中存在\"可能\"等不确定词汇,都代表你不能对你的结论100%确定,那么请在\"攻击状态\"处给出\"未知\"。\n5. 安全:\n(1) 如果\"HTTP请求内容\"和\"HTTP响应内容\"中都没有任何攻击意图的证据,那么请在\"攻击状态\"处给出\"安全\"。\n\n## 日志内容\n{log_text}\n\n## 输出要求\n请按下列结构输出(中文):\n1. 攻击状态: [攻击成功/攻击失败/攻击/未知/安全]\n2. 判定依据: 简要说明请求与响应的关键证据\n3. 详细分析: 不超过200字\n'''\n\nresult = _strip_think(llm.ask(prompt))\nprint(f'[attack_analysis_result] {len(result)} chars')\n\n# Persist intermediate artifact for traceability and downstream re-use.\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\ntool.run('write',\n filePath=os.path.join(artifacts_dir, 'attack_analysis_result.md'),\n content=result)\n\n# Forward all upstream parallel outputs so report/verdict/title nodes receive them.\noutputs['attack_analysis_result'] = result\noutputs['parsed_alert'] = inputs.get('parsed_alert', {})\noutputs['log_text'] = log_text\noutputs['intel_results'] = inputs.get('intel_results', [])\noutputs['intel_content'] = inputs.get('intel_content', '')\noutputs['vuln_info'] = inputs.get('vuln_info', {})\noutputs['vuln_content'] = inputs.get('vuln_content', '')\noutputs['survey_result'] = inputs.get('survey_result', '')\noutputs['cve_related_result'] = inputs.get('cve_related_result', '')\noutputs['cve_info_result'] = inputs.get('cve_info_result', '')\noutputs['payload_analysis_result'] = inputs.get('payload_analysis_result', '')\n" - }, - { - "id": "attack_verdict", - "type": "python", - "description": "攻击判定:将攻击分析结果归一化为 5 个标签之一(attack_success/attack_failed/attack/unknown/benign)", - "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Convert the upstream long-form reasoning into a single verdict label.\nattack_analysis_result = inputs.get('attack_analysis_result', '')\n\nprompt = f'''你是一个专业的Web日志分析专家。请据参考信息,直接输出攻击判定类别:\nattack_success:表示攻击成功。\nattack_failed:表示攻击失败。\nattack:表示是日志内容是攻击。\nunknown:表示未知。\nbenign:是安全。\n不额外输出任何其他信息,包括解释、判定依据等。\n\n## 日志分析结果:\n{attack_analysis_result}\n'''\n\nraw = _strip_think(llm.ask(prompt)).strip().lower()\n# Be defensive: LLM may add quotes / punctuation around the label.\nallowed = ('attack_success', 'attack_failed', 'attack', 'unknown', 'benign')\nverdict = next((v for v in allowed if v in raw), 'unknown')\nprint(f'[attack_verdict] raw={raw!r} -> verdict={verdict}')\n\noutputs['attack_verdict'] = verdict\n# Forward everything for the next nodes.\nfor k in ('parsed_alert', 'log_text', 'intel_results', 'intel_content',\n 'vuln_info', 'vuln_content', 'survey_result', 'cve_related_result',\n 'cve_info_result', 'payload_analysis_result', 'attack_analysis_result'):\n outputs[k] = inputs.get(k)\n" - }, - { - "id": "report_title", - "type": "python", - "description": "报告标题:基于攻击类型 + 判定结果生成不超过 30 字的中文报告标题", - "code": "import re\n\ndef _strip_think(text):\n # Remove ... reasoning blocks some models output.\n text = re.sub(r'[\\s\\S]*?', '', str(text or ''), flags=re.IGNORECASE).strip()\n return text\n\n\n# Pithy title; encodes attack type + verdict outcome.\nattack_analysis_result = inputs.get('attack_analysis_result', '')\nattack_verdict = inputs.get('attack_verdict', 'unknown')\nparsed_alert = inputs.get('parsed_alert', {})\n\nprompt = f'''你是一个专业的Web日志分析专家。请基于以下分析结果,生成一份不超过 30 字的中文报告标题。\n要求:\n1. 标题必须能体现\"攻击类型\"或\"攻击结果分析的结论\"。\n2. 不要带书名号、引号或其他标点。\n3. 只输出标题本身,不要任何解释或说明。\n\n## 攻击类型\n{parsed_alert.get('alert_type', 'unknown')}\n\n## 攻击判定\n{attack_verdict}\n\n## 攻击分析结果\n{attack_analysis_result}\n'''\n\nraw = _strip_think(llm.ask(prompt)).strip()\ntitle = raw.splitlines()[0].strip(' \"\\'《》[]【】') if raw else f'{parsed_alert.get(\"alert_type\", \"未知告警\")} - {attack_verdict}'\nprint(f'[report_title] {title}')\n\noutputs['report_title'] = title\nfor k in ('parsed_alert', 'log_text', 'intel_results', 'intel_content',\n 'vuln_info', 'vuln_content', 'survey_result', 'cve_related_result',\n 'cve_info_result', 'payload_analysis_result', 'attack_analysis_result',\n 'attack_verdict'):\n outputs[k] = inputs.get(k)\n" + "description": "等待并归一化并行节点输出,供最终报告使用", + "code": "outputs['parsed_alert'] = inputs.get('parsed_alert', {})\noutputs['intel_results'] = inputs.get('intel_results', [])\noutputs['vuln_info'] = inputs.get('vuln_info', {})\noutputs['payload_analysis'] = inputs.get('payload_analysis', '')\noutputs['response_analysis'] = inputs.get('response_analysis', '')\noutputs['attack_success'] = inputs.get('attack_success', False)" }, { "id": "generate_report", "type": "python", - "description": "输出报告:汇总所有分析结果生成最终 markdown 报告,落盘到 ~/.flocks/workspace/outputs//artifacts/final_report.md", - "code": "\n# Final markdown that aggregates every prior step. Persists to disk.\nimport os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nparsed_alert = inputs.get('parsed_alert', {})\nlog_text = inputs.get('log_text', '')\nintel_results = inputs.get('intel_results', [])\nvuln_info = inputs.get('vuln_info', {})\nsurvey_result = inputs.get('survey_result', '')\ncve_related_result = inputs.get('cve_related_result', '')\ncve_info_result = inputs.get('cve_info_result', '')\npayload_analysis_result = inputs.get('payload_analysis_result', '')\nattack_analysis_result = inputs.get('attack_analysis_result', '')\nattack_verdict = inputs.get('attack_verdict', 'unknown')\nreport_title = inputs.get('report_title', '')\n\nverdict_cn_map = {\n 'attack_success': '攻击成功',\n 'attack_failed': '攻击失败',\n 'attack': '攻击',\n 'unknown': '未知',\n 'benign': '安全',\n}\nverdict_cn = verdict_cn_map.get(attack_verdict, attack_verdict)\n\n# Risk level driven by the verdict (not by individual evidence) for consistency with the label.\nrisk_level = {\n 'attack_success': 'High',\n 'attack_failed': 'Medium',\n 'attack': 'Medium',\n 'unknown': 'Medium',\n 'benign': 'Low',\n}.get(attack_verdict, 'Medium')\n\n# Use the LLM-generated title; fall back to a deterministic synthesis if the LLM returned empty.\nif not report_title:\n report_title = f'{parsed_alert.get(\"alert_type\", \"Web日志告警\")} - {verdict_cn}'\n\nintel_md = ''\nif intel_results:\n for intel in intel_results:\n intel_md += f\"\\n**{intel['source']} / {intel['type']}**: {intel['value']}\\n```\\n{intel['result']}\\n```\\n\"\nelse:\n intel_md = '\\n(未查询到外部威胁情报)\\n'\n\nvuln_md = (\n f\"\\n```json\\n{json.dumps(vuln_info, ensure_ascii=False, indent=2)}\\n```\\n\"\n if vuln_info else '\\n(未查询到漏洞详情)\\n'\n)\n\n# Title encodes verdict; report intentionally omits a wall-clock timestamp.\nreport = (\n f'# {report_title}\\n\\n'\n f'**攻击判定**: {verdict_cn} (`{attack_verdict}`)\\n'\n f'**风险等级**: {risk_level}\\n'\n f'**告警类型**: {parsed_alert.get(\"alert_type\", \"unknown\")}\\n'\n f'**源 / 目的**: {parsed_alert.get(\"src_ip\", \"N/A\")}:{parsed_alert.get(\"src_port\", \"N/A\")} → '\n f'{parsed_alert.get(\"dst_ip\", \"N/A\")}:{parsed_alert.get(\"dst_port\", \"N/A\")}\\n'\n f'**URL**: {parsed_alert.get(\"url\", \"N/A\")}\\n\\n'\n '---\\n\\n'\n '## 1. 日志类型分析\\n'\n 'Web日志(已通过 log_type_analysis 校验)\\n\\n'\n '## 2. 测绘信息\\n'\n f'{survey_result or \"(无)\"}\\n\\n'\n '## 3. 关联漏洞分析\\n'\n f'{cve_related_result or \"(无)\"}\\n\\n'\n '## 4. 漏洞详情\\n'\n f'{cve_info_result or \"(无)\"}\\n\\n'\n '## 5. 攻击 Payload 分析\\n'\n f'{payload_analysis_result or \"(无)\"}\\n\\n'\n '## 6. 攻击分析结果\\n'\n f'{attack_analysis_result or \"(无)\"}\\n\\n'\n '## 7. 威胁情报\\n'\n f'{intel_md}\\n'\n '## 8. 漏洞情报原始数据\\n'\n f'{vuln_md}\\n'\n '## 9. 原始日志\\n'\n f'```\\n{log_text}\\n```\\n'\n)\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\nreport_path = os.path.join(artifacts_dir, 'final_report.md')\ntool.run('write', filePath=report_path, content=report)\n\nprint(f'[generate_report] verdict={attack_verdict}, title={report_title}, path={report_path}')\n\noutputs['final_report'] = report\noutputs['report_path'] = report_path\noutputs['report_title'] = report_title\noutputs['attack_verdict'] = attack_verdict\noutputs['risk_level'] = risk_level\n" + "description": "汇总 IOC、威胁情报、漏洞、请求响应分析并生成最终报告", + "code": "import os\nimport json\nimport datetime\nfrom flocks.workspace.manager import WorkspaceManager\n\nparsed_alert = inputs.get('parsed_alert', {})\nintel_results = inputs.get('intel_results', [])\nvuln_info = inputs.get('vuln_info', {})\npayload_analysis = inputs.get('payload_analysis', '')\nresponse_analysis = inputs.get('response_analysis', '')\nattack_success = inputs.get('attack_success', False)\n\ndef severity_to_text(value):\n mapping = {0: 'low', 1: 'low', 2: 'medium', 3: 'high', 4: 'critical'}\n if isinstance(value, int):\n return mapping.get(value, str(value))\n if str(value).isdigit():\n return mapping.get(int(value), str(value))\n return str(value) or 'unknown'\n\nseverity_text = severity_to_text(parsed_alert.get('severity', 'unknown'))\nreport_content = f\"\"\"# NDR/TDP 告警分析报告\n\n## 执行摘要\n\n**告警类型**: {parsed_alert.get('alert_type', 'unknown')}\n**严重级别**: {severity_text}\n**攻击是否成功**: {'是' if attack_success else '否'}\n**URL**: {parsed_alert.get('url', 'N/A')}\n\n### 主要发现\n- 源 IP: {parsed_alert.get('src_ip', 'N/A')}\n- 目的 IP: {parsed_alert.get('dst_ip', 'N/A')}\n- 协议: {parsed_alert.get('protocol', 'N/A')}\n- 端口: {parsed_alert.get('src_port', 'N/A')} -> {parsed_alert.get('dst_port', 'N/A')}\n- TDP 判定: {parsed_alert.get('threat_result', 'N/A')}\n- 失败信号: {', '.join(parsed_alert.get('failed_by', [])) or '无'}\n\n---\n\n## 详细分析\n\n### 1. IOC\n```json\n{json.dumps(parsed_alert.get('iocs', []), ensure_ascii=False, indent=2)}\n```\n\n### 2. 威胁情报查询结果\n\"\"\"\n\nif intel_results:\n for intel in intel_results:\n report_content += f\"\\n**{intel['source']} - {intel['type']}**: {intel['value']}\\n```\\n{intel['result']}\\n```\\n\"\nelse:\n report_content += '\\n未查询到可用外部情报,或 IOC 仅包含内网/保留地址。\\n'\n\nreport_content += '\\n### 3. 漏洞信息\\n'\nif vuln_info:\n report_content += f\"\\n```json\\n{json.dumps(vuln_info, ensure_ascii=False, indent=2)}\\n```\\n\"\nelse:\n report_content += '\\n未识别到标准漏洞编号,跳过漏洞查询。\\n'\n\nreport_content += f\"\"\"\n### 4. 攻击负载分析\n{payload_analysis or '无'}\n\n### 5. 响应包分析\n{response_analysis or '无'}\n\n---\n\n## 风险评估\n\n- **风险等级**: {'High' if attack_success else 'Medium'}\n- **攻击成功**: {'是 - 建议紧急响应' if attack_success else '否 - 当前更像扫描或失败尝试'}\n\n---\n\n## 建议与行动项\n\n1. {'立即阻断源 IP 通信并检查目标主机是否已落地 WebShell。' if attack_success else '结合 Web/WAF/主机日志复核该请求,确认是否仅为扫描或误报。'}\n2. 检查目标主机 {parsed_alert.get('dst_ip', 'N/A')} 对应 Web 服务和目录访问日志。\n3. 如 URL 涉及敏感上传/脚本路径,建议核查文件完整性与最近变更。\n4. 将该源地址与其他探测、爆破、上传类告警做关联分析。\n\n---\n\n## 数据来源\n\n- 威胁情报: ThreatBook, VirusTotal\n- 漏洞信息: Vulnerability Database\n- 分析: LLM-based analysis\n\"\"\"\n\nws = WorkspaceManager.get_instance()\noutput_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat())\nartifacts_dir = os.path.join(output_dir, 'artifacts')\nos.makedirs(artifacts_dir, exist_ok=True)\nreport_path = os.path.join(artifacts_dir, 'final_report.md')\ntool.run('write', filePath=report_path, content=report_content)\n\noutputs['final_report'] = report_content\noutputs['report_path'] = report_path\noutputs['risk_level'] = 'High' if attack_success else 'Medium'\noutputs['action_required'] = '紧急响应' if attack_success else '持续监控'\noutputs['action_details'] = '建议立即阻断源 IP 通信,隔离受影响系统,并进行深入调查' if attack_success else '建议记录该告警,结合 Web/WAF/主机日志持续监控并复核'" } ], "edges": [ { "from": "receive_alert", - "to": "prepare_intel", - "order": 0 - }, - { - "from": "prepare_intel", - "to": "survey", - "order": 0 - }, - { - "from": "prepare_intel", - "to": "cve_related", - "order": 1 - }, - { - "from": "prepare_intel", - "to": "cve_info", - "order": 2 + "to": "query_threat_intel" }, { - "from": "prepare_intel", - "to": "payload_analysis", - "order": 3 + "from": "receive_alert", + "to": "query_vuln" }, { - "from": "survey", - "to": "attack_analysis_result", - "order": 0 + "from": "receive_alert", + "to": "analyze_payload" }, { - "from": "cve_related", - "to": "attack_analysis_result", - "order": 1 + "from": "receive_alert", + "to": "analyze_response" }, { - "from": "cve_info", - "to": "attack_analysis_result", - "order": 2 + "from": "query_threat_intel", + "to": "join_results" }, { - "from": "payload_analysis", - "to": "attack_analysis_result", - "order": 3 + "from": "query_vuln", + "to": "join_results" }, { - "from": "attack_analysis_result", - "to": "attack_verdict", - "order": 0 + "from": "analyze_payload", + "to": "join_results" }, { - "from": "attack_verdict", - "to": "report_title", - "order": 0 + "from": "analyze_response", + "to": "join_results" }, { - "from": "report_title", - "to": "generate_report", - "order": 0 - } - ], - "metadata": { - "node_timeout_s": 600, - "sampleInputs": { - "alert_data": { - "data": [ - { - "attacker": "1.2.3.4", - "victim": "10.0.0.1", - "external_port": 50000, - "machine_port": 80, - "url_host": "vuln.example.com", - "url_path": "/admin/login.php?id=1 OR 1=1", - "net": { - "src_port": 50000, - "dest_port": 80, - "app_proto": "http", - "http": { - "reqs_line": "GET /admin/login.php?id=1 OR 1=1 HTTP/1.1", - "reqs_header": "Host: vuln.example.com\nUser-Agent: sqlmap/1.6", - "req_body": "", - "resp_line": "HTTP/1.1 200 OK", - "resp_header": "Content-Type: text/html", - "resp_body": "You have an error in your SQL syntax", - "status": 200 - } - }, - "threat": { - "name": "SQL注入攻击", - "severity": "high", - "msg": "CVE-2017-12615 detected", - "result": "success" - } - } - ] - } + "from": "join_results", + "to": "generate_report" } - } -} \ No newline at end of file + ] +} diff --git a/.flocks/plugins/workflows/tdp_alert_triage/workflow.md b/.flocks/plugins/workflows/tdp_alert_triage/workflow.md index 9cd394ce4..21f4368f8 100644 --- a/.flocks/plugins/workflows/tdp_alert_triage/workflow.md +++ b/.flocks/plugins/workflows/tdp_alert_triage/workflow.md @@ -1,100 +1,124 @@ -# TDP/NDR Web 日志研判工作流 +# NDR/TDP 告警调查工作流 ## 业务场景 -对 NDR/TDP 上送的 HTTP 日志告警进行标准化研判(默认输入已为 HTTP 日志,无需类型判别):收集情报 → 并行分析(测绘 / 漏洞 / 漏洞详情 / payload)→ 综合给出攻击判定与最终报告。其中**测绘 / 漏洞分析 / 漏洞详情 / 攻击 payload** 四步并行执行以缩短端到端时延。 - -## 流程结构 - -``` -receive_alert (告警解析) - │ - ▼ -prepare_intel (查询 IP 威胁情报 + CVE 漏洞情报) - │ - ▼ -┌────────────────── 并行 4 节点 ──────────────────┐ -│ survey (测绘) │ -│ cve_related (从日志提取 CVE 编号) │ -│ cve_info (展示 CVE 漏洞信息) │ -│ payload_analysis (攻击 payload 分析) │ -└──────────────────────────────────────────────────┘ - │ - ▼ -attack_analysis_result (攻击分析结果,join 节点) - │ - ▼ -attack_verdict (攻击判定:5 类标签) - │ - ▼ -report_title (报告标题) - │ - ▼ -generate_report (输出最终 Markdown 报告) -``` - -## 节点详情 - -### 1. `receive_alert` -解析 NDR/TDP 告警 JSON,兼容顶层 `data` 数组与扁平结构。提取 `src_ip/dst_ip/sport/dport/protocol`、HTTP 请求/响应、URL、IOC 列表,以及预扫的 CVE/CNVD/CNNVD/XVE 编号。生成统一的 `log_text` 文本块供下游所有 LLM prompt 使用。 - -### 2. `prepare_intel` -并行块的预处理: -- 对外网 IP 调用 `threatbook_ip_query`(自动跳过 RFC1918 / 回环 / 保留地址) -- 对域名/URL 调用 `threatbook_domain_query` / `threatbook_url_query` -- 若 `receive_alert` 提取到 CVE,调用 `__mcp_vuln_query` 获取详情 - -输出 `intel_content` 与 `vuln_content` 文本块,供 `survey` 与 `cve_info` 的 LLM prompt 直接使用。 - -### 3-6. 并行节点 - -| 节点 | 职责 | 关键约束 | -|------|------|---------| -| `survey` | 总结 IP 情报中的空间测绘信息(标签 + 服务 + 应用资产) | 多 IP 以无序列表显示,无测绘信息则不列出 | -| `cve_related` | 仅从日志文本提取漏洞编号 | 不做任何推测,无编号则输出"日志中无关联漏洞情报" | -| `cve_info` | 基于 vuln_content 输出 CVE 基本信息 | 不输出修复建议、不带额外解释说明 | -| `payload_analysis` | 分析日志中是否包含攻击载荷及判定依据 | 不做攻击意图分析、不做攻击影响分析 | - -4 个节点同时执行,各自独立产出结果。 - -### 7. `attack_analysis_result`(join) -`join: true` —— 等待 4 个并行节点全部完成。按"攻击成功 / 攻击失败 / 攻击 / 未知 / 安全"五分类标准进行长文本判定,输出"攻击状态 + 判定依据 + 详细分析"。同时把所有上游结果透传到下游。落盘到 `attack_analysis_result.md`。 - -### 8. `attack_verdict` -将上一节点的长文本归一化为 5 个标签之一:`attack_success` / `attack_failed` / `attack` / `unknown` / `benign`。LLM 输出做 token 容错。 - -### 9. `report_title` -基于攻击类型 + 判定结果生成 ≤30 字中文标题。对返回内容做引号/括号清理。LLM 失败时回退到 ` - ` 模板。 - -### 10. `generate_report` -汇总所有分析结果生成最终 Markdown 报告,9 个章节:执行摘要 → 日志类型 → 测绘 → 关联漏洞 → 漏洞详情 → payload → 攻击分析 → 威胁情报 → 原始日志。落盘到: -``` -~/.flocks/workspace/outputs//artifacts/final_report.md -``` -报告标题包含攻击类型/结果,正文不包含时间戳。 - -## 输入参数 - -```json -{ - "alert_data": "TDP 告警 JSON(list / dict / 嵌套 data 结构均支持)" -} -``` - -## 输出参数 - -| 字段 | 类型 | 说明 | -|------|------|------| -| `final_report` | string | 完整 Markdown 报告 | -| `report_path` | string | 报告文件路径 | -| `report_title` | string | 报告标题 | -| `attack_verdict` | enum | `attack_success` / `attack_failed` / `attack` / `unknown` / `benign` | -| `risk_level` | enum | `High` / `Medium` / `Low` | - -## 工程要点 - -- **并行扇出/扇入**:`prepare_intel` 是唯一 fan-out 起点;`attack_analysis_result` 用 `join: true` 作为唯一 fan-in 汇聚点,符合 flocks workflow 引擎的 lint 要求。 -- **LLM 推理块清洗**:所有 LLM 节点都会用 `_strip_think()` 去除 `...` 推理块,避免模型内部思考过程污染输出。 -- **LLM 容错**:所有调用 `llm.ask` 的节点都对返回结果做了正则提取与回退处理,单一 LLM 输出格式偏差不会让整个工作流失败。 -- **节点超时**:`metadata.node_timeout_s = 600`,留给最慢的 LLM 推理足够时间。 -- **报告落盘**:使用 `WorkspaceManager.get_workspace_dir()` 解析输出根目录,所有产物统一落到 flocks 工作区下的 `outputs//artifacts/`。 +对 NDR/TDP 告警进行自动化研判分析。当前工作流重点兼容 TDP 检索结果这类嵌套结构输入,例如顶层 `data` 数组、`net.http` 请求响应字段以及 `threat` 判定信息,并生成结构化分析报告。 + +## 流程步骤 + +### 1. 接收告警数据 +- **描述**: 接收并解析 NDR/TDP 告警或网络流量日志,提取关键字段(源IP、目的IP、端口、协议、HTTP 请求/响应、IOC 等) +- **工具/模型**: Tool-driven +- **输入**: `alert_data` - 告警 JSON 数据,支持扁平结构或 TDP 风格的 `{ "data": [ ... ] }` +- **输出**: `parsed_alert` - 解析后的告警数据字典 +- **处理逻辑**: + - 自动展开顶层 `data[0]`,兼容直接传入单条告警 + - 优先从 `threat`、`net.http`、`attacker/victim`、`external_ip/server_ip` 等 TDP 字段提取 src/dst、URL、请求与响应 + - 从 `threat.msg`、`threat.tag` 等字段中识别标准漏洞编号(如 `CVE-*`) + - 提取 IOC(IP、域名、URL),并保留原始告警用于后续分析 + +### 2. 威胁情报查询(并行) +- **描述**: 使用多源威胁情报查询告警中涉及的外部 IP、域名、URL 等指标 +- **工具/模型**: Tool-driven +- **输入**: `parsed_alert` - 解析后的告警数据 +- **输出**: `intel_results` - 威胁情报查询结果汇总 +- **处理逻辑**: + - 遍历告警中的 IOC(IP、域名、URL) + - 自动去重,并跳过内网/保留地址,避免对 `127.0.0.1`、RFC1918 地址做无意义情报查询 + - 使用 `threatbook_ip_query`、`threatbook_domain_query`、`threatbook_url_query` 查询 + - 使用 `virustotal_ip_query`、`virustotal_domain_query`、`virustotal_url_query` 做补充查询 + - 汇总所有情报结果 + +### 3. 漏洞信息查询(并行) +- **描述**: 仅在识别到标准漏洞编号时查询漏洞信息(CVE/CNVD/CNNVD/XVE) +- **工具/模型**: Tool-driven +- **输入**: `parsed_alert` - 可能包含漏洞ID +- **输出**: `vuln_info` - 漏洞详细信息 +- **处理逻辑**: + - 从 `threat.msg`、`threat.tag`、URL 等文本中提取漏洞ID(如 `CVE-2021-xxx`) + - 仅当存在标准漏洞编号时才调用 `__mcp_vuln_query` + - 获取漏洞描述、影响产品、修复方案、POC 等信息 + - 无漏洞ID时返回空结果 + +### 4. 攻击负载分析(并行) +- **描述**: 使用 LLM 分析 HTTP 请求负载,识别攻击/扫描手法与意图 +- **工具/模型**: LLM-driven +- **输入**: `parsed_alert` - 包含 payload +- **输出**: `payload_analysis` - 攻击负载分析结果 +- **处理逻辑**: + - 提取 HTTP 请求行、请求头、请求体 + - 使用 LLM 分析该流量更像攻击、扫描、误报还是正常请求 + - 识别具体攻击/扫描方式和意图 + - **必须落盘**: 将 LLM 分析结果写入 `~/.flocks/workspace/outputs//artifacts/payload_analysis_llm_output.md` + +### 5. 响应包分析与攻击成功判定(并行) +- **描述**: 结合服务器响应包和 TDP 判定字段,判断攻击是否成功 +- **工具/模型**: LLM-driven +- **输入**: `parsed_alert` - 包含请求和响应 +- **输出**: `response_analysis` - 响应分析结果, `attack_success` - 攻击是否成功 +- **处理逻辑**: + - 提取请求包和响应包内容 + - 将 `HTTP status`、`threat.result`、`threat.failed_by` 一并作为判定信号 + - 优先让 LLM 结构化输出成功/失败结论,解析失败时再使用规则兜底 + - **必须落盘**: 将分析结果写入 `~/.flocks/workspace/outputs//artifacts/response_analysis_llm_output.md` + +### 6. 汇聚并行结果 +- **描述**: 使用 `join=true` 等待并行节点全部完成,再把结果归一化后传给报告节点 +- **工具/模型**: Tool-driven +- **输入**: `intel_results`、`vuln_info`、`payload_analysis`、`response_analysis`、`attack_success` +- **输出**: 归一化后的统一上下文 +- **处理逻辑**: + - 等待 4 个并行节点全部完成 + - 透传并规整报告节点所需字段 + - 避免多个并行分支直接汇聚到写文件节点,满足 workflow 引擎约束 + +### 7. 生成分析报告 +- **描述**: 综合以上分析结果,生成结构化分析报告 +- **工具/模型**: LLM-driven +- **输入**: 所有前序步骤的输出(intel_results, vuln_info, payload_analysis, response_analysis, attack_success) +- **输出**: `final_report` - 完整分析报告 +- **处理逻辑**: + - 汇总情报查询结果 + - 汇总漏洞信息 + - 汇总攻击负载分析和响应分析 + - 根据 `attack_success` 和 TDP 失败信号生成风险等级 + - 生成结构化报告,包含:摘要、IOC、情报、漏洞、分析、风险评估、建议 + - **必须落盘**: 将报告写入 `~/.flocks/workspace/outputs//artifacts/final_report.md` + +## 并行执行设计 + +步骤 2、3、4、5 为并行节点,同时执行以提升效率: +- query_threat_intel: 威胁情报查询 +- query_vuln: 漏洞信息查询 +- analyze_payload: 攻击负载分析 +- analyze_response: 响应包分析与攻击成功判定 + +所有并行节点执行完成后,先汇聚到 `join_results`,再进入 `generate_report` 生成最终报告。 + +## 报告结构 + +### 执行摘要 +- 告警概述 +- 主要发现 +- 风险等级 + +### 详细分析 +- 告警详情 +- 威胁情报结果 +- 漏洞信息(如有) +- 攻击负载分析 +- 响应分析 + +### 关键发现 +- IOC 列表 +- 攻击手法描述 +- 是否成功判定 + +### 风险评估 +- 风险等级 +- 影响范围 + +### 建议与行动项 +- 紧急处置建议 +- 长期加固建议 +- 需要关联分析的系统 From a0fcab4b30bcf71d689f64bb684c7ed5007f3c05 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Thu, 14 May 2026 14:52:49 +0800 Subject: [PATCH 41/41] =?UTF-8?q?fix(syslog/watcher):=20address=20PR-267?= =?UTF-8?q?=20review=20=E2=80=94=20backpressure,=20atomic-save,=20bind-err?= =?UTF-8?q?or=20reporting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## fix: syslog ingest — replace semaphore+create_task with a bounded worker pool The previous design drained the bounded queue immediately by spawning an `asyncio.Task` per message; the semaphore was only acquired inside those tasks, so pending coroutines could grow without bound under a syslog flood. Replace with a fixed pool of `_MAX_CONCURRENT_EXECUTIONS` `_worker_loop` coroutines that each `await queue.get()` serially. The pool size is now the *only* concurrency knob; total in-flight workflow runs is exactly the pool size, making backpressure a hard invariant rather than a soft hint. ## fix(watcher): check dest_path on atomic-save events in tool/agent/skill watchers Editors (vim, VS Code "useAtomicSave", …) persist edits by writing a sibling temp file then renaming it onto the real target. watchdog reports this as a `moved` event where `src_path` is the throwaway name and `dest_path` is the actual `tool.yaml` / `agent.yaml` / `SKILL.md`. Filtering only by `src_path` (the previous behaviour) silently skipped these valid updates. Extract the path-matching logic into module-level predicates `_tool_event_should_reload`, `_agent_event_should_reload`, and `_skill_event_should_reload` that inspect both endpoints before deciding to skip an event. ## fix(syslog): save-config endpoint surfaces bind failures synchronously `restart_workflow` now blocks (up to `_BIND_WAIT_TIMEOUT_S = 3 s`) until the socket either binds successfully or raises `OSError`. Bind failures are recorded in `_listener_status` with `state="failed"` and propagated back to `POST /api/workflow/{id}/syslog-config` as `409 Conflict`, so the UI sees an actionable error instead of a false "Listening" badge. Add `GET /api/workflow/{id}/syslog-status` to expose the runtime state (binding / listening / failed / stopped, queue depth, worker count) independently of the persisted config. Update `IntegrationTab.tsx` to derive the "Listening" indicator from the runtime `syslogStatus.state` rather than the saved `enabled` field, add an amber "binding…" indicator, red error banners on failure, and a live queue-depth readout while listening. ## test: add pytest-runnable regression tests for all three areas - `tests/ingest/test_syslog_manager_backpressure.py` — worker pool bounds in-flight dispatches, queue drops on full, stop_workflow cancels pool cleanly - `tests/ingest/test_syslog_manager_bind_failure.py` — restart_workflow reports state="failed" on port conflict; state="stopped" when disabled - `tests/tool/test_watcher_atomic_save.py` — all three watcher predicates accept dest_path on rename and reject unrelated / hidden / pycache paths 15 new tests, all passing. Co-authored-by: Cursor --- flocks/agent/registry.py | 25 +- flocks/ingest/syslog/manager.py | 286 +++++++++++++----- flocks/server/routes/workflow.py | 37 ++- flocks/skill/skill.py | 17 +- flocks/tool/registry.py | 44 ++- tests/ingest/__init__.py | 0 .../test_syslog_manager_backpressure.py | 155 ++++++++++ .../test_syslog_manager_bind_failure.py | 106 +++++++ tests/tool/test_watcher_atomic_save.py | 107 +++++++ webui/src/api/workflow.ts | 20 +- .../WorkflowDetail/tabs/IntegrationTab.tsx | 98 +++++- 11 files changed, 788 insertions(+), 107 deletions(-) create mode 100644 tests/ingest/__init__.py create mode 100644 tests/ingest/test_syslog_manager_backpressure.py create mode 100644 tests/ingest/test_syslog_manager_bind_failure.py create mode 100644 tests/tool/test_watcher_atomic_save.py diff --git a/flocks/agent/registry.py b/flocks/agent/registry.py index b73fd517a..e1643b541 100644 --- a/flocks/agent/registry.py +++ b/flocks/agent/registry.py @@ -579,6 +579,27 @@ def unregister(cls, name: str) -> bool: # --------------------------------------------------------------------------- +def _agent_event_should_reload(event: object) -> bool: + """Return True if a watchdog event should invalidate the agent cache. + + Mirrors ``flocks.tool.registry._tool_event_should_reload``: atomic-save + editors surface the real target via ``dest_path``, so we inspect both + endpoints before deciding to skip. + """ + candidates = [] + src = getattr(event, "src_path", "") or "" + if src: + candidates.append(src) + dest = getattr(event, "dest_path", "") or "" + if dest: + candidates.append(dest) + for path in candidates: + fname = os.path.basename(path) + if fname == "agent.yaml" or path.endswith(".md"): + return True + return False + + class AgentFileWatcher: """Watch plugin agent directories and auto-invalidate the Agent cache on change. @@ -634,9 +655,7 @@ def on_any_event(self, event: FileSystemEvent) -> None: return if getattr(event, "event_type", "") not in _RELOAD_EVENT_TYPES: return - src = getattr(event, "src_path", "") or "" - fname = os.path.basename(src) - if fname == "agent.yaml" or src.endswith(".md"): + if _agent_event_should_reload(event): watcher._schedule_invalidate() handler = _Handler() diff --git a/flocks/ingest/syslog/manager.py b/flocks/ingest/syslog/manager.py index 040545dd3..79438dd1a 100644 --- a/flocks/ingest/syslog/manager.py +++ b/flocks/ingest/syslog/manager.py @@ -4,7 +4,7 @@ import asyncio import time -from typing import Any, Dict +from typing import Any, Dict, List from flocks.storage.storage import Storage from flocks.utils.log import Log @@ -25,20 +25,43 @@ _MAX_CONCURRENT_EXECUTIONS = 8 # Maximum number of buffered syslog messages per workflow; excess messages are dropped with a warning _MAX_QUEUE_SIZE = 200 +# Maximum time we wait for the listener to either bind successfully or fail +# during ``restart_workflow``. Any value <0.5s makes the call too aggressive +# under busy event-loops; anything >5s would make the HTTP save endpoint feel +# hung when the user makes a typo. +_BIND_WAIT_TIMEOUT_S = 3.0 class SyslogManager: - """One async listener task per workflow id (when enabled).""" + """One async listener task per workflow id (when enabled). + + The listener / consumer fan-out is built around bounded primitives so a + syslog flood cannot translate into unbounded asyncio.Task growth: + + * A bounded ``asyncio.Queue`` (``_MAX_QUEUE_SIZE``) absorbs spikes; the + listener uses ``put_nowait`` and drops excess messages with a warning. + * A fixed pool of ``_MAX_CONCURRENT_EXECUTIONS`` worker coroutines drains + the queue and runs ``_trigger_workflow`` serially per worker. This is + stronger than a per-task ``Semaphore``: the previous design called + ``create_task`` for every queued message and only awaited the semaphore + *inside* the task, which let pending coroutines accumulate without + bound while the queue was emptied immediately. + """ def __init__(self) -> None: self._tasks: dict[str, asyncio.Task] = {} self._abort_events: dict[str, asyncio.Event] = {} - # Per-workflow semaphore to cap concurrent executions - self._semaphores: dict[str, asyncio.Semaphore] = {} # Per-workflow bounded message queue for backpressure self._queues: dict[str, asyncio.Queue] = {} - # Per-workflow queue consumer task - self._consumer_tasks: dict[str, asyncio.Task] = {} + # Per-workflow fixed worker pool draining the queue + self._worker_pools: dict[str, List[asyncio.Task]] = {} + # Per-workflow listener runtime status for the syslog-status API. + # Possible state values: "binding" | "listening" | "failed" | "stopped". + self._listener_status: dict[str, Dict[str, Any]] = {} + # Per-workflow event signalled when the listener has either bound + # successfully or failed. Used by ``restart_workflow`` so the HTTP + # save endpoint can report bind failures synchronously. + self._listener_ready: dict[str, asyncio.Event] = {} @staticmethod def _config_key(workflow_id: str) -> str: @@ -69,6 +92,25 @@ async def stop_all(self) -> None: for workflow_id in list(self._tasks.keys()): await self.stop_workflow(workflow_id) + def get_listener_status(self, workflow_id: str) -> Dict[str, Any]: + """Return a snapshot of the listener runtime state for ``workflow_id``. + + Result shape:: + + {"state": "binding|listening|failed|stopped", "error": "..." | None, + "host": "...", "port": 5140, "protocol": "udp|tcp", + "queueSize": 12, "queueCapacity": 200, "workerCount": 8} + """ + status = dict(self._listener_status.get(workflow_id) or {"state": "stopped"}) + q = self._queues.get(workflow_id) + if q is not None: + status["queueSize"] = q.qsize() + status["queueCapacity"] = q.maxsize + pool = self._worker_pools.get(workflow_id) + if pool is not None: + status["workerCount"] = sum(1 for t in pool if not t.done()) + return status + async def stop_workflow(self, workflow_id: str) -> None: ev = self._abort_events.pop(workflow_id, None) if ev is not None: @@ -80,61 +122,120 @@ async def stop_workflow(self, workflow_id: str) -> None: await task except asyncio.CancelledError: pass - # Stop the queue consumer task - consumer = self._consumer_tasks.pop(workflow_id, None) - if consumer is not None and not consumer.done(): - consumer.cancel() + # Cancel all worker pool tasks; pop first so callers observing a + # stopped listener see an empty pool immediately. + pool = self._worker_pools.pop(workflow_id, None) + if pool: + for w in pool: + if not w.done(): + w.cancel() try: - await consumer - except asyncio.CancelledError: + await asyncio.wait_for( + asyncio.gather(*pool, return_exceptions=True), + timeout=5.0, + ) + except (asyncio.TimeoutError, asyncio.CancelledError): pass - self._semaphores.pop(workflow_id, None) self._queues.pop(workflow_id, None) - - async def restart_workflow(self, workflow_id: str) -> None: + self._listener_ready.pop(workflow_id, None) + if workflow_id in self._listener_status: + self._listener_status[workflow_id] = {"state": "stopped", "error": None} + + async def restart_workflow(self, workflow_id: str) -> Dict[str, Any]: + """Restart the listener and return its post-bind runtime status. + + This call blocks until the underlying socket either binds successfully, + the bind fails (OSError such as ``EADDRINUSE``), or + ``_BIND_WAIT_TIMEOUT_S`` elapses. Callers (e.g. the HTTP + ``save_syslog_config`` route) can therefore surface bind errors to the + user instead of silently leaving the listener in a failed state. + """ await self.stop_workflow(workflow_id) key = self._config_key(workflow_id) try: data = await Storage.read(key) except Exception as exc: log.warning("syslog.restart_read_failed", {"workflow_id": workflow_id, "error": str(exc)}) - return + return {"state": "failed", "error": str(exc)} if not isinstance(data, dict) or not data.get("enabled"): - return + self._listener_status[workflow_id] = {"state": "stopped", "error": None} + return {"state": "stopped", "error": None} # Load and cache the workflow JSON once; avoids a disk read per message wf_data = read_workflow_from_fs(workflow_id) if not wf_data: + err = "workflow_not_found" + self._listener_status[workflow_id] = {"state": "failed", "error": err} log.warning("syslog.workflow_not_found_on_start", {"workflow_id": workflow_id}) - return + return {"state": "failed", "error": err} workflow_json = wf_data.get("workflowJson") if not workflow_json: + err = "workflow_json_missing" + self._listener_status[workflow_id] = {"state": "failed", "error": err} log.warning("syslog.workflow_json_missing_on_start", {"workflow_id": workflow_id}) - return + return {"state": "failed", "error": err} - # Set up concurrency control resources - self._semaphores[workflow_id] = asyncio.Semaphore(_MAX_CONCURRENT_EXECUTIONS) queue: asyncio.Queue = asyncio.Queue(maxsize=_MAX_QUEUE_SIZE) self._queues[workflow_id] = queue abort = asyncio.Event() self._abort_events[workflow_id] = abort + ready = asyncio.Event() + self._listener_ready[workflow_id] = ready + + host = str(data.get("host") or "0.0.0.0") + port = int(data.get("port") or 5140) + protocol = str(data.get("protocol") or "udp").lower() + self._listener_status[workflow_id] = { + "state": "binding", + "error": None, + "host": host, + "port": port, + "protocol": protocol, + } + input_key = str(data.get("inputKey") or "syslog_message") - # Start the consumer that drains the queue and dispatches executions under the semaphore - consumer = asyncio.create_task( - self._queue_consumer(workflow_id, workflow_json, input_key, queue, abort), - name=f"syslog-consumer-{workflow_id}", - ) - self._consumer_tasks[workflow_id] = consumer + # Spin up a fixed worker pool: exactly _MAX_CONCURRENT_EXECUTIONS + # coroutines drain the queue. pending tasks cannot exceed this number, + # which is the actual backpressure invariant we want. + workers: List[asyncio.Task] = [] + for i in range(_MAX_CONCURRENT_EXECUTIONS): + workers.append( + asyncio.create_task( + self._worker_loop(workflow_id, workflow_json, input_key, queue, abort), + name=f"syslog-worker-{workflow_id}-{i}", + ) + ) + self._worker_pools[workflow_id] = workers task = asyncio.create_task( - self._listener_loop(workflow_id, data, queue, abort), + self._listener_loop(workflow_id, data, queue, abort, ready), name=f"syslog-{workflow_id}", ) self._tasks[workflow_id] = task + + # Wait briefly for the listener to bind (or fail) so the caller can + # decide whether to surface a 502/Conflict instead of pretending the + # listener is up. + try: + await asyncio.wait_for(ready.wait(), timeout=_BIND_WAIT_TIMEOUT_S) + except asyncio.TimeoutError: + # Listener hasn't reported bind result; treat as best-effort + # "scheduled" so we don't tear it down on slow boxes, but mark the + # state explicitly so the UI can show "pending". + current = self._listener_status.get(workflow_id) or {} + if current.get("state") == "binding": + self._listener_status[workflow_id] = { + **current, + "state": "binding", + "error": "bind_pending_timeout", + } + log.warning("syslog.bind_pending_timeout", {"workflow_id": workflow_id}) + log.info("syslog.listener_scheduled", {"workflow_id": workflow_id}) + return self.get_listener_status(workflow_id) async def _listener_loop( self, @@ -142,6 +243,7 @@ async def _listener_loop( config: Dict[str, Any], queue: asyncio.Queue, abort: asyncio.Event, + ready: asyncio.Event, ) -> None: host = str(config.get("host") or "0.0.0.0") port = int(config.get("port") or 5140) @@ -160,34 +262,73 @@ def on_msg(parsed: dict) -> None: "queue_size": queue.qsize(), }) + async def _bind_and_serve() -> None: + """Bind the socket synchronously then mark the listener ready. + + ``run_udp_syslog_server`` / ``run_tcp_syslog_server`` create the + endpoint at the top of their body and then await abort; we wrap + them with a tiny helper so we can flip the ``ready`` flag + *after* the bind has succeeded. Bind failures are caught by the + outer ``try`` below and reported back as ``state="failed"``. + """ + # We rely on the underlying asyncio APIs raising OSError before + # they yield control, so wrapping the call alone is enough. We + # additionally schedule a single-shot "mark ready" task that + # runs on the next event-loop tick — by which point the bind has + # either succeeded or raised. + mark_task = asyncio.create_task(_mark_ready_after_bind()) + try: + if protocol == "tcp": + await run_tcp_syslog_server(host, port, format_hint, on_msg, abort_event=abort) + else: + await run_udp_syslog_server(host, port, format_hint, on_msg, abort_event=abort) + finally: + if not mark_task.done(): + mark_task.cancel() + + async def _mark_ready_after_bind() -> None: + # Give the bind one event-loop tick to complete (or raise) so we + # don't claim "listening" before the socket actually exists. + await asyncio.sleep(0) + if not ready.is_set(): + self._listener_status[workflow_id] = { + "state": "listening", + "error": None, + "host": host, + "port": port, + "protocol": protocol, + } + ready.set() + try: - if protocol == "tcp": - await run_tcp_syslog_server( - host, - port, - format_hint, - on_msg, - abort_event=abort, - ) - else: - await run_udp_syslog_server( - host, - port, - format_hint, - on_msg, - abort_event=abort, - ) + await _bind_and_serve() except asyncio.CancelledError: raise except OSError as exc: + self._listener_status[workflow_id] = { + "state": "failed", + "error": str(exc), + "host": host, + "port": port, + "protocol": protocol, + } + ready.set() log.error( "syslog.bind_failed", {"workflow_id": workflow_id, "error": str(exc), "host": host, "port": port, "protocol": protocol}, ) except Exception as exc: + self._listener_status[workflow_id] = { + "state": "failed", + "error": str(exc), + "host": host, + "port": port, + "protocol": protocol, + } + ready.set() log.error("syslog.listener_error", {"workflow_id": workflow_id, "error": str(exc)}) - async def _queue_consumer( + async def _worker_loop( self, workflow_id: str, workflow_json: Any, @@ -195,41 +336,28 @@ async def _queue_consumer( queue: asyncio.Queue, abort: asyncio.Event, ) -> None: - """Drain the message queue and dispatch executions bounded by the semaphore.""" - semaphore = self._semaphores[workflow_id] - pending: set[asyncio.Task] = set() - - async def _dispatch(m: dict) -> None: - async with semaphore: - await self._trigger_workflow(workflow_id, workflow_json, m, input_key) + """One worker drains the queue serially. - try: - while not abort.is_set(): - try: - # Poll with a short timeout so we can react to abort promptly - msg = await asyncio.wait_for(queue.get(), timeout=0.5) - except asyncio.TimeoutError: - continue - - t = asyncio.create_task(_dispatch(msg)) - pending.add(t) - t.add_done_callback(pending.discard) - except asyncio.CancelledError: - pass - finally: - # Best-effort drain: wait briefly for in-flight dispatches so their - # final Storage writes complete; cancel anything still stuck so we - # don't leak tasks on shutdown. - if pending: - try: - await asyncio.wait_for( - asyncio.gather(*pending, return_exceptions=True), - timeout=5.0, - ) - except (asyncio.TimeoutError, asyncio.CancelledError): - for t in list(pending): - if not t.done(): - t.cancel() + The worker pool size is the *only* concurrency knob; we deliberately + do not spawn additional asyncio.Tasks per message so the total number + of in-flight workflow runs is exactly ``_MAX_CONCURRENT_EXECUTIONS``. + """ + while not abort.is_set(): + try: + msg = await asyncio.wait_for(queue.get(), timeout=0.5) + except asyncio.TimeoutError: + continue + except asyncio.CancelledError: + return + try: + await self._trigger_workflow(workflow_id, workflow_json, msg, input_key) + except asyncio.CancelledError: + return + except Exception as exc: + log.warning( + "syslog.worker_dispatch_failed", + {"workflow_id": workflow_id, "error": str(exc)}, + ) async def _trigger_workflow( self, diff --git a/flocks/server/routes/workflow.py b/flocks/server/routes/workflow.py index db95920d1..8f77ee3f7 100644 --- a/flocks/server/routes/workflow.py +++ b/flocks/server/routes/workflow.py @@ -1597,7 +1597,12 @@ async def get_kafka_config(workflow_id: str): async def save_syslog_config(workflow_id: str, req: SyslogConfigRequest): """ Save syslog listener configuration for a workflow. - When enabled, starts UDP/TCP listener and passes parsed messages to workflow inputs. + + When ``enabled`` is true, this also (re)starts the UDP/TCP listener and + blocks until the underlying socket has either bound successfully or the + bind has failed (e.g. ``EADDRINUSE``, invalid host). Bind failures are + surfaced as ``409 Conflict`` so the UI can show an actionable error + instead of falsely claiming "Listening". """ try: if not _read_workflow_from_fs(workflow_id): @@ -1617,8 +1622,15 @@ async def save_syslog_config(workflow_id: str, req: SyslogConfigRequest): from flocks.ingest.syslog.manager import default_manager as _syslog_default_manager - await _syslog_default_manager.restart_workflow(workflow_id) - return {"ok": True} + status = await _syslog_default_manager.restart_workflow(workflow_id) + state = (status or {}).get("state") + if req.enabled and state == "failed": + err = (status or {}).get("error") or "listener_bind_failed" + raise HTTPException( + status_code=409, + detail=f"Syslog listener failed to bind: {err}", + ) + return {"ok": True, "listener": status} except HTTPException: raise except Exception as e: @@ -1637,6 +1649,25 @@ async def get_syslog_config(workflow_id: str): raise HTTPException(status_code=500, detail=f"Failed to get syslog config: {str(e)}") +@router.get("/workflow/{workflow_id}/syslog-status") +async def get_syslog_status(workflow_id: str): + """Return the *runtime* status of the syslog listener for a workflow. + + This reflects the actual bind state (binding/listening/failed/stopped) and + queue depth, so the UI can show whether a saved-but-not-yet-bound listener + is actually running. The persisted config (``/syslog-config``) only + captures *intent*, which is why the UI must consult this endpoint to + truthfully render "Listening". + """ + try: + from flocks.ingest.syslog.manager import default_manager as _syslog_default_manager + + return _syslog_default_manager.get_listener_status(workflow_id) + except Exception as e: + log.error("workflow.syslog_status.get.error", {"id": workflow_id, "error": str(e)}) + raise HTTPException(status_code=500, detail=f"Failed to get syslog status: {str(e)}") + + # ============================================================================= # API Endpoints - Run Single Node # ============================================================================= diff --git a/flocks/skill/skill.py b/flocks/skill/skill.py index 1e854b924..003682d3d 100644 --- a/flocks/skill/skill.py +++ b/flocks/skill/skill.py @@ -447,6 +447,20 @@ def stop_watcher(cls) -> None: cls._watcher = None +def _skill_event_should_reload(event: object) -> bool: + """Return True if a watchdog event affects a ``SKILL.md`` file. + + Atomic-save flows rename a temp file onto the real ``SKILL.md``; we have + to consult both ``src_path`` and ``dest_path`` so the watcher reloads on + those renames as well. + """ + for attr in ("src_path", "dest_path"): + path = getattr(event, attr, "") or "" + if path.endswith("SKILL.md"): + return True + return False + + class SkillFileWatcher: """ Watches skill directories for SKILL.md changes and auto-invalidates @@ -494,8 +508,7 @@ def on_any_event(self, event: FileSystemEvent): return if getattr(event, "event_type", "") not in _RELOAD_EVENT_TYPES: return - src = getattr(event, "src_path", "") or "" - if src.endswith("SKILL.md"): + if _skill_event_should_reload(event): watcher._schedule_clear() handler = _Handler() diff --git a/flocks/tool/registry.py b/flocks/tool/registry.py index be20ec338..c41f3b22c 100644 --- a/flocks/tool/registry.py +++ b/flocks/tool/registry.py @@ -1495,6 +1495,42 @@ def _register_dynamic_tools(cls) -> None: # --------------------------------------------------------------------------- +def _tool_event_should_reload(event: object) -> bool: + """Return True if a watchdog filesystem event should trigger a plugin reload. + + Atomic-save editors (vim, VS Code "useAtomicSave", many GUI tools, …) + persist edits by writing a sibling temp file then ``rename`` ing it onto + the real target. watchdog surfaces this as a ``moved`` event whose + ``src_path`` is the throwaway temp filename and whose ``dest_path`` is the + real ``tool.yaml`` / ``*.py``. Filtering only by ``src_path`` (the + pre-fix behaviour) misses the real edit entirely, so we have to inspect + both endpoints. + + Exposed at module scope so it can be unit-tested without spinning up + ``watchdog.observers.Observer`` against a temp directory. + """ + candidate_paths: List[str] = [] + src = getattr(event, "src_path", "") or "" + if src: + candidate_paths.append(src) + dest = getattr(event, "dest_path", "") or "" + if dest: + candidate_paths.append(dest) + if not candidate_paths: + return False + + for path in candidate_paths: + if not (path.endswith(".yaml") or path.endswith(".py")): + continue + fname = os.path.basename(path) + # Ignore Python bytecode / temp / hidden files that get touched + # during normal imports but never carry plugin definitions. + if fname.startswith(".") or fname.startswith("_") or "/__pycache__/" in path: + continue + return True + return False + + class ToolFileWatcher: """Watch plugin tool directories and auto-reload plugin tools on change. @@ -1560,13 +1596,7 @@ def on_any_event(self, event: FileSystemEvent) -> None: return if getattr(event, "event_type", "") not in _RELOAD_EVENT_TYPES: return - src = getattr(event, "src_path", "") or "" - if not (src.endswith(".yaml") or src.endswith(".py")): - return - # Ignore Python bytecode / temp / hidden files that get touched - # during normal imports but never carry plugin definitions. - fname = os.path.basename(src) - if fname.startswith(".") or fname.startswith("_") or "/__pycache__/" in src: + if not _tool_event_should_reload(event): return watcher._schedule_refresh() diff --git a/tests/ingest/__init__.py b/tests/ingest/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/ingest/test_syslog_manager_backpressure.py b/tests/ingest/test_syslog_manager_backpressure.py new file mode 100644 index 000000000..7bc859b15 --- /dev/null +++ b/tests/ingest/test_syslog_manager_backpressure.py @@ -0,0 +1,155 @@ +"""Regression tests for the syslog → workflow backpressure pipeline. + +These tests exercise ``SyslogManager`` in isolation (no UDP/TCP sockets) by +driving the bounded queue directly. They verify two invariants that the +previous semaphore-based design did *not* guarantee: + +1. Under a sustained burst the number of in-flight workflow dispatches is + bounded by ``_MAX_CONCURRENT_EXECUTIONS`` — not by the number of messages + the listener has shoved into the queue. +2. The bounded queue itself rejects excess messages via ``QueueFull`` so the + listener can drop+log instead of growing the consumer's pending-task set. + +These tests deliberately *do not* rely on networking; the listener loop is +covered by a separate route-level test that exercises the bind failure path. +""" + +from __future__ import annotations + +import asyncio + +import pytest + +from flocks.ingest.syslog import manager as syslog_manager + + +@pytest.mark.asyncio +async def test_worker_pool_bounds_in_flight_dispatches(monkeypatch: pytest.MonkeyPatch) -> None: + """The fixed worker pool must cap concurrent ``_trigger_workflow`` calls. + + We replace ``_trigger_workflow`` with an instrumented coroutine that + increments a counter on entry and asserts it never exceeds the worker + pool size before exiting. Then we feed N messages (much larger than the + pool) into the queue and let the workers drain them. + """ + + manager = syslog_manager.SyslogManager() + pool_size = syslog_manager._MAX_CONCURRENT_EXECUTIONS + + in_flight = 0 + max_in_flight = 0 + completed = 0 + lock = asyncio.Lock() + + async def _fake_trigger(workflow_id, workflow_json, msg, input_key): # noqa: ANN001 + nonlocal in_flight, max_in_flight, completed + async with lock: + in_flight += 1 + if in_flight > max_in_flight: + max_in_flight = in_flight + # Hold the worker briefly so a true concurrency violation would be + # observable; we cooperate with the event loop with a small sleep. + await asyncio.sleep(0.01) + async with lock: + in_flight -= 1 + completed += 1 + + monkeypatch.setattr(manager, "_trigger_workflow", _fake_trigger) + + workflow_id = "test-wf" + queue: asyncio.Queue = asyncio.Queue(maxsize=syslog_manager._MAX_QUEUE_SIZE) + abort = asyncio.Event() + + # Wire the manager up the same way ``restart_workflow`` would, minus the + # listener task itself (which would try to bind a real socket). + manager._queues[workflow_id] = queue + manager._abort_events[workflow_id] = abort + workers = [ + asyncio.create_task( + manager._worker_loop(workflow_id, {}, "syslog_message", queue, abort), + name=f"test-worker-{i}", + ) + for i in range(pool_size) + ] + manager._worker_pools[workflow_id] = workers + + # Burst-fill the queue with more work than the pool can do at once. + burst_size = pool_size * 6 + for i in range(burst_size): + queue.put_nowait({"_seq": i, "_trigger": "test"}) + + # Wait for the workers to drain the queue. + deadline = asyncio.get_event_loop().time() + 5.0 + while completed < burst_size and asyncio.get_event_loop().time() < deadline: + await asyncio.sleep(0.02) + + abort.set() + for w in workers: + w.cancel() + await asyncio.gather(*workers, return_exceptions=True) + + assert completed == burst_size, f"expected {burst_size} dispatches, got {completed}" + assert max_in_flight <= pool_size, ( + f"in-flight dispatches exceeded worker pool size: " + f"max_in_flight={max_in_flight}, pool_size={pool_size}" + ) + + +@pytest.mark.asyncio +async def test_bounded_queue_drops_excess_on_full() -> None: + """``put_nowait`` must raise ``QueueFull`` once capacity is reached. + + This is the contract the synchronous ``on_msg`` callback relies on; the + listener catches ``QueueFull`` and emits ``syslog.queue_full_dropped`` + instead of growing the queue unboundedly. + """ + + queue: asyncio.Queue = asyncio.Queue(maxsize=4) + for i in range(4): + queue.put_nowait({"_seq": i}) + with pytest.raises(asyncio.QueueFull): + queue.put_nowait({"_seq": 99}) + assert queue.qsize() == 4 + + +@pytest.mark.asyncio +async def test_stop_workflow_cancels_worker_pool() -> None: + """``stop_workflow`` must cancel and drain the worker pool cleanly. + + Leaking worker tasks would re-introduce the symptom the worker-pool + refactor was designed to prevent (orphan coroutines holding queue + references after the listener has stopped). + """ + + manager = syslog_manager.SyslogManager() + workflow_id = "test-wf-stop" + queue: asyncio.Queue = asyncio.Queue(maxsize=8) + abort = asyncio.Event() + manager._queues[workflow_id] = queue + manager._abort_events[workflow_id] = abort + manager._listener_status[workflow_id] = {"state": "listening", "error": None} + + async def _noop_trigger(*args, **kwargs): # noqa: ANN001, D401 + return None + + manager._trigger_workflow = _noop_trigger # type: ignore[assignment] + + workers = [ + asyncio.create_task( + manager._worker_loop(workflow_id, {}, "syslog_message", queue, abort), + name=f"stop-worker-{i}", + ) + for i in range(3) + ] + manager._worker_pools[workflow_id] = workers + + # Let workers loop once. + await asyncio.sleep(0.05) + + await manager.stop_workflow(workflow_id) + + for w in workers: + assert w.done(), "stop_workflow must terminate every worker in the pool" + assert workflow_id not in manager._worker_pools + assert workflow_id not in manager._queues + assert manager._listener_status[workflow_id]["state"] == "stopped" diff --git a/tests/ingest/test_syslog_manager_bind_failure.py b/tests/ingest/test_syslog_manager_bind_failure.py new file mode 100644 index 000000000..ecd621fcb --- /dev/null +++ b/tests/ingest/test_syslog_manager_bind_failure.py @@ -0,0 +1,106 @@ +"""Regression test for the bind-failure path of ``SyslogManager.restart_workflow``. + +The HTTP ``POST /api/workflow/{id}/syslog-config`` endpoint relies on +``restart_workflow`` synchronously reporting the listener's terminal state so +the route can return ``409 Conflict`` instead of falsely claiming success. + +We reproduce the failure by binding our own UDP socket on a chosen port and +then asking ``SyslogManager`` to start a listener for the same host/port; the +``OSError`` raised inside ``_listener_loop`` must surface as +``state == "failed"`` in the returned status. +""" + +from __future__ import annotations + +import asyncio +import socket + +import pytest + +from flocks.ingest.syslog import manager as syslog_manager + + +def _find_busy_udp_port() -> tuple[socket.socket, int]: + """Bind a UDP socket on a free port and return it (still bound).""" + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + sock.bind(("127.0.0.1", 0)) + port = sock.getsockname()[1] + return sock, port + + +@pytest.mark.asyncio +async def test_restart_workflow_reports_failure_on_port_conflict( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Restarting a listener on a busy port must yield state="failed".""" + busy_sock, busy_port = _find_busy_udp_port() + try: + workflow_id = "wf-bind-fail" + config = { + "workflowId": workflow_id, + "enabled": True, + "protocol": "udp", + "host": "127.0.0.1", + "port": busy_port, + "format": "auto", + "inputKey": "syslog_message", + } + + async def _fake_storage_read(key: str): # noqa: ANN001 + if key == syslog_manager.SyslogManager._config_key(workflow_id): + return config + return None + + def _fake_read_workflow_from_fs(wid: str): # noqa: ANN001 + return { + "id": wid, + "workflowJson": { + "start": "n1", + "nodes": [{"id": "n1", "type": "python", "code": "result = {'ok': True}"}], + "edges": [], + }, + } + + # Patch the *module-level* names ``manager.py`` looks up at call time. + monkeypatch.setattr(syslog_manager.Storage, "read", _fake_storage_read) + monkeypatch.setattr(syslog_manager, "read_workflow_from_fs", _fake_read_workflow_from_fs) + + manager = syslog_manager.SyslogManager() + try: + status = await manager.restart_workflow(workflow_id) + assert status["state"] == "failed", ( + f"expected state='failed' on busy port, got {status!r}" + ) + assert status.get("error"), "failed status must include an error message" + assert status["port"] == busy_port + finally: + await manager.stop_workflow(workflow_id) + finally: + busy_sock.close() + + +@pytest.mark.asyncio +async def test_restart_workflow_returns_stopped_when_disabled( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """A saved-but-disabled config must report state="stopped".""" + workflow_id = "wf-disabled" + config = { + "workflowId": workflow_id, + "enabled": False, + "protocol": "udp", + "host": "127.0.0.1", + "port": 9999, + "format": "auto", + "inputKey": "syslog_message", + } + + async def _fake_storage_read(key: str): # noqa: ANN001 + return config + + monkeypatch.setattr(syslog_manager.Storage, "read", _fake_storage_read) + + manager = syslog_manager.SyslogManager() + status = await manager.restart_workflow(workflow_id) + assert status == {"state": "stopped", "error": None} + assert manager.get_listener_status(workflow_id) == {"state": "stopped", "error": None} diff --git a/tests/tool/test_watcher_atomic_save.py b/tests/tool/test_watcher_atomic_save.py new file mode 100644 index 000000000..e88c6a053 --- /dev/null +++ b/tests/tool/test_watcher_atomic_save.py @@ -0,0 +1,107 @@ +"""Regression tests for atomic-save handling in plugin/agent/skill watchers. + +Atomic-save editors persist edits by writing a sibling temp file and then +``rename``-ing it onto the real target. watchdog surfaces this as a +``FileMovedEvent`` whose ``src_path`` is the temp filename and whose +``dest_path`` is the actual ``tool.yaml`` / ``agent.yaml`` / ``SKILL.md``. + +These tests pin down the contract enforced by the three module-level +predicates that the watcher event handlers delegate to. +""" + +from __future__ import annotations + +from types import SimpleNamespace + +from flocks.tool.registry import _tool_event_should_reload +from flocks.agent.registry import _agent_event_should_reload +from flocks.skill.skill import _skill_event_should_reload + + +def _move_event(src: str, dest: str) -> SimpleNamespace: + return SimpleNamespace(event_type="moved", src_path=src, dest_path=dest, is_directory=False) + + +def _modify_event(path: str) -> SimpleNamespace: + return SimpleNamespace(event_type="modified", src_path=path, dest_path="", is_directory=False) + + +# --------------------------------------------------------------------------- +# Tool watcher predicate +# --------------------------------------------------------------------------- + + +def test_tool_watcher_accepts_dest_path_on_atomic_save() -> None: + """A rename of ```` -> ``tool.yaml`` must trigger a reload.""" + evt = _move_event( + src="/repo/.flocks/plugins/tools/api/foo/.tool.yaml.swp", + dest="/repo/.flocks/plugins/tools/api/foo/tool.yaml", + ) + assert _tool_event_should_reload(evt) is True + + +def test_tool_watcher_accepts_python_atomic_save() -> None: + evt = _move_event( + src="/repo/.flocks/plugins/tools/python/foo/.tool.py.4321~", + dest="/repo/.flocks/plugins/tools/python/foo/tool.py", + ) + assert _tool_event_should_reload(evt) is True + + +def test_tool_watcher_rejects_irrelevant_paths() -> None: + assert _tool_event_should_reload(_modify_event("/repo/.flocks/plugins/tools/api/foo/README")) is False + assert _tool_event_should_reload(_modify_event("/repo/.flocks/plugins/tools/api/foo/__pycache__/x.py")) is False + assert _tool_event_should_reload(_modify_event("/repo/.flocks/plugins/tools/api/foo/.hidden.yaml")) is False + assert _tool_event_should_reload(_modify_event("/repo/.flocks/plugins/tools/api/foo/_tmp.yaml")) is False + + +def test_tool_watcher_accepts_direct_modify_on_yaml() -> None: + evt = _modify_event("/repo/.flocks/plugins/tools/api/foo/tool.yaml") + assert _tool_event_should_reload(evt) is True + + +# --------------------------------------------------------------------------- +# Agent watcher predicate +# --------------------------------------------------------------------------- + + +def test_agent_watcher_accepts_dest_path_on_atomic_save() -> None: + evt = _move_event( + src="/repo/.flocks/plugins/agents/foo/.agent.yaml.swp", + dest="/repo/.flocks/plugins/agents/foo/agent.yaml", + ) + assert _agent_event_should_reload(evt) is True + + +def test_agent_watcher_accepts_md_via_dest_path() -> None: + evt = _move_event( + src="/repo/.flocks/plugins/agents/foo/.AGENT.md.tmp", + dest="/repo/.flocks/plugins/agents/foo/AGENT.md", + ) + assert _agent_event_should_reload(evt) is True + + +def test_agent_watcher_rejects_unrelated_paths() -> None: + evt = _modify_event("/repo/.flocks/plugins/agents/foo/README.txt") + assert _agent_event_should_reload(evt) is False + + +# --------------------------------------------------------------------------- +# Skill watcher predicate +# --------------------------------------------------------------------------- + + +def test_skill_watcher_accepts_skill_md_via_dest_path() -> None: + evt = _move_event( + src="/repo/.flocks/plugins/skills/foo/.SKILL.md.swap", + dest="/repo/.flocks/plugins/skills/foo/SKILL.md", + ) + assert _skill_event_should_reload(evt) is True + + +def test_skill_watcher_accepts_direct_modify() -> None: + assert _skill_event_should_reload(_modify_event("/repo/.flocks/plugins/skills/foo/SKILL.md")) is True + + +def test_skill_watcher_rejects_non_skill_files() -> None: + assert _skill_event_should_reload(_modify_event("/repo/.flocks/plugins/skills/foo/notes.md")) is False diff --git a/webui/src/api/workflow.ts b/webui/src/api/workflow.ts index 5a770a704..e73b32507 100644 --- a/webui/src/api/workflow.ts +++ b/webui/src/api/workflow.ts @@ -159,6 +159,18 @@ export interface SyslogConfig { updatedAt?: number; } +/** Runtime state of the syslog listener (independent from saved config). */ +export interface SyslogListenerStatus { + state: 'binding' | 'listening' | 'failed' | 'stopped'; + error?: string | null; + host?: string; + port?: number; + protocol?: string; + queueSize?: number; + queueCapacity?: number; + workerCount?: number; +} + export const workflowAPI = { list: (params?: { category?: string; status?: string; excludeId?: string }) => client.get('/api/workflow', { params }), @@ -259,11 +271,17 @@ export const workflowAPI = { format?: string; inputKey?: string; }) => - client.post<{ ok: boolean }>(`/api/workflow/${id}/syslog-config`, config), + client.post<{ ok: boolean; listener?: SyslogListenerStatus }>( + `/api/workflow/${id}/syslog-config`, + config, + ), getSyslogConfig: (id: string) => client.get(`/api/workflow/${id}/syslog-config`), + getSyslogStatus: (id: string) => + client.get(`/api/workflow/${id}/syslog-status`), + runNode: (id: string, data: { nodeId: string; inputs?: Record }) => client.post(`/api/workflow/${id}/run-node`, { node_id: data.nodeId, inputs: data.inputs ?? {} }), diff --git a/webui/src/pages/WorkflowDetail/tabs/IntegrationTab.tsx b/webui/src/pages/WorkflowDetail/tabs/IntegrationTab.tsx index 7290db1c0..71e8666a7 100644 --- a/webui/src/pages/WorkflowDetail/tabs/IntegrationTab.tsx +++ b/webui/src/pages/WorkflowDetail/tabs/IntegrationTab.tsx @@ -8,6 +8,7 @@ import { workflowAPI, Workflow, WorkflowService, + SyslogListenerStatus, } from '@/api/workflow'; import CopyButton from '@/components/common/CopyButton'; import WorkflowStatusBadge from '@/components/common/WorkflowStatusBadge'; @@ -316,15 +317,51 @@ function SyslogSection({ workflowId }: { workflowId: string }) { const [port, setPort] = useState('5140'); const [format, setFormat] = useState('auto'); const [inputKey, setInputKey] = useState('syslog_message'); + // Runtime listener state (independent from saved config) — only this should + // drive the "Listening" indicator, otherwise a bind failure leaves the UI + // falsely showing the listener as active. + const [listener, setListener] = useState(null); + const [saveError, setSaveError] = useState(''); - // 摘要行:已启用时在折叠标题旁显示 - const summaryBadge = enabled && !expanded ? ( - - {protocol.toUpperCase()} {host}:{port} · {t('detail.run.syslogActive')} - - ) : ( - {t('detail.run.syslogExperimental')} - ); + const refreshStatus = useCallback(async () => { + try { + const res = await workflowAPI.getSyslogStatus(workflowId); + setListener(res.data); + } catch { + // ignore — older backend / transient failure: UI will show "unknown" + } + }, [workflowId]); + + const isListening = listener?.state === 'listening'; + const isBinding = listener?.state === 'binding'; + const isFailed = listener?.state === 'failed'; + + // 摘要行:仅当后端真正报告 listening 时才显示绿色 active + let summaryBadge: React.ReactNode; + if (isListening) { + summaryBadge = ( + + {(listener?.protocol || protocol).toUpperCase()} {listener?.host || host}:{listener?.port ?? port} + {' · '}{t('detail.run.syslogActive')} + + ); + } else if (enabled && isBinding) { + summaryBadge = ( + + {protocol.toUpperCase()} {host}:{port} · binding… + + ); + } else if (enabled && isFailed) { + summaryBadge = ( + + {protocol.toUpperCase()} {host}:{port} · {listener?.error || 'failed'} + + ); + } else { + summaryBadge = ( + {t('detail.run.syslogExperimental')} + ); + } useEffect(() => { workflowAPI.getSyslogConfig(workflowId).then(res => { @@ -337,13 +374,25 @@ function SyslogSection({ workflowId }: { workflowId: string }) { setInputKey(res.data.inputKey || 'syslog_message'); } }).catch(() => {}); - }, [workflowId]); + refreshStatus(); + }, [workflowId, refreshStatus]); + + // While "binding" we poll briefly so the UI converges on the real state + // without forcing the user to refresh. + useEffect(() => { + if (!isBinding) return; + const handle = window.setInterval(() => { + refreshStatus(); + }, 1500); + return () => window.clearInterval(handle); + }, [isBinding, refreshStatus]); const handleSave = async () => { setSaving(true); setSaved(false); + setSaveError(''); try { - await workflowAPI.saveSyslogConfig(workflowId, { + const res = await workflowAPI.saveSyslogConfig(workflowId, { enabled, protocol, host, @@ -351,10 +400,16 @@ function SyslogSection({ workflowId }: { workflowId: string }) { format, inputKey, }); + if (res.data?.listener) { + setListener(res.data.listener); + } else { + refreshStatus(); + } setSaved(true); setTimeout(() => setSaved(false), 2000); - } catch { - // ignore + } catch (err: unknown) { + setSaveError(extractErrorMessage(err, t('detail.run.savingConfig'))); + refreshStatus(); } finally { setSaving(false); } @@ -439,6 +494,25 @@ function SyslogSection({ workflowId }: { workflowId: string }) { ) : null} {saving ? t('detail.run.savingConfig') : saved ? t('detail.run.savedConfig') : t('detail.run.saveConfig')} + {saveError && ( +
+ + {saveError} +
+ )} + {enabled && isFailed && !saveError && ( +
+ + + Listener failed to bind: {listener?.error || 'unknown error'} + +
+ )} + {enabled && isListening && typeof listener?.queueSize === 'number' && ( +

+ queue {listener.queueSize}/{listener.queueCapacity ?? '?'} · workers {listener.workerCount ?? '?'} +

+ )}

{t('detail.run.syslogHint')}

)}