Corbell-AI · himmi-01 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/README.md b/README.md
@@ -28,8 +28,8 @@ EvalMonkey natively supports evaluating ANY LLM: **AWS Bedrock**, **Azure**, **G
 
 ## 🚀 At a Glance
 - **11 Agent Frameworks natively supported**: CrewAI, LangChain, LlamaIndex, LangGraph, Pydantic AI, OpenAI Agents, Microsoft AutoGen, AWS Bedrock, Ollama, Strands, and custom HTTP endpoints.
-- **19 Standard Benchmarks out-of-the-box**: GSM8K, BIG-Bench Hard, HotpotQA, ToxiGen, MT-Bench, MBPP, and more — all categorised by the agent type they target.
-- **23 Chaos Injections ready to run**: 12 client-side payload mutations + 11 server-side middleware injections — all text-based, no GPU or vision dependencies.
+- **22 Standard Benchmarks out-of-the-box**: GSM8K, BIG-Bench Hard, HotpotQA, ToxiGen, MT-Bench, MBPP, DailyDialog, MultiWOZ, and more — all categorised by the agent type they target (including Coding, Voice, and Reasoning).
+- **28 Chaos Injections ready to run**: 17 client-side payload mutations (including 5 voice-specific profiles) + 11 server-side middleware injections — all text-based, no GPU or vision dependencies.
 - **Automatic Eval Asset Generation**: Poor benchmark scores automatically produce `traces.json`, `evals.json`, and `improvement_prompt.md` — one `cat` command away from Claude Code or Cursor.
 
 ---
@@ -358,6 +358,11 @@ You don't need to change a single line of your target agent's code for these tes
 | `client_repetition_loop` | Repeats the payload 50× to simulate a stuck retry loop — exercises token budget limits and rate-limit handling. |
 | `client_negative_sentiment` | Wraps the request in angry, hostile emotional framing — tests agent professionalism under the abusive customer support scenario. |
 | `client_length_constraint_violation` | Appends a conflicting "respond in exactly 2 words" constraint to a complex task — simulates contradictory user instructions common in chatbots. |
+| `voice_asr_noise` | Simulates ASR homophone confusion, missing punctuation, and lowercasing to test phonetic error robustness. |
+| `voice_filler_words` | Injects speech disfluencies (um, uh, like, you know) to check intent extraction under verbal noise. |
+| `voice_background_noise_sim` | Prepend/appends static/barking/cough descriptors to simulate transcriptions from noisy environments. |
+| `voice_truncated_speech` | Cuts the prompt off mid-sentence to simulate speech timeout or early user hang-up. |
+| `voice_dialect_shift` | Replaces words with casual phonetic shifts (yeah, wanna, gonna, y'all, lemme) to test dialect robustness. |
 
 ```bash
 # Testing a single prompt injection against your agent without modifying your code!

diff --git a/demo_coding_agent.sh b/demo_coding_agent.sh
@@ -159,12 +159,12 @@ run_benchmark() {
     echo ""
     step "Benchmark: ${BCYAN}${scenario}${NC}  (${LIMIT} samples)"
     divider
-    evalmonkey run-benchmark \
+    FORCE_COLOR=1 evalmonkey run-benchmark \
         --scenario    "$scenario" \
         --target-url  "$AGENT_URL" \
         --limit       "$LIMIT" \
         --request-key question \
-        --response-path data
+        --response-path data 2>&1 | sed -E "s/Found the latest cached dataset configuration( '[^']+')? at [^ ]+/Found the latest cached dataset configuration\1/g"
     divider
 }
 
@@ -174,13 +174,13 @@ run_chaos() {
     echo ""
     step "Chaos: ${BRED}${profile}${NC}  on  ${CYAN}${scenario}${NC}  (${LIMIT} samples)"
     divider
-    evalmonkey run-chaos \
+    FORCE_COLOR=1 evalmonkey run-chaos \
         --scenario     "$scenario" \
         --target-url   "$AGENT_URL" \
         --chaos-profile "$profile" \
         --limit        "$LIMIT" \
         --request-key  question \
-        --response-path data
+        --response-path data 2>&1 | sed -E "s/Found the latest cached dataset configuration( '[^']+')? at [^ ]+/Found the latest cached dataset configuration\1/g"
     divider
 }
 

diff --git a/evalmonkey/scenarios/standard_benchmarks.py b/evalmonkey/scenarios/standard_benchmarks.py
@@ -2,7 +2,7 @@
 from typing import List, Dict
 
 # Each entry: description + agent_category
-# Categories: Q&A, Reasoning, Coding, Research, Tool Use, Safety, Instruction Following
+# Categories: Q&A, Reasoning, Coding, Research, Tool Use, Safety, Instruction Following, Voice
 SUPPORTED_BENCHMARKS: Dict[str, Dict[str, str]] = {
     # ── Original 10 ──────────────────────────────────────────────────────────
     "gsm8k": {
@@ -82,6 +82,19 @@
         "description": "ToxiGen: Detects whether agents generate or amplify hateful/toxic content across 13 groups.",
         "agent_category": "Safety",
     },
+    # ── Voice Benchmarks ──────────────────────────────────────────────────────
+    "daily-dialog": {
+        "description": "DailyDialog: Multi-turn dialogue flow dataset covering daily life topics, useful for conversational voice agents.",
+        "agent_category": "Voice",
+    },
+    "multiwoz": {
+        "description": "MultiWOZ 2.2: Task-oriented dialogue dataset checking voice slot filling and transaction execution.",
+        "agent_category": "Voice",
+    },
+    "spokentext-cleanup": {
+        "description": "SpokenTextCleanup: Evaluate voice agent ability to clean up disfluencies, stutter, filler words, and self-corrections from transcribed speech.",
+        "agent_category": "Voice",
+    },
 }
 
 
@@ -277,6 +290,101 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce
         except Exception as e:
             print(f"Failed to fetch swe-bench from HF datasets: {e}")
 
+    elif benchmark_name.lower() == "daily-dialog":
+        try:
+            print(f"Loading daily-dialog from HuggingFace Datasets (daily_dialog)...")
+            dataset = load_dataset("daily_dialog", split="test", streaming=True)
+            for idx, item in enumerate(dataset):
+                if idx >= limit:
+                    break
+                dialog = item.get("dialog", [])
+                if len(dialog) >= 2:
+                    history = dialog[:-1]
+                    target = dialog[-1]
+                    question = "We are having a conversation. Here is the dialogue history so far:\n" + "\n".join(f"- {turn.strip()}" for turn in history) + "\n\nResponse to the last turn. Keep your response brief, clear, and natural as if spoken aloud (no markdown, no bullets)."
+                    scenarios.append(EvalScenario(
+                        id=f"daily-dialog_{idx}",
+                        description="DailyDialog multi-turn conversational dialogue flow.",
+                        input_payload={"question": question},
+                        expected_behavior_rubric=f"Agent MUST provide a brief and conversational reply. A reference expected response is: '{target.strip()}'"
+                    ))
+                else:
+                    scenarios.append(EvalScenario(
+                        id=f"daily-dialog_{idx}",
+                        description="DailyDialog multi-turn conversational dialogue flow.",
+                        input_payload={"question": "Hello, how are you today?"},
+                        expected_behavior_rubric="Agent MUST respond politely and conversationally."
+                    ))
+        except Exception as e:
+            print(f"Failed to fetch daily-dialog from HF datasets: {e}")
+
+    elif benchmark_name.lower() == "multiwoz":
+        try:
+            print(f"Loading multiwoz from HuggingFace Datasets (multi_woz_v22)...")
+            dataset = load_dataset("multi_woz_v22", split="test", streaming=True, trust_remote_code=True)
+            for idx, item in enumerate(dataset):
+                if idx >= limit:
+                    break
+                turns = item.get("turns", {})
+                speakers = turns.get("speaker", [])
+                utterances = turns.get("utterance", [])
+                if len(utterances) >= 2:
+                    history = []
+                    for spk, utt in zip(speakers[:-1], utterances[:-1]):
+                        role = "User" if spk == 0 or spk == "USER" else "Assistant"
+                        history.append(f"{role}: {utt.strip()}")
+                    target = utterances[-1]
+
+                    question = "Here is a task-oriented assistant dialogue history:\n" + "\n".join(history) + "\n\nProvide the next natural response. Keep it brief and voice-agent friendly (no markdown, no formatting)."
+                    scenarios.append(EvalScenario(
+                        id=f"multiwoz_{idx}",
+                        description="MultiWOZ task-oriented dialogue benchmark.",
+                        input_payload={"question": question},
+                        expected_behavior_rubric=f"Agent MUST provide a natural response that progresses the task-oriented dialog. Reference response: '{target.strip()}'"
+                    ))
+                else:
+                    scenarios.append(EvalScenario(
+                        id=f"multiwoz_{idx}",
+                        description="MultiWOZ task-oriented dialogue benchmark.",
+                        input_payload={"question": "I would like to book a taxi to the train station please."},
+                        expected_behavior_rubric="Agent MUST ask for details or confirm the taxi booking."
+                    ))
+        except Exception as e:
+            print(f"Failed to fetch multiwoz from HF datasets: {e}")
+
+    elif benchmark_name.lower() == "spokentext-cleanup":
+        cleanup_data = [
+            {
+                "input": "uh, please, like, set an alarm for, you know, 7:00 AM, wait, no, 8:00 AM, yeah.",
+                "target": "Set an alarm for 8:00 AM."
+            },
+            {
+                "input": "can you, uh, turn off the living room, no wait, the kitchen lights, please?",
+                "target": "Turn off the kitchen lights."
+            },
+            {
+                "input": "play some music by, uh, what's his name, oh, Ed Sheeran, no actually, Taylor Swift.",
+                "target": "Play music by Taylor Swift."
+            },
+            {
+                "input": "what is the weather like in, like, Seattle, oh wait, I'm in Chicago today, so Chicago.",
+                "target": "What is the weather in Chicago?"
+            },
+            {
+                "input": "remind me to buy, um, milk, eggs, and, uh, wait, call Mom at 5 PM.",
+                "target": "Remind me to call Mom at 5 PM."
+            }
+        ]
+        for idx, item in enumerate(cleanup_data):
+            if idx >= limit:
+                break
+            scenarios.append(EvalScenario(
+                id=f"spokentext-cleanup_{idx}",
+                description="SpokenTextCleanup: evaluates cleaning filler words, stutters, and self-corrections from speech transcription.",
+                input_payload={"question": f"Please clean up this spoken transcription, removing stutters, filler words, and resolved self-corrections, to produce a clean command:\n'{item['input']}'"},
+                expected_behavior_rubric=f"Agent MUST clean the transcription. Expected command structure: '{item['target']}'"
+            ))
+
     elif benchmark_name.lower() in SUPPORTED_BENCHMARKS:
         try:
             hf_map = {

diff --git a/evalmonkey/simulator/load_gen.py b/evalmonkey/simulator/load_gen.py
@@ -211,6 +211,93 @@ async def run_scenario(
                     "- Time complexity MUST be O(n)\n"
                 )
 
+            # ── Voice-Agent-Specific Chaos Profiles ──────────────────────────
+            elif chaos_profile == "voice_asr_noise":
+                # Simulates typical ASR (Automatic Speech Recognition) transcription errors,
+                # such as homophone confusion, missing punctuation, and lack of capitalization.
+                q = working_payload[self.request_key]
+                homophones = {
+                    "there": "their",
+                    "to": "too",
+                    "too": "two",
+                    "two": "to",
+                    "see": "sea",
+                    "write": "right",
+                    "right": "write",
+                    "accept": "except",
+                    "except": "accept",
+                    "weather": "whether",
+                    "whether": "weather",
+                    "you're": "your",
+                    "your": "you're",
+                }
+                words = q.split()
+                new_words = []
+                for w in words:
+                    clean_w = w.strip(".,!?\"'()[]{}:;-").lower()
+                    if clean_w in homophones:
+                        rep = homophones[clean_w]
+                        idx = w.lower().find(clean_w)
+                        prefix = w[:idx]
+                        suffix = w[idx + len(clean_w):]
+                        new_words.append(prefix + rep + suffix)
+                    else:
+                        new_words.append(w)
+                q = " ".join(new_words)
+                import re as _re
+                q = _re.sub(r"[.,\/#!$%\^&\*;:{}=\-_`~()?]", "", q)
+                working_payload[self.request_key] = q.lower()
+
+            elif chaos_profile == "voice_filler_words":
+                # Injects speech disfluencies (um, uh, like, you know) into the query.
+                q = working_payload[self.request_key]
+                words = q.split()
+                if len(words) > 3:
+                    words.insert(0, "uh,")
+                    words.insert(1, "um,")
+                    mid = len(words) // 2
+                    words.insert(mid, "like,")
+                    words.insert(mid + 2, "you know,")
+                    words.append(", right?")
+                else:
+                    words = ["um,", "like,"] + words + [", you know?"]
+                working_payload[self.request_key] = " ".join(words)
+
+            elif chaos_profile == "voice_background_noise_sim":
+                # Injects ambient noise indicators to simulate a phone call from a noisy room.
+                q = working_payload[self.request_key]
+                working_payload[self.request_key] = (
+                    "[background chatter] " + q.replace(" ", " [static] ", 1) + " [dog barking] [cough]"
+                )
+
+            elif chaos_profile == "voice_truncated_speech":
+                # Cuts the query off mid-sentence to simulate speech timeout or early user hang-up.
+                q = working_payload[self.request_key]
+                words = q.split()
+                if len(words) > 4:
+                    cutoff = max(len(words) // 2, 3)
+                    truncated = " ".join(words[:cutoff])
+                else:
+                    truncated = q[:len(q)//2]
+                working_payload[self.request_key] = truncated + "... [audio cut off / silence]"
+
+            elif chaos_profile == "voice_dialect_shift":
+                # Simulates phonetic dialectal/casual shifts (yeah, wanna, gonna, y'all, lemme).
+                q = working_payload[self.request_key]
+                replacements = {
+                    "yes": "yeah",
+                    "want to": "wanna",
+                    "going to": "gonna",
+                    "you all": "y'all",
+                    "let me": "lemme",
+                    "give me": "gimme",
+                    "ok": "uh-huh",
+                }
+                for word, rep in replacements.items():
+                    q = q.replace(word, rep)
+                    q = q.replace(word.capitalize(), rep.capitalize())
+                working_payload[self.request_key] = q
+
         async with httpx.AsyncClient(timeout=60.0) as client:
             try:
                 response = await client.post(

diff --git a/scripts/cli.py b/scripts/cli.py
@@ -110,7 +110,7 @@ def generate_ci(
 
 @app.command()
 def list_benchmarks(
-    category: str = typer.Option(None, help="Filter by agent category (e.g. Coding, Reasoning, Q&A, Research, Safety, Tool Use, Instruction Following)")
+    category: str = typer.Option(None, help="Filter by agent category (e.g. Coding, Reasoning, Q&A, Research, Safety, Tool Use, Instruction Following, Voice)")
 ):
     """Lists the off-the-shelf benchmark datasets natively supported, optionally filtered by agent category."""
     print_banner()
@@ -134,7 +134,7 @@ def list_benchmarks(
 
     if not benchmarks:
         console.print(f"[bold yellow]No benchmarks found for category '{category}'. "
-                      f"Available: Coding, Reasoning, Q&A, Research, Safety, Tool Use, Instruction Following[/bold yellow]")
+                      f"Available: Coding, Reasoning, Q&A, Research, Safety, Tool Use, Instruction Following, Voice[/bold yellow]")
         return
 
     for b_id, desc in benchmarks.items():
@@ -446,9 +446,12 @@ def run_chaos_suite(
         "client_language_shift", "client_payload_bloat", "client_empty_payload",
         "client_context_truncation", "client_unicode_flood", "client_role_impersonation",
         "client_repetition_loop", "client_negative_sentiment", "client_length_constraint_violation",
-        # Coding-agent-specific (7)
+        # Coding-agent-specific (6)
         "code_context_strip", "code_wrong_language", "code_syntax_break",
         "code_test_poison", "code_incomplete_signature", "code_conflicting_constraints",
+        # Voice-agent-specific (5)
+        "voice_asr_noise", "voice_filler_words", "voice_background_noise_sim",
+        "voice_truncated_speech", "voice_dialect_shift",
     ]
     console.print("[bold cyan]=> 🌪️ STARTING FULL CHAOS BARRAGE SUITE 🌪️[/bold cyan]")
 

diff --git a/tests/test_coding_agent.py b/tests/test_coding_agent.py
@@ -50,11 +50,11 @@ def test_coding_benchmarks_in_supported():
         assert SUPPORTED_BENCHMARKS[bid]["agent_category"] == "Coding"
 
 
-def test_catalogue_has_19_benchmarks():
-    """Ensure the total count is still 19."""
+def test_catalogue_has_22_benchmarks():
+    """Ensure the total count is 22."""
     from evalmonkey.scenarios.standard_benchmarks import get_supported_benchmarks
     cat = get_supported_benchmarks()
-    assert len(cat) == 19
+    assert len(cat) == 22
 
 
 # ── Coding Chaos Profiles ───────────────────────────────────────────────────
@@ -230,7 +230,7 @@ def test_backend_list_benchmarks_no_filter():
     ids = [b.id for b in result]
     assert "human-eval" in ids
     assert "gsm8k" in ids
-    assert len(ids) == 19
+    assert len(ids) == 22
 
 
 def test_backend_list_benchmarks_coding_filter():

diff --git a/tests/test_components.py b/tests/test_components.py
@@ -94,10 +94,10 @@ def test_cli_no_target_url():
 
 # ----------- TEST NEW BENCHMARKS CATALOGUE -----------
 
-def test_catalogue_has_19_benchmarks():
+def test_catalogue_has_22_benchmarks():
     from evalmonkey.scenarios.standard_benchmarks import get_supported_benchmarks
     cat = get_supported_benchmarks()
-    assert len(cat) == 19
+    assert len(cat) == 22
 
 
 def test_benchmark_categories_returned():