AgentLab/src/agentlab/analyze/agent_xray.py at 74a00951b40232586cce2b21a3f565de25b958b0 · ServiceNow/AgentLab · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import base64
import os
import traceback
from copy import deepcopy
from io import BytesIO
from logging import warning
from pathlib import Path

import gradio as gr
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from attr import dataclass
from langchain.schema import BaseMessage, HumanMessage
from openai import OpenAI
from openai.types.responses import ResponseFunctionToolCall
from PIL import Image

from agentlab.analyze import inspect_results
from agentlab.analyze.overlay_utils import annotate_action
from agentlab.experiments.exp_utils import RESULTS_DIR
from agentlab.experiments.loop import ExpResult, StepInfo
from agentlab.experiments.study import get_most_recent_study
from agentlab.llm.chat_api import make_system_message, make_user_message
from agentlab.llm.llm_utils import BaseMessage as AgentLabBaseMessage
from agentlab.llm.llm_utils import Discussion
from agentlab.llm.response_api import MessageBuilder
from agentlab.llm.response_api import ToolCalls

select_dir_instructions = "Select Experiment Directory"
AGENT_NAME_KEY = "agent.agent_name"
TASK_NAME_KEY = "env.task_name"
TASK_SEED_KEY = "env.task_seed"


def display_table(df: pd.DataFrame):
    df = df.copy()
    df.columns = clean_column_names(df.columns)
    df.index.names = clean_column_names(df.index.names)
    return df


def remove_args_from_col(df: pd.DataFrame):
    df.columns = [col.replace("_args", "") for col in df.columns]
    df.index.names = [col.replace("_args", "") for col in df.index.names]
    return df


def clean_column_names(col_list):
    # col_list = [col.replace("_args", "") for col in col_list]
    col_list = [col.replace(".", ".\n") for col in col_list]  # adding space for word wrap
    # col_list = [col.replace("_", " ") for col in col_list]
    return col_list


class ClickMapper:
    def __init__(self, ax: plt.Axes, step_times: list[float]):
        self.ax = ax
        self.step_times = step_times

    def to_time(self, x_pix_coord):
        x_time_coord, _ = self.ax.transData.inverted().transform((x_pix_coord, 0))
        return x_time_coord

    def to_step(self, x_pix_coord):
        x_time_coord = self.to_time(x_pix_coord)
        return np.searchsorted(self.step_times, x_time_coord)


@dataclass
class EpisodeId:
    agent_id: str = None
    task_name: str = None
    seed: int = None


@dataclass
class StepId:
    episode_id: EpisodeId = None
    step: int = None


@dataclass
class Info:
    results_dir: Path = None  # to root directory of all experiments
    study_dirs: Path = None  # the path of the currently selected experiment
    result_df: pd.DataFrame = None  # the raw loaded df
    agent_df: pd.DataFrame = None  # the df filtered for selected agent
    tasks_df: pd.DataFrame = None  # the unique tasks for selected agent
    exp_result: ExpResult = None  # the selected episode
    click_mapper: ClickMapper = None  # mapping from profiler click to step
    step: int = None  # currently selected step
    active_tab: str = "Screenshot"  # currently selected observation tab
    agent_id_keys: list[str] = None  # the list of columns identifying an agent

    def update_exp_result(self, episode_id: EpisodeId):
        if self.result_df is None or episode_id.task_name is None or episode_id.seed is None:
            self.exp_result = None

        # find unique row for task_name and seed
        result_df = self.agent_df.reset_index(inplace=False)
        sub_df = result_df[
            (result_df[TASK_NAME_KEY] == episode_id.task_name)
            & (result_df[TASK_SEED_KEY] == episode_id.seed)
        ]
        if len(sub_df) == 0:
            self.exp_result = None
            raise ValueError(
                f"Could not find task_name: {episode_id.task_name} and seed: {episode_id.seed}"
            )

        if len(sub_df) > 1:
            warning(
                f"Found multiple rows for task_name: {episode_id.task_name} and seed: {episode_id.seed}. Using the first one."
            )

        exp_dir = sub_df.iloc[0]["exp_dir"]
        print(exp_dir)
        self.exp_result = ExpResult(exp_dir)
        self.step = 0

    def get_agent_id(self, row: pd.Series):
        agent_id = []
        for key in self.agent_id_keys:
            agent_id.append((key, row[key]))
        return agent_id

    def filter_agent_id(self, agent_id: list[tuple]):
        # query_str = " & ".join([f"`{col}` == {repr(val)}" for col, val in agent_id])
        # agent_df = info.result_df.query(query_str)

        agent_df = self.result_df.reset_index(inplace=False)
        agent_df.set_index(TASK_NAME_KEY, inplace=True)

        for col, val in agent_id:
            col = col.replace(".\n", ".")
            agent_df = agent_df[agent_df[col] == val]
        self.agent_df = agent_df


info = Info()


css = """
.my-markdown {
    max-height: 400px;
    overflow-y: auto;
}
.error-report {
    max-height: 700px;
    overflow-y: auto;
}
.my-code-view {
    max-height: 300px;
    overflow-y: auto;
}
code {
    white-space: pre-wrap;
}
th {
    white-space: normal !important;
    word-wrap: break-word !important;
}
"""


def run_gradio(results_dir: Path):
    """
    Run Gradio on the selected experiments saved at savedir_base.

    """
    global info
    info.results_dir = results_dir

    with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
        agent_id = gr.State(value=None)
        episode_id = gr.State(value=EpisodeId())
        agent_task_id = gr.State(value=None)
        step_id = gr.State(value=None)

        hidden_key_input = gr.Textbox(visible=False, elem_id="key_capture")

        with gr.Accordion("Help", open=False):
            gr.Markdown(
                """\
# Agent X-Ray

1. **Select your experiment directory**. You may refresh the list of directories by
clicking the refresh button.

2. **Select your episode**: Chose a triplet (agent, task, seed).

    1. **Select Agent**: Click on a row of the table to select your agent

    2. **Select Task**: Select the task you want to analyze, this will trigger
       an update of the available seeds.

    3. **Select the Seed**: You might have multiple repetition for a given task,
       you will be able to select the seed you want to analyze.

3. **Select the step**: Once your episode is selected, you can select the step
   by clicking on the profiling image. This will trigger the update of the the
   information on the corresponding step.

4. **Select a tab**: You can select different visualization by clicking on the tabs.
"""
            )
        with gr.Row():
            exp_dir_choice = gr.Dropdown(
                choices=get_directory_contents(results_dir),
                value=select_dir_instructions,
                multiselect=True,
                label="Experiment Directory",
                show_label=False,
                scale=6,
                container=False,
            )
            refresh_button = gr.Button("↺", scale=0, size="sm")

        with gr.Tabs():
            with gr.Tab("Select Agent"):
                with gr.Accordion("Agent Selector (click for help)", open=False):
                    gr.Markdown(
                        """\
    Click on a row to select an agent. It will trigger the update of other
    fields.

    The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?).
    """
                    )
                agent_table = gr.DataFrame(max_height=500, show_label=False, interactive=False)
            with gr.Tab("Select Task and Seed", id="Select Task"):
                with gr.Row():
                    with gr.Column(scale=4):
                        with gr.Row():  # combining the title (help) and the refresh button
                            with gr.Accordion("Task Selector (click for help)", open=False):
                                gr.Markdown(
                                    """\
        Click on a row to select a task. It will trigger the update of other fields.

        The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?).
        """
                                )
                            refresh_results_button = gr.Button("↺", scale=0, size="sm")

                        task_table = gr.DataFrame(
                            max_height=500,
                            show_label=False,
                            interactive=False,
                            elem_id="task_table",
                        )

                    with gr.Column(scale=2):
                        with gr.Accordion("Seed Selector (click for help)", open=False):
                            gr.Markdown(
                                """\
    Click on a row to select a seed. It will trigger the update of other fields.

    The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?).
    """
                            )

                        seed_table = gr.DataFrame(
                            max_height=500,
                            show_label=False,
                            interactive=False,
                            elem_id="seed_table",
                        )

            with gr.Tab("Constants and Variables"):
                with gr.Row():
                    with gr.Column(scale=2):
                        with gr.Accordion("Constants", open=False):
                            gr.Markdown(
                                """\
    Constants are the parameters that are the same for **all** episodes of
    **all** agents. They are displayed as a table with the name and value of the
    constant."""
                            )
                        constants = gr.DataFrame(
                            max_height=500, show_label=False, interactive=False
                        )
                    with gr.Column(scale=2):
                        with gr.Accordion("Variables", open=False):
                            gr.Markdown(
                                """\
    Variables are the parameters that can change between episodes of an agent.
    They are displayed as a table with the name, value and count of unique
    values. A maximum of 3 different values are displayed."""
                            )
                        variables = gr.DataFrame(
                            max_height=500, show_label=False, interactive=False
                        )
            with gr.Tab("Global Stats"):
                global_stats = gr.DataFrame(max_height=500, show_label=False, interactive=False)

            with gr.Tab("Error Report"):
                error_report = gr.Markdown(elem_classes="error-report", show_copy_button=True)
        with gr.Row():
            episode_info = gr.Markdown(label="Episode Info", elem_classes="my-markdown")
            action_info = gr.Markdown(label="Action Info", elem_classes="my-markdown")
            state_error = gr.Markdown(label="Next Step Error", elem_classes="my-markdown")

        profiling_gr = gr.Image(
            label="Profiling", show_label=False, interactive=False, show_download_button=False
        )

        gr.HTML(
            """
<style>
    .code-container {
        height: 700px;  /* Set the desired height */
        overflow: auto;  /* Enable scrolling */
    }
</style>
"""
        )
        with gr.Tabs() as tabs:
            code_args = dict(interactive=False, elem_classes=["code-container"], show_label=False)
            with gr.Tab("Screenshot") as tab_screenshot:
                som_or_not = gr.Dropdown(
                    choices=["Raw Screenshots", "SOM Screenshots"],
                    label="Screenshot Type",
                    value="Raw Screenshots",
                    show_label=False,
                    container=False,
                    interactive=True,
                    scale=0,
                )
                screenshot = gr.Image(
                    show_label=False, interactive=False, show_download_button=False
                )

            with gr.Tab("Screenshot Pair") as tab_screenshot_pair:
                with gr.Row():
                    screenshot1 = gr.Image(
                        show_label=False, interactive=False, show_download_button=False
                    )
                    screenshot2 = gr.Image(
                        show_label=False, interactive=False, show_download_button=False
                    )
            with gr.Tab("Screenshot Gallery") as tab_screenshot_gallery:
                screenshot_gallery = gr.Gallery(
                    columns=2,
                    show_download_button=False,
                    show_label=False,
                    object_fit="contain",
                    preview=True,
                )

            with gr.Tab("DOM HTML") as tab_html:
                html_code = gr.Code(language="html", **code_args)

            with gr.Tab("Pruned DOM HTML") as tab_pruned_html:
                pruned_html_code = gr.Code(language="html", **code_args)

            with gr.Tab("AXTree") as tab_axtree:
                axtree_code = gr.Markdown()

            with gr.Tab("Chat Messages") as tab_chat:
                chat_messages = gr.Markdown()

            with gr.Tab("Task Error") as tab_error:
                task_error = gr.Markdown()

            with gr.Tab("Logs") as tab_logs:
                logs = gr.Code(language=None, **code_args)

            with gr.Tab("Stats") as tab_stats:
                stats = gr.DataFrame(max_height=500, show_label=False, interactive=False)

            with gr.Tab("Agent Info HTML") as tab_agent_info_html:
                with gr.Row():
                    screenshot1_agent = gr.Image(
                        show_label=False, interactive=False, show_download_button=False
                    )
                    screenshot2_agent = gr.Image(
                        show_label=False, interactive=False, show_download_button=False
                    )
                agent_info_html = gr.HTML()

            with gr.Tab("Agent Info MD") as tab_agent_info_md:
                agent_info_md = gr.Markdown()

            with gr.Tab("Prompt tests") as tab_prompt_tests:
                with gr.Row():
                    prompt_markdown = gr.Textbox(
                        value="",
                        label="",
                        show_label=False,
                        interactive=False,
                        elem_id="prompt_markdown",
                    )
                    with gr.Column():
                        prompt_tests_textbox = gr.Textbox(
                            value="",
                            label="",
                            show_label=False,
                            interactive=True,
                            elem_id="prompt_tests_textbox",
                        )
                        submit_button = gr.Button(value="Submit")
                    result_box = gr.Textbox(
                        value="", label="Result", show_label=True, interactive=False
                    )

                # Define the interaction
                submit_button.click(
                    fn=submit_action, inputs=prompt_tests_textbox, outputs=result_box
                )

        # Handle Events #
        # ===============#

        refresh_button.click(
            fn=refresh_exp_dir_choices, inputs=exp_dir_choice, outputs=exp_dir_choice
        )

        refresh_results_button.click(
            fn=refresh_exp_dir_choices, inputs=exp_dir_choice, outputs=exp_dir_choice
        )

        exp_dir_choice.change(
            fn=new_exp_dir,
            inputs=exp_dir_choice,
            outputs=[agent_table, agent_id, constants, variables, global_stats, error_report],
        )

        agent_table.select(fn=on_select_agent, inputs=agent_table, outputs=[agent_id])
        task_table.select(fn=on_select_task, inputs=[task_table, agent_id], outputs=agent_task_id)

        agent_id.change(fn=new_agent_id, inputs=agent_id, outputs=[task_table, agent_task_id])
        agent_task_id.change(
            fn=update_seeds, inputs=agent_task_id, outputs=[seed_table, episode_id]
        )
        # seed_gr.change(fn=on_select_seed, inputs=[seed_gr, task_name], outputs=[episode_id])
        seed_table.select(on_select_seed, inputs=[seed_table, agent_task_id], outputs=episode_id)
        step_id.change(fn=update_step_info, outputs=[episode_info, action_info, state_error])
        episode_id.change(fn=new_episode, inputs=[episode_id], outputs=[profiling_gr, step_id])
        profiling_gr.select(select_step, inputs=[episode_id], outputs=step_id)

        # Update all tabs on step change, but only actually update the active
        # tab. This helps keeping the UI responsive when selecting a new step.
        step_id.change(
            fn=if_active("Screenshot")(update_screenshot),
            inputs=som_or_not,
            outputs=screenshot,
        )
        step_id.change(
            fn=if_active("Screenshot Pair", 2)(update_screenshot_pair),
            inputs=som_or_not,
            outputs=[screenshot1, screenshot2],
        )
        step_id.change(
            fn=if_active("Screenshot Gallery")(update_screenshot_gallery),
            inputs=som_or_not,
            outputs=[screenshot_gallery],
        )
        screenshot_gallery.select(fn=gallery_step_change, inputs=episode_id, outputs=step_id)
        step_id.change(fn=if_active("DOM HTML")(update_html), outputs=html_code)
        step_id.change(
            fn=if_active("Pruned DOM HTML")(update_pruned_html), outputs=pruned_html_code
        )
        step_id.change(fn=if_active("AXTree")(update_axtree), outputs=axtree_code)
        step_id.change(fn=if_active("Chat Messages")(update_chat_messages), outputs=chat_messages)
        step_id.change(fn=if_active("Task Error")(update_task_error), outputs=task_error)
        step_id.change(fn=if_active("Logs")(update_logs), outputs=logs)
        step_id.change(fn=if_active("Stats")(update_stats), outputs=stats)
        step_id.change(
            fn=if_active("Agent Info HTML", 3)(update_agent_info_html),
            outputs=[agent_info_html, screenshot1_agent, screenshot2_agent],
        )
        step_id.change(fn=if_active("Agent Info MD")(update_agent_info_md), outputs=agent_info_md)
        step_id.change(
            fn=if_active("Prompt tests", 2)(update_prompt_tests),
            outputs=[prompt_markdown, prompt_tests_textbox],
        )

        # In order to handel tabs that were not visible when step was changed,
        # we need to update them individually when the tab is selected
        tab_screenshot.select(fn=update_screenshot, inputs=som_or_not, outputs=screenshot)
        tab_screenshot_pair.select(
            fn=update_screenshot_pair, inputs=som_or_not, outputs=[screenshot1, screenshot2]
        )
        tab_screenshot_gallery.select(
            fn=update_screenshot_gallery, inputs=som_or_not, outputs=[screenshot_gallery]
        )
        tab_html.select(fn=update_html, outputs=html_code)
        tab_pruned_html.select(fn=update_pruned_html, outputs=pruned_html_code)
        tab_axtree.select(fn=update_axtree, outputs=axtree_code)
        tab_chat.select(fn=update_chat_messages, outputs=chat_messages)
        tab_error.select(fn=update_task_error, outputs=task_error)
        tab_logs.select(fn=update_logs, outputs=logs)
        tab_stats.select(fn=update_stats, outputs=stats)
        tab_agent_info_html.select(fn=update_agent_info_html, outputs=agent_info_html)
        tab_agent_info_md.select(fn=update_agent_info_md, outputs=agent_info_md)
        tab_prompt_tests.select(
            fn=update_prompt_tests, outputs=[prompt_markdown, prompt_tests_textbox]
        )

        som_or_not.change(fn=update_screenshot, inputs=som_or_not, outputs=screenshot)

        # keep track of active tab
        tabs.select(tab_select)

        demo.load(fn=refresh_exp_dir_choices, inputs=exp_dir_choice, outputs=exp_dir_choice)

        demo.load(
            None,
            None,
            None,
            js="""
    function() {
        document.addEventListener('keydown', function(e) {
            if ((e.key === 'ArrowLeft' || e.key === 'ArrowRight') && (e.metaKey || e.ctrlKey)) {
                e.preventDefault();
                const hiddenInput = document.querySelector('#key_capture input, #key_capture textarea');
                if (hiddenInput) {
                    let event = e.key === 'ArrowLeft' ? 'Cmd+Left' : 'Cmd+Right';
                    hiddenInput.value = event;
                    hiddenInput.dispatchEvent(new Event('input', {bubbles: true}));
                }
            }
        });
    }
        """,
        )
        hidden_key_input.change(
            handle_key_event,
            inputs=[hidden_key_input, step_id],
            outputs=[hidden_key_input, step_id],
        )

    demo.queue()

    do_share = os.getenv("AGENTXRAY_SHARE_GRADIO", "false").lower() == "true"
    port = os.getenv("AGENTXRAY_APP_PORT", None)
    if isinstance(port, str):
        port = int(port)
    demo.launch(server_port=port, share=do_share)


def handle_key_event(key_event, step_id: StepId):

    if key_event:
        global info

        # print(f"Key event: {key_event}")
        step = step_id.step
        if key_event.startswith("Cmd+Left"):
            step = max(0, step - 1)
        elif key_event.startswith("Cmd+Right"):
            step = min(len(info.exp_result.steps_info) - 2, step + 1)
        else:
            return gr.update()
        # print(f"Updating step to {step} from key event {key_event}")
        info.step = step
        step_id = StepId(episode_id=step_id.episode_id, step=step)
    return ("", step_id)


def tab_select(evt: gr.SelectData):
    global info
    info.active_tab = evt.value


def if_active(tab_name, n_out=1):
    def decorator(fn):
        def wrapper(*args, **kwargs):
            global info
            if info.active_tab == tab_name:
                # print("updating: ", fn.__name__)
                return fn(*args, **kwargs)
            else:
                # print("skipping: ", fn.__name__)
                if n_out == 1:
                    return gr.update()
                elif n_out > 1:
                    return (gr.update(),) * n_out

        return wrapper

    return decorator


def update_screenshot(som_or_not: str):
    global info
    img, action_str = get_screenshot(info, som_or_not=som_or_not, annotate=True)
    return img


def get_screenshot(
    info: Info, step: int = None, som_or_not: str = "Raw Screenshots", annotate: bool = False
):
    if step is None:
        step = info.step
    try:
        step_info = info.exp_result.steps_info[step]
        is_som = som_or_not == "SOM Screenshots"
        img = info.exp_result.get_screenshot(step, som=is_som)
        if annotate:
            action_str = step_info.action
            properties = step_info.obs.get("extra_element_properties", None)
            try:
                action_colored = annotate_action(
                    img, action_string=action_str, properties=properties
                )
            except Exception as e:
                warning(f"Failed to annotate action: {e}")
                action_colored = action_str
        else:
            action_colored = None
        return img, action_colored
    except (FileNotFoundError, IndexError):
        return None, None


def update_screenshot_pair(som_or_not: str):
    global info
    s1, action_str = get_screenshot(info, info.step, som_or_not, annotate=True)
    s2, action_str = get_screenshot(info, info.step + 1, som_or_not)
    return s1, s2


def update_screenshot_gallery(som_or_not: str):
    global info
    max_steps = len(info.exp_result.steps_info)

    screenshots = [get_screenshot(info, step=i, som_or_not=som_or_not)[0] for i in range(max_steps)]

    screenshots_and_label = [(s, f"Step {i}") for i, s in enumerate(screenshots)]

    gallery = gr.Gallery(
        value=screenshots_and_label,
        columns=2,
        show_download_button=False,
        show_label=False,
        object_fit="contain",
        preview=True,
        selected_index=info.step,
    )
    return gallery


def gallery_step_change(evt: gr.SelectData, episode_id: EpisodeId):
    global info
    info.step = evt.index
    return StepId(episode_id=episode_id, step=evt.index)


def update_html():
    return get_obs(key="dom_txt", default="No DOM HTML")


def update_pruned_html():
    return get_obs(key="pruned_html", default="No Pruned HTML")


def update_axtree():
    obs = get_obs(key="axtree_txt", default="No AXTree")
    return f"```\n{obs}\n```"


def dict_to_markdown(d: dict):
    """
    Convert a dictionary to a clean markdown representation, recursively.

    Args:
        d (dict): A dictionary where keys are strings and values can be strings,
                  lists of dictionaries, or nested dictionaries.

    Returns:
        str: A markdown-formatted string representation of the dictionary.
    """
    if not isinstance(d, dict):
        if isinstance(d, ToolCalls):
            # ToolCalls rendered by to_markdown method.
            return ""
        warning(f"Expected dict, got {type(d)}")
        return repr(d)
    if not d:
        return "No Data"
    res = ""
    for k, v in d.items():
        if isinstance(v, dict):
            res += f"### {k}\n{dict_to_markdown(v)}\n"
        elif isinstance(v, list):
            res += f"### {k}\n"
            for i, item in enumerate(v):
                if isinstance(item, dict):
                    res += f"#### Item {i}\n{dict_to_markdown(item)}\n"
                else:
                    res += f"- {item}\n"
        else:
            res += f"- **{k}**: {v}\n"
    return res


def dict_msg_to_markdown(d: dict):
    if "role" not in d:
        return dict_to_markdown(d)
    parts = []
    for item in d["content"]:

        if hasattr(item, "dict"):
            item = item.dict()

        match item["type"]:
            case "image":
                parts.append(f"![Image]({item['image']})")
            case "text":
                parts.append(f"\n```\n{item['text']}\n```\n")
            case "tool_use":
                tool_use = _format_tool_call(item["name"], item["input"], item["id"])
                parts.append(f"\n```\n{tool_use}\n```\n")
            case _:
                parts.append(f"\n```\n{str(item)}\n```\n")

    markdown = f"### {d["role"].capitalize()}\n"
    markdown += "\n".join(parts)
    return markdown


def _format_tool_call(name: str, input: str, call_id: str):
    """
    Format a tool call to markdown.
    """
    return f"Tool Call: {name}  `{input}` (call_id: {call_id})"


def format_chat_message(message: BaseMessage | MessageBuilder | dict):
    """
    Format a message to markdown.
    """
    if isinstance(message, BaseMessage):
        return message.content
    elif isinstance(message, MessageBuilder):
        return message.to_markdown()
    elif isinstance(message, dict):
        return dict_msg_to_markdown(message)
    elif isinstance(message, ResponseFunctionToolCall):  # type: ignore[return]
        too_use_str = _format_tool_call(message.name, message.arguments, message.call_id)
        return f"### Tool Use\n```\n{too_use_str}\n```\n"
    else:
        return str(message)


def update_chat_messages():
    global info
    agent_info = info.exp_result.steps_info[info.step].agent_info
    chat_messages = agent_info.get("chat_messages", ["No Chat Messages"])
    if isinstance(chat_messages, Discussion):
        return chat_messages.to_markdown()

    if isinstance(chat_messages, list):
        chat_messages = [format_chat_message(m) for m in chat_messages]
        return "\n\n".join(chat_messages)


def update_task_error():
    global info
    try:
        stack_trace = info.exp_result.summary_info.get("stack_trace", None)
        return f"""{code(stack_trace)}"""
    except FileNotFoundError:
        return "No Task Error"


def update_logs():
    global info
    try:
        return f"""{info.exp_result.logs}"""
    except FileNotFoundError:
        return """No Logs"""


def update_stats():
    global info
    try:
        stats = info.exp_result.steps_info[info.step].stats
        return pd.DataFrame(stats.items(), columns=["name", "value"])
    except (FileNotFoundError, IndexError):
        return None


def update_agent_info_md():
    global info
    try:
        agent_info = info.exp_result.steps_info[info.step].agent_info
        page = agent_info.get("markdown_page", None)
        if page is None:
            page = agent_info.get("markup_page", None)  # TODO: remove in a while
        if page is None:
            page = """Fill up markdown_page attribute in AgentInfo to display here."""
        return page
    except (FileNotFoundError, IndexError):
        return None


def update_agent_info_html():
    global info
    # screenshots from current and next step
    try:
        s1, action_str = get_screenshot(info, info.step, False)
        s2, action_str = get_screenshot(info, info.step + 1, False)
        agent_info = info.exp_result.steps_info[info.step].agent_info
        page = agent_info.get("html_page", ["No Agent Info"])
        if page is None:
            page = """Fill up html_page attribute in AgentInfo to display here."""
        else:
            page = _page_to_iframe(page)
        return page, s1, s2

    except (FileNotFoundError, IndexError):
        return None, None, None


def _page_to_iframe(page: str):
    html_bytes = page.encode("utf-8")
    encoded_html = base64.b64encode(html_bytes).decode("ascii")
    data_url = f"data:text/html;base64,{encoded_html}"

    # Create iframe with the data URL
    page = f"""
<iframe src="{data_url}"
        style="width: 100%; height: 1000px; border: none; background-color: white;">
</iframe>
"""
    return page


def submit_action(input_text):
    global info
    agent_info = info.exp_result.steps_info[info.step].agent_info
    chat_messages = deepcopy(agent_info.get("chat_messages", ["No Chat Messages"])[:2])
    if isinstance(chat_messages[1], BaseMessage):  # TODO remove once langchain is deprecated
        assert isinstance(chat_messages[1], HumanMessage), "Second message should be user"
        chat_messages = [
            make_system_message(chat_messages[0].content),
            make_user_message(chat_messages[1].content),
        ]
    elif isinstance(chat_messages[1], dict):
        assert chat_messages[1].get("role", None) == "user", "Second message should be user"
    else:
        raise ValueError("Chat messages should be a list of BaseMessage or dict")

    client = OpenAI()
    chat_messages[1]["content"] = input_text
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=chat_messages,
    )
    result_text = completion.choices[0].message.content
    return result_text


def update_prompt_tests():
    global info
    agent_info = info.exp_result.steps_info[info.step].agent_info
    chat_messages = agent_info.get("chat_messages", ["No Chat Messages"])
    prompt = chat_messages[1]
    if isinstance(prompt, dict):
        prompt = prompt.get("content", "No Content")
    return prompt, prompt


def select_step(episode_id: EpisodeId, evt: gr.SelectData):
    global info
    step = info.click_mapper.to_step(evt.index[0])
    info.step = step
    return StepId(episode_id, step)


def update_step_info():
    global info
    return [
        get_episode_info(info),
        get_action_info(info),
        get_state_error(info),
    ]


def get_obs(key: str, default=None):
    global info
    obs = info.exp_result.steps_info[info.step].obs
    return obs.get(key, default)


def code(txt):
    # return f"""<pre style="white-space: pre-wrap; word-wrap:
    # break-word;">{txt}</pre>"""
    return f"""```\n{txt}\n```"""


def get_episode_info(info: Info):
    try:
        env_args = info.exp_result.exp_args.env_args
        steps_info = info.exp_result.steps_info
        if info.step >= len(steps_info):
            info.step = len(steps_info) - 1
        if len(steps_info) == 0:
            return "No steps were taken in this episode."
        step_info = steps_info[info.step]
        try:
            goal = step_info.obs["goal_object"]
        except KeyError:
            goal = None
        try:
            cum_reward = info.exp_result.summary_info["cum_reward"]
        except FileNotFoundError:
            cum_reward = np.nan

        exp_dir = info.exp_result.exp_dir
        exp_dir_str = f"{exp_dir.parent.name}/{exp_dir.name}"

        info = f"""\
### {env_args.task_name} (seed: {env_args.task_seed})
### Step {info.step} / {len(steps_info) - 1} (Reward: {cum_reward:.1f})

**Goal:**

{code(str(AgentLabBaseMessage("", goal)))}

**Task info:**

{code(step_info.task_info)}

**Terminated or Truncated:**
{code(f"Terminated: {step_info.terminated}, Truncated: {step_info.truncated}")}

**exp_dir:**

<small style="line-height: 1; margin: 0; padding: 0;">{code(exp_dir_str)}</small>"""
    except Exception:
        info = f"""\
**Error while getting episode info**
{code(traceback.format_exc())}"""
    return info


def get_action_info(info: Info):
    steps_info = info.exp_result.steps_info
    img, action_str = get_screenshot(info, step=info.step, annotate=True)  # to update click_mapper

    if len(steps_info) == 0:
        return "No steps were taken"
    if len(steps_info) <= info.step:
        return f"Step {info.step} is out of bounds. The episode has {len(steps_info)} steps."

    step_info = steps_info[info.step]
    action_info = f"""\
**Action:**

{action_str}
"""
    think = step_info.agent_info.get("think", None)
    if think is not None:
        action_info += f"""
**Think:**

{code(think)}"""
    return action_info


def get_state_error(state: Info):
    try:
        step_info = state.exp_result.steps_info[state.step + 1]
        err_msg = step_info.obs.get("last_action_error", None)
    except (IndexError, AttributeError):
        err_msg = None

    if err_msg is None or len(err_msg) == 0:
        err_msg = "No Error"
    return f"""\
**Step error after action:**

{code(err_msg)}"""


def get_seeds_df(result_df: pd.DataFrame, task_name: str):
    result_df = result_df.reset_index(inplace=False)
    result_df = result_df[result_df[TASK_NAME_KEY] == task_name]

    def extract_columns(row: pd.Series):
        return pd.Series(
            {
                "seed": row[TASK_SEED_KEY],
                "reward": row.get("cum_reward", None),
                "err": bool(row.get("err_msg", None)),
                "n_steps": row.get("n_steps", None),
            }
        )

    seed_df = result_df.apply(extract_columns, axis=1)
    return seed_df


def on_select_agent(evt: gr.SelectData, df: pd.DataFrame):
    # TODO try to find a clever way to solve the sort bug here