self-reflection/source/9_analyze_keywords.py at main · ruio248/self-reflection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import pandas as pd
import matplotlib.pyplot as plt

root_folder_path = "../data/reflections"
keywords = pd.DataFrame()

remove_list = [
    "* ",
    "$\\begin{array}{r}",
    "\\end{array}$",
    "error",
    "incorrectly",
    "incorrect",
    "incomplete",
    "lack of",
    "of the question",
    "of the problem",
    "the question",
    "the problem",
    "the argument"
]

remap_list = {
    "": "N/A",
    "misinterpretation": "interpretation",
    "misreading": "reading",
    "misunderstanding": "understanding",
    "attention to detail": "attention to detail",
    "attention to details": "attention to detail",
    "inattention to detail": "attention to detail",
    "inattention": "attention to detail",
    "knowledge gap": "knowledge",
    "failure to consider all possibilities": "consideration",
    "failure to consider all possible scenarios": "consideration",
    "logical": "logic",
    "logical reasoning": "logic",
}

for folder_name in os.listdir(root_folder_path):

    folder_path = root_folder_path + "/" + folder_name
    folder_name_parts = folder_name.split(" - ")
    model_name = folder_name_parts[0]
    agent_name = folder_name_parts[1]
    exam_name = folder_name_parts[2]

    if agent_name != "keywords":
        continue

    for file_name in os.listdir(folder_path):

        file_path = folder_path + "/" + file_name
        if not file_name.endswith(".txt"):
            continue

        if "comprehensive-100" not in file_path:
            continue

        print(file_path)

        with open(file_path, "r") as file:
            lines = file.readlines()
            for i, line in enumerate(lines):
                if line.startswith("Error Keywords:"):
                    continue

                keyword = line.strip()
                keyword = keyword.lower()
                keyword = keyword.replace("- ", "")

                for remove_words in remove_list:
                    keyword = keyword.replace(remove_words, "")

                keyword = keyword.strip()

                if keyword in remap_list:
                    keyword = remap_list[keyword]

                keyword_row = {
                    "model_name": model_name,
                    "keyword": keyword,
                    "depth": i}

                keywords = keywords._append(keyword_row, ignore_index=True)

keywords.reset_index()

# Filter in only top-level keywords
keywords = keywords[keywords["depth"] == 1]

# Group the unique keywords, count them, and average the depth
unique_keywords = keywords \
    .groupby(["model_name", "keyword"]) \
    .agg(count=("keyword", "count"),
        depth =("depth", "mean")) \
    .reset_index()

# Sort by count
unique_keywords = unique_keywords \
    .sort_values(by="count", ascending=False)

# Get the top n keywords
top_n = 100
top_keywords = unique_keywords.head(top_n)

# # Plot the top n keywords
# plt.figure(figsize=(10, 5))
# plt.barh(
#     top_keywords["keyword"],
#     top_keywords["count"])
# plt.title(f"Top {top_n} Error Keywords")
# plt.xlabel("Count")
# plt.ylabel("Keyword")
# plt.gca().invert_yaxis()
# plt.subplots_adjust(left=0.2)
# plt.show()

# Plot the top n keywords as a stacked barchart by model_name
plt.figure(figsize=(10, 10))
plt.barh(
    top_keywords["keyword"],
    top_keywords["count"],
    color="gray")
for i, model_name in enumerate(top_keywords["model_name"].unique()):
    model_keywords = top_keywords[top_keywords["model_name"] == model_name]
    plt.barh(
        model_keywords["keyword"],
        model_keywords["count"],
        color=f"C{i}",
        label=model_name)
plt.title(f"Top {top_n} Error Keywords by Model")
plt.xlabel("Count")
plt.ylabel("Keyword")
plt.gca().invert_yaxis()
plt.subplots_adjust(left=0.2)
plt.legend(title="Model")
plt.show()