Skip to content

Commit f2938db

Browse files
committed
Lots of updates based on interactive testing
1 parent fe0f786 commit f2938db

10 files changed

Lines changed: 1065 additions & 85 deletions

File tree

Cargo.lock

Lines changed: 295 additions & 48 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ image = "0.25.5"
1818
byteorder = "1.4"
1919
clap = { version = "4.0", features = ["derive"] }
2020
evdev = { version = "0.13", features = ["tokio"] }
21-
resvg = "0.45"
21+
resvg = "0.47"
22+
svg2polylines = "0.8"
2223
dotenv = "0.15"
2324
imageproc = "0.25.0"
2425
rust-embed = { version = "8.5.0", features = ["include-exclude", "compression"] }
@@ -39,6 +40,10 @@ path = "src/lib.rs"
3940
name = "ghostwriter"
4041
path = "src/main.rs"
4142

43+
[[bin]]
44+
name = "experiment"
45+
path = "src/bin/experiment.rs"
46+
4247
[profile.release]
4348
# strip = true # Automatically strip symbols from the binary.
4449
# opt-level = "z" # Optimize for size.

prompts/general.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
{
2-
"prompt": "You are a helpful assistant named Ghostwriter. You live inside of a reMarkable brand eInk notepad, which has a 768x1024 px sized screen which can only display grayscale. Your input is the current content of the screen, which may contain content written by the user or previously written by you (the assistant). Look at this content, interpret it, and respond to the content. The content will contain handwritten notes, diagrams, and maybe typewritten text. Respond by calling a tool. Call draw_text to output text which will be sent using simulated keyboard input. Call draw_svg to respond with an SVG drawing which will be drawn on top of the existing content. Try to place the output on the screen at coordinates that make sense. If you need to place text at a very specific location, you should output an SVG instead of keyboard text.",
2+
"prompt": "You are a helpful assistant named Ghostwriter. You live inside of a reMarkable brand eInk notepad, which has a 768x1024 px sized screen which can only display grayscale. Your input is the current content of the screen, which may contain content written by the user or previously written by you (the assistant). Look at this content, interpret it, and respond to the content. The content will contain handwritten notes, diagrams, and maybe typewritten text. Respond by calling a tool. Call draw_text to output text which will be sent using simulated keyboard input. Call draw_svg to respond with an SVG drawing which will be drawn on top of the existing content. Try to place the output on the screen at coordinates that make sense. If you need to place text at a very specific location, you should output an SVG instead of keyboard text. If your output is entirely text, then draw_text is better -- it will automatically be positioned below the lowest content on the page.",
33
"tools": ["draw_text", "draw_svg"]
44
}

run_eval.sh

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,22 +11,36 @@ results="$outdir_base/results.md"
1111

1212
scenarios=($(ls evaluations))
1313

14-
attempt_count=3
14+
attempt_count=1 # Usually 3
1515

1616
declare -A test_case_params
1717

18-
test_case_params["claude_sonnet_latest_no_seg"]="--model claude-3-5-sonnet-latest"
19-
test_case_params["claude_sonnet_latest_with_seg"]="--apply-segmentation --model claude-3-5-sonnet-latest"
20-
test_case_params["gpt-4o-mini_no_seg"]="--model gpt-4o-mini"
21-
test_case_params["gpt-4o_with_seg"]="--apply-segmentation --model gpt-4o-mini"
22-
test_case_params["gpt-4o-mini_no_seg"]="--model gpt-4o"
23-
test_case_params["gpt-4o_with_seg"]="--apply-segmentation --model gpt-4o"
24-
test_case_params["gemini-2-flash_no_seg"]="--model gemini-2.0-flash-exp"
25-
test_case_params["gemini-2-flash_with_seg"]="--apply-segmentation --model gemini-2.0-flash-exp"
18+
test_case_params["claude_haiku_4.5"]="--model claude-haiku-4-5"
19+
test_case_params["claude_sonnet_4.5"]="--model claude-sonnet-4-5"
20+
test_case_params["claude_opus_4.6"]="--model claude-opus-4-6"
21+
22+
test_case_params["gemini-3-flash"]="--model gemini-3-flash-preview"
23+
test_case_params["gemini-3-pro"]="--model gemini-3-pro-preview"
24+
25+
test_case_params["gpt-5-nano"]="--model gpt-5-nano"
26+
test_case_params["gpt-5-mini"]="--model gpt-5-mini"
27+
test_case_params["gpt-5.2"]="--model gpt-5.2"
28+
# test_case_params["gpt-5.2-codex"]="--model gpt-5.2-codex"
29+
30+
31+
# Old retired test cases
32+
# test_case_params["claude_sonnet_latest_no_seg"]="--model claude-3-5-sonnet-latest"
33+
# test_case_params["claude_sonnet_latest_with_seg"]="--apply-segmentation --model claude-3-5-sonnet-latest"
34+
# test_case_params["gpt-4o-mini_no_seg"]="--model gpt-4o-mini"
35+
# test_case_params["gpt-4o_with_seg"]="--apply-segmentation --model gpt-4o-mini"
36+
# test_case_params["gpt-4o-mini_no_seg"]="--model gpt-4o"
37+
# test_case_params["gpt-4o_with_seg"]="--apply-segmentation --model gpt-4o"
38+
# test_case_params["gemini-2-flash_no_seg"]="--model gemini-2.0-flash-exp"
39+
# test_case_params["gemini-2-flash_with_seg"]="--apply-segmentation --model gemini-2.0-flash-exp"
2640
# test_case_params["gemini-1206-flash_no_seg"]="--model gemini-exp-1206"
2741
# test_case_params["gemini-1206-flash_with_seg"]="--apply-segmentation --model gemini-exp-1206"
28-
test_case_params["gemini-1.5-pro_no_seg"]="--model gemini-1.5-pro"
29-
test_case_params["gemini-1.5-pro_with_seg"]="--apply-segmentation --model gemini-1.5-pro"
42+
# test_case_params["gemini-1.5-pro_no_seg"]="--model gemini-1.5-pro"
43+
# test_case_params["gemini-1.5-pro_with_seg"]="--apply-segmentation --model gemini-1.5-pro"
3044

3145
echo "# Ghostwriter evaluation results $datetime" > $results
3246
echo "" >> $results
@@ -61,6 +75,8 @@ for scenario in "${scenarios[@]}"; do
6175
# Run the test case
6276
echo "Running scenario $scenario with params $params attempt $attempt"
6377

78+
start_time=$(date +%s%N)
79+
6480
./target/release/ghostwriter \
6581
--input-png evaluations/$scenario/input.png \
6682
--save-screenshot $outdir/input.png \
@@ -73,6 +89,10 @@ for scenario in "${scenarios[@]}"; do
7389
--no-trigger \
7490
$params
7591

92+
end_time=$(date +%s%N)
93+
elapsed_ms=$(( (end_time - start_time) / 1000000 ))
94+
elapsed_s=$(printf "%.1f" "$(echo "$elapsed_ms / 1000" | bc -l)")
95+
7696
# Create a merged image with the new part in red
7797
if [ -f $outdir/result.png ]; then
7898
convert \
@@ -94,8 +114,11 @@ for scenario in "${scenarios[@]}"; do
94114
echo '```' >> $results
95115
fi
96116

97-
echo "Sleeping for 10 seconds to avoid rate limiting"
98-
sleep 10
117+
echo " (${elapsed_s}s)" >> $results
118+
119+
echo "Completed in ${elapsed_s}s"
120+
# echo "Sleeping for 10 seconds to avoid rate limiting"
121+
# sleep 10
99122

100123
done
101124

0 commit comments

Comments
 (0)