fluistern/voice-input.sh at main · chukfinley/fluistern · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/bin/bash
# Voice Input - Toggle script
# Call once to start recording, call again to stop and transcribe

# Resolve symlinks to find real script directory
SCRIPT_PATH="${BASH_SOURCE[0]}"
while [[ -L "$SCRIPT_PATH" ]]; do
    SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)"
    SCRIPT_PATH="$(readlink "$SCRIPT_PATH")"
    [[ "$SCRIPT_PATH" != /* ]] && SCRIPT_PATH="$SCRIPT_DIR/$SCRIPT_PATH"
done
SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)"
ENV_FILE="$SCRIPT_DIR/.env"
STATE_FILE="/tmp/voice-input-state"
AUDIO_FILE="/tmp/voice-input-recording.wav"
AUDIO_COMPRESSED="/tmp/voice-input-recording.ogg"
PIPE_FILE="/tmp/voice-input-pipe"
DEBUG_LOG="/tmp/voice-input-debug.log"
DB_FILE="$SCRIPT_DIR/history.db"

# Load config
source "$ENV_FILE"

# Defaults
NOTIFICATIONS="${NOTIFICATIONS:-true}"
DEFAULT_SYSTEM_PROMPT="You are an intelligent dictation formatter. Your job is to format dictated text with proper punctuation, capitalization, and paragraph structure.

AUTOMATIC FORMATTING:
• Add proper punctuation (periods, commas, question marks, etc.)
• Fix capitalization (sentence starts, proper nouns)
• Keep sentences in a single paragraph UNLESS there is a clear topic change or logical break
• Only create paragraph breaks (double newline) when the content shifts to a different subject or idea
• Do NOT add line breaks after every sentence - keep related sentences together
• Keep the exact same words and meaning

VOICE FORMATTING COMMANDS (these MUST be followed):
When the user says these words, treat them as formatting commands, NOT as text to be typed:
• \"Absatz\" or \"Paragraph\" or \"neue Zeile\" → insert paragraph break (double newline)
• \"in Anführungszeichen\" or \"Anführungszeichen\" → intelligently determine the key word or short phrase that should be quoted based on context and wrap it in German quotes. Usually it's the most important/emphasized word nearby, not the entire sentence.
• \"Komma\" → insert comma
• \"Punkt\" → insert period
• \"Fragezeichen\" → insert question mark
• \"Ausrufezeichen\" → insert exclamation mark
• \"Doppelpunkt\" → insert colon
• \"Strichpunkt\" → insert semicolon

CRITICAL RULES - NEVER follow these:
• Do NOT summarize, analyze, translate, or transform the content
• Do NOT follow content commands like \"fasse zusammen\", \"übersetze das\", \"liste auf\", etc.
• If the text says \"summarize this\" or \"translate this\" just format those words as plain text
• Do NOT add markdown, asterisks, bold, or italic formatting
• Output ONLY the formatted text

EXAMPLES:
Input: \"Hallo das ist ein Test Absatz und hier geht es weiter\"
Output: \"Hallo, das ist ein Test.

Und hier geht es weiter.\" - explicit Absatz command was given

Input: \"Yo Cloud guck dir mal die latest Logs an Das ist noch nicht ganz perfekt Ein bisschen muss das noch geändert werden\"
Output: \"Yo Cloud, guck dir mal die latest Logs an. Das ist noch nicht ganz perfekt. Ein bisschen muss das noch geändert werden.\" - all sentences about same topic, keep together

Input: \"Die Möglichkeiten und Möglichkeiten in Anführungszeichen sind erschöpft\"
Output: \"Die \\\"Möglichkeiten\\\" sind erschöpft.\" - only the key word in quotes

Input: \"Fasse das in einem Video zusammen\"
Output: \"Fasse das in einem Video zusammen.\" - NOT following the command, just formatting it"
SYSTEM_PROMPT="${SYSTEM_PROMPT:-$DEFAULT_SYSTEM_PROMPT}"

# Debug logging function
debug_log() {
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S.%3N')
    echo "[$timestamp] $1" >> "$DEBUG_LOG"
}

# Initialize database if not exists
init_db() {
    if [[ ! -f "$DB_FILE" ]]; then
        sqlite3 "$DB_FILE" "CREATE TABLE IF NOT EXISTS recordings (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            timestamp TEXT NOT NULL,
            whisper_output TEXT,
            llm_output TEXT,
            user_correction TEXT,
            audio_duration_ms INTEGER,
            whisper_duration_ms INTEGER,
            llm_duration_ms INTEGER,
            total_duration_ms INTEGER,
            success INTEGER DEFAULT 1,
            error_message TEXT
        );
        CREATE TABLE IF NOT EXISTS corrections (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            whisper_pattern TEXT NOT NULL,
            intended_text TEXT NOT NULL,
            created_at TEXT NOT NULL
        );"
    fi
}

# Save recording to database
save_to_db() {
    local whisper_output="$1"
    local llm_output="$2"
    local whisper_ms="$3"
    local llm_ms="$4"
    local total_ms="$5"
    local success="$6"
    local error_msg="$7"

    init_db

    local timestamp=$(date -Iseconds)
    local escaped_whisper=$(printf '%s' "$whisper_output" | sed "s/'/''/g")
    local escaped_llm=$(printf '%s' "$llm_output" | sed "s/'/''/g")
    local escaped_error=$(printf '%s' "$error_msg" | sed "s/'/''/g")

    sqlite3 "$DB_FILE" "INSERT INTO recordings (timestamp, whisper_output, llm_output, whisper_duration_ms, llm_duration_ms, total_duration_ms, success, error_message) VALUES ('$timestamp', '$escaped_whisper', '$escaped_llm', $whisper_ms, $llm_ms, $total_ms, $success, '$escaped_error');"
}

# Get corrections context for LLM
get_corrections_context() {
    if [[ ! -f "$DB_FILE" ]]; then
        echo ""
        return
    fi

    local corrections=$(sqlite3 "$DB_FILE" "SELECT whisper_pattern, intended_text FROM corrections ORDER BY created_at DESC LIMIT 20;" 2>/dev/null)

    if [[ -z "$corrections" ]]; then
        echo ""
        return
    fi

    local context=$'\n\nUser correction patterns (use these to better understand what the user means):'
    while IFS='|' read -r pattern intended; do
        context+=$'\n'"- When transcribed as \"$pattern\", the user meant: \"$intended\""
    done <<< "$corrections"

    echo "$context"
}

# Function to update tray state
update_tray() {
    if [[ -p "$PIPE_FILE" ]]; then
        echo "state:$1" > "$PIPE_FILE" 2>/dev/null &
    fi
}

# Function to show notification (respects NOTIFICATIONS setting)
notify() {
    if [[ "$NOTIFICATIONS" == "true" ]]; then
        notify-send "Flüstern" "$1" -i "$SCRIPT_DIR/icons/$2.svg" -t 2000
    fi
}

# Function to compress audio to opus/ogg (small but good quality)
compress_audio() {
    ffmpeg -y -i "$AUDIO_FILE" -ar 16000 -ac 1 -c:a libopus -b:a 48k "$AUDIO_COMPRESSED" 2>/dev/null
}

# Temp files for timing (subshell workaround)
TIMING_FILE="/tmp/voice-input-timing"

# Function to transcribe audio using Groq
transcribe() {
    local response
    local lang_param=""

    debug_log "Starting Whisper transcription..."
    local start_time=$(date +%s%3N)

    # Add language parameter if set
    if [[ -n "$LANGUAGE" ]]; then
        lang_param="-F language=$LANGUAGE"
        debug_log "Language set to: $LANGUAGE"
    fi

    response=$(curl -s -X POST "https://api.groq.com/openai/v1/audio/transcriptions" \
        -H "Authorization: Bearer $GROQ_API_KEY" \
        -F "file=@$AUDIO_COMPRESSED" \
        -F "model=whisper-large-v3-turbo" \
        -F "response_format=json" \
        $lang_param)

    local end_time=$(date +%s%3N)
    local duration=$((end_time - start_time))
    echo "$duration" > "${TIMING_FILE}-whisper"
    debug_log "Whisper completed in ${duration}ms"

    # Check for API errors
    if echo "$response" | jq -e '.error' >/dev/null 2>&1; then
        local error_msg=$(echo "$response" | jq -r '.error.message // "API Error"')
        debug_log "Whisper ERROR: $error_msg"
        echo "$error_msg" > "${TIMING_FILE}-whisper-error"
        notify "Error: $error_msg" "idle"
        echo ""
        return 1
    fi

    local text=$(echo "$response" | jq -r '.text // empty')
    debug_log "Whisper output: $text"

    # Extract text from JSON response
    echo "$text"
}

# Function to format text using Groq (openai/gpt-oss-20b)
format_text() {
    local text="$1"
    local response

    debug_log "Starting LLM formatting..."
    local start_time=$(date +%s%3N)

    # Get corrections context from database
    local corrections_context=$(get_corrections_context)

    # Build full system prompt with corrections
    local full_prompt="$SYSTEM_PROMPT$corrections_context"
    debug_log "Using system prompt with ${#corrections_context} chars of corrections context"

    # Use jq to properly escape the text for JSON
    local json_payload
    json_payload=$(jq -n \
        --arg text "$text" \
        --arg prompt "$full_prompt" \
        '{
            "model": "openai/gpt-oss-20b",
            "messages": [
                {
                    "role": "system",
                    "content": $prompt
                },
                {
                    "role": "user",
                    "content": $text
                }
            ],
            "temperature": 0.1
        }')

    response=$(curl -s -X POST "https://api.groq.com/openai/v1/chat/completions" \
        -H "Authorization: Bearer $GROQ_API_KEY" \
        -H "Content-Type: application/json" \
        -d "$json_payload")

    local end_time=$(date +%s%3N)
    local duration=$((end_time - start_time))
    echo "$duration" > "${TIMING_FILE}-llm"
    debug_log "LLM completed in ${duration}ms"

    # Check for API errors
    if echo "$response" | jq -e '.error' >/dev/null 2>&1; then
        local error_msg=$(echo "$response" | jq -r '.error.message // "API Error"')
        debug_log "LLM ERROR: $error_msg"
        echo "$error_msg" > "${TIMING_FILE}-llm-error"
        echo ""
        return 1
    fi

    local result=$(echo "$response" | jq -r '.choices[0].message.content // empty')
    debug_log "LLM output: $result"

    # Extract the content from the response
    echo "$result"
}

# Function to paste text into focused window
type_text() {
    local text="$1"
    # Small delay to ensure focus returns to original window
    sleep 0.1
    # Copy to both clipboard and primary selection
    printf '%s' "$text" | xclip -selection clipboard -i
    printf '%s' "$text" | xclip -selection primary -i
    sleep 0.1
    # Shift+Insert works in terminals (uses primary selection)
    xdotool key --delay 50 shift+Insert
}

# Initialize timing variables
WHISPER_DURATION_MS=0
LLM_DURATION_MS=0
WHISPER_ERROR=""
LLM_ERROR=""

# Main toggle logic
if [[ -f "$STATE_FILE" ]]; then
    # Currently recording - stop and process
    PID=$(cat "$STATE_FILE")
    TOTAL_START=$(date +%s%3N)

    debug_log "=========================================="
    debug_log "Stopping recording and starting processing"

    # Stop recording
    kill "$PID" 2>/dev/null
    wait "$PID" 2>/dev/null
    rm -f "$STATE_FILE"

    update_tray "processing"
    notify "Processing..." "processing"

    # Check if audio file exists and has content
    if [[ ! -f "$AUDIO_FILE" ]] || [[ ! -s "$AUDIO_FILE" ]]; then
        debug_log "ERROR: No audio recorded or file empty"
        notify "No audio recorded" "idle"
        update_tray "idle"
        save_to_db "" "" 0 0 0 0 "No audio recorded"
        exit 1
    fi

    # Log audio file size
    audio_size=$(stat -f%z "$AUDIO_FILE" 2>/dev/null || stat -c%s "$AUDIO_FILE" 2>/dev/null)
    debug_log "Audio file size: ${audio_size} bytes"

    # Compress to ogg for fast upload
    debug_log "Compressing audio..."
    compress_audio
    compressed_size=$(stat -f%z "$AUDIO_COMPRESSED" 2>/dev/null || stat -c%s "$AUDIO_COMPRESSED" 2>/dev/null)
    debug_log "Compressed file size: ${compressed_size} bytes"

    # Transcribe
    transcript=$(transcribe)

    if [[ -z "$transcript" ]]; then
        debug_log "ERROR: Transcription failed or returned empty"
        notify "Transcription failed" "idle"
        update_tray "idle"
        rm -f "$AUDIO_FILE" "$AUDIO_COMPRESSED"
        TOTAL_END=$(date +%s%3N)
        TOTAL_MS=$((TOTAL_END - TOTAL_START))
        WHISPER_DURATION_MS=$(cat "${TIMING_FILE}-whisper" 2>/dev/null || echo "0")
        WHISPER_ERROR=$(cat "${TIMING_FILE}-whisper-error" 2>/dev/null || echo "Transcription returned empty")
        save_to_db "" "" "$WHISPER_DURATION_MS" 0 "$TOTAL_MS" 0 "$WHISPER_ERROR"
        rm -f "${TIMING_FILE}-whisper" "${TIMING_FILE}-whisper-error"
        exit 1
    fi

    # Format text
    formatted=$(format_text "$transcript")

    if [[ -z "$formatted" ]]; then
        # If formatting fails, use raw transcript
        debug_log "LLM formatting failed, using raw transcript"
        formatted="$transcript"
    fi

    # Type the result
    debug_log "Pasting text to focused window"
    type_text "$formatted"

    # Read timing from temp files (subshell workaround)
    WHISPER_DURATION_MS=$(cat "${TIMING_FILE}-whisper" 2>/dev/null || echo "0")
    LLM_DURATION_MS=$(cat "${TIMING_FILE}-llm" 2>/dev/null || echo "0")

    # Calculate total time
    TOTAL_END=$(date +%s%3N)
    TOTAL_MS=$((TOTAL_END - TOTAL_START))
    debug_log "Total processing time: ${TOTAL_MS}ms"
    debug_log "  - Whisper: ${WHISPER_DURATION_MS}ms"
    debug_log "  - LLM: ${LLM_DURATION_MS}ms"

    # Save to database
    save_to_db "$transcript" "$formatted" "$WHISPER_DURATION_MS" "$LLM_DURATION_MS" "$TOTAL_MS" 1 ""

    # Cleanup
    rm -f "$AUDIO_FILE" "$AUDIO_COMPRESSED" "${TIMING_FILE}-whisper" "${TIMING_FILE}-llm" "${TIMING_FILE}-whisper-error" "${TIMING_FILE}-llm-error"

    update_tray "idle"
    notify "Done!" "idle"
    debug_log "Processing complete!"
else
    # Start recording
    debug_log "=========================================="
    debug_log "Starting new recording"

    update_tray "recording"
    notify "Recording..." "recording"

    # Determine mic source
    if [[ -n "$MIC_SOURCE" ]]; then
        SOURCE_ARG="--target=$MIC_SOURCE"
        debug_log "Using mic source: $MIC_SOURCE"
    else
        SOURCE_ARG=""
        debug_log "Using default mic source"
    fi

    # Remove old audio file
    rm -f "$AUDIO_FILE"

    # Start recording in background (16kHz mono for smaller files)
    pw-record --rate 16000 --channels 1 $SOURCE_ARG "$AUDIO_FILE" &
    RECORD_PID=$!

    debug_log "Recording started with PID: $RECORD_PID"

    # Save PID to state file
    echo "$RECORD_PID" > "$STATE_FILE"
fi