fluistern/voice-input.sh at 6fd2792d711c06c3f9c92c33ff8b12572fbf0d90 · chukfinley/fluistern · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/bin/bash
# Voice Input - Toggle script
# Call once to start recording, call again to stop and transcribe

# Resolve symlinks to find real script directory
SCRIPT_PATH="${BASH_SOURCE[0]}"
while [[ -L "$SCRIPT_PATH" ]]; do
    SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)"
    SCRIPT_PATH="$(readlink "$SCRIPT_PATH")"
    [[ "$SCRIPT_PATH" != /* ]] && SCRIPT_PATH="$SCRIPT_DIR/$SCRIPT_PATH"
done
SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)"
ENV_FILE="$SCRIPT_DIR/.env"
STATE_FILE="/tmp/voice-input-state"
AUDIO_FILE="/tmp/voice-input-recording.wav"
AUDIO_COMPRESSED="/tmp/voice-input-recording.ogg"
PIPE_FILE="/tmp/voice-input-pipe"

# Load config
source "$ENV_FILE"

# Defaults
NOTIFICATIONS="${NOTIFICATIONS:-true}"

# Function to update tray state
update_tray() {
    if [[ -p "$PIPE_FILE" ]]; then
        echo "state:$1" > "$PIPE_FILE" 2>/dev/null &
    fi
}

# Function to show notification (respects NOTIFICATIONS setting)
notify() {
    if [[ "$NOTIFICATIONS" == "true" ]]; then
        notify-send "Flüstern" "$1" -i "$SCRIPT_DIR/icons/$2.svg" -t 2000
    fi
}

# Function to compress audio to opus/ogg (small but good quality)
compress_audio() {
    ffmpeg -y -i "$AUDIO_FILE" -ar 16000 -ac 1 -c:a libopus -b:a 48k "$AUDIO_COMPRESSED" 2>/dev/null
}

# Function to transcribe audio using Groq
transcribe() {
    local response
    local lang_param=""

    # Add language parameter if set
    if [[ -n "$LANGUAGE" ]]; then
        lang_param="-F language=$LANGUAGE"
    fi

    response=$(curl -s -X POST "https://api.groq.com/openai/v1/audio/transcriptions" \
        -H "Authorization: Bearer $GROQ_API_KEY" \
        -F "file=@$AUDIO_COMPRESSED" \
        -F "model=whisper-large-v3-turbo" \
        -F "response_format=json" \
        $lang_param)

    # Check for API errors
    if echo "$response" | jq -e '.error' >/dev/null 2>&1; then
        local error_msg=$(echo "$response" | jq -r '.error.message // "API Error"')
        notify "Error: $error_msg" "idle"
        echo ""
        return 1
    fi

    # Extract text from JSON response
    echo "$response" | jq -r '.text // empty'
}

# Function to format text using Groq (openai/gpt-oss-20b)
format_text() {
    local text="$1"
    local response

    # Use jq to properly escape the text for JSON
    local json_payload
    json_payload=$(jq -n \
        --arg text "$text" \
        '{
            "model": "openai/gpt-oss-20b",
            "messages": [
                {
                    "role": "system",
                    "content": "You are a dictation formatter. Add proper punctuation (periods, commas, question marks) and fix capitalization (sentence starts, proper nouns). Do NOT add any markdown, asterisks, bold, or formatting. Output the plain corrected text only, nothing else."
                },
                {
                    "role": "user",
                    "content": $text
                }
            ],
            "temperature": 0.1
        }')

    response=$(curl -s -X POST "https://api.groq.com/openai/v1/chat/completions" \
        -H "Authorization: Bearer $GROQ_API_KEY" \
        -H "Content-Type: application/json" \
        -d "$json_payload")

    # Check for API errors
    if echo "$response" | jq -e '.error' >/dev/null 2>&1; then
        echo ""
        return 1
    fi

    # Extract the content from the response
    echo "$response" | jq -r '.choices[0].message.content // empty'
}

# Function to paste text into focused window
type_text() {
    local text="$1"
    # Small delay to ensure focus returns to original window
    sleep 0.1
    # Copy to both clipboard and primary selection
    printf '%s' "$text" | xclip -selection clipboard -i
    printf '%s' "$text" | xclip -selection primary -i
    sleep 0.1
    # Shift+Insert works in terminals (uses primary selection)
    xdotool key --delay 50 shift+Insert
}

# Main toggle logic
if [[ -f "$STATE_FILE" ]]; then
    # Currently recording - stop and process
    PID=$(cat "$STATE_FILE")

    # Stop recording
    kill "$PID" 2>/dev/null
    wait "$PID" 2>/dev/null
    rm -f "$STATE_FILE"

    update_tray "processing"
    notify "Processing..." "processing"

    # Check if audio file exists and has content
    if [[ ! -f "$AUDIO_FILE" ]] || [[ ! -s "$AUDIO_FILE" ]]; then
        notify "No audio recorded" "idle"
        update_tray "idle"
        exit 1
    fi

    # Compress to mp3 for fast upload
    compress_audio

    # Transcribe
    transcript=$(transcribe)

    if [[ -z "$transcript" ]]; then
        notify "Transcription failed" "idle"
        update_tray "idle"
        rm -f "$AUDIO_FILE" "$AUDIO_COMPRESSED"
        exit 1
    fi

    # Format text
    formatted=$(format_text "$transcript")

    if [[ -z "$formatted" ]]; then
        # If formatting fails, use raw transcript
        formatted="$transcript"
    fi

    # Type the result
    type_text "$formatted"

    # Cleanup
    rm -f "$AUDIO_FILE" "$AUDIO_COMPRESSED"

    update_tray "idle"
    notify "Done!" "idle"
else
    # Start recording
    update_tray "recording"
    notify "Recording..." "recording"

    # Determine mic source
    if [[ -n "$MIC_SOURCE" ]]; then
        SOURCE_ARG="--target=$MIC_SOURCE"
    else
        SOURCE_ARG=""
    fi

    # Remove old audio file
    rm -f "$AUDIO_FILE"

    # Start recording in background (16kHz mono for smaller files)
    pw-record --rate 16000 --channels 1 $SOURCE_ARG "$AUDIO_FILE" &
    RECORD_PID=$!

    # Save PID to state file
    echo "$RECORD_PID" > "$STATE_FILE"
fi