-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvoice-input.sh
More file actions
executable file
·195 lines (162 loc) · 5.44 KB
/
voice-input.sh
File metadata and controls
executable file
·195 lines (162 loc) · 5.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/bin/bash
# Voice Input - Toggle script
# Call once to start recording, call again to stop and transcribe
# Resolve symlinks to find real script directory
SCRIPT_PATH="${BASH_SOURCE[0]}"
while [[ -L "$SCRIPT_PATH" ]]; do
SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)"
SCRIPT_PATH="$(readlink "$SCRIPT_PATH")"
[[ "$SCRIPT_PATH" != /* ]] && SCRIPT_PATH="$SCRIPT_DIR/$SCRIPT_PATH"
done
SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)"
ENV_FILE="$SCRIPT_DIR/.env"
STATE_FILE="/tmp/voice-input-state"
AUDIO_FILE="/tmp/voice-input-recording.wav"
AUDIO_COMPRESSED="/tmp/voice-input-recording.ogg"
PIPE_FILE="/tmp/voice-input-pipe"
# Load config
source "$ENV_FILE"
# Defaults
NOTIFICATIONS="${NOTIFICATIONS:-true}"
# Function to update tray state
update_tray() {
if [[ -p "$PIPE_FILE" ]]; then
echo "state:$1" > "$PIPE_FILE" 2>/dev/null &
fi
}
# Function to show notification (respects NOTIFICATIONS setting)
notify() {
if [[ "$NOTIFICATIONS" == "true" ]]; then
notify-send "Flüstern" "$1" -i "$SCRIPT_DIR/icons/$2.svg" -t 2000
fi
}
# Function to compress audio to opus/ogg (small but good quality)
compress_audio() {
ffmpeg -y -i "$AUDIO_FILE" -ar 16000 -ac 1 -c:a libopus -b:a 48k "$AUDIO_COMPRESSED" 2>/dev/null
}
# Function to transcribe audio using Groq
transcribe() {
local response
local lang_param=""
# Add language parameter if set
if [[ -n "$LANGUAGE" ]]; then
lang_param="-F language=$LANGUAGE"
fi
response=$(curl -s -X POST "https://api.groq.com/openai/v1/audio/transcriptions" \
-H "Authorization: Bearer $GROQ_API_KEY" \
-F "file=@$AUDIO_COMPRESSED" \
-F "model=whisper-large-v3-turbo" \
-F "response_format=json" \
$lang_param)
# Check for API errors
if echo "$response" | jq -e '.error' >/dev/null 2>&1; then
local error_msg=$(echo "$response" | jq -r '.error.message // "API Error"')
notify "Error: $error_msg" "idle"
echo ""
return 1
fi
# Extract text from JSON response
echo "$response" | jq -r '.text // empty'
}
# Function to format text using Groq (openai/gpt-oss-20b)
format_text() {
local text="$1"
local response
# Use jq to properly escape the text for JSON
local json_payload
json_payload=$(jq -n \
--arg text "$text" \
'{
"model": "openai/gpt-oss-20b",
"messages": [
{
"role": "system",
"content": "You are a dictation formatter. Add proper punctuation (periods, commas, question marks) and fix capitalization (sentence starts, proper nouns). Do NOT add any markdown, asterisks, bold, or formatting. Output the plain corrected text only, nothing else."
},
{
"role": "user",
"content": $text
}
],
"temperature": 0.1
}')
response=$(curl -s -X POST "https://api.groq.com/openai/v1/chat/completions" \
-H "Authorization: Bearer $GROQ_API_KEY" \
-H "Content-Type: application/json" \
-d "$json_payload")
# Check for API errors
if echo "$response" | jq -e '.error' >/dev/null 2>&1; then
echo ""
return 1
fi
# Extract the content from the response
echo "$response" | jq -r '.choices[0].message.content // empty'
}
# Function to paste text into focused window
type_text() {
local text="$1"
# Small delay to ensure focus returns to original window
sleep 0.1
# Copy to both clipboard and primary selection
printf '%s' "$text" | xclip -selection clipboard -i
printf '%s' "$text" | xclip -selection primary -i
sleep 0.1
# Shift+Insert works in terminals (uses primary selection)
xdotool key --delay 50 shift+Insert
}
# Main toggle logic
if [[ -f "$STATE_FILE" ]]; then
# Currently recording - stop and process
PID=$(cat "$STATE_FILE")
# Stop recording
kill "$PID" 2>/dev/null
wait "$PID" 2>/dev/null
rm -f "$STATE_FILE"
update_tray "processing"
notify "Processing..." "processing"
# Check if audio file exists and has content
if [[ ! -f "$AUDIO_FILE" ]] || [[ ! -s "$AUDIO_FILE" ]]; then
notify "No audio recorded" "idle"
update_tray "idle"
exit 1
fi
# Compress to mp3 for fast upload
compress_audio
# Transcribe
transcript=$(transcribe)
if [[ -z "$transcript" ]]; then
notify "Transcription failed" "idle"
update_tray "idle"
rm -f "$AUDIO_FILE" "$AUDIO_COMPRESSED"
exit 1
fi
# Format text
formatted=$(format_text "$transcript")
if [[ -z "$formatted" ]]; then
# If formatting fails, use raw transcript
formatted="$transcript"
fi
# Type the result
type_text "$formatted"
# Cleanup
rm -f "$AUDIO_FILE" "$AUDIO_COMPRESSED"
update_tray "idle"
notify "Done!" "idle"
else
# Start recording
update_tray "recording"
notify "Recording..." "recording"
# Determine mic source
if [[ -n "$MIC_SOURCE" ]]; then
SOURCE_ARG="--target=$MIC_SOURCE"
else
SOURCE_ARG=""
fi
# Remove old audio file
rm -f "$AUDIO_FILE"
# Start recording in background (16kHz mono for smaller files)
pw-record --rate 16000 --channels 1 $SOURCE_ARG "$AUDIO_FILE" &
RECORD_PID=$!
# Save PID to state file
echo "$RECORD_PID" > "$STATE_FILE"
fi