Skip to content

Commit bc6fa29

Browse files
Add scripts/troubleshoot.py for system state debugging
This script gathers system resources, Docker/Consul/Nomad status, and logs from failed Nomad allocations into a single report file. It integrates existing analysis scripts for a comprehensive view of the system state. Co-authored-by: LokiMetaSmith <5054116+LokiMetaSmith@users.noreply.github.com>
1 parent 90c659b commit bc6fa29

File tree

1 file changed

+153
-0
lines changed

1 file changed

+153
-0
lines changed

scripts/troubleshoot.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#!/usr/bin/env python3
2+
import os
3+
import sys
4+
import subprocess
5+
import datetime
6+
import getpass
7+
import json
8+
import urllib.request
9+
import urllib.error
10+
import tempfile
11+
import shutil
12+
13+
def run_command(command, shell=False):
14+
"""Run a shell command and return its output."""
15+
try:
16+
# If command is a list, shell should be False (usually)
17+
# If command is a string, shell should be True
18+
if isinstance(command, list):
19+
cmd_str = " ".join(command)
20+
else:
21+
cmd_str = command
22+
shell = True
23+
24+
print(f"Running: {cmd_str}...")
25+
result = subprocess.run(
26+
command,
27+
stdout=subprocess.PIPE,
28+
stderr=subprocess.STDOUT,
29+
shell=shell,
30+
universal_newlines=True,
31+
timeout=30
32+
)
33+
return f"$ {cmd_str}\n{result.stdout}\n"
34+
except Exception as e:
35+
return f"Error running command '{command}': {e}\n"
36+
37+
def get_nomad_allocations():
38+
"""Fetch allocations from Nomad API."""
39+
url = "http://localhost:4646/v1/allocations"
40+
try:
41+
with urllib.request.urlopen(url) as response:
42+
if response.status == 200:
43+
return json.loads(response.read().decode('utf-8'))
44+
except Exception as e:
45+
print(f"Error fetching allocations: {e}")
46+
return []
47+
48+
def section(title):
49+
return f"\n{'='*80}\n {title}\n{'='*80}\n"
50+
51+
def main():
52+
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
53+
report_file = f"troubleshoot_report_{timestamp}.txt"
54+
55+
print(f"Generating troubleshooting report: {report_file}")
56+
57+
with open(report_file, 'w') as f:
58+
# Header
59+
f.write(section("TROUBLESHOOTING REPORT"))
60+
f.write(f"Date: {datetime.datetime.now()}\n")
61+
f.write(f"Hostname: {os.uname().nodename}\n")
62+
try:
63+
user = getpass.getuser()
64+
except Exception:
65+
user = os.environ.get('USER', 'unknown')
66+
f.write(f"User: {user}\n")
67+
68+
# System Info
69+
f.write(section("SYSTEM RESOURCES"))
70+
f.write(run_command("uptime"))
71+
f.write(run_command("free -h"))
72+
f.write(run_command("df -h"))
73+
74+
# Docker
75+
f.write(section("DOCKER STATUS"))
76+
f.write(run_command("docker ps -a"))
77+
78+
# Consul
79+
f.write(section("CONSUL STATUS"))
80+
f.write(run_command("consul members"))
81+
f.write(run_command("consul catalog services"))
82+
83+
f.write("\n--- Stale Services Analysis ---\n")
84+
# Run prune_consul_services.py in dry-run mode
85+
script_dir = os.path.dirname(os.path.abspath(__file__))
86+
prune_script = os.path.join(script_dir, "prune_consul_services.py")
87+
if os.path.exists(prune_script):
88+
f.write(run_command([sys.executable, prune_script, "--dry-run"]))
89+
else:
90+
f.write(f"Script not found: {prune_script}\n")
91+
92+
# Nomad
93+
f.write(section("NOMAD STATUS"))
94+
f.write(run_command("nomad server members"))
95+
f.write(run_command("nomad node status"))
96+
f.write(run_command("nomad job status"))
97+
98+
# Nomad Analysis
99+
f.write(section("NOMAD JOB ANALYSIS"))
100+
analyze_script = os.path.join(script_dir, "analyze_nomad_allocs.py")
101+
102+
allocs = get_nomad_allocations()
103+
if allocs:
104+
# Save to temp file for the analyzer script
105+
fd, temp_path = tempfile.mkstemp(suffix=".json")
106+
try:
107+
with os.fdopen(fd, 'w') as tmp:
108+
json.dump(allocs, tmp)
109+
110+
# Run the analyzer
111+
if os.path.exists(analyze_script):
112+
f.write(run_command([sys.executable, analyze_script, temp_path]))
113+
else:
114+
f.write(f"Script not found: {analyze_script}\n")
115+
116+
# Enhanced Log Capture for Failed Allocs
117+
f.write(section("RECENT FAILED ALLOCATION LOGS"))
118+
119+
# Filter for failed allocs
120+
failed_allocs = [a for a in allocs if a.get('ClientStatus') == 'failed']
121+
# Sort by ModifyTime desc
122+
failed_allocs.sort(key=lambda x: x.get('ModifyTime', 0), reverse=True)
123+
124+
# Take top 5
125+
top_failures = failed_allocs[:5]
126+
127+
if not top_failures:
128+
f.write("No failed allocations found.\n")
129+
130+
for alloc in top_failures:
131+
alloc_id = alloc.get('ID')
132+
short_id = alloc_id[:8]
133+
job_id = alloc.get('JobID')
134+
135+
f.write(f"\n--- Logs for Allocation {short_id} (Job: {job_id}) ---\n")
136+
137+
# Find failed tasks
138+
task_states = alloc.get('TaskStates', {})
139+
for task_name, state in task_states.items():
140+
if state.get('Failed') or state.get('State') == 'dead':
141+
f.write(f"Task: {task_name} (State: {state.get('State')})\n")
142+
# Fetch stderr
143+
f.write(f"Fetching stderr for task '{task_name}'...\n")
144+
f.write(run_command(["nomad", "alloc", "logs", "-stderr", "-tail", "-n", "100", alloc_id, task_name]))
145+
finally:
146+
os.remove(temp_path)
147+
else:
148+
f.write("Could not retrieve allocations for analysis.\n")
149+
150+
print(f"Report generation complete. Output saved to: {report_file}")
151+
152+
if __name__ == "__main__":
153+
main()

0 commit comments

Comments
 (0)