|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import os |
| 3 | +import sys |
| 4 | +import subprocess |
| 5 | +import datetime |
| 6 | +import getpass |
| 7 | +import json |
| 8 | +import urllib.request |
| 9 | +import urllib.error |
| 10 | +import tempfile |
| 11 | +import shutil |
| 12 | + |
| 13 | +def run_command(command, shell=False): |
| 14 | + """Run a shell command and return its output.""" |
| 15 | + try: |
| 16 | + # If command is a list, shell should be False (usually) |
| 17 | + # If command is a string, shell should be True |
| 18 | + if isinstance(command, list): |
| 19 | + cmd_str = " ".join(command) |
| 20 | + else: |
| 21 | + cmd_str = command |
| 22 | + shell = True |
| 23 | + |
| 24 | + print(f"Running: {cmd_str}...") |
| 25 | + result = subprocess.run( |
| 26 | + command, |
| 27 | + stdout=subprocess.PIPE, |
| 28 | + stderr=subprocess.STDOUT, |
| 29 | + shell=shell, |
| 30 | + universal_newlines=True, |
| 31 | + timeout=30 |
| 32 | + ) |
| 33 | + return f"$ {cmd_str}\n{result.stdout}\n" |
| 34 | + except Exception as e: |
| 35 | + return f"Error running command '{command}': {e}\n" |
| 36 | + |
| 37 | +def get_nomad_allocations(): |
| 38 | + """Fetch allocations from Nomad API.""" |
| 39 | + url = "http://localhost:4646/v1/allocations" |
| 40 | + try: |
| 41 | + with urllib.request.urlopen(url) as response: |
| 42 | + if response.status == 200: |
| 43 | + return json.loads(response.read().decode('utf-8')) |
| 44 | + except Exception as e: |
| 45 | + print(f"Error fetching allocations: {e}") |
| 46 | + return [] |
| 47 | + |
| 48 | +def section(title): |
| 49 | + return f"\n{'='*80}\n {title}\n{'='*80}\n" |
| 50 | + |
| 51 | +def main(): |
| 52 | + timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") |
| 53 | + report_file = f"troubleshoot_report_{timestamp}.txt" |
| 54 | + |
| 55 | + print(f"Generating troubleshooting report: {report_file}") |
| 56 | + |
| 57 | + with open(report_file, 'w') as f: |
| 58 | + # Header |
| 59 | + f.write(section("TROUBLESHOOTING REPORT")) |
| 60 | + f.write(f"Date: {datetime.datetime.now()}\n") |
| 61 | + f.write(f"Hostname: {os.uname().nodename}\n") |
| 62 | + try: |
| 63 | + user = getpass.getuser() |
| 64 | + except Exception: |
| 65 | + user = os.environ.get('USER', 'unknown') |
| 66 | + f.write(f"User: {user}\n") |
| 67 | + |
| 68 | + # System Info |
| 69 | + f.write(section("SYSTEM RESOURCES")) |
| 70 | + f.write(run_command("uptime")) |
| 71 | + f.write(run_command("free -h")) |
| 72 | + f.write(run_command("df -h")) |
| 73 | + |
| 74 | + # Docker |
| 75 | + f.write(section("DOCKER STATUS")) |
| 76 | + f.write(run_command("docker ps -a")) |
| 77 | + |
| 78 | + # Consul |
| 79 | + f.write(section("CONSUL STATUS")) |
| 80 | + f.write(run_command("consul members")) |
| 81 | + f.write(run_command("consul catalog services")) |
| 82 | + |
| 83 | + f.write("\n--- Stale Services Analysis ---\n") |
| 84 | + # Run prune_consul_services.py in dry-run mode |
| 85 | + script_dir = os.path.dirname(os.path.abspath(__file__)) |
| 86 | + prune_script = os.path.join(script_dir, "prune_consul_services.py") |
| 87 | + if os.path.exists(prune_script): |
| 88 | + f.write(run_command([sys.executable, prune_script, "--dry-run"])) |
| 89 | + else: |
| 90 | + f.write(f"Script not found: {prune_script}\n") |
| 91 | + |
| 92 | + # Nomad |
| 93 | + f.write(section("NOMAD STATUS")) |
| 94 | + f.write(run_command("nomad server members")) |
| 95 | + f.write(run_command("nomad node status")) |
| 96 | + f.write(run_command("nomad job status")) |
| 97 | + |
| 98 | + # Nomad Analysis |
| 99 | + f.write(section("NOMAD JOB ANALYSIS")) |
| 100 | + analyze_script = os.path.join(script_dir, "analyze_nomad_allocs.py") |
| 101 | + |
| 102 | + allocs = get_nomad_allocations() |
| 103 | + if allocs: |
| 104 | + # Save to temp file for the analyzer script |
| 105 | + fd, temp_path = tempfile.mkstemp(suffix=".json") |
| 106 | + try: |
| 107 | + with os.fdopen(fd, 'w') as tmp: |
| 108 | + json.dump(allocs, tmp) |
| 109 | + |
| 110 | + # Run the analyzer |
| 111 | + if os.path.exists(analyze_script): |
| 112 | + f.write(run_command([sys.executable, analyze_script, temp_path])) |
| 113 | + else: |
| 114 | + f.write(f"Script not found: {analyze_script}\n") |
| 115 | + |
| 116 | + # Enhanced Log Capture for Failed Allocs |
| 117 | + f.write(section("RECENT FAILED ALLOCATION LOGS")) |
| 118 | + |
| 119 | + # Filter for failed allocs |
| 120 | + failed_allocs = [a for a in allocs if a.get('ClientStatus') == 'failed'] |
| 121 | + # Sort by ModifyTime desc |
| 122 | + failed_allocs.sort(key=lambda x: x.get('ModifyTime', 0), reverse=True) |
| 123 | + |
| 124 | + # Take top 5 |
| 125 | + top_failures = failed_allocs[:5] |
| 126 | + |
| 127 | + if not top_failures: |
| 128 | + f.write("No failed allocations found.\n") |
| 129 | + |
| 130 | + for alloc in top_failures: |
| 131 | + alloc_id = alloc.get('ID') |
| 132 | + short_id = alloc_id[:8] |
| 133 | + job_id = alloc.get('JobID') |
| 134 | + |
| 135 | + f.write(f"\n--- Logs for Allocation {short_id} (Job: {job_id}) ---\n") |
| 136 | + |
| 137 | + # Find failed tasks |
| 138 | + task_states = alloc.get('TaskStates', {}) |
| 139 | + for task_name, state in task_states.items(): |
| 140 | + if state.get('Failed') or state.get('State') == 'dead': |
| 141 | + f.write(f"Task: {task_name} (State: {state.get('State')})\n") |
| 142 | + # Fetch stderr |
| 143 | + f.write(f"Fetching stderr for task '{task_name}'...\n") |
| 144 | + f.write(run_command(["nomad", "alloc", "logs", "-stderr", "-tail", "-n", "100", alloc_id, task_name])) |
| 145 | + finally: |
| 146 | + os.remove(temp_path) |
| 147 | + else: |
| 148 | + f.write("Could not retrieve allocations for analysis.\n") |
| 149 | + |
| 150 | + print(f"Report generation complete. Output saved to: {report_file}") |
| 151 | + |
| 152 | +if __name__ == "__main__": |
| 153 | + main() |
0 commit comments