-
Notifications
You must be signed in to change notification settings - Fork 538
Description
What's wrong?
Hello,
I'm running alloy 1.8.2 (and previous versions to debug), in 2 of 8 nodes alloy gets stuck after a few minutes. It also does not recover after a few hours of runtime.
There are no error messages in the logs. But logs are still generated, for example:
ts=2025-04-26T14:25:41.171363493Z level=info msg="skipping update of position for a file which does not currently exist" component_path=/ component_id=loki.source.file.pods component=tailer
Creation of the support package hangs indefinitly.
Requests to some endpoints hang indefinitly, for example /api/v0/web/components. Edit: Config reload hangs aswell
Attached is a goroutines and profile pprof extract of the affected node. profiles.zip
Alloy is running as daemonset, collecting local logs from files directly.
Please advise for further debug steps.
Steps to reproduce
unclear, happens on 2 of 8 nodes, probably some environmental side effect.
System information
Rocky Linux 9.4 5.14.0-427.24.1.el9_4.x86_64
Software version
Grafana Alloy 1.8.2
Configuration
discovery.kubernetes "pods" {
role = "pod"
// Filter only pods running on the current node (when run as daemonset)
selectors {
role = "pod"
field = "spec.nodeName=" + coalesce(env("HOSTNAME"), constants.hostname)
}
}
discovery.relabel "pods" {
targets = discovery.kubernetes.pods.targets
// Basic namespace/pod/container rules
// Adding the node is useful to track restarting daemonsets on a specific node
// otherwise it's not possible to group logs by node, and this might be an issue when debugging node related issues.
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_node_name"]
target_label = "node"
}
rule {
action = "replace"
source_labels = ["__meta_kubernetes_namespace"]
target_label = "namespace"
}
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "pod"
}
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_container_name"]
target_label = "container"
}
// Extended app.kubernetes.io rules (instance, role, component, app)
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
target_label = "app"
}
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_component"]
target_label = "component"
}
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_instance"]
target_label = "app_instance"
}
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_role"]
target_label = "role"
}
}
discovery.relabel "loki_pods" {
targets = discovery.relabel.pods.output
// Labels for Loki
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_annotation_loki_io_tenant"]
target_label = "tenant"
}
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_annotation_loki_io_parser"]
target_label = "parser"
}
// Labels for the file path of the pod
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
target_label = "__path__"
separator = "/"
// Path is made up of: {namespace}_{pod}_{uid}/{container_name}/{0.log|1.log|1.log.20241205-140004}
// Format of the logs is CRI => used in loki.process
replacement = "/var/log/pods/*$1/*.log"
}
// Not sure if the following is required
rule {
source_labels = ["__meta_kubernetes_pod_annotationpresent_kubernetes_io_config_hash", "__meta_kubernetes_pod_annotation_kubernetes_io_config_hash", "__meta_kubernetes_pod_container_name"]
separator = "/"
regex = "true/(.*)"
target_label = "__path__"
replacement = "/var/log/pods/*$1/*.log"
}
}
// Map __path__ with glob to real files
local.file_match "pods" {
path_targets = discovery.relabel.loki_pods.output
}
// Read logs via FS
loki.source.file "pods" {
targets = local.file_match.pods.targets
forward_to = [ loki.relabel.pods.receiver ]
}
loki.source.api "loki_api" {
http {
listen_port = 3100
}
forward_to = [loki.relabel.pods.receiver]
}
loki.relabel "pods" {
forward_to = [loki.process.main.receiver]
rule {
action = "labeldrop"
regex = "(__meta_|app).+"
}
rule {
action = "labeldrop"
regex = "controller_revision_hash|helm_sh_chart|k8s_app|pod_.+"
}
}
loki.process "main" {
forward_to = [
loki.relabel.main.receiver,
]
// Generate labels content, stream, timestamp
// Logs read directly from the pods log files
stage.cri { }
}
Logs