内容目录
1.部署prometheus
创建存储类并设置默认
helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner/
helm repo update
#前置需要部署NFS 搜索站内nfs教程
helm install nfs-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner \
--set nfs.server=192.168.163.139 \
--set nfs.path=/data/share
kubectl patch storageclass nfs-client \
-p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
添加 prometheus-community 仓库
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
创建监控命名空间
kubectl create namespace monitoring
部署 prometheus(不带 Grafana 方便Grafana独立后续接入其他数据源)
helm install prometheus prometheus-community/prometheus \
--namespace monitoring
接下来我们就可以继续部署 Loki(日志收集) 和 Grafana(可视化)。
我会给你一套完整、可直接执行的命令。
2. 安装 Loki-stack(grafana+loki+promtail 日志系统)
- 添加 Helm 仓库
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
编辑安装配置
vim loki-stack-values.yaml
grafana:
enabled: true
service:
type: NodePort
nodePort: 31200 #grafana 访问端口,自行设置
loki:
persistence:
enabled: true
storageClassName: nfs-client
size: 5Gi #自行设置大小,不要超过nfs大小
tableManager:
retention_deletes_enabled: true
retention_period: 168h # 日志保存时长自行设置
promtail:
enabled: true
serviceAccount:
create: true
name: loki-promtail
clusterRole:
enabled: true
rules:
- apiGroups: [""]
resources: ["pods", "namespaces"]
verbs: ["get","list","watch"]
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get","list","watch"]
config:
clients:
- url: http://loki:3100/loki/api/v1/push
positions:
filename: /run/promtail/positions.yaml
scrape_configs:
- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
namespaces: {} # 空对象表示所有命名空间
relabel_configs:
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
- source_labels: [__meta_kubernetes_pod_container_name]
target_label: container
- source_labels: [__meta_kubernetes_pod_label_app]
target_label: app
安装
helm upgrade --install loki grafana/loki-stack \
-n monitoring --create-namespace \
-f loki-stack-values.yaml
获取grafana密码
登陆默认有loki数据源
prometheus 额外添加
URL: http://prometheus-server.monitoring.svc.cluster.local
kubectl get secret --namespace monitoring loki-grafana -o jsonpath="{.data.admin-password}" | base64 --decode
常见问题:
有个镜像拉取超时失败,手动拉取
ctr -n k8s.io images pull docker.io/kiwigrid/k8s-sidecar:1.19.2
孤儿pod产生
由于kuboard面板操作过东西
卸载后
root@u1:~/k8s_yaml# helm uninstall loki -n monitoring
release "loki" uninstalled
root@u1:~/k8s_yaml# kubectl get pod -n monitoring
NAME READY STATUS RESTARTS AGE
loki-grafana-bcd96655d-hbps5 2/2 Running 0 18h
检查
-
Grafana 部署仍在;
-
但 Helm 已丢失这个 release 的记录(即 “孤儿资源”);
-
所以 helm uninstall loki 已经无法再回收这些对象。
root@u1:~/k8s_yaml# helm list -n monitoring NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION prometheus monitoring 1 2025-09-24 07:49:58.007039569 +0000 UTC deployed prometheus-27.39.0 v3.6.0 root@u1:~/k8s_yaml# kubectl get deployment loki-grafana -n monitoring -o jsonpath='{.metadata.labels}' {"app.kubernetes.io/instance":"loki","app.kubernetes.io/managed-by":"Helm","app.kubernetes.io/name":"grafana","app.kubernetes.io/version":"10.3.3","helm.sh/chart":"grafana-6.43.5","k8s.kuboard.cn/name":"loki-grafana"}root@u1:~/k8s_yaml#
孤儿pod删除
kubectl delete deployment loki-grafana -n monitoring kubectl delete svc loki-grafana -n monitoring kubectl delete secret loki-grafana -n monitoring --ignore-not-found kubectl delete configmap loki-grafana -n monitoring --ignore-not-found kubectl delete pvc -l app.kubernetes.io/instance=loki -n monitoring --ignore-not-found