三、OpenFaaS社区版添加CPU和内存指标
商业版有cpu和内存指标,社区版没有。不过可以另辟蹊径,通过K8S内置的cadvisor来获取。在prometheus来收集cadvisor计算的数据,可以通过和获取函数pod的cpu和内存指标数据。如果不嫌麻烦,prometheus配置好抓取cadvisor的数据就能做到获取函数pod的cpu和内存指标数据。不过也可以换装到openfaas里,直接在gateway里查询一遍搬到自己的指标上。
商业版有如下指标
pod_cpu_usage_seconds_total
和pod_memory_working_set_bytes
,社区版没有。不过可以另辟蹊径,通过K8S内置的cadvisor
来获取。
在prometheus来收集
cadvisor
计算的数据,可以通过container_cpu_usage_seconds_total
和container_memory_working_set_bytes
获取函数pod的cpu和内存指标数据。
如果不嫌麻烦,prometheus配置好抓取cadvisor的数据就能做到获取函数pod的cpu和内存指标数据。不过也可以换装到openfaas里,直接在gateway里查询一遍搬到自己的指标上。
修改prometheus配置
1. 修改prometheus-rbac.yml
openfaas的集群权限不能收集个节点数据,所以需要改一下权限。将Role改成ClusterRole,RoleBinding改成ClusterRoleBinding。同时在resources一栏添加 nodes、nodes/proxy。此举把prometheus变成集群级别的了,如果只是自己测试的话无所谓了。
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: openfaas-prometheus
namespace: "openfaas"
labels:
app: openfaas
component: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: openfaas-prometheus
namespace: "openfaas"
labels:
app: openfaas
component: prometheus
rules:
- apiGroups: [""]
resources:
- services
- endpoints
- pods
- nodes
- nodes/proxy
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: openfaas-prometheus-fn
namespace: "openfaas-fn"
labels:
app: openfaas
component: prometheus
rules:
- apiGroups: [""]
resources:
- services
- endpoints
- pods
- nodes
- nodes/proxy
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: openfaas-prometheus
namespace: "openfaas"
labels:
app: openfaas
component: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: openfaas-prometheus
subjects:
- kind: ServiceAccount
name: openfaas-prometheus
namespace: "openfaas"
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: openfaas-prometheus-fn
namespace: "openfaas-fn"
labels:
app: openfaas
component: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: openfaas-prometheus-fn
subjects:
- kind: ServiceAccount
name: openfaas-prometheus
namespace: "openfaas"
2. 修改prometheus-cfg.yml
在prometheus-cfg.yml里面,将prometheus.yml的scrape_configs里添加一个job,用于抓取cadvisor的数据。
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
target_label: __metrics_path__
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- source_labels: [__meta_kubernetes_node_name]
action: replace
target_label: node
- source_labels: [__meta_kubernetes_node_label_node]
action: replace
target_label: node_name
3. 重新部署
kubectl apply -f prometheus-rbac.yml
kubectl apply -f prometheus-cfg.yml
kubectl replace --force -f prometheus-dep.yml
修改gateway源码
1. 修改metrics.go
修改gateway/metrics/metrics.go,加俩指标。这里都使用GaugeVec类型,因为只是抓取cadvisor的数据来设置到我们自定义的指标上。
type MetricOptions struct {
GatewayFunctionInvocation *prometheus.CounterVec
GatewayFunctionsHistogram *prometheus.HistogramVec
GatewayFunctionInvocationStarted *prometheus.CounterVec
ServiceReplicasGauge *prometheus.GaugeVec
ServiceMetrics *ServiceMetricOptions
// 添加cpu和memory的指标
PodCpuUsageSecondsTotal *prometheus.GaugeVec
PodMemoryWorkingSetBytes *prometheus.GaugeVec
}
func BuildMetricsOptions() MetricOptions {
// 省略一些代码
// 添加如下
podCpuUsageSecondsTotal := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: "pod",
Name: "cpu_usage_seconds_total",
Help: "CPU seconds consumed by all the replicas of a given function.",
},
[]string{"function_name"},
)
podMemoryWorkingSetBytes := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: "pod",
Name: "memory_working_set_bytes",
Help: "Bytes of RAM consumed by all the replicas of a given function",
},
[]string{"function_name"},
)
metricsOptions := MetricOptions{
GatewayFunctionsHistogram: gatewayFunctionsHistogram,
GatewayFunctionInvocation: gatewayFunctionInvocation,
ServiceReplicasGauge: serviceReplicas,
ServiceMetrics: serviceMetricOptions,
GatewayFunctionInvocationStarted: gatewayFunctionInvocationStarted,
// 添加如下
PodCpuUsageSecondsTotal: podCpuUsageSecondsTotal,
PodMemoryWorkingSetBytes: podMemoryWorkingSetBytes,
}
return metricsOptions
}
2. 修改prometheus_query.go
修改gateway/metrics/prometheus_query.go,修改查询结果的结构体,因为查出来的和原本的数据不一样,得加俩参数。
type VectorQueryResponse struct {
Data struct {
Result []struct {
Metric struct {
Code string `json:"code"`
FunctionName string `json:"function_name"`
// 添加如下
Container string `json:"container"`
Namespace string `json:"namespace"`
}
Value []interface{} `json:"value"`
}
}
}
3. 修改exporter.go
修改gateway/metrics/exporter.go。注意,修改完,exporter_test.go会报错,因为它也调了NewExporter,可以注释掉里面的函数。
type Exporter struct {
metricOptions MetricOptions
services []types.FunctionStatus
credentials *auth.BasicAuthCredentials
FunctionNamespace string
// 加这个,用来查询prometheus
prometheusQuery PrometheusQueryFetcher
}
// 加最后一个参数
func NewExporter(options MetricOptions, credentials *auth.BasicAuthCredentials, namespace string, prometheusQuery PrometheusQueryFetcher) *Exporter {
return &Exporter{
metricOptions: options,
services: []types.FunctionStatus{},
credentials: credentials,
FunctionNamespace: namespace,
// 加这个
prometheusQuery: prometheusQuery,
}
}
func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
// 省略代码
// 加下面
e.metricOptions.PodCpuUsageSecondsTotal.Describe(ch)
e.metricOptions.PodMemoryWorkingSetBytes.Describe(ch)
}
func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
e.metricOptions.GatewayFunctionsHistogram.Collect(ch)
e.metricOptions.GatewayFunctionInvocationStarted.Collect(ch)
e.metricOptions.ServiceReplicasGauge.Reset()
for _, service := range e.services {
var serviceName string
if len(service.Namespace) > 0 {
serviceName = fmt.Sprintf("%s.%s", service.Name, service.Namespace)
} else {
serviceName = service.Name
}
e.metricOptions.ServiceReplicasGauge.WithLabelValues(serviceName).Set(float64(service.Replicas))
// 加这里,不然销毁的实例没数据
e.metricOptions.PodCpuUsageSecondsTotal.WithLabelValues(serviceName).Set(0)
e.metricOptions.PodMemoryWorkingSetBytes.WithLabelValues(serviceName).Set(0)
}
// 加这个来计算
e.calc()
// 添加如下
e.metricOptions.PodCpuUsageSecondsTotal.Collect(ch)
e.metricOptions.PodMemoryWorkingSetBytes.Collect(ch)
e.metricOptions.GatewayFunctionInvocation.Collect(ch)
e.metricOptions.ServiceReplicasGauge.Collect(ch)
e.metricOptions.ServiceMetrics.Counter.Collect(ch)
e.metricOptions.ServiceMetrics.Histogram.Collect(ch)
}
// ! 这个是新加的函数,直接放最底下。即将查出来的指标转成自己定义的
func (e *Exporter) calc() {
q1 := `sum by(container, namespace) (container_cpu_usage_seconds_total{image!="",namespace="openfaas-fn", container!="POD"})`
q2 := `sum by(container, namespace) (container_memory_working_set_bytes{image!="",namespace="openfaas-fn", container!="POD"})`
q1Results, err := e.prometheusQuery.Fetch(url.QueryEscape(q1))
if err != nil {
log.Printf("Error querying q1: %s\n", err.Error())
return
}
// cpu
for _, v := range q1Results.Data.Result {
metricValue := v.Value[1]
f, _ := strconv.ParseFloat(metricValue.(string), 64)
e.metricOptions.PodCpuUsageSecondsTotal.WithLabelValues(fmt.Sprintf("%s.%s", v.Metric.Container, v.Metric.Namespace)).Set(f)
}
q2Results, err := e.prometheusQuery.Fetch(url.QueryEscape(q2))
if err != nil {
log.Printf("Error querying q2: %s\n", err.Error())
return
}
// memory
for _, v := range q2Results.Data.Result {
metricValue := v.Value[1]
f, _ := strconv.ParseFloat(metricValue.(string), 64)
e.metricOptions.PodMemoryWorkingSetBytes.WithLabelValues(fmt.Sprintf("%s.%s", v.Metric.Container, v.Metric.Namespace)).Set(f)
}
}
4. 修改main.go
// 这个原本是在底下,得往上移
prometheusQuery := metrics.NewPrometheusQuery(config.PrometheusHost, config.PrometheusPort, &http.Client{})
// 在原来的基础上加个prometheusQuery参数
exporter := metrics.NewExporter(metricsOptions, credentials, config.Namespace, prometheusQuery)
部署和结果
之后重新打包部署gateway就行。查询可以看到cpu指标、memory指标和原本的一样。
更多推荐
所有评论(0)