AWS Industry week에 다녀왔다.
보안과 비용최적화에 대한 챌린지를 이겨내야하는 운명이다보니 kubecost와 karpenter를 통한 실질적인 사례가 눈에 띄었다.

openshift에 helm chart를 이용해서 kubecost 구성해보자.
개발자 카탈로그에서 helm repository https://kubecost.github.io/cost-analyzer를 추가한다.

cost analyzer를 검색하여 install 버튼을 클릭한다.

openshift 관련 설정과, nodeAffinity를 추가하였다.
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker01.supermoon.it
awsstore:
annotations: {}
createServiceAccount: false
imageNameAndVersion: 'gcr.io/kubecost1/awsstore:latest'
nodeSelector: {}
priorityClassName: ''
useAwsStore: false
clusterController:
actionConfigs:
clusterRightsize: null
clusterTurndown: []
containerRightsize: null
namespaceTurndown: null
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker01.supermoon.it
annotations: {}
enabled: false
image:
repository: gcr.io/kubecost1/cluster-controller
tag: v0.16.9
imagePullPolicy: IfNotPresent
kubescaler:
defaultResizeAll: false
namespaceTurndown:
rbac:
enabled: true
nodeSelector: {}
priorityClassName: ''
resources: {}
tolerations: []
costEventsAudit:
enabled: false
diagnostics:
collectHelmValues: false
deployment:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker01.supermoon.it
annotations: {}
containerSecurityContext: {}
enabled: false
env: {}
labels: {}
nodeSelector: {}
resources:
requests:
cpu: 10m
memory: 20Mi
securityContext: {}
tolerations: []
enabled: true
keepDiagnosticHistory: false
pollingInterval: 300s
primary:
enabled: false
readonly: false
retention: 7d
etlUtils:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker01.supermoon.it
annotations: {}
enabled: false
env: {}
fullImageName: null
nodeSelector: {}
resources: {}
tolerations: []
extraObjects: []
extraVolumeMounts: []
extraVolumes: []
federatedETL:
agentOnly: false
federatedCluster: false
readOnlyPrimary: false
redirectS3Backup: false
useMultiClusterDB: false
forecasting:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker01.supermoon.it
annotations: {}
enabled: true
env:
GUNICORN_CMD_ARGS: '--log-level info -t 1200'
fullImageName: 'gcr.io/kubecost1/kubecost-modeling:v0.1.18'
imagePullPolicy: IfNotPresent
livenessProbe:
enabled: true
failureThreshold: 200
initialDelaySeconds: 10
periodSeconds: 10
nodeSelector: {}
priority:
enabled: false
name: ''
readinessProbe:
enabled: true
failureThreshold: 200
initialDelaySeconds: 10
periodSeconds: 10
resources:
limits:
cpu: 1500m
memory: 1Gi
requests:
cpu: 200m
memory: 300Mi
tolerations: []
global:
additionalLabels: {}
ammsp:
aadAuthProxy:
aadClientId: $<AZURE_MANAGED_IDENTITY_CLIENT_ID>
aadTenantId: $<AZURE_MANAGED_IDENTITY_TENANT_ID>
audience: 'https://prometheus.monitor.azure.com/.default'
enabled: false
identityType: userAssigned
image: $<IMAGE>
imagePullPolicy: IfNotPresent
name: aad-auth-proxy
port: 8081
enabled: false
prometheusServerEndpoint: 'http://localhost:8081/'
queryEndpoint: $<AMMSP_QUERY_ENDPOINT>
remoteWriteService: $<AMMSP_METRICS_INGESTION_ENDPOINT>
amp:
enabled: false
prometheusServerEndpoint: 'http://localhost:8005/workspaces/<workspaceId>/'
remoteWriteService: >-
https://aps-workspaces.us-west-2.amazonaws.com/workspaces/<workspaceId>/api/v1/remote_write
sigv4:
region: us-west-2
annotations: {}
assetReports:
enabled: false
reports:
- accumulate: false
aggregateBy: type
filters:
- property: cluster
value: cluster-one
title: Example Asset Report 0
window: today
cloudCostReports:
enabled: false
reports:
- accumulate: false
aggregateBy: service
title: Cloud Cost Report 0
window: today
containerSecurityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
gcpstore:
enabled: false
gmp:
enabled: false
gmpProxy:
enabled: false
image: 'gke.gcr.io/prometheus-engine/frontend:v0.4.1-gke.0'
imagePullPolicy: IfNotPresent
name: gmp-proxy
port: 8085
projectId: YOUR_PROJECT_ID
prometheusServerEndpoint: 'http://localhost:8085/'
grafana:
domainName: cost-analyzer-grafana.default.svc
enabled: true
proxy: true
scheme: http
integrations:
postgres:
databaseHost: ''
databaseName: ''
databasePassword: ''
databasePort: ''
databaseSecretName: ''
databaseUser: ''
enabled: false
queryConfigs:
allocations: []
assets: []
cloudCosts: []
runInterval: 12h
turbonomic:
clientId: ''
clientSecret: ''
enabled: false
host: ''
insecureClient: false
role: ''
mimirProxy:
annotations: {}
enabled: false
image: nginxinc/nginx-unprivileged
mimirEndpoint: $mimir_endpoint
name: mimir-proxy
orgIdentifier: $your_tenant_ID
port: 8085
notifications:
alertmanager:
enabled: false
fqdn: 'http://cost-analyzer-prometheus-server.default.svc'
platforms:
cicd:
enabled: false
skipSanityChecks: false
openshift:
enabled: true
route:
annotations: {}
enabled: false
scc:
networkCosts: true
nodeExporter: true
securityContext:
runAsNonRoot: true
seccompProfile:
type: RuntimeDefault
podAnnotations: {}
prometheus:
enabled: true
fqdn: 'http://cost-analyzer-prometheus-server.default.svc'
savedReports:
enabled: false
reports:
- accumulate: false
aggregateBy: namespace
chartDisplay: category
filters:
- key: cluster
operator: ':'
value: dev
idle: separate
rate: cumulative
title: Example Saved Report 0
window: today
- accumulate: false
aggregateBy: controllerKind
chartDisplay: category
filters:
- key: namespace
operator: '!:'
value: kubecost
idle: share
rate: monthly
title: Example Saved Report 1
window: month
- accumulate: true
aggregateBy: service
chartDisplay: category
filters: []
idle: hide
rate: daily
title: Example Saved Report 2
window: '2020-11-11T00:00:00Z,2020-12-09T23:59:59Z'
securityContext:
fsGroup: 1001
fsGroupChangePolicy: OnRootMismatch
runAsGroup: 1001
runAsNonRoot: true
runAsUser: 1001
seccompProfile:
type: RuntimeDefault
grafana:
adminPassword: strongpassword
adminUser: admin
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker01.supermoon.it
annotations: {}
dashboardProviders: {}
dashboards: {}
dashboardsConfigMaps: {}
deploymentStrategy: RollingUpdate
downloadDashboardsImage:
pullPolicy: IfNotPresent
repository: curlimages/curl
tag: latest
env: {}
envFromSecret: ''
extraSecretMounts: []
grafana.ini:
analytics:
check_for_updates: true
auth.anonymous:
enabled: true
org_name: Main Org.
org_role: Editor
grafana_net:
url: 'https://grafana.net'
log:
mode: console
paths:
data: /var/lib/grafana/data
logs: /var/log/grafana
plugins: /var/lib/grafana/plugins
provisioning: /etc/grafana/provisioning
server:
root_url: '%(protocol)s://%(domain)s:%(http_port)s/grafana'
serve_from_sub_path: false
image:
pullPolicy: IfNotPresent
repository: cgr.dev/chainguard/grafana
tag: latest
livenessProbe:
failureThreshold: 10
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 60
timeoutSeconds: 30
nodeSelector: {}
persistence:
enabled: false
plugins: []
podAnnotations: {}
priorityClassName: ''
rbac:
create: true
readinessProbe:
httpGet:
path: /api/health
port: 3000
replicas: 1
resources: {}
securityContext: {}
service:
annotations: {}
labels: {}
port: 80
type: ClusterIP
serviceAccount:
create: true
name: ''
sidecar:
dashboards:
annotations: {}
enabled: true
error_throttle_sleep: 0
folder: /tmp/dashboards
label: grafana_dashboard
labelValue: '1'
datasources:
enabled: false
error_throttle_sleep: 0
label: grafana_datasource
image:
pullPolicy: IfNotPresent
repository: cgr.dev/chainguard/k8s-sidecar
tag: latest
resources: {}
tolerations: []
ingress:
annotations: null
enabled: false
hosts:
- cost-analyzer.local
labels: null
pathType: ImplementationSpecific
paths:
- /
tls: []
initChownData:
resources: {}
initChownDataImage: busybox
kubecostAdmissionController:
caBundle: '${CA_BUNDLE}'
enabled: false
secretName: webhook-server-tls
kubecostAggregator:
aggregatorDbStorage:
storageClass: ''
storageRequest: 128Gi
cloudCost:
readinessProbe:
enabled: true
failureThreshold: 200
initialDelaySeconds: 10
periodSeconds: 10
resources: {}
containerResourceUsageRetentionDays: 1
dbConcurrentIngestionCount: 1
dbMemoryLimit: 0GB
dbReadThreads: 1
dbTrimMemoryOnClose: true
dbWriteMemoryLimit: 0GB
dbWriteThreads: 1
deployMethod: singlepod
enabled: false
etlDailyStoreDurationDays: 91
etlHourlyStoreDurationHours: 49
imagePullPolicy: IfNotPresent
jaeger:
enabled: false
image: jaegertracing/all-in-one
imageVersion: latest
logLevel: info
numDBCopyPartitions: 25
persistentConfigsStorage:
storageClass: ''
storageRequest: 1Gi
readinessProbe:
enabled: true
failureThreshold: 200
initialDelaySeconds: 10
periodSeconds: 10
replicas: 1
resources: {}
service:
labels: {}
stagingEmptyDirSizeLimit: 2Gi
kubecostDeployment:
annotations: {}
labels: {}
replicas: 1
kubecostFrontend:
deployMethod: singlepod
deploymentStrategy: {}
enabled: true
haReplicas: 2
image: gcr.io/kubecost1/frontend
imagePullPolicy: IfNotPresent
ipv6:
enabled: true
livenessProbe:
enabled: true
failureThreshold: 6
initialDelaySeconds: 1
periodSeconds: 5
readinessProbe:
enabled: true
failureThreshold: 6
initialDelaySeconds: 1
periodSeconds: 5
resources:
requests:
cpu: 10m
memory: 55Mi
useDefaultFqdn: false
kubecostMetrics: null
kubecostModel:
allocation: null
containerStatsEnabled: true
etlDailyStoreDurationDays: 91
etlHourlyStoreDurationHours: 49
etlReadOnlyMode: false
extraArgs: []
extraPorts: []
image: gcr.io/kubecost1/cost-model
imagePullPolicy: IfNotPresent
livenessProbe:
enabled: true
failureThreshold: 200
initialDelaySeconds: 10
periodSeconds: 10
logLevel: info
maxQueryConcurrency: 5
plugins:
enabled: false
enabledPlugins: []
existingCustomSecret:
enabled: false
name: ''
folder: /opt/opencost/plugin
install:
enabled: false
fullImageName: 'curlimages/curl:latest'
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1001
seccompProfile:
type: RuntimeDefault
secretName: kubecost-plugin-secret
readinessProbe:
enabled: true
failureThreshold: 200
initialDelaySeconds: 10
periodSeconds: 10
resources:
requests:
cpu: 200m
memory: 55Mi
utcOffset: '+00:00'
kubecostToken: null
networkCosts:
additionalLabels: {}
additionalSecurityContext: {}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker01.supermoon.it
annotations: {}
config:
destinations:
cross-region: []
direct-classification: []
in-region: []
in-zone:
- 127.0.0.0/8
- 169.254.0.0/16
- 10.0.0.0/8
- 172.16.0.0/12
- 192.168.0.0/16
internet: []
services:
amazon-web-services: true
azure-cloud-services: true
google-cloud-services: true
enabled: false
extraArgs: []
healthCheckProbes: {}
image:
repository: gcr.io/kubecost1/kubecost-network-costs
tag: v0.17.6
imagePullPolicy: IfNotPresent
logLevel: info
nodeSelector: {}
podMonitor:
additionalLabels: {}
enabled: false
port: 3001
priorityClassName: ''
prometheusScrape: false
resources:
limits:
cpu: 500m
requests:
cpu: 50m
memory: 20Mi
service:
annotations: {}
labels: {}
tolerations: []
trafficLogging: true
updateStrategy:
type: RollingUpdate
nodeSelector: {}
oidc:
authURL: ''
clientID: ''
clientSecret: ''
discoveryURL: ''
enabled: false
existingCustomSecret:
enabled: false
name: ''
hostedDomain: ''
loginRedirectURL: ''
rbac:
enabled: false
secretName: kubecost-oidc-secret
skipOnlineTokenValidation: false
useClientSecretPost: false
persistentVolume:
annotations: {}
enabled: true
labels: {}
size: 32Gi
pricingCsv:
enabled: false
location:
URI: 's3://kc-csv-test/pricing_schema.csv'
csvAccessCredentials: pricing-schema-access-secret
provider: AWS
region: us-east-1
priority:
enabled: false
name: ''
prometheus:
alertRelabelConfigs: null
alertmanager:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker01.supermoon.it
annotations: {}
baseURL: 'http://localhost:9093'
configFileName: alertmanager.yml
configFromSecret: ''
configMapOverrideName: ''
enabled: false
extraArgs: {}
extraEnv: {}
extraSecretMounts: []
image:
pullPolicy: IfNotPresent
repository: cgr.dev/chainguard/prometheus-alertmanager
tag: latest
ingress:
annotations: {}
enabled: false
extraLabels: {}
extraPaths: []
hosts: []
tls: []
name: alertmanager
nodeSelector: {}
persistentVolume:
accessModes:
- ReadWriteOnce
annotations: {}
enabled: true
existingClaim: ''
mountPath: /data
size: 2Gi
subPath: ''
podAnnotations: {}
podDisruptionBudget:
enabled: false
maxUnavailable: 1
podLabels: {}
prefixURL: ''
priorityClassName: ''
replicaCount: 1
resources: {}
securityContext:
fsGroup: 1001
runAsGroup: 1001
runAsNonRoot: true
runAsUser: 1001
service:
annotations: {}
clusterIP: ''
externalIPs: []
labels: {}
loadBalancerIP: ''
loadBalancerSourceRanges: []
servicePort: 80
sessionAffinity: None
type: ClusterIP
statefulSet:
annotations: {}
enabled: false
headless:
annotations: {}
labels: {}
servicePort: 80
podManagementPolicy: OrderedReady
strategy:
rollingUpdate: null
type: Recreate
tolerations: []
alertmanagerFiles:
alertmanager.yml:
global: {}
receivers:
- name: default-receiver
route:
group_interval: 5m
group_wait: 10s
receiver: default-receiver
repeat_interval: 3h
configmapReload:
alertmanager:
enabled: false
extraArgs: {}
extraConfigmapMounts: []
extraVolumeDirs: []
image:
pullPolicy: IfNotPresent
repository: cgr.dev/chainguard/prometheus-config-reloader
tag: latest
name: configmap-reload
resources: {}
prometheus:
containerSecurityContext: {}
enabled: false
extraArgs: {}
extraConfigmapMounts: []
extraVolumeDirs: []
image:
pullPolicy: IfNotPresent
repository: cgr.dev/chainguard/prometheus-config-reloader
tag: latest
name: configmap-reload
resources: {}
extraScrapeConfigs: >
- job_name: kubecost
honor_labels: true
scrape_interval: 1m
scrape_timeout: 60s
metrics_path: /metrics
scheme: http
dns_sd_configs:
- names:
- {{ template "cost-analyzer.serviceName" . }}
type: 'A'
port: 9003
- job_name: kubecost-networking
kubernetes_sd_configs:
- role: pod
relabel_configs:
# Scrape only the the targets matching the following metadata
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
action: keep
regex: kubecost
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
action: keep
regex: network-costs
- job_name: kubecost-aggregator
scrape_interval: 1m
scrape_timeout: 60s
metrics_path: /metrics
scheme: http
dns_sd_configs:
- names:
- {{ template "aggregator.serviceName" . }}
type: 'A'
{{- if or .Values.saml.enabled .Values.oidc.enabled }}
port: 9008
{{- else }}
port: 9004
{{- end }}
## Enables scraping of NVIDIA GPU metrics via dcgm-exporter. Scrapes all
## endpoints which contain "dcgm-exporter" in labels "app",
## "app.kubernetes.io/component", or "app.kubernetes.io/name" with a case
## insensitive match.
## Refs:
##
https://github.com/NVIDIA/gpu-operator/blob/d4316a415bbd684ce8416a88042305fc1a093aa4/assets/state-dcgm-exporter/0600_service.yaml#L7
##
https://github.com/NVIDIA/dcgm-exporter/blob/54fd1ca137c66511a87a720390613680b9bdabdd/deployment/templates/service.yaml#L23
- job_name: kubecost-dcgm-exporter
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app, __meta_kubernetes_pod_label_app_kubernetes_io_component, __meta_kubernetes_pod_label_app_kubernetes_io_name]
action: keep
regex: (?i)(.*dcgm-exporter.*|.*dcgm-exporter.*|.*dcgm-exporter.*)
imagePullSecrets: null
networkPolicy:
enabled: false
nodeExporter:
annotations: {}
dnsPolicy: ClusterFirstWithHostNet
enabled: false
extraArgs: {}
extraConfigmapMounts: []
extraHostPathMounts: []
hostNetwork: true
hostPID: true
image:
pullPolicy: IfNotPresent
repository: prom/node-exporter
tag: v1.8.2
name: node-exporter
nodeSelector: {}
pod:
labels: {}
podAnnotations: {}
podDisruptionBudget:
enabled: false
maxUnavailable: 1
priorityClassName: ''
resources: {}
securityContext: {}
service:
annotations:
prometheus.io/scrape: 'true'
clusterIP: None
externalIPs: []
hostPort: 9100
labels: {}
loadBalancerIP: ''
loadBalancerSourceRanges: []
servicePort: 9100
type: ClusterIP
tolerations: []
updateStrategy:
type: RollingUpdate
rbac:
create: true
server:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- worker01.supermoon.it
alertmanagers: []
annotations: {}
baseURL: ''
configMapOverrideName: ''
configPath: /etc/config/prometheus.yml
containerSecurityContext: {}
emptyDir:
sizeLimit: ''
enabled: true
env: []
extraArgs:
query.max-concurrency: 1
query.max-samples: 100000000
extraConfigmapMounts: []
extraFlags:
- web.enable-lifecycle
extraHostPathMounts: []
extraInitContainers: []
extraSecretMounts: []
extraVolumeMounts: []
extraVolumes: []
global:
evaluation_interval: 1m
external_labels:
cluster_id: cluster-one
scrape_interval: 1m
scrape_timeout: 60s
image:
pullPolicy: IfNotPresent
repository: cgr.dev/chainguard/prometheus
tag: latest
ingress:
annotations: {}
enabled: false
extraLabels: {}
extraPaths: []
hosts: []
pathType: Prefix
tls: []
livenessProbeFailureThreshold: 3
livenessProbeInitialDelay: 5
livenessProbeSuccessThreshold: 1
livenessProbeTimeout: 3
name: server
nodeSelector: {}
persistentVolume:
accessModes:
- ReadWriteOnce
annotations: {}
enabled: true
existingClaim: ''
mountPath: /data
size: 32Gi
subPath: ''
podAnnotations: {}
podDisruptionBudget:
enabled: false
maxUnavailable: 1
podLabels: {}
prefixURL: ''
priorityClassName: ''
readinessProbeFailureThreshold: 3
readinessProbeInitialDelay: 5
readinessProbeSuccessThreshold: 1
readinessProbeTimeout: 3
remoteRead: {}
remoteWrite: {}
replicaCount: 1
resources: {}
retention: 97h
securityContext: {}
service:
annotations: {}
clusterIP: ''
externalIPs: []
gRPC:
enabled: false
servicePort: 10901
labels: {}
loadBalancerIP: ''
loadBalancerSourceRanges: []
servicePort: 80
sessionAffinity: None
statefulsetReplica:
enabled: false
replica: 0
type: ClusterIP
sidecarContainers: null
statefulSet:
annotations: {}
enabled: false
headless:
annotations: {}
labels: {}
servicePort: 80
labels: {}
podManagementPolicy: OrderedReady
strategy:
rollingUpdate: null
type: Recreate
terminationGracePeriodSeconds: 300
tolerations: []
verticalAutoscaler:
enabled: false
serverFiles:
alerting_rules.yml: {}
prometheus.yml:
rule_files:
- /etc/config/recording_rules.yml
- /etc/config/alerting_rules.yml
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- 'localhost:9090'
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-nodes-cadvisor
kubernetes_sd_configs:
- role: node
metric_relabel_configs:
- action: keep
regex: >-
(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_errors_total|container_network_transmit_errors_total|container_network_receive_packets_dropped_total|container_network_transmit_packets_dropped_total|container_memory_usage_bytes|container_cpu_cfs_throttled_periods_total|container_cpu_cfs_periods_total|container_fs_usage_bytes|container_fs_limit_bytes|container_cpu_cfs_periods_total|container_fs_inodes_free|container_fs_inodes_total|container_fs_usage_bytes|container_fs_limit_bytes|container_cpu_cfs_throttled_periods_total|container_cpu_cfs_periods_total|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_fs_inodes_free|container_fs_inodes_total|container_fs_usage_bytes|container_fs_limit_bytes|container_spec_cpu_shares|container_spec_memory_limit_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_fs_reads_bytes_total|container_network_receive_bytes_total|container_fs_writes_bytes_total|container_fs_reads_bytes_total|cadvisor_version_info|kubecost_pv_info)
source_labels:
- __name__
- action: replace
regex: (.+)
source_labels:
- container
target_label: container_name
- action: replace
regex: (.+)
source_labels:
- pod
target_label: pod_name
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- replacement: 'kubernetes.default.svc:443'
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-nodes
kubernetes_sd_configs:
- role: node
metric_relabel_configs:
- action: keep
regex: (kubelet_volume_stats_used_bytes)
source_labels:
- __name__
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- replacement: 'kubernetes.default.svc:443'
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/$1/proxy/metrics
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- job_name: kubernetes-service-endpoints
kubernetes_sd_configs:
- role: endpoints
metric_relabel_configs:
- action: keep
regex: >-
(container_cpu_allocation|container_cpu_usage_seconds_total|container_fs_limit_bytes|container_fs_writes_bytes_total|container_gpu_allocation|container_memory_allocation_bytes|container_memory_usage_bytes|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|DCGM_FI_DEV_GPU_UTIL|deployment_match_labels|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_number_ready|kube_deployment_spec_replicas|kube_deployment_status_replicas|kube_deployment_status_replicas_available|kube_job_status_failed|kube_namespace_annotations|kube_namespace_labels|kube_node_info|kube_node_labels|kube_node_status_allocatable|kube_node_status_allocatable_cpu_cores|kube_node_status_allocatable_memory_bytes|kube_node_status_capacity|kube_node_status_capacity_cpu_cores|kube_node_status_capacity_memory_bytes|kube_node_status_condition|kube_persistentvolume_capacity_bytes|kube_persistentvolume_status_phase|kube_persistentvolumeclaim_info|kube_persistentvolumeclaim_resource_requests_storage_bytes|kube_pod_container_info|kube_pod_container_resource_limits|kube_pod_container_resource_limits_cpu_cores|kube_pod_container_resource_limits_memory_bytes|kube_pod_container_resource_requests|kube_pod_container_resource_requests_cpu_cores|kube_pod_container_resource_requests_memory_bytes|kube_pod_container_status_restarts_total|kube_pod_container_status_running|kube_pod_container_status_terminated_reason|kube_pod_labels|kube_pod_owner|kube_pod_status_phase|kube_replicaset_owner|kube_statefulset_replicas|kube_statefulset_status_replicas|kubecost_cluster_info|kubecost_cluster_management_cost|kubecost_cluster_memory_working_set_bytes|kubecost_load_balancer_cost|kubecost_network_internet_egress_cost|kubecost_network_region_egress_cost|kubecost_network_zone_egress_cost|kubecost_node_is_spot|kubecost_pod_network_egress_bytes_total|node_cpu_hourly_cost|node_cpu_seconds_total|node_disk_reads_completed|node_disk_reads_completed_total|node_disk_writes_completed|node_disk_writes_completed_total|node_filesystem_device_error|node_gpu_count|node_gpu_hourly_cost|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_MemAvailable_bytes|node_memory_MemFree_bytes|node_memory_MemTotal_bytes|node_network_transmit_bytes_total|node_ram_hourly_cost|node_total_hourly_cost|pod_pvc_allocation|pv_hourly_cost|service_selector_labels|statefulSet_match_labels|kubecost_pv_info|up)
source_labels:
- __name__
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- action: keep
regex: (.*node-exporter|kubecost-network-costs)
source_labels:
- __meta_kubernetes_endpoints_name
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: '([^:]+)(?::\d+)?;(\d+)'
replacement: '$1:$2'
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: kubernetes_name
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: kubernetes_node
recording_rules.yml: {}
rules:
groups:
- name: CPU
rules:
- expr: 'sum(rate(container_cpu_usage_seconds_total{container!=""}[5m]))'
record: 'cluster:cpu_usage:rate5m'
- expr: 'rate(container_cpu_usage_seconds_total{container!=""}[5m])'
record: 'cluster:cpu_usage_nosum:rate5m'
- expr: >-
avg(irate(container_cpu_usage_seconds_total{container!="POD",
container!=""}[5m])) by (container,pod,namespace)
record: kubecost_container_cpu_usage_irate
- expr: >-
sum(container_memory_working_set_bytes{container!="POD",container!=""})
by (container,pod,namespace)
record: kubecost_container_memory_working_set_bytes
- expr: >-
sum(container_memory_working_set_bytes{container!="POD",container!=""})
record: kubecost_cluster_memory_working_set_bytes
- name: Savings
rules:
- expr: >-
sum(avg(kube_pod_owner{owner_kind!="DaemonSet"}) by (pod) *
sum(container_cpu_allocation) by (pod))
labels:
daemonset: 'false'
record: kubecost_savings_cpu_allocation
- expr: >-
sum(avg(kube_pod_owner{owner_kind="DaemonSet"}) by (pod) *
sum(container_cpu_allocation) by (pod)) / sum(kube_node_info)
labels:
daemonset: 'true'
record: kubecost_savings_cpu_allocation
- expr: >-
sum(avg(kube_pod_owner{owner_kind!="DaemonSet"}) by (pod) *
sum(container_memory_allocation_bytes) by (pod))
labels:
daemonset: 'false'
record: kubecost_savings_memory_allocation_bytes
- expr: >-
sum(avg(kube_pod_owner{owner_kind="DaemonSet"}) by (pod) *
sum(container_memory_allocation_bytes) by (pod)) /
sum(kube_node_info)
labels:
daemonset: 'true'
record: kubecost_savings_memory_allocation_bytes
serviceAccounts:
alertmanager:
create: true
name: null
nodeExporter:
create: true
name: null
server:
annotations: {}
create: true
name: null
prometheusRule:
additionalLabels: {}
enabled: false
reporting:
errorReporting: true
logCollection: true
productAnalytics: true
valuesReporting: true
saml:
enabled: false
rbac:
enabled: false
service:
annotations: {}
labels: {}
nodePort: {}
port: 9090
sessionAffinity:
enabled: false
timeoutSeconds: 10800
targetPort: 9090
type: ClusterIP
serviceAccount:
annotations: {}
create: true
serviceMonitor:
additionalLabels: {}
aggregatorMetrics:
additionalLabels: {}
enabled: false
interval: 1m
metricRelabelings: []
relabelings:
- action: replace
sourceLabels:
- __meta_kubernetes_namespace
targetLabel: namespace
scrapeTimeout: 10s
enabled: false
interval: 1m
metricRelabelings: []
networkCosts:
additionalLabels: {}
enabled: false
interval: 1m
metricRelabelings: []
relabelings: []
scrapeTimeout: 10s
relabelings: []
scrapeTimeout: 10s
sigV4Proxy:
extraEnv: null
host: aps-workspaces.us-west-2.amazonaws.com
image: 'public.ecr.aws/aws-observability/aws-sigv4-proxy:latest'
imagePullPolicy: IfNotPresent
name: aps
port: 8005
region: us-west-2
resources: {}
supportNFS: false
systemProxy:
enabled: false
httpProxyUrl: ''
httpsProxyUrl: ''
noProxy: ''
tolerations: []
topologySpreadConstraints: []
upgrade:
toV2: false
이제 router를 설정한 뒤 접속해보자.


파드 사이징에 대한 추천까지 잘 뜨는 것을 볼 수 있다.
AWS 에서는 karpenter와 함께 25%까지 비용절감이 되었다니, openshift에서의 활용방안을 고민해야겠지만
보다 객관적인 지표를 제공하는 운영 최적화 도구로서 우선 사용해봐야겠다.

728x90
'CloudNative > Observability & Analysis' 카테고리의 다른 글
| pinpoint plugin config (2) | 2024.11.24 |
|---|---|
| otel (0) | 2024.11.22 |
| springboot log pipeline: EFK 로그 전처리와 opensearch (0) | 2024.11.07 |
| springboot log pipeline: EFK setup (opensearch) (1) | 2024.10.28 |
| jib maven 으로 pinpoint agent 배포 (0) | 2024.10.18 |