1、创建自愈套餐
首先在套餐管理中创建自愈套餐,套餐类型选择 HTTP 回调,自定义套餐名称,填入回调地址(可以是外部地址,需保证自愈后台与回调地址的连通性),这里我是自建的 SaaS 应用。
这里特别需要注意,故障自愈把告警信息作为参数,以POST方式向回调地址请求,参数为 json 格式,完整的参数如下:
{
"alarm_def_id": 11,
"source_time": "2019-11-14T06:54:00",
"source_type": "ALERT",
"alarm_type": "cpu_7",
"ip": "192.168.1.73",
"origin_alarm": {
"_match_info": {
"cc_plat_id": 0,
"cc_idc_unit": "",
"category": "",
"topo_id": {},
"alarm_process": "",
"alarm_context": "",
"alarm_responsible": "",
"alarm_desc": "CPU总使用率: 当前指标值(14.69%) >= (2.0%)",
"cc_biz_id": 3,
"cc_app_module": [
"tomcat_监控",
"服务"
],
"topo_names": {},
"cc_equipment": "",
"alarm_type": [
"cpu_7"
],
"cc_set_category": "",
"cc_company_id": 0,
"alarm_attr_id": 19,
"alarm_time": "2019-11-14 06:54:00",
"cc_link_net_device": "",
"source_type": "ALERT",
"host": "192.168.1.73",
"cc_topo_set": [
"系统"
],
"alarm_def_id": 11,
"cc_set_service_state": "",
"solution": 41,
"alarm_port": "",
"cc_set_envi_type": "",
"source_id": "97ce9a40c8fc88ce3aade709d43b8107"
},
"anomaly_id": "97ce9a40c8fc88ce3aade709d43b8107",
"monitor_field_alias": "usage",
"monitor_source_info": {
"count_freq": 60,
"dimensions": [
"ip",
"plat_id",
"company_id"
],
"fields": [
{
"default_value": null,
"processor": null,
"description": "",
"origins": null,
"tags": null,
"created_at": "2019-07-29 16:41:38",
"updated_at": "2019-07-29 16:41:38",
"is_dimension": false,
"filter": null,
"field": "timestamp",
"field_index": 11,
"processor_args": null,
"id": 724,
"type": "timestamp",
"result_table_id": "3_system_cpu_summary",
"unit": null
},
{
"default_value": null,
"processor": null,
"description": "",
"origins": null,
"tags": null,
"created_at": "2019-07-29 16:41:38",
"updated_at": "2019-07-29 16:41:38",
"is_dimension": false,
"filter": null,
"field": "usage",
"field_index": 3,
"processor_args": null,
"id": 716,
"type": "double",
"result_table_id": "3_system_cpu_summary",
"unit": null
},
{
"default_value": null,
"processor": null,
"description": "",
"origins": null,
"tags": null,
"created_at": "2019-07-29 16:41:38",
"updated_at": "2019-07-29 16:41:38",
"is_dimension": true,
"filter": null,
"field": "ip",
"field_index": 0,
"processor_args": null,
"id": 713,
"type": "string",
"result_table_id": "3_system_cpu_summary",
"unit": null
},
{
"default_value": null,
"processor": null,
"description": "",
"origins": null,
"tags": null,
"created_at": "2019-07-29 16:41:38",
"updated_at": "2019-07-29 16:41:38",
"is_dimension": true,
"filter": null,
"field": "plat_id",
"field_index": 9,
"processor_args": null,
"id": 722,
"type": "int",
"result_table_id": "3_system_cpu_summary",
"unit": null
},
{
"default_value": null,
"processor": null,
"description": "",
"origins": null,
"tags": null,
"created_at": "2019-07-29 16:41:38",
"updated_at": "2019-07-29 16:41:38",
"is_dimension": true,
"filter": null,
"field": "company_id",
"field_index": 10,
"processor_args": null,
"id": 723,
"type": "int",
"result_table_id": "3_system_cpu_summary",
"unit": null
}
],
"monitor_field": "usage",
"unit_conversion": 1,
"where_sql": "",
"monitor_result_table_id": "3_system_cpu_summary",
"values": [
"timestamp",
"usage"
],
"aggregator": "max",
"biz_id": "3",
"id": "3_system_cpu_summary",
"unit": "%",
"description": ""
},
"source_alarm_inst_id": 3471204,
"monitor_target": "7",
"value": 14.69,
"monitor_tag": "",
"monitor_name": "CPU总使用率",
"unit": "%",
"extra_info": {
"check_value": 14.69
},
"conversion": 1,
"monitor_level": 2,
"dimensions": {
"cc_app_module": [
56,
75
],
"ip": "192.168.1.73",
"company_id": "0",
"cc_topo_module": [
56,
75
],
"plat_id": "0",
"cc_topo_set": [
11
],
"bk_topo_node": [
"biz|3",
"module|56",
"module|75",
"set|11"
]
},
"scenario": "performance",
"extend_message": "",
"anomaly_message": "当前指标值(14.69%) >= (2.0%)",
"record_id": "307bf1dd752eb7475ce2b11b35019c99",
"src_type": "BKMONITOR",
"monitor_source_name": "CPU总使用率",
"bk_match_info": {
"cc_plat_id": 0,
"converge": [
19
],
"monitor_target": "7",
"monitor_name": "CPU总使用率",
"category": "performance",
"monitor_level": 2,
"event_time": "2019-11-14 06:55:51",
"collect": 1,
"alarm_desc": "当前指标值(14.69%) >= (2.0%)",
"monitor_source_name": "CPU总使用率",
"alarm_dimension": {
"business": [
"73服务"
],
"ip": "192.168.1.73",
"company_id": "0",
"集群": [
"系统"
],
"模块": [
"tomcat_监控",
"服务"
],
"biz_id": 3,
"plat_id": "0"
},
"cc_app_module": [
56,
75
],
"notice": 19,
"alarm_type": [
"cpu"
],
"cc_company_id": 0,
"alarm_attr_id": 19,
"alarm_time": "2019-11-14 06:54:00",
"source_type": "JUNGLE_ALERT",
"host": "192.168.1.73",
"biz_id": 3,
"alarm_source_id": 19,
"cc_topo_set": [
11
],
"solution": 0,
"cc_biz_id": 3,
"monitor_indicator": "usage",
"source_id": "97ce9a40c8fc88ce3aade709d43b8107",
"match_dimension": {
"cc_app_module": [
56,
75
],
"ip": "192.168.1.73",
"company_id": "0",
"cc_topo_module": [
56,
75
],
"plat_id": "0",
"cc_topo_set": [
11
],
"bk_topo_node": [
"biz|3",
"module|56",
"module|75",
"set|11"
]
}
},
"count_freq": 60,
"monitor_source_type": "TSDATA",
"anomaly_level": "",
"biz_id": 3,
"monitor_source_id": "3_system_cpu_summary",
"condition": [
[
{
"field": "ip",
"method": "eq",
"value": [
{
"ip": "192.168.1.73",
"bk_cloud_id": 0
}
]
}
]
],
"alarm_def_id": 19,
"dimensions_alias": {
"business": [
"73服务"
],
"ip": "192.168.1.73",
"company_id": "0",
"集群": [
"系统"
],
"模块": [
"tomcat_监控",
"服务"
],
"biz_id": 3,
"plat_id": "0"
},
"monitor_id": 19,
"monitor_field": "usage",
"strategy_contexts": [
{
"strategy_name": "ThresholdStrategy",
"anomaly_message": "当前指标值(14.69%) >= (2.0%)",
"strategy_option": {
"threshold": 2,
"message": "当前指标值(${metric|value}${metric|unit}) ${method} (${threshold}${metric|unit})",
"method": "gte"
}
}
],
"monitor_desc": "CPU总使用率",
"values": {
"usage": 14.69,
"timestamp": 1573714440
},
"monitor_type": "cpu",
"dt_event_time": 1573714440,
"anomaly_time": "2019-11-14 06:55:51"
},
"cc_biz_id": 3
}
然后我们可以将我们需要的参数获取到。
2、接入自愈
选择告警类型,选择上面我们创建的自愈套餐。
3、自愈详情
在自愈详情中查看回调执行结果。(我们在回调接口中尽量写清异常处理返回,方便排查问题)
Comments (0)