云和安全管理服务专家新钛云服 金恩原创

一、CloudWatch服务安装
Amazon Linux 2系统安装Agent
Bash #!/bin/bash rpm -ivh https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpm sudo tee -a /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json <<- 'EOF' { "logs" : { "logs_collected" : { "files" : { "collect_list" : [{ "file_path" : "/logArchive/hcaextension/info*.log" , "log_group_name" : "RGC-Prod-3in1oven" , "log_stream_name" : "info.logs" },{ "file_path" : "/logArchive/hcaextension/http*.log" , "log_group_name" : "RGC-Prod-3in1oven" , "log_stream_name" : "http.logs" }]}}}, "metrics" : { "aggregation_dimensions" : [[ "InstanceId" ]], "append_dimensions" : { "AutoScalingGroupName" : "${aws:AutoScalingGroupName}" , "ImageId" : "${aws:ImageId}" , "InstanceId" : "${aws:InstanceId}" , "InstanceType" : "${aws:InstanceType}" }, "metrics_collected" : { "cpu" : { "measurement" : [ "cpu_usage_idle" , "cpu_usage_iowait" , "cpu_usage_user" , "cpu_usage_system" ], "metrics_collection_interval" : 180, "resources" : [ "*" ], "totalcpu" : false }, "disk" : { "measurement" : [ "used_percent" ], "metrics_collection_interval" : 180, "resources" : [ "/" ]}, "diskio" : { "measurement" : [ "io_time" , "write_bytes" , "read_bytes" , "writes" , "reads" ], "metrics_collection_interval" : 180, "resources" : [ "/" ]}, "mem" : { "measurement" : [ "mem_used_percent" ], "metrics_collection_interval" : 180}, "netstat" : { "measurement" : [ "tcp_established" , "tcp_time_wait" ], "metrics_collection_interval" : 180}, "statsd" : { "metrics_aggregation_interval" : 60, "metrics_collection_interval" : 180, "service_address" : ":8125" }, "swap" : { "measurement" : [ "swap_used_percent" ], "metrics_collection_interval" : 180}}}} EOF sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json systemctl restart amazon-cloudwatch-agent.servicesystemctl enable amazon-cloudwatch-agent.service
二、AWS-CLI批量下发监控
前提条件:本机安装awscli工具
需要修改的是区域信息、ip_list、实例id、sns_arn信息
通过脚本自动在CloudWatch上添加监控配置EC2监控
Python #!/usr/bin/python # -*- coding: utf-8 -*- import osimport jsonimport subprocess # 1. 配置cli路径和region Contants = { "AWSCLI" : '"C:\\Program Files\\Amazon\\AWSCLI\\bin\\aws*ex.e**" --output json' , "AWSREGION" : [ 'eu-central-1' ] # 新加坡 } # 构造字典 class CreateDict(dict):def __getitem__(self, item):try: return dict.__getitem__(self, item)except KeyError:value = self[item] = type (self)() return value ######################################################################################################### # 配置告警 # CPUUtilization,3分钟检查3次,平均值大于或等于80%,就告警。 def getCPUUtilizationComm(name, action, instance_id):mertic = 'CPUUtilization' print ( "#####开始配置 %s#####" % mertic) return '''{cli} cloudwatch put-metric-alarm \--alarm-name "AWS_EC2_{name}_{mertic}" \--alarm-description "aws ec2 {mertic}" \--metric-name {mertic} \--namespace AWS/EC2 \--statistic Average \--period 60 \--threshold 80 \--evaluation-periods 3 \--datapoints-to-alarm 3 \--comparison-operator GreaterThanOrEqualToThreshold \--treat-missing-data notBreaching \--alarm-actions "{action}" \--ok-actions "{action}" \--unit Percent \--dimensions "Name=InstanceId,Value={id}"''' .format(cli=Contants[ 'AWSCLI' ], name=name, action=action, id=instance_id, mertic=mertic) # MEMUtilization,3分钟检查3次,平均值大于或等于80%,就告警。 def getmem_used_percentComm(name, action, instance_id, instancetype, imageid):mertic = 'mem_used_percent' print ( "#####开始配置 %s#####" % mertic) return '''{cli} cloudwatch put-metric-alarm \--alarm-name "AWS_EC2_{name}_{mertic}" \--alarm-description "aws ec2 {mertic}" \--metric-name {mertic} \--namespace CWAgent \--statistic Average \--period 60 \--threshold 80 \--evaluation-periods 3 \--datapoints-to-alarm 3 \--comparison-operator GreaterThanOrEqualToThreshold \--treat-missing-data missing \--alarm-actions "{action}" \--ok-actions "{action}" \--dimensions Name=InstanceId,Value={id} Name=ImageId,Value={imageid} Name=InstanceType,Value={instancetype}''' .format(cli=Contants[ 'AWSCLI' ], name=name, action=action, id=instance_id, mertic=mertic,instancetype=instancetype, imageid=imageid) # DISKUtilization,3分钟检查3次,平均值大于或等于80%,就告警。 def getdisk_used_percentComm(name, action, instance_id, instancetype, imageid):mertic = 'disk_used_percent' print ( "#####开始配置 %s#####" % mertic) return '''{cli} cloudwatch put-metric-alarm \--alarm-name "AWS_EC2_{name}_{mertic}" \--alarm-description "aws ec2 {mertic}" \--metric-name {mertic} \--namespace CWAgent \--dimensions "Name=path,Value=/" \--statistic Average \--period 60 \--threshold 80 \--evaluation-periods 3 \--datapoints-to-alarm 3 \--comparison-operator GreaterThanOrEqualToThreshold \--treat-missing-data missing \--alarm-actions "{action}" \--ok-actions "{action}" \--dimensions Name=InstanceId,Value={id} Name=ImageId,Value={imageid} Name=InstanceType,Value={instancetype} Name=device,Value=nvme0n1p1 Name=fstype,Value=ext4 "Name=path,Value=/"''' .format(cli=Contants[ 'AWSCLI' ], name=name, action=action, id=instance_id, mertic=mertic,instancetype=instancetype, imageid=imageid) #注意因为磁盘无法获取到值和指定变量所以磁盘的值需要在cloudwatch上看下类型值来填写 device和fstype # NetworkIn,3分钟检查3次,平均值大于或等于5m,就告警。 def getNetworkInComm(name, action, instance_id):mertic = 'NetworkIn' print ( "#####开始配置 %s#####" % mertic) return '''{cli} cloudwatch put-metric-alarm \--alarm-name "AWS_EC2_{name}_{mertic}" \--alarm-description "aws ec2 {mertic}" \--metric-name {mertic} \--namespace AWS/EC2 \--statistic Average \--period 60 \--threshold 5000000 \--evaluation-periods 3 \--datapoints-to-alarm 3 \--comparison-operator GreaterThanOrEqualToThreshold \--treat-missing-data notBreaching \--alarm-actions "{action}" \--ok-actions "{action}" \--dimensions "Name=InstanceId,Value=%s"''' .format(cli=Contants[ 'AWSCLI' ], name=name, action=action, id=instance_id, mertic=mertic) # NetworkOut,3分钟检查3次,平均值大于或等于5m,就告警。 def getNetworkOutComm(name, action, instance_id):mertic = 'NetworkOut' print ( "#####开始配置 %s#####" % mertic) return '''{cli} cloudwatch put-metric-alarm \--alarm-name "AWS_EC2_{name}_{mertic}" \--alarm-description "aws ec2 {mertic}" \--metric-name {mertic} \--namespace AWS/EC2 \--statistic Average \--period 60 \--threshold 5000000 \--evaluation-periods 3 \--datapoints-to-alarm 3 \--comparison-operator GreaterThanOrEqualToThreshold \--treat-missing-data notBreaching \--alarm-actions "{action}" \--ok-actions "{action}" \--dimensions "Name=InstanceId,Value={id}"''' .format(cli=Contants[ 'AWSCLI' ], name=name, action=action, id=instance_id, mertic=mertic) # 执行命令函数 def execCommand(comm):try: print (comm)(status, stdout) = subprocess.getstatusoutput(comm) print (status) return stdoutexcept Exception as e: print (e) # 获取当前可用区内所有EC2的基础信息 def getAll(get_server_id_list): # instanceids = ["i-0f24b7bf904ea9563" ,"i-0ce745e06c12cbde1"] # for instanceid in instanceids: # print(instanceid) # comm1 = "%s ec2 describe-instances --instance-ids %s" % (Contants['AWSCLI'],instanceid) comm1 = "%s ec2 describe-instances" % Contants[ 'AWSCLI' ] all_data = json.loads(execCommand(comm1)) instance_list = []instance_list_modify = [] for r in all_data[ 'Reservations' ]:data = {} for i in r[ 'Instances' ]:data[ 'id' ] = i[ 'InstanceId' ]data[ 'imageid' ] = i[ 'ImageId' ]data[ 'instancetype' ] = i[ 'InstanceType' ] for t in i[ 'Tags' ]: if t[ 'Key' ] == 'Name' :data[ 'name' ] = t[ 'Value' ] if not data[ 'name' ]:data[ 'name' ] = i[ 'InstanceId' ]instance_list.append(data) # print(instance_list) for instance_id in instance_list: print (instance_id) if instance_id.get( "id" ) in get_server_id_list:instance_list_modify.append(instance_id) #print(instance_list) print (instance_list_modify) return instance_list_modify # 添加报警 def add_alert(data, action): for i in data:instance_id = i[ 'id' ]name = i[ 'name' ]imageid = i[ 'imageid' ]instancetype = i[ 'instancetype' ] print (instance_id, name, imageid, instancetype) #print(instance_id, name) execCommand(getCPUUtilizationComm(name, action, instance_id)) #execCommand(getNetworkInComm(name, action, instance_id)) #execCommand(getNetworkOutComm(name, action, instance_id)) #execCommand(getmem_used_percentComm(name, action, instance_id, instancetype, imageid)) #execCommand(getdisk_used_percentComm(name, action, instance_id, instancetype, imageid)) def get_server_info(instance_list):server_info = [] # print(server_dict) for i in instance_list: # print(i) # 显示执行命令 print ( "aws ec2 describe-instances --output json --instance-ids {0}" .format(i)) # print(cmd) server_dict = {}data = os.popen( "aws ec2 describe-instances --output json --instance-ids {0}" .format(i)). read ()json_str = json.loads(data) """# print(json_str[" Reservations "][0][" Instances "][0])server_dict['id']=json_str[" Reservations "][0][" Instances "][0][" InstanceId "]server_dict['imageid']=json_str[" Reservations "][0][" Instances "][0][" ImageId "]server_dict['instancetype']=json_str[" Reservations "][0][" Instances "][0][" InstanceType "]if not json_str[" Reservations "][0][" Instances "][0][" Tags "][0][" Value "]:server_dict['name'] = json_str[" Reservations "][0][" Instances "][0][" InstanceId "]else:server_dict['name']=json_str[" Reservations "][0][" Instances "][0][" Tags "][0][" Value "]server_info.append(server_dict)""" for Reservations_list in json_str[ "Reservations" ]: for Instances_list in Reservations_list[ "Instances" ]:server_dict[ 'id' ] = Instances_list[ "InstanceId" ]server_dict[ 'imageid' ] = Instances_list[ "ImageId" ]server_dict[ 'instancetype' ] = Instances_list[ "InstanceType" ] # if not Instances_list["Tags"][0]["Value"]: # server_dict['name'] = Instances_list["InstanceId"] # else: # server_dict['name'] = Instances_list["Tags"][0]["Value"] for tag_item in Instances_list[ "Tags" ]:name=tag_item[ "Key" ] if name == "Name" :server_dict[ 'name' ] = tag_item[ "Value" ] break if i == server_dict[ "id" ]: print (server_dict)server_info.append(server_dict) return server_info if __name__ == '__main__' : # 2. 配置sns的arn sns_arn = "arn:aws:sns:eu-central-1:643xxxxx:xxxx-CloudWatch-Lambda-DingTalk" ip_list = [ "i-010bxxxx" , "i-00xxxxx" ]cli = Contants[ 'AWSCLI' ] for i in Contants[ 'AWSREGION' ]: print ( '[Region] ' , i)Contants[ 'AWSCLI' ] = cli + ' --region ' + iadd_alert(get_server_info(ip_list), sns_arn)
三、Amazon SNS创建主题
创建sns主题关联LAMBDA 钉钉程序
四、Lambda钉钉函数通知脚本
上传如下脚本,通过cloudwatch调式EC2设定的规则来触发告警测试
Prolog # _*_coding:utf-8_*_ # python 3.8 # Creation time: 2021/11/18 import timeimport hmacimport hashlibimport base64import urllib.parseimport jsonimport osimport requestsimport datetime def lambda_handler(event, context):headers = { 'Content-Type' : 'application/json;charset=utf-8' }token = 'ca5533c8cb976c21' timestamp = str(round(time.time() * 1000))secret = 'SEC8d1a31ec5e8e91' secret_enc = secret.encode( 'utf-8' )string_to_sign = '{}\n{}' .format(timestamp, secret)string_to_sign_enc = string_to_sign.encode( 'utf-8' )hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()sign = urllib.parse.quote_plus(base64.b64encode(hmac_code)) # get url api_url = "https://oapi.dingtalk.com/robot/send?access_token={}×tamp={}&sign={}" .format(token, timestamp, sign) # msg setting #message = event['Records'][0]['SNS'] message = event[ 'Records' ][0][ 'Sns' ]Timestamp = message[ 'Timestamp' ]Subject = message[ 'Subject' ] # sns_message = message['Message'] sns_message = json.loads(message[ 'Message' ])NewStateReason = json.loads(event[ 'Records' ][0][ 'Sns' ][ 'Message' ])[ 'NewStateReason' ]current_time = (datetime.datetime.now() + datetime.timedelta(hours=8)).strftime( '%Y-%m-%d %H:%M:%S' ) if "ALARM" in Subject:title = '' elif "OK" in Subject:title = '' else :title = '' _value = sns_message[ 'Trigger' ][ 'Dimensions' ][0][ 'value' ] if _value.startswith( '/' ):_value = sns_message[ 'Trigger' ][ 'Dimensions' ][1][ 'value' ]content = "### {title}" .format(title=title) + \ "\n> #### **时间**: " + current_time + \ "\n> #### **状态**: " + sns_message[ 'OldStateValue' ] + " => " + sns_message[ 'NewStateValue' ] + \ "\n> #### **告警名称**: " + sns_message[ 'AlarmName' ] + \ "\n> #### **账户ID**: " + sns_message[ 'AWSAccountId' ] + \ "\n> #### **AWS区域**: " + sns_message[ 'Region' ] + \ "\n> #### **描述**: " + sns_message[ 'AlarmDescription' ] + \ "\n> #### **产品资源**: " + sns_message[ 'Trigger' ][ 'Namespace' ] + \ "\n> #### **实例ID**: " + _value + \ "\n> #### **指标名称**: " + sns_message[ 'Trigger' ][ 'MetricName' ] + \ "\n> #### **报警详情**: " + sns_message[ 'NewStateReason' ] msg = { "msgtype" : "markdown" , "markdown" : { "title" : title, "text" : content}, "at" : { "isAtAll" : "true" }} # request request = requests.post(url=api_url, data=json.dumps(msg), headers=headers).content.decode( "utf8" ) return request
Aws子账户权限调式工具
https://policysim.aws.amazon.com/