#encoding=utf-8 import json import codecs,os import time,datetime import traceback from datetime import datetime, timedelta import calendar import codecs from esUtil import EsUtil from file_helper import write_large_file,get_file_content,TRACE_PATH from dashboard_data_conversion import find_region_by_code,jobnum_region_dict from uebaMetricsAnalysis.utils.ext_logging import logger,logger_cron,get_clean_file_path from collections import defaultdict from ext_logging import logger_trace size = 9999#根据实际情况调整 DATA_TYPE = { "IP": 1, "ACCOUNT": 2, "INTERFACE": 3, "MENU": 4, } ## IP维度 def get_ip_group_data(index,startTime,endTime): query_body={ "size": 0, "query": { "bool": { "filter": [ {"range": {"timestamp": {"gt": startTime,"lte": endTime}}}, {"bool": { "must_not": [ {"match_phrase": {"account": ""}} ] }} ] } }, "aggs": { "composite_buckets": { "composite": { "size": size, "sources": [ {"sip": { "terms": {"field": "sip"} }}, {"trojan_type": { "terms": { "field": "trojan_type"}}} ] } } } } after_key = None es_util_instance = EsUtil() datas=[] while True: if after_key: query_body["aggs"]["composite_buckets"]["composite"]["after"] = after_key response = es_util_instance.search(index,query_body) for bucket in response["aggregations"]["composite_buckets"]["buckets"]: data = { "data_type": DATA_TYPE.get("IP"), "count": bucket['doc_count'], "jobnum": bucket['key']['trojan_type'] , "ip":bucket['key']['sip'] } datas.append(data) after_key = bucket["key"] if not response["aggregations"]["composite_buckets"].get("after_key"): break after_key = response["aggregations"]["composite_buckets"]["after_key"] return datas ## 账号维度 def get_account_group_data(index,startTime,endTime): query_body={ "size": 0, "query": { "bool": { "filter": [ {"range": {"timestamp": {"gt": startTime,"lte": endTime}}}, {"bool": { "must_not": [ {"match_phrase": {"account": ""}} ] }} ] } }, "aggs": { "composite_buckets": { "composite": { "size": size, "sources": [ {"account": { "terms": {"field": "account"} }}, {"trojan_type": { "terms": { "field": "trojan_type"}}} ] } } } } after_key = None es_util_instance = EsUtil() datas=[] while True: if after_key: query_body["aggs"]["composite_buckets"]["composite"]["after"] = after_key response = es_util_instance.search(index,query_body) for bucket in response["aggregations"]["composite_buckets"]["buckets"]: data = { "data_type": DATA_TYPE.get("ACCOUNT"), "account": bucket['key']['account'], "count": bucket['doc_count'], "jobnum": bucket['key']['trojan_type'] } datas.append(data) after_key = bucket["key"] if not response["aggregations"]["composite_buckets"].get("after_key"): break after_key = response["aggregations"]["composite_buckets"]["after_key"] return datas ## 接口维度 def get_interface_group_data(index,startTime,endTime): query_body={ "size": 0, "query": { "bool": { "filter": [ {"range": {"timestamp": {"gt": startTime,"lte": endTime}}}, {"bool": { "must_not": [ {"match_phrase": {"account": ""}} ] }} ] } }, "aggs": { "composite_buckets": { "composite": { "size": size, "sources": [ {"interface": { "terms": {"field": "interface"} }}, {"sip": { "terms": { "field": "sip"}}}, {"account": { "terms": { "field": "account"}}}, {"trojan_type": { "terms": { "field": "trojan_type"}}} ] } } } } after_key = None es_util_instance = EsUtil() datas=[] while True: if after_key: query_body["aggs"]["composite_buckets"]["composite"]["after"] = after_key response = es_util_instance.search(index,query_body) for bucket in response["aggregations"]["composite_buckets"]["buckets"]: data = { "data_type": DATA_TYPE.get("INTERFACE"), "account": bucket['key']['account'], "count": bucket['doc_count'], "jobnum": bucket['key']['trojan_type'] , "interface": bucket['key']['interface'] , "ip":bucket['key']['sip'] } datas.append(data) after_key = bucket["key"] if not response["aggregations"]["composite_buckets"].get("after_key"): break after_key = response["aggregations"]["composite_buckets"]["after_key"] return datas ## 菜单维度 def get_menu_group_data(index,startTime,endTime): query_body={ "size": 0, "query": { "bool": { "filter": [ {"range": {"timestamp": {"gt": startTime,"lte": endTime}}}, {"bool": { "must_not": [ {"match_phrase": {"account": ""}} ] }} ] } }, "aggs": { "composite_buckets": { "composite": { "size": size, "sources": [ {"worm_family": { "terms": {"field": "worm_family"} }}, {"sip": { "terms": { "field": "sip"}}}, {"account": { "terms": { "field": "account"}}}, {"trojan_type": { "terms": { "field": "trojan_type"}}} ] } } } } after_key = None es_util_instance = EsUtil() datas=[] while True: if after_key: query_body["aggs"]["composite_buckets"]["composite"]["after"] = after_key response = es_util_instance.search(index,query_body) for bucket in response["aggregations"]["composite_buckets"]["buckets"]: data = { "data_type": DATA_TYPE.get("MENU"), "account": bucket['key']['account'], "count": bucket['doc_count'], "jobnum": bucket['key']['trojan_type'] , "ip":bucket['key']['sip'], "menu": bucket['key']['worm_family'], } datas.append(data) after_key = bucket["key"] if not response["aggregations"]["composite_buckets"].get("after_key"): break after_key = response["aggregations"]["composite_buckets"]["after_key"] return datas def datetime_to_timestamp(dt): dtstr=dt.strftime("%Y-%m-%d %H:%M:%S") return int(time.mktime(time.strptime(dtstr,"%Y-%m-%d %H:%M:%S"))*1000) def clean_data(read_index,start,end,jobid): data_ip = get_ip_group_data(read_index,start,end) data_account = get_account_group_data(read_index,start,end) data_interface = get_interface_group_data(read_index,start,end) data_menu = get_menu_group_data(read_index,start,end) if len(data_ip) == 0 and len(data_account) == 0 and len(data_interface) == 0 and len(data_menu) == 0: logger_cron.info("JOB:"+jobid+",es中未获取到数据,无需做数据合并") return logger_cron.info("JOB:"+jobid+",ip维度获取到 "+str(len(data_ip))+" 条数据") logger_cron.info("JOB:"+jobid+",账号维度获取到 "+str(len(data_account))+" 条数据") logger_cron.info("JOB:"+jobid+",接口维度获取到 "+str(len(data_interface))+" 条数据") logger_cron.info("JOB:"+jobid+",菜单维度获取到 "+str(len(data_menu))+" 条数据") #todo 读取上一次5分钟的文件,与这5分钟的文件合并 #合并完成后 写文件 group_and_write_to_file(data_ip, data_account, data_interface, data_menu, start,jobid) def group_and_write_to_file(data_ip, data_account, data_interface, data_menu, start,jobid): # 获取当前工作目录 base_path = get_clean_file_path() logger_cron.info("JOB: "+jobid+",写入文件base路径"+base_path) date_time = convert_utc_to_local_time(start) #临时文件 临时文件格式:20240720-1630_tmp.json tmp_file_name = time.strftime("%Y%m%d-%H%M_tmp.json", date_time) tmp_file_path = os.path.join(base_path,tmp_file_name) #正式文件 正式文件格式:20240720-1630.json file_name = time.strftime("%Y%m%d-%H%M.json", date_time) file_path = os.path.join(base_path,file_name) logger_cron.info("JOB:"+jobid+", tmpfilepath"+tmp_file_path) #(datatype,menu,ip,account,jobnum,interface) count records = {} for item in data_ip: menu = remove_commas(item.get('menu', '')) ip = item.get('ip', '0.0.0.0') account = remove_commas(item.get('account', '')) jobnum = item.get('jobnum', '') count = item.get('count', 0) datatype = DATA_TYPE.get("IP",1) interface = remove_commas(item.get('interface', '')) company = find_region_by_code(jobnum,jobnum_region_dict) records[",".join([str(datatype), menu, ip,account,jobnum,interface,company])]=count #日志追踪 if not os.path.exists(TRACE_PATH): write_large_file(TRACE_PATH, ",".join([str(datatype), menu, ip,account,jobnum,interface,company])) for item in data_account: menu = remove_commas(item.get('menu', '')) ip = item.get('ip', '0.0.0.0') account = remove_commas(item.get('account', '')) jobnum = item.get('jobnum', '') count = item.get('count', 0) datatype = DATA_TYPE.get("ACCOUNT",2) interface = remove_commas(item.get('interface', '')) company = find_region_by_code(jobnum,jobnum_region_dict) records[",".join([str(datatype), menu, ip,account,jobnum,interface,company])]=count for item in data_interface: menu = remove_commas(item.get('menu', '')) ip = item.get('ip', '0.0.0.0') account = remove_commas(item.get('account', '')) jobnum = item.get('jobnum', '') count = item.get('count', 0) datatype = DATA_TYPE.get("INTERFACE",3) interface = remove_commas(item.get('interface', '')) company = find_region_by_code(jobnum,jobnum_region_dict) records[",".join([str(datatype), menu, ip,account,jobnum,interface,company])]=count for item in data_menu: menu = remove_commas(item.get('menu', '')) ip = item.get('ip', '0.0.0.0') account = remove_commas(item.get('account', '')) jobnum = item.get('jobnum', '') count = item.get('count', 0) datatype = DATA_TYPE.get("MENU",4) interface = remove_commas(item.get('interface', '')) company = find_region_by_code(jobnum,jobnum_region_dict) records[",".join([str(datatype), menu, ip,account,jobnum,interface,company])]=count json_data = json.dumps(records) ########问题排查################# key=get_file_content() if key in records: logger_trace.info("baseclean:"+jobid+file_path+":"+str(records[key])) #写入文件 logger_cron.info("JOB: "+jobid+",准备写入文件") write_large_file(tmp_file_path,json_data) #重命名文件 os.rename(tmp_file_path, file_path) logger_cron.info("JOB: "+jobid+",写入文件完成") #原始数据去掉逗号 def remove_commas(record): return ''.join(c for c in record if c != ',') def group_and_sum(data, by_fields="ip,jobnum"): # 将by_fields转换为列表 by_fields_list = by_fields.split(',') aggregated_data = {} # 遍历数据,进行聚合 for record in data: # 确保所有字段都存在于记录中 if all(field in record for field in by_fields_list): # 构建聚合键 key = tuple(record[field] for field in by_fields_list) # 如果键不存在,则初始化计数器 if key not in aggregated_data: aggregated_data[key] = 0 # 累加count值 aggregated_data[key] += record['count'] # 将结果转换为期望的输出格式 output = [ dict({field: value for (field, value) in zip(by_fields_list, key)}, count=count) for key, count in aggregated_data.items() ] return output def convert_utc_to_local_time(timestamp): # 将毫秒时间戳转换为秒 seconds_since_epoch = long(timestamp) / 1000.0 # 使用 gmtime() 得到 UTC 时间结构体 time_struct_utc = time.gmtime(seconds_since_epoch) # 将 UTC 时间转换为北京时间(东八区),即加 8 小时 time_struct_beijing = time.mktime(time_struct_utc) + (8 * 60 * 60) # 将时间戳转换回 struct_time time_struct_beijing = time.localtime(time_struct_beijing) return time_struct_beijing #入口 def entry(start,end,jobid): base_index ="bsa_traffic*" es_util_instance = EsUtil() start = datetime_to_timestamp(start) end = datetime_to_timestamp(end) logger_cron.info("JOB:"+jobid+",start为"+str(start)) logger_cron.info("JOB:"+jobid+",end为"+str(end)) res=es_util_instance.get_available_index_name(start,end,base_index) logger_cron.info("JOB:"+jobid+",index为"+json.dumps(res)) if len(res)==0: return index =",".join(res) clean_data(index,start,end,jobid) # start = '2024-07-18 15:20:31' # end = '2024-07-18 15:25:31' # date_format = "%Y-%m-%d %H:%M:%S" # date_str = datetime.strptime(start, date_format) # end_str = datetime.strptime(end, date_format) # # # 将 datetime 对象转换为秒级时间戳 # # timestamp_seconds = time.mktime(dt.timetuple()) # # # 获取微秒数 # # microseconds = dt.microsecond # # # 转换为毫秒级时间戳 # # timestamp_milliseconds = int(timestamp_seconds * 1000 + microseconds / 1000.0) # entry(date_str,end_str,"xxxxxxxxxxxxxxxxxxx")