You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
hbyd_ueba/utils/file_helper.py

124 lines
3.8 KiB

3 months ago
#!/usr/bin/python
#encoding=utf-8
# author: tangwy
import re,os,json
3 months ago
import codecs
3 months ago
from db2json import DBUtils
from datetime import datetime, timedelta
from ext_logging import logger_cron,get_clean_file_path
TRACE_PATH = "/home/master/ISOP/apps/uebaMetricsAnalysis/logs/trace.cfg"
3 months ago
#写入大文件5M
def write_large_file(filename, data_list, chunk_size=1024*1024*20):
3 months ago
with codecs.open(filename, 'w', encoding='utf-8') as f:
for i in range(0, len(data_list), chunk_size):
chunk = data_list[i:i + chunk_size]
f.write(chunk)
#判断文件是否大于500M
def is_file_larger_than_500mb(file_path):
file_size = os.path.getsize(file_path)
file_size_in_mb = file_size / (1024.0 * 1024)
if file_size_in_mb > 500:
return True
else:
return False
3 months ago
#读取大文件
def read_large_json_file(filename, chunk_size=1024*1024*10): # 每次读取10MB的数据
3 months ago
json_object = ''
with codecs.open(filename, 'r', encoding='utf-8') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
json_object += chunk
data = json.loads(json_object)
return data
3 months ago
#删除文件
def delete_frile(file_path):
if os.path.exists(file_path):
os.remove(file_path)
#文件内容合并
def merge_data(datasets):
# 初始化一个空的字典来保存合并后的数据
merged_data = {
"ip": [],
"account": [],
"interface": [],
"menu": []
}
# 遍历所有数据集
for dataset in datasets:
# 遍历数据集中的每个类别
for category, items in dataset.items():
# 将当前数据集的项目添加到合并数据的相应类别中
merged_data[category].extend(items)
# 定义一个字典来存储聚合后的数据
aggregated_data = {
"ip": [],
"account": [],
"interface": [],
"menu": []
}
# 遍历所有类别
for category in aggregated_data:
# 创建一个字典来存储每个类别的聚合数据
category_data = {}
# 如果当前类别存在于merged_data中
if category in merged_data:
for item in merged_data[category]:
# 确定非计数字段
keys_to_use = [k for k in item if k != 'count']
# 使用元组作为键,包含所有非计数字段
key_tuple = tuple(item[k] for k in keys_to_use)
if key_tuple not in category_data:
category_data[key_tuple] = item['count']
else:
category_data[key_tuple] += item['count']
# 将聚合后的数据转换回原始格式
aggregated_data[category] = [
dict(zip(keys_to_use, key_tuple) + [('count', count)])
for key_tuple, count in category_data.items()
]
return aggregated_data
def merge_data_new(datasets):
# 创建一个新的空字典来存储结果
result = {}
# 遍历列表中的每一个字典
for d in datasets:
for key, value in d.iteritems(): # 遍历当前字典中的键值对
if key in result:
# 如果键已经存在于结果中,则将值相加
result[key] = str(int(result[key]) + int(value))
else:
# 否则,直接添加键值对
result[key] = value
return result
#获取文件内容不做jsonload处理
def get_file_content():
json_object = ''
if os.path.exists(TRACE_PATH):
with codecs.open(TRACE_PATH, 'r', encoding='utf-8') as f:
while True:
chunk = f.read(1024*1024*1)
if not chunk:
break
json_object += chunk
return json_object