hbyd_ueba/utils/file_helper.py

#!/usr/bin/python
#encoding=utf-8
# author: tangwy
import re,os,json
import codecs
from db2json import DBUtils
from datetime import datetime, timedelta
from ext_logging import logger_cron,get_clean_file_path

#写入大文件5M
def write_large_file(filename, data_list, chunk_size=1024*1024*5):
    with codecs.open(filename, 'w', encoding='utf-8') as f:
        for i in range(0, len(data_list), chunk_size):
            chunk = data_list[i:i + chunk_size]
            f.write(chunk)

#读取大文件
def read_large_json_file(filename, chunk_size=1024*1024*5):  # 每次读取5MB的数据
    json_object = ''
    with codecs.open(filename, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            json_object += chunk

    data = json.loads(json_object)
    return data

#删除文件
def delete_frile(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)

#文件内容合并
def merge_data(datasets):
    #  初始化一个空的字典来保存合并后的数据
    merged_data = {
        "ip": [],
        "account": [],
        "interface": [],
        "menu": []
    }

    # 遍历所有数据集
    for dataset in datasets:
        # 遍历数据集中的每个类别
        for category, items in dataset.items():
            # 将当前数据集的项目添加到合并数据的相应类别中
            merged_data[category].extend(items)

    # 定义一个字典来存储聚合后的数据
    aggregated_data = {
        "ip": [],
        "account": [],
        "interface": [],
        "menu": []
    }

    # 遍历所有类别
    for category in aggregated_data:
        # 创建一个字典来存储每个类别的聚合数据
        category_data = {}

        # 如果当前类别存在于merged_data中
        if category in merged_data:
            for item in merged_data[category]:
                # 确定非计数字段
                keys_to_use = [k for k in item if k != 'count']
                # 使用元组作为键，包含所有非计数字段
                key_tuple = tuple(item[k] for k in keys_to_use)

                if key_tuple not in category_data:
                    category_data[key_tuple] = item['count']
                else:
                    category_data[key_tuple] += item['count']

        # 将聚合后的数据转换回原始格式
        aggregated_data[category] = [
            dict(zip(keys_to_use, key_tuple) + [('count', count)])
            for key_tuple, count in category_data.items()
        ]

    return aggregated_data
'代码提交' 4 months ago			`#!/usr/bin/python`
			`#encoding=utf-8`
			`# author: tangwy`
			`import re,os,json`
			`import codecs`
			`from db2json import DBUtils`
			`from datetime import datetime, timedelta`
			`from ext_logging import logger_cron,get_clean_file_path`

			`#写入大文件5M`
			`def write_large_file(filename, data_list, chunk_size=102410245):`
			`with codecs.open(filename, 'w', encoding='utf-8') as f:`
			`for i in range(0, len(data_list), chunk_size):`
			`chunk = data_list[i:i + chunk_size]`
			`f.write(chunk)`

			`#读取大文件`
			`def read_large_json_file(filename, chunk_size=102410245): # 每次读取5MB的数据`
			`json_object = ''`
			`with codecs.open(filename, 'r', encoding='utf-8') as f:`
			`while True:`
			`chunk = f.read(chunk_size)`
			`if not chunk:`
			`break`
			`json_object += chunk`

			`data = json.loads(json_object)`
			`return data`

			`#删除文件`
			`def delete_frile(file_path):`
			`if os.path.exists(file_path):`
			`os.remove(file_path)`

			`#文件内容合并`
			`def merge_data(datasets):`
			`# 初始化一个空的字典来保存合并后的数据`
			`merged_data = {`
			`"ip": [],`
			`"account": [],`
			`"interface": [],`
			`"menu": []`
			`}`

			`# 遍历所有数据集`
			`for dataset in datasets:`
			`# 遍历数据集中的每个类别`
			`for category, items in dataset.items():`
			`# 将当前数据集的项目添加到合并数据的相应类别中`
			`merged_data[category].extend(items)`

			`# 定义一个字典来存储聚合后的数据`
			`aggregated_data = {`
			`"ip": [],`
			`"account": [],`
			`"interface": [],`
			`"menu": []`
			`}`

			`# 遍历所有类别`
			`for category in aggregated_data:`
			`# 创建一个字典来存储每个类别的聚合数据`
			`category_data = {}`

			`# 如果当前类别存在于merged_data中`
			`if category in merged_data:`
			`for item in merged_data[category]:`
			`# 确定非计数字段`
			`keys_to_use = [k for k in item if k != 'count']`
			`# 使用元组作为键，包含所有非计数字段`
			`key_tuple = tuple(item[k] for k in keys_to_use)`

			`if key_tuple not in category_data:`
			`category_data[key_tuple] = item['count']`
			`else:`
			`category_data[key_tuple] += item['count']`

			`# 将聚合后的数据转换回原始格式`
			`aggregated_data[category] = [`
			`dict(zip(keys_to_use, key_tuple) + [('count', count)])`
			`for key_tuple, count in category_data.items()`
			`]`

			`return aggregated_data`