hbyd_ueba/utils/file_helper.py

#!/usr/bin/python
#encoding=utf-8
# author: tangwy
import re,os,json
import codecs
from db2json import DBUtils
from datetime import datetime, timedelta
from ext_logging import logger_cron,get_clean_file_path

TRACE_PATH = "/home/master/ISOP/apps/uebaMetricsAnalysis/logs/trace.cfg"

#写入大文件5M
def write_large_file(filename, data_list, chunk_size=1024*1024*20):
    with codecs.open(filename, 'w', encoding='utf-8') as f:
        for i in range(0, len(data_list), chunk_size):
            chunk = data_list[i:i + chunk_size]
            f.write(chunk)

#判断文件是否大于500M
def is_file_larger_than_500mb(file_path):
    file_size = os.path.getsize(file_path)
    file_size_in_mb = file_size / (1024.0 * 1024)
    if file_size_in_mb > 500:
        return True
    else:
        return False

#读取大文件
def read_large_json_file(filename, chunk_size=1024*1024*10):  # 每次读取10MB的数据
    json_object = ''
    with codecs.open(filename, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            json_object += chunk

    data = json.loads(json_object)
    return data

#删除文件
def delete_frile(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)

#文件内容合并
def merge_data(datasets):
    #  初始化一个空的字典来保存合并后的数据
    merged_data = {
        "ip": [],
        "account": [],
        "interface": [],
        "menu": []
    }

    # 遍历所有数据集
    for dataset in datasets:
        # 遍历数据集中的每个类别
        for category, items in dataset.items():
            # 将当前数据集的项目添加到合并数据的相应类别中
            merged_data[category].extend(items)

    # 定义一个字典来存储聚合后的数据
    aggregated_data = {
        "ip": [],
        "account": [],
        "interface": [],
        "menu": []
    }

    # 遍历所有类别
    for category in aggregated_data:
        # 创建一个字典来存储每个类别的聚合数据
        category_data = {}

        # 如果当前类别存在于merged_data中
        if category in merged_data:
            for item in merged_data[category]:
                # 确定非计数字段
                keys_to_use = [k for k in item if k != 'count']
                # 使用元组作为键，包含所有非计数字段
                key_tuple = tuple(item[k] for k in keys_to_use)

                if key_tuple not in category_data:
                    category_data[key_tuple] = item['count']
                else:
                    category_data[key_tuple] += item['count']

        # 将聚合后的数据转换回原始格式
        aggregated_data[category] = [
            dict(zip(keys_to_use, key_tuple) + [('count', count)])
            for key_tuple, count in category_data.items()
        ]

    return aggregated_data

def merge_data_new(datasets):
    # 创建一个新的空字典来存储结果
    result = {}
    
    # 遍历列表中的每一个字典
    for d in datasets:
        for key, value in d.iteritems():  # 遍历当前字典中的键值对
            if key in result:
                # 如果键已经存在于结果中，则将值相加
                result[key] = str(int(result[key]) + int(value))
            else:
                # 否则，直接添加键值对
                result[key] = value
    
    return result

#获取文件内容不做jsonload处理
def get_file_content():
    json_object = ''
    if os.path.exists(TRACE_PATH):
        with codecs.open(TRACE_PATH, 'r', encoding='utf-8') as f:
            while True:
                chunk = f.read(1024*1024*1)
                if not chunk:
                    break
                json_object += chunk
        
    return json_object
'代码提交' 3 months ago			`#!/usr/bin/python`
			`#encoding=utf-8`
			`# author: tangwy`
			`import re,os,json`
代码还原 3 months ago			`import codecs`
'代码提交' 3 months ago			`from db2json import DBUtils`
			`from datetime import datetime, timedelta`
			`from ext_logging import logger_cron,get_clean_file_path`

第二轮代码优化 2 months ago			`TRACE_PATH = "/home/master/ISOP/apps/uebaMetricsAnalysis/logs/trace.cfg"`

'代码提交' 3 months ago			`#写入大文件5M`
第二轮代码优化 2 months ago			`def write_large_file(filename, data_list, chunk_size=1024102420):`
'代码提交' 3 months ago			`with codecs.open(filename, 'w', encoding='utf-8') as f:`
			`for i in range(0, len(data_list), chunk_size):`
			`chunk = data_list[i:i + chunk_size]`
			`f.write(chunk)`

第二轮代码优化 2 months ago			`#判断文件是否大于500M`
			`def is_file_larger_than_500mb(file_path):`
			`file_size = os.path.getsize(file_path)`
			`file_size_in_mb = file_size / (1024.0 * 1024)`
			`if file_size_in_mb > 500:`
			`return True`
			`else:`
			`return False`

'代码提交' 3 months ago			`#读取大文件`
第二轮代码优化 2 months ago			`def read_large_json_file(filename, chunk_size=1024102410): # 每次读取10MB的数据`
代码还原 3 months ago			`json_object = ''`
			`with codecs.open(filename, 'r', encoding='utf-8') as f:`
			`while True:`
			`chunk = f.read(chunk_size)`
			`if not chunk:`
			`break`
			`json_object += chunk`

			`data = json.loads(json_object)`
			`return data`
'代码提交' 3 months ago
			`#删除文件`
			`def delete_frile(file_path):`
			`if os.path.exists(file_path):`
			`os.remove(file_path)`

			`#文件内容合并`
			`def merge_data(datasets):`
			`# 初始化一个空的字典来保存合并后的数据`
			`merged_data = {`
			`"ip": [],`
			`"account": [],`
			`"interface": [],`
			`"menu": []`
			`}`

			`# 遍历所有数据集`
			`for dataset in datasets:`
			`# 遍历数据集中的每个类别`
			`for category, items in dataset.items():`
			`# 将当前数据集的项目添加到合并数据的相应类别中`
			`merged_data[category].extend(items)`

			`# 定义一个字典来存储聚合后的数据`
			`aggregated_data = {`
			`"ip": [],`
			`"account": [],`
			`"interface": [],`
			`"menu": []`
			`}`

			`# 遍历所有类别`
			`for category in aggregated_data:`
			`# 创建一个字典来存储每个类别的聚合数据`
			`category_data = {}`

			`# 如果当前类别存在于merged_data中`
			`if category in merged_data:`
			`for item in merged_data[category]:`
			`# 确定非计数字段`
			`keys_to_use = [k for k in item if k != 'count']`
			`# 使用元组作为键，包含所有非计数字段`
			`key_tuple = tuple(item[k] for k in keys_to_use)`

			`if key_tuple not in category_data:`
			`category_data[key_tuple] = item['count']`
			`else:`
			`category_data[key_tuple] += item['count']`

			`# 将聚合后的数据转换回原始格式`
			`aggregated_data[category] = [`
			`dict(zip(keys_to_use, key_tuple) + [('count', count)])`
			`for key_tuple, count in category_data.items()`
			`]`

			`return aggregated_data`
第二轮代码优化 2 months ago
			`def merge_data_new(datasets):`
			`# 创建一个新的空字典来存储结果`
			`result = {}`

			`# 遍历列表中的每一个字典`
			`for d in datasets:`
			`for key, value in d.iteritems(): # 遍历当前字典中的键值对`
			`if key in result:`
			`# 如果键已经存在于结果中，则将值相加`
			`result[key] = str(int(result[key]) + int(value))`
			`else:`
			`# 否则，直接添加键值对`
			`result[key] = value`

			`return result`

			`#获取文件内容不做jsonload处理`
			`def get_file_content():`
			`json_object = ''`
			`if os.path.exists(TRACE_PATH):`
			`with codecs.open(TRACE_PATH, 'r', encoding='utf-8') as f:`
			`while True:`
			`chunk = f.read(102410241)`
			`if not chunk:`
			`break`
			`json_object += chunk`

			`return json_object`