hbyd_ueba/utils/file_merge.py

#!/usr/bin/python
#encoding=utf-8
# author: tangwy
import re,os,json
import codecs
from db2json import DBUtils
from datetime import datetime, timedelta
from ext_logging import logger_cron,get_clean_file_path
from file_helper import read_large_json_file,write_large_file,merge_data,delete_frile
from collections import defaultdict

date_pattern = re.compile(r'\d{8}-\d{4}\.json')

def get_all_files(path):
    # 列出所有包含匹配模式的文件名
    files = []
    for filename in os.listdir(path):
        if date_pattern.search(filename):
            files.append(filename)
    return files

#对待合并文件进行分组
def get_file_merge_array(filenames):
    # 创建一个defaultdict来存储分组的文件
    file_dict = defaultdict(list)
    for filename in filenames:
        date = filename[:8]
        file_dict[date].append(filename)

    file_dict = dict(file_dict)
    return file_dict

#合并所以文件
def merge_all_files(file_dict,base_path):
    # 遍历字典中的每一个键值对
    for date_str, files in file_dict.items():
        #20240721
        root_file_path = "{}-{}-{}.json".format(date_str[:4], date_str[4:6], date_str[6:])
        full_root_file_path = os.path.join(base_path,root_file_path)
        if len(files)>0:
            file_objs=[]
            if os.path.exists(full_root_file_path):
                root_data = read_large_json_file(full_root_file_path)
                file_objs.append(root_data)

            file_full_path = []
            for filename in files:
                #20240721-0170.json
                full_path = os.path.join(base_path,filename)
                file_full_path.append(full_path)
                logger_cron.info("FILE_MERGE: 准备读取文件"+full_path)
                tmp_data =read_large_json_file(full_path)
                file_objs.append(tmp_data)
            
            logger_cron.info("FILE_MERGE: 准备合并文件")
            data = merge_data(file_objs)
            logger_cron.info("FILE_MERGE: 准备写入合并的文件")
            write_large_file(full_root_file_path,json.dumps(data))
            logger_cron.info("FILE_MERGE: 写入合并文件完成")
            #准备删除合并文件
            for del_file in file_full_path:
                logger_cron.info("FILE_MERGE: 准备删除 "+del_file)
                delete_frile(del_file)
                logger_cron.info("FILE_MERGE: 完成删除 "+del_file)
def entry():
    #清洗目录
    base_path = get_clean_file_path()
    #匹配待清洗的文件
    files = get_all_files(base_path)
    #对待清洗的文件进行分组
    file_dict =get_file_merge_array(files)
    #合并所有文件
    logger_cron.info("FILE_MERGE: 准备执行文件合并")
    merge_all_files(file_dict,base_path)
'代码提交' 4 months ago			`#!/usr/bin/python`
			`#encoding=utf-8`
			`# author: tangwy`
			`import re,os,json`
			`import codecs`
			`from db2json import DBUtils`
			`from datetime import datetime, timedelta`
			`from ext_logging import logger_cron,get_clean_file_path`
			`from file_helper import read_large_json_file,write_large_file,merge_data,delete_frile`
			`from collections import defaultdict`

			`date_pattern = re.compile(r'\d{8}-\d{4}\.json')`

			`def get_all_files(path):`
			`# 列出所有包含匹配模式的文件名`
			`files = []`
			`for filename in os.listdir(path):`
			`if date_pattern.search(filename):`
			`files.append(filename)`
			`return files`

			`#对待合并文件进行分组`
			`def get_file_merge_array(filenames):`
			`# 创建一个defaultdict来存储分组的文件`
			`file_dict = defaultdict(list)`
			`for filename in filenames:`
			`date = filename[:8]`
			`file_dict[date].append(filename)`

			`file_dict = dict(file_dict)`
			`return file_dict`

			`#合并所以文件`
			`def merge_all_files(file_dict,base_path):`
			`# 遍历字典中的每一个键值对`
			`for date_str, files in file_dict.items():`
			`#20240721`
			`root_file_path = "{}-{}-{}.json".format(date_str[:4], date_str[4:6], date_str[6:])`
			`full_root_file_path = os.path.join(base_path,root_file_path)`
			`if len(files)>0:`
			`file_objs=[]`
			`if os.path.exists(full_root_file_path):`
			`root_data = read_large_json_file(full_root_file_path)`
			`file_objs.append(root_data)`

			`file_full_path = []`
			`for filename in files:`
			`#20240721-0170.json`
			`full_path = os.path.join(base_path,filename)`
			`file_full_path.append(full_path)`
			`logger_cron.info("FILE_MERGE: 准备读取文件"+full_path)`
			`tmp_data =read_large_json_file(full_path)`
			`file_objs.append(tmp_data)`

			`logger_cron.info("FILE_MERGE: 准备合并文件")`
			`data = merge_data(file_objs)`
			`logger_cron.info("FILE_MERGE: 准备写入合并的文件")`
			`write_large_file(full_root_file_path,json.dumps(data))`
			`logger_cron.info("FILE_MERGE: 写入合并文件完成")`
			`#准备删除合并文件`
			`for del_file in file_full_path:`
			`logger_cron.info("FILE_MERGE: 准备删除 "+del_file)`
			`delete_frile(del_file)`
			`logger_cron.info("FILE_MERGE: 完成删除 "+del_file)`
			`def entry():`
			`#清洗目录`
			`base_path = get_clean_file_path()`
			`#匹配待清洗的文件`
			`files = get_all_files(base_path)`
			`#对待清洗的文件进行分组`
			`file_dict =get_file_merge_array(files)`
			`#合并所有文件`
			`logger_cron.info("FILE_MERGE: 准备执行文件合并")`
			`merge_all_files(file_dict,base_path)`