#!/usr/bin/python #encoding=utf-8 # author: tangwy import re,os,json import codecs from db2json import DBUtils from datetime import datetime, timedelta from ext_logging import logger_cron,get_clean_file_path from file_helper import read_large_json_file,write_large_file,merge_data,delete_frile from collections import defaultdict date_pattern = re.compile(r'\d{8}-\d{4}\.json') def get_all_files(path): # 列出所有包含匹配模式的文件名 files = [] for filename in os.listdir(path): if date_pattern.search(filename): files.append(filename) return files #对待合并文件进行分组 def get_file_merge_array(filenames): # 创建一个defaultdict来存储分组的文件 file_dict = defaultdict(list) for filename in filenames: date = filename[:8] file_dict[date].append(filename) file_dict = dict(file_dict) return file_dict #合并所以文件 def merge_all_files(file_dict,base_path): # 遍历字典中的每一个键值对 for date_str, files in file_dict.items(): #20240721 root_file_path = "{}-{}-{}.json".format(date_str[:4], date_str[4:6], date_str[6:]) full_root_file_path = os.path.join(base_path,root_file_path) if len(files)>0: file_objs=[] if os.path.exists(full_root_file_path): root_data = read_large_json_file(full_root_file_path) file_objs.append(root_data) file_full_path = [] for filename in files: #20240721-0170.json full_path = os.path.join(base_path,filename) file_full_path.append(full_path) logger_cron.info("FILE_MERGE: 准备读取文件"+full_path) tmp_data =read_large_json_file(full_path) file_objs.append(tmp_data) logger_cron.info("FILE_MERGE: 准备合并文件") data = merge_data(file_objs) logger_cron.info("FILE_MERGE: 准备写入合并的文件") write_large_file(full_root_file_path,json.dumps(data)) logger_cron.info("FILE_MERGE: 写入合并文件完成") #准备删除合并文件 for del_file in file_full_path: logger_cron.info("FILE_MERGE: 准备删除 "+del_file) delete_frile(del_file) logger_cron.info("FILE_MERGE: 完成删除 "+del_file) def entry(): #清洗目录 base_path = get_clean_file_path() #匹配待清洗的文件 files = get_all_files(base_path) #对待清洗的文件进行分组 file_dict =get_file_merge_array(files) #合并所有文件 logger_cron.info("FILE_MERGE: 准备执行文件合并") merge_all_files(file_dict,base_path)