You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
74 lines
2.7 KiB
74 lines
2.7 KiB
#!/usr/bin/python
|
|
#encoding=utf-8
|
|
# author: tangwy
|
|
import re,os,json
|
|
import codecs
|
|
from db2json import DBUtils
|
|
from datetime import datetime, timedelta
|
|
from ext_logging import logger_cron,get_clean_file_path
|
|
from file_helper import read_large_json_file,write_large_file,merge_data,delete_frile
|
|
from collections import defaultdict
|
|
|
|
date_pattern = re.compile(r'\d{8}-\d{4}\.json')
|
|
|
|
def get_all_files(path):
|
|
# 列出所有包含匹配模式的文件名
|
|
files = []
|
|
for filename in os.listdir(path):
|
|
if date_pattern.search(filename):
|
|
files.append(filename)
|
|
return files
|
|
|
|
#对待合并文件进行分组
|
|
def get_file_merge_array(filenames):
|
|
# 创建一个defaultdict来存储分组的文件
|
|
file_dict = defaultdict(list)
|
|
for filename in filenames:
|
|
date = filename[:8]
|
|
file_dict[date].append(filename)
|
|
|
|
file_dict = dict(file_dict)
|
|
return file_dict
|
|
|
|
#合并所以文件
|
|
def merge_all_files(file_dict,base_path):
|
|
# 遍历字典中的每一个键值对
|
|
for date_str, files in file_dict.items():
|
|
#20240721
|
|
root_file_path = "{}-{}-{}.json".format(date_str[:4], date_str[4:6], date_str[6:])
|
|
full_root_file_path = os.path.join(base_path,root_file_path)
|
|
if len(files)>0:
|
|
file_objs=[]
|
|
if os.path.exists(full_root_file_path):
|
|
root_data = read_large_json_file(full_root_file_path)
|
|
file_objs.append(root_data)
|
|
|
|
file_full_path = []
|
|
for filename in files:
|
|
#20240721-0170.json
|
|
full_path = os.path.join(base_path,filename)
|
|
file_full_path.append(full_path)
|
|
logger_cron.info("FILE_MERGE: 准备读取文件"+full_path)
|
|
tmp_data =read_large_json_file(full_path)
|
|
file_objs.append(tmp_data)
|
|
|
|
logger_cron.info("FILE_MERGE: 准备合并文件")
|
|
data = merge_data(file_objs)
|
|
logger_cron.info("FILE_MERGE: 准备写入合并的文件")
|
|
write_large_file(full_root_file_path,json.dumps(data))
|
|
logger_cron.info("FILE_MERGE: 写入合并文件完成")
|
|
#准备删除合并文件
|
|
for del_file in file_full_path:
|
|
logger_cron.info("FILE_MERGE: 准备删除 "+del_file)
|
|
delete_frile(del_file)
|
|
logger_cron.info("FILE_MERGE: 完成删除 "+del_file)
|
|
def entry():
|
|
#清洗目录
|
|
base_path = get_clean_file_path()
|
|
#匹配待清洗的文件
|
|
files = get_all_files(base_path)
|
|
#对待清洗的文件进行分组
|
|
file_dict =get_file_merge_array(files)
|
|
#合并所有文件
|
|
logger_cron.info("FILE_MERGE: 准备执行文件合并")
|
|
merge_all_files(file_dict,base_path)
|
|
|