有个导出的json数据文件,单行存储了上亿个元素,文件大小为GB级别。在对该文件数据进行分析的时候由于文件太大有些问题,刚好数据中有时间字段,所以打算根据时间字段按月拆分为小文件。
示例数据
以下为缩减修改后的示例数据
[root@imzcy json]# cat bigfile.json
[{"name":"陈八","phone":"11111111111","createdAt":{"$date":"2023-06-30T05:58:48.222Z"},"updatedAt":{"$date":"2023-06-30T05:58:48.222Z"}},{"name":"范十一","phone":"13333333333","createdAt":{"$date":"2023-07-14T22:12:15.729Z"},"updatedAt":{"$date":"2023-07-14T22:12:15.729Z"}},{"name":"赵二","phone":"12222222222","createdAt":{"$date":"2023-06-23T08:30:00.000Z"},"updatedAt":{"$date":"2023-06-23T08:30:00.000Z"}},{"name":"钱三","phone":"14444444444","createdAt":{"$date":"2023-07-02T16:45:30.500Z"},"updatedAt":{"$date":"2023-07-02T16:45:30.500Z"}},{"name":"孙四","phone":"15555555555","createdAt":{"$date":"2023-08-15T10:20:45.123Z"},"updatedAt":{"$date":"2023-08-15T10:20:45.123Z"}},{"name":"李五","phone":"16666666666","createdAt":{"$date":"2023-09-07T18:00:00.000Z"},"updatedAt":{"$date":"2023-09-07T18:00:00.000Z"}},{"name":"周六","phone":"17777777777","createdAt":{"$date":"2023-06-30T12:45:30.800Z"},"updatedAt":{"$date":"2023-06-30T12:45:30.800Z"}},{"name":"吴七","phone":"18888888888","createdAt":{"$date":"2023-08-18T09:10:20.333Z"},"updatedAt":{"$date":"2023-08-18T09:10:20.333Z"}},{"name":"郑八","phone":"19999999999","createdAt":{"$date":"2023-09-22T14:30:00.000Z"},"updatedAt":{"$date":"2023-09-22T14:30:00.000Z"}},{"name":"王九","phone":"20000000000","createdAt":{"$date":"2023-07-10T06:15:45.222Z"},"updatedAt":{"$date":"2023-07-10T06:15:45.222Z"}},{"name":"刘十","phone":"21111111111","createdAt":{"$date":"2023-10-01T00:00:00.000Z"},"updatedAt":{"$date":"2023-10-01T00:00:00.000Z"}},{"name":"张十一","phone":"22222222222","createdAt":{"$date":"2023-07-20T16:30:00.500Z"},"updatedAt":{"$date":"2023-07-20T16:30:00.500Z"}},{"name":"杨十二","phone":"23333333333","createdAt":{"$date":"2023-08-05T09:45:15.888Z"},"updatedAt":{"$date":"2023-08-05T09:45:15.888Z"}},{"name":"陈十三","phone":"24444444444","createdAt":{"$date":"2023-09-12T12:00:00.000Z"},"updatedAt":{"$date":"2023-09-12T12:00:00.000Z"}},{"name":"范十四","phone":"25555555555","createdAt":{"$date":"2023-06-30T18:20:30.100Z"},"updatedAt":{"$date":"2023-06-30T18:20:30.100Z"}},{"name":"赵十五","phone":"26666666666","createdAt":{"$date":"2023-08-25T15:30:45.555Z"},"updatedAt":{"$date":"2023-08-25T15:30:45.555Z"}},{"name":"钱十六","phone":"27777777777","createdAt":{"$date":"2023-09-30T08:40:00.000Z"},"updatedAt":{"$date":"2023-09-30T08:40:00.000Z"}},{"name":"孙十七","phone":"28888888888","createdAt":{"$date":"2023-07-11T11:11:11.111Z"},"updatedAt":{"$date":"2023-07-11T11:11:11.111Z"}},{"name":"李十八","phone":"29999999999","createdAt":{"$date":"2023-10-10T10:10:10.101Z"},"updatedAt":{"$date":"2023-10-10T10:10:10.101Z"}},{"name":"周十九","phone":"30000000000","createdAt":{"$date":"2023-07-28T20:20:20.202Z"},"updatedAt":{"$date":"2023-07-28T20:20:20.202Z"}},{"name":"吴二十","phone":"31111111111","createdAt":{"$date":"2023-09-15T14:30:00.000Z"},"updatedAt":{"$date":"2023-09-15T14:30:00.000Z"}}]
[root@imzcy json]#展开 json 格式
[
{
"name": "陈八",
"phone": "11111111111",
"createdAt": {
"$date": "2023-06-30T05:58:48.222Z"
},
"updatedAt": {
"$date": "2023-06-30T05:58:48.222Z"
}
},
{
"name": "范十一",
"phone": "13333333333",
"createdAt": {
"$date": "2023-07-14T22:12:15.729Z"
},
"updatedAt": {
"$date": "2023-07-14T22:12:15.729Z"
}
}
]python 脚本
[root@imzcy json]# cat split_file_by_month.json
#!/usr/bin/env python
'''
Author: zcy
Date: 2023-12-10
Website: https://www.imzcy.cn
'''
import json
import os
from datetime import datetime
input_file = "bigfile.json"
output_dir = "output/"
# 创建输出目录
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def split_file_by_month(input_file, output_dir):
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# 按照月份分割数据
for obj in data:
date_str = obj["createdAt"]["$date"]
date = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
month = date.strftime("%Y-%m")
output_file = os.path.join(output_dir, f"smallfile_{month}.json")
with open(output_file, 'a', encoding='utf-8') as output_f:
json.dump(obj, output_f, ensure_ascii=False)
output_f.write('\n') # 每个对象单独一行
split_file_by_month(input_file, output_dir)
[root@imzcy json]#执行验证效果
当前目录结构
查看当前目录下文件,包含名为 bigfile.json 的要拆分的原始数据文件 和 名为 split_file_by_month.json 的脚本。
[root@imzcy json]# ll
total 12
-rw-r--r-- 1 root root 4106 Dec 8 08:29 bigfile.json
-rw-r--r-- 1 root root 959 Dec 10 13:43 split_file_by_month.json
[root@imzcy json]#执行 python 脚本
脚本执行过程中不会有任何输出,正常退出执行就是完成了。
[root@imzcy json]# python3 --version
Python 3.9.17
[root@imzcy json]#
[root@imzcy json]# python3 split_file_by_month.json确认执行后效果
[root@imzcy json]# ll
total 12
-rw-r--r-- 1 root root 4106 Dec 8 08:29 bigfile.json
drwxr-xr-x 2 root root 156 Dec 10 13:45 output
-rw-r--r-- 1 root root 959 Dec 10 13:44 split_file_by_month.json
[root@imzcy json]#[root@imzcy json]# ll output/
total 20
-rw-r--r-- 1 root root 591 Dec 10 13:45 smallfile_2023-06.json
-rw-r--r-- 1 root root 894 Dec 10 13:45 smallfile_2023-07.json
-rw-r--r-- 1 root root 594 Dec 10 13:45 smallfile_2023-08.json
-rw-r--r-- 1 root root 744 Dec 10 13:45 smallfile_2023-09.json
-rw-r--r-- 1 root root 297 Dec 10 13:45 smallfile_2023-10.json
[root@imzcy json]#[root@imzcy json]# cat output/smallfile_2023-06.json
{"name": "陈八", "phone": "11111111111", "createdAt": {"$date": "2023-06-30T05:58:48.222Z"}, "updatedAt": {"$date": "2023-06-30T05:58:48.222Z"}}
{"name": "赵二", "phone": "12222222222", "createdAt": {"$date": "2023-06-23T08:30:00.000Z"}, "updatedAt": {"$date": "2023-06-23T08:30:00.000Z"}}
{"name": "周六", "phone": "17777777777", "createdAt": {"$date": "2023-06-30T12:45:30.800Z"}, "updatedAt": {"$date": "2023-06-30T12:45:30.800Z"}}
{"name": "范十四", "phone": "25555555555", "createdAt": {"$date": "2023-06-30T18:20:30.100Z"}, "updatedAt": {"$date": "2023-06-30T18:20:30.100Z"}}
[root@imzcy json]#
[root@imzcy json]# cat output/smallfile_2023-08.json
{"name": "孙四", "phone": "15555555555", "createdAt": {"$date": "2023-08-15T10:20:45.123Z"}, "updatedAt": {"$date": "2023-08-15T10:20:45.123Z"}}
{"name": "吴七", "phone": "18888888888", "createdAt": {"$date": "2023-08-18T09:10:20.333Z"}, "updatedAt": {"$date": "2023-08-18T09:10:20.333Z"}}
{"name": "杨十二", "phone": "23333333333", "createdAt": {"$date": "2023-08-05T09:45:15.888Z"}, "updatedAt": {"$date": "2023-08-05T09:45:15.888Z"}}
{"name": "赵十五", "phone": "26666666666", "createdAt": {"$date": "2023-08-25T15:30:45.555Z"}, "updatedAt": {"$date": "2023-08-25T15:30:45.555Z"}}
[root@imzcy json]#
本文采用 知识共享署名4.0 国际许可协议进行许可。
本站文章除注明转载/出处外,均为本站原创或翻译,转载前请务必署名。
如果您的问题未解决,欢迎微信扫描右侧二维码与我联系。