Is There A Way To Read Bulk `yaml` Files Into A Pandas `dataframe` More Efficiently(faster) In Python
I would like to read several yaml files from a directory into pandas dataframe and concatenate them into one big DataFrame. The directory consists of 7470 files. %%time import pand
Solution 1:
step1: convert yaml to json file, use multi-process
import os
from datetime import datetime, timedelta
from pandas import json_normalize
import pandas as pd
import numpy as np
import yaml
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
# yaml file path
os.chdir('~/Downloads/all')
yaml_file_list = os.listdir('.')
yaml_file_list = [i for i in yaml_file_list if i.endswith('yaml')]
ifnot os.path.exists('output'):
os.mkdir('output')
defyaml2json(file, cnt = 1):
file_json = f'output/{file}.json'if os.path.exists(file_json):
return# read yaml and convert to dictwithopen(file,'r') as fh:
data = yaml.load(fh.read(), Loader=yaml.BaseLoader)
# convert to json file
data_str = json.dumps(data, ensure_ascii=False) + '\n'withopen(file_json, 'w') as fw:
fw.write(data_str)
logging.info(f'[{cnt}] {file_json}')
file = yaml_file_list[0]
yaml2json(file)
# muti-Process to handle file to jsonfrom concurrent.futures import ProcessPoolExecutor
####################
workers = 8
pool_list = yaml_file_list
pool_func = yaml2json
####################
total_count = len(pool_list)
with ProcessPoolExecutor(max_workers=workers) as executor:
futures = [executor.submit(pool_func, param, total_count-n)
for n, param inenumerate(pool_list)
]
# 2020-12-29 14:29:19,648 - INFO - [7468] output/1163055.yaml.json # 2020-12-29 14:32:07,597 - INFO - [6466] output/640941.yaml.json# macbook 15' 2015# 2.2 GHz Intel Core i7# 16 GB 1600 MHz DDR3# 1000 -> 14:29:19,648 -> 14:32:07,597 -> 3min# 7400 ~ 25min
step2: merge json file to one file
os.chdir('~/Downloads/all/output/')
# merge file use bash cmd cat# !cat *.json > yaml-all-json# ipython
pycmd = lambda cmd: get_ipython().system(cmd)
cmd = 'cat *.json > yaml-all-json'# pycmd(cmd)
step3: read json file
# read file# 1478 lines -> 4.37s
file = 'yaml-all-json'
df = pd.read_csv(file, sep='\n', header=None)[0]
obj = df.map(json.loads)
data_list = obj.tolist()
df_data = pd.DataFrame(data_list) # or use json_normalize to parse json data
df_data
# meta info innings# 0 {'data_version': '0.9', 'created': '2016-12-05... {'dates': ['2016-11-24', '2016-11-25', '2016-1... [{'1st innings': {'team': 'South Africa', 'dec...# 1 {'data_version': '0.9', 'created': '2016-12-21... {'city': 'Brisbane', 'dates': ['2016-12-15', '... [{'1st innings': {'team': 'Australia', 'delive...# 2 {'data_version': '0.9', 'created': '2016-10-21... {'city': 'Port Moresby', 'dates': ['2016-10-16... [{'1st innings': {'team': 'Papua New Guinea', ...# 3 {'data_version': '0.9', 'created': '2016-09-14... {'city': 'Edinburgh', 'dates': ['2016-09-10'],... [{'1st innings': {'team': 'Scotland', 'deliver...# 4 {'data_version': '0.9', 'created': '2016-09-12... {'city': 'Londonderry', 'dates': ['2016-09-05'... [{'1st innings': {'team': 'Hong Kong', 'delive..
Solution 2:
Dask is a great package if you want to avoid getting into the details of parallel computing. It's really designed for distributed computing on machines with many CPUs but I find the syntax is convenient even if you're just using it for multi-threading or multiple processes on one machine.
Here is some code that loads 100 of the yaml files into memory, firstly without using Dask:
import glob
import yaml
path = r'all'# local folder
all_files = glob.glob(path + "/*.yaml")
defload_yaml_file(filename):
withopen(filename, 'r') as fh:
d = yaml.safe_load(fh.read())
return d
n = 100
results = []
for filename in all_files[:n]:
d = load_yaml_file(filename)
results.append(d)
assertlen(results) == n
Then, using Dask:
importdaskn=100
lazy_results = []
for filename in all_files[:n]:
d = dask.delayed(load_yaml_file)(filename)
lazy_results.append(d)
results = dask.compute(*lazy_results, scheduler='processes')
assertlen(results) == n
I timed both the above on my machine which has a quad-core processor and found that with Dask it took about 19 s (wall time) compared to just over 1 min without (about a 3.1x speed up).
Post a Comment for "Is There A Way To Read Bulk `yaml` Files Into A Pandas `dataframe` More Efficiently(faster) In Python"