Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/json/14.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 提高循环效率_Python_Json_Pandas_Performance - Fatal编程技术网

Python 提高循环效率

Python 提高循环效率,python,json,pandas,performance,Python,Json,Pandas,Performance,我正在尝试将12000个JSON文件(包含事件web数据)转换为单个数据帧。 代码运行时间太长。 关于如何提高效率有什么想法吗 加载的JSON文件示例: {'$schema': 12, 'amplitude_id': None, 'app': '', 'city': ' ', 'device_carrier'

我正在尝试将12000个JSON文件(包含事件web数据)转换为单个数据帧。 代码运行时间太长。 关于如何提高效率有什么想法吗

加载的JSON文件示例:

{'$schema': 12,                       
 'amplitude_id': None,                
 'app': '',                           
 'city': ' ',                         
 'device_carrier': None,              
 'dma': ' ',                          
 'event_time': '2018-03-12 22:00:01.646000',                                
 'group_properties': {'[Segment] Group': {'': {}}},                         
 'ip_address': ' ',                   
 'os_version': None,                  
 'paying': None,                      
 'platform': 'analytics-ruby',        
 'processed_time': '2018-03-12 22:00:06.004940',                            
 'server_received_time': '2018-03-12 22:00:02.993000',                      
 'user_creation_time': '2018-01-12 18:57:20.212000',                        
 'user_id': ' ',                      
 'user_properties': {'initial_referrer': '',                                
  'last_name': '',                    
  'organization_id': 2},              
 'uuid': ' ',                         
 'version_name': None}                
谢谢

import os
import pandas as pd


data = pd.DataFrame()

for filename in os.listdir('path'):
    file = open(filename, "r")
    file_read1 = file.read()
    file_read1 = pd.read_json(file_read1, lines = True)
    data = data.append(file_read1, ignore_index = True)

将JSON字符串转换为数据帧的最快方法似乎是
pd.io.JSON.JSON\u normalize
。根据JSON的数量,它比附加到现有数据帧快15到>500倍。它比pd.concat强13到170倍

副作用是JSON的嵌套部分(
group\u properties
user\u properties
)也会变平,并且需要手动设置
dtypes

12000个JSON的运行时间(不考虑磁盘I/O)

  • 附加:~177秒
  • concat:~126秒
  • json_标准化:~0.7秒

完整代码

import pandas as pd
import json
import time

j = {'$schema': 12,                       
 'amplitude_id': None,                
 'app': '',                           
 'city': ' ',                         
 'device_carrier': None,              
 'dma': ' ',                          
 'event_time': '2018-03-12 22:00:01.646000',                                
 'group_properties': {'[Segment] Group': {'': {}}},                         
 'ip_address': ' ',                   
 'os_version': None,                  
 'paying': None,                      
 'platform': 'analytics-ruby',        
 'processed_time': '2018-03-12 22:00:06.004940',                            
 'server_received_time': '2018-03-12 22:00:02.993000',                      
 'user_creation_time': '2018-01-12 18:57:20.212000',                        
 'user_id': ' ',                      
 'user_properties': {'initial_referrer': '',                                
  'last_name': '',                    
  'organization_id': 2},              
 'uuid': ' ',                         
 'version_name': None}

json_str = json.dumps(j)

def df_append():
    t0 = time.time()
    df = pd.DataFrame()
    for _ in range(n_lines):
        file_read1 = pd.read_json(json_str, lines=True)
        df = df.append(file_read1, ignore_index=True)
    return df, time.time() - t0

def df_concat():
    t0 = time.time()
    data = []
    for _ in range(n_lines):
        file_read1 = pd.read_json(json_str, lines=True)
        data.append(file_read1)

    df = pd.concat(data)
    df.index = list(range(len(df)))
    return df, time.time() - t0

def df_io_json():
    df_ref = pd.read_json(json_str, lines=True)
    t0 = time.time()
    data = []
    for _ in range(n_lines):
        data.append(json_str)

    df = pd.io.json.json_normalize(pd.DataFrame(data)[0].apply(json.loads))
    for col, dtype in df_ref.dtypes.to_dict().items():
        if col not in df.columns:
            continue
        df[col] = df[col].astype(dtype, inplace=True)
    return df, time.time() - t0


n_datapoints = (10, 10**2, 10**3, 12000, 10**4, 10**5)
times = {}
for n_lines in n_datapoints:
    times[n_lines] = [[], [], []]
    for _ in range(3):
        df1, t1 = df_append()
        df2, t2 = df_concat()
        df3, t3 = df_io_json()
        times[n_lines][0].append(t1)
        times[n_lines][1].append(t2)
        times[n_lines][2].append(t3)
        pd.testing.assert_frame_equal(df1, df2)
        pd.testing.assert_frame_equal(df1[df1.columns[0:7]], df3[df3.columns[0:7]])
        pd.testing.assert_frame_equal(df2[df2.columns[8:16]], df3[df3.columns[7:15]])
        pd.testing.assert_frame_equal(df2[df2.columns[17:]], df3[df3.columns[18:]])
    for i in range(3):
        times[n_lines][i] = sum(times[n_lines][i]) / 3
times  

x = n_datapoints

fig = plt.figure()

plt.plot(x, [t[0] for t in times.values()], 'o-', label='append')
plt.plot(x, [t[1] for t in times.values()], 'o-', label='concat')
plt.plot(x, [t[2] for t in times.values()], 'o-', label='json_normalize')

plt.xlabel('number of JSONs', fontsize=16)
plt.ylabel('time in seconds', fontsize=18)
plt.yscale('log')

plt.legend()
plt.show()


您能否举例说明文件和JSON格式中的数据是什么样的?构建一个大型json然后将其放入数据帧可能会更快。此方法还可能导致内存问题,因为它将创建12000次新的dataframe对象。请将其编辑到原始问题中,注释不利于显示数据或格式化是的,不要
。在循环中附加
数据帧。这是非常低效的。创建数据帧列表,然后在结果列表上使用
pd.concat
import pandas as pd
import json
import time

j = {'$schema': 12,                       
 'amplitude_id': None,                
 'app': '',                           
 'city': ' ',                         
 'device_carrier': None,              
 'dma': ' ',                          
 'event_time': '2018-03-12 22:00:01.646000',                                
 'group_properties': {'[Segment] Group': {'': {}}},                         
 'ip_address': ' ',                   
 'os_version': None,                  
 'paying': None,                      
 'platform': 'analytics-ruby',        
 'processed_time': '2018-03-12 22:00:06.004940',                            
 'server_received_time': '2018-03-12 22:00:02.993000',                      
 'user_creation_time': '2018-01-12 18:57:20.212000',                        
 'user_id': ' ',                      
 'user_properties': {'initial_referrer': '',                                
  'last_name': '',                    
  'organization_id': 2},              
 'uuid': ' ',                         
 'version_name': None}

json_str = json.dumps(j)

def df_append():
    t0 = time.time()
    df = pd.DataFrame()
    for _ in range(n_lines):
        file_read1 = pd.read_json(json_str, lines=True)
        df = df.append(file_read1, ignore_index=True)
    return df, time.time() - t0

def df_concat():
    t0 = time.time()
    data = []
    for _ in range(n_lines):
        file_read1 = pd.read_json(json_str, lines=True)
        data.append(file_read1)

    df = pd.concat(data)
    df.index = list(range(len(df)))
    return df, time.time() - t0

def df_io_json():
    df_ref = pd.read_json(json_str, lines=True)
    t0 = time.time()
    data = []
    for _ in range(n_lines):
        data.append(json_str)

    df = pd.io.json.json_normalize(pd.DataFrame(data)[0].apply(json.loads))
    for col, dtype in df_ref.dtypes.to_dict().items():
        if col not in df.columns:
            continue
        df[col] = df[col].astype(dtype, inplace=True)
    return df, time.time() - t0


n_datapoints = (10, 10**2, 10**3, 12000, 10**4, 10**5)
times = {}
for n_lines in n_datapoints:
    times[n_lines] = [[], [], []]
    for _ in range(3):
        df1, t1 = df_append()
        df2, t2 = df_concat()
        df3, t3 = df_io_json()
        times[n_lines][0].append(t1)
        times[n_lines][1].append(t2)
        times[n_lines][2].append(t3)
        pd.testing.assert_frame_equal(df1, df2)
        pd.testing.assert_frame_equal(df1[df1.columns[0:7]], df3[df3.columns[0:7]])
        pd.testing.assert_frame_equal(df2[df2.columns[8:16]], df3[df3.columns[7:15]])
        pd.testing.assert_frame_equal(df2[df2.columns[17:]], df3[df3.columns[18:]])
    for i in range(3):
        times[n_lines][i] = sum(times[n_lines][i]) / 3
times  

x = n_datapoints

fig = plt.figure()

plt.plot(x, [t[0] for t in times.values()], 'o-', label='append')
plt.plot(x, [t[1] for t in times.values()], 'o-', label='concat')
plt.plot(x, [t[2] for t in times.values()], 'o-', label='json_normalize')

plt.xlabel('number of JSONs', fontsize=16)
plt.ylabel('time in seconds', fontsize=18)
plt.yscale('log')

plt.legend()
plt.show()