Python 3.x 如何遍历数据帧以形成嵌套的json?

Python 3.x 如何遍历数据帧以形成嵌套的json?,python-3.x,pandas,Python 3.x,Pandas,我有一个具有以下结构的熊猫数据帧。它可以使用以下代码创建 import pandas as pd import numpy as np word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN] level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,n

我有一个具有以下结构的熊猫数据帧。它可以使用以下代码创建

import pandas as pd
import numpy as np

word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN]
level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,np.NaN,np.NaN,np.NaN,'<leg>',np.NaN,'<clutch>',np.NaN,np.NaN,'<break>',np.NaN]
level_3_end = [np.NaN,np.NaN,'</tyre>',np.NaN,'</steering>',np.NaN,np.NaN,np.NaN,'</leg>',np.NaN,np.NaN,np.NaN,'</clutch>','</break>',np.NaN]
level_2_start = [np.NaN,np.NaN,'<car>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<dog>',np.NaN,'<car>',np.NaN,np.NaN,'<bus>',np.NaN]
level_2_end = [np.NaN,np.NaN,np.NaN,np.NaN,'</car>',np.NaN,np.NaN,np.NaN,'</dog>',np.NaN,np.NaN,np.NaN,'</car>','</bus>',np.NaN]
level_1_start= [np.NaN,np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<animal>',np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN]
level_1_end= [np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN,'</animal>',np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN]

df1 = pd.DataFrame(list(zip(word, level_3_start,level_3_end, level_2_start,level_2_end, level_1_start,  level_1_end )), 
               columns =['word', 'level_3_start', 'level_3_end', 'level_2_start', 'level_2_end', 'level_1_start', 'level_1_end']) 


在熊猫身上实现这一点的最佳方法是什么?

您捕获的信息比需要的要多。不需要结束列

  • 删除其中没有任何内容的行
    dropna()
  • 向前填充标签并从字符串中删除
  • 使用理解从dataframe
    到_dict()

  • 要获得最终结果,您的数据必须经过三个步骤:

    步骤1:删除处理不需要的所有列

    步骤2:清理数据以删除标记,并按级别1、级别2、级别3的顺序对它们进行排序

    步骤3:创建嵌套字典

    我是这样做的。对每个部分进行评论,以清楚地显示我们正在做什么

    import pandas as pd
    import numpy as np
    import collections
    
    word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN]
    level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,np.NaN,np.NaN,np.NaN,'<leg>',np.NaN,'<clutch>',np.NaN,np.NaN,'<break>',np.NaN]
    level_3_end = [np.NaN,np.NaN,'</tyre>',np.NaN,'</steering>',np.NaN,np.NaN,np.NaN,'</leg>',np.NaN,np.NaN,np.NaN,'</clutch>','</break>',np.NaN]
    level_2_start = [np.NaN,np.NaN,'<car>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<dog>',np.NaN,'<car>',np.NaN,np.NaN,'<bus>',np.NaN]
    level_2_end = [np.NaN,np.NaN,np.NaN,np.NaN,'</car>',np.NaN,np.NaN,np.NaN,'</dog>',np.NaN,np.NaN,np.NaN,'</car>','</bus>',np.NaN]
    level_1_start= [np.NaN,np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<animal>',np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN]
    level_1_end= [np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN,'</animal>',np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN]
    
    df1 = pd.DataFrame(list(zip(word, level_3_start,level_3_end, level_2_start,level_2_end, level_1_start,  level_1_end )), 
                   columns =['word', 'level_3_start', 'level_3_end', 'level_2_start', 'level_2_end', 'level_1_start', 'level_1_end'])
    
    #creating df_temp for processing
    df_temp = df1
    
    #drop columns that are not important for this problem statement
    df_temp = df_temp.drop(columns=['word','level_1_end','level_2_end','level_3_end'])
    
    #remove all < and >
    df_temp['level_1_start'] = df_temp['level_1_start'].str.replace("<","").str.replace(">","")
    df_temp['level_2_start'] = df_temp['level_2_start'].str.replace("<","").str.replace(">","")
    df_temp['level_3_start'] = df_temp['level_3_start'].str.replace("<","").str.replace(">","")
    
    #drop all rows that don't have any value
    df_temp.dropna(how='all', inplace = True)
    
    #forwardfill all level_1 columns
    df_temp['level_1_start'] = df_temp['level_1_start'].ffill()
    
    #drop rows that have no data in level_2 and level_3
    df_temp = df_temp.dropna(subset=['level_3_start','level_2_start'],how='all')
    
    #forwardfill all level_2_start columns
    df_temp['level_2_start'] = df_temp['level_2_start'].ffill()
    
    #drop rows that have no data in level_3
    df_temp = df_temp.dropna(subset=['level_3_start'],how='all')
    
    #now we have the all data ready for processing
    #sort them in level_1, level_2, level_3 order
    df_temp = df_temp.sort_values(by=['level_1_start', 'level_2_start','level_3_start'])
    
    #to create nested dictionary, you need to use collections.defaultdict
    df_dict = collections.defaultdict(dict)
    
    #iterate through the dataframe. each row will have a unique record for level_3    
    for idx,row in df_temp.iterrows():
        lev_1 = row['level_1_start']
        lev_2 = row['level_2_start']
        lev_3 = row['level_3_start']
        
        #if level_1 does not exist, create new entry for level_1, level_2, & level_3 (ex: animal does not exist)
        #if level_1 exists but no level_2, create new entry for level_2 & level_3 (ex: car does not exist but bus exists)
        #if level_1 and level 2 exists, then create a new entry for level 3 (ex: vehicle, car exists, but tyre does not)
    
        if lev_1 in df_dict:
            if lev_2 in df_dict[lev_1]:
                df_dict[lev_1][lev_2][lev_3] = True
            else:
                df_dict[lev_1][lev_2] = {lev_3:True}
        else:
            df_dict[lev_1] = {lev_2 : {lev_3:True}}
    
    #convert collection back to normal dictionary
    df_dict = dict(df_dict)
    
    print(df_dict)
    

    将数据框粘贴为文本。不需要图片,请添加创建数据帧的代码!
    df = pd.DataFrame({"word":["this","is","a","test","call","this","is","a","test","call","this","is","a","test","call"],
                  "level_3_start":["","","<tyre>","<steering>","","","","","<leg>","","<clutch>","","","<break>",""],
                  "level_3_end":["","","</tyre>","","</steering>","","","","</leg>","","","","</clutch>","</break>",""],
                  "level_2_start":["","","<car>","","","","","","<dog>","","<car>","","","<bus>",""],
                  "level_2_end":["","","","","</car>","","","","</dog>","","","","</car>","</bus>",""],
                  "level_1_start":["","","<vehicle>","","","","","","<animal>","","<vehicle>","","","",""],
                  "level_1_end":["","","","","","","</vehicle>","","</animal>","","","","","</vehicle>",""]})
    
    # cleanup
    df = df.replace({"":np.nan}).dropna(subset=[c for c in df.columns if c!="word"], how="all")
    for c in [c for c in df.columns if "start" in c]:
        df[c].fillna(method="ffill", inplace=True)
        df[c] = df[c].str.replace("<","")
        df[c] = df[c].str.replace(">","")
    
    
    dfd = df.loc[:,[c for c in df.columns if "level" in c]].drop_duplicates().to_dict(orient="records")
    {d["level_1_start"]:
     {d2["level_2_start"]:
      {d3["level_3_start"]:True
       for d3 in dfd if d3["level_1_start"]==d["level_1_start"] and d3["level_2_start"]==d2["level_2_start"]
      }
      for d2 in dfd if d2["level_1_start"]==d["level_1_start"]
     }
     for d in dfd
    }
    
    
    {'vehicle': {'car': {'tyre': True, 'steering': True, 'clutch': True},
      'bus': {'break': True}},
     'animal': {'dog': {'leg': True}}}
    
    import pandas as pd
    import numpy as np
    import collections
    
    word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN]
    level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,np.NaN,np.NaN,np.NaN,'<leg>',np.NaN,'<clutch>',np.NaN,np.NaN,'<break>',np.NaN]
    level_3_end = [np.NaN,np.NaN,'</tyre>',np.NaN,'</steering>',np.NaN,np.NaN,np.NaN,'</leg>',np.NaN,np.NaN,np.NaN,'</clutch>','</break>',np.NaN]
    level_2_start = [np.NaN,np.NaN,'<car>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<dog>',np.NaN,'<car>',np.NaN,np.NaN,'<bus>',np.NaN]
    level_2_end = [np.NaN,np.NaN,np.NaN,np.NaN,'</car>',np.NaN,np.NaN,np.NaN,'</dog>',np.NaN,np.NaN,np.NaN,'</car>','</bus>',np.NaN]
    level_1_start= [np.NaN,np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<animal>',np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN]
    level_1_end= [np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN,'</animal>',np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN]
    
    df1 = pd.DataFrame(list(zip(word, level_3_start,level_3_end, level_2_start,level_2_end, level_1_start,  level_1_end )), 
                   columns =['word', 'level_3_start', 'level_3_end', 'level_2_start', 'level_2_end', 'level_1_start', 'level_1_end'])
    
    #creating df_temp for processing
    df_temp = df1
    
    #drop columns that are not important for this problem statement
    df_temp = df_temp.drop(columns=['word','level_1_end','level_2_end','level_3_end'])
    
    #remove all < and >
    df_temp['level_1_start'] = df_temp['level_1_start'].str.replace("<","").str.replace(">","")
    df_temp['level_2_start'] = df_temp['level_2_start'].str.replace("<","").str.replace(">","")
    df_temp['level_3_start'] = df_temp['level_3_start'].str.replace("<","").str.replace(">","")
    
    #drop all rows that don't have any value
    df_temp.dropna(how='all', inplace = True)
    
    #forwardfill all level_1 columns
    df_temp['level_1_start'] = df_temp['level_1_start'].ffill()
    
    #drop rows that have no data in level_2 and level_3
    df_temp = df_temp.dropna(subset=['level_3_start','level_2_start'],how='all')
    
    #forwardfill all level_2_start columns
    df_temp['level_2_start'] = df_temp['level_2_start'].ffill()
    
    #drop rows that have no data in level_3
    df_temp = df_temp.dropna(subset=['level_3_start'],how='all')
    
    #now we have the all data ready for processing
    #sort them in level_1, level_2, level_3 order
    df_temp = df_temp.sort_values(by=['level_1_start', 'level_2_start','level_3_start'])
    
    #to create nested dictionary, you need to use collections.defaultdict
    df_dict = collections.defaultdict(dict)
    
    #iterate through the dataframe. each row will have a unique record for level_3    
    for idx,row in df_temp.iterrows():
        lev_1 = row['level_1_start']
        lev_2 = row['level_2_start']
        lev_3 = row['level_3_start']
        
        #if level_1 does not exist, create new entry for level_1, level_2, & level_3 (ex: animal does not exist)
        #if level_1 exists but no level_2, create new entry for level_2 & level_3 (ex: car does not exist but bus exists)
        #if level_1 and level 2 exists, then create a new entry for level 3 (ex: vehicle, car exists, but tyre does not)
    
        if lev_1 in df_dict:
            if lev_2 in df_dict[lev_1]:
                df_dict[lev_1][lev_2][lev_3] = True
            else:
                df_dict[lev_1][lev_2] = {lev_3:True}
        else:
            df_dict[lev_1] = {lev_2 : {lev_3:True}}
    
    #convert collection back to normal dictionary
    df_dict = dict(df_dict)
    
    print(df_dict)
    
    {'animal': 
        {'dog': {'leg': True}
        }, 
     'vehicle': 
        {'bus': {'break': True}, 
         'car': {'clutch': True, 'steering': True, 'tyre': True}
        }
    }