Python 根据数据帧中的多个列和行的某些条件构造新列

Python 根据数据帧中的多个列和行的某些条件构造新列,python,pandas,numpy,dataframe,Python,Pandas,Numpy,Dataframe,我有一个如下所示的数据帧: leid run_seq cp_id products currency amount 101 1 201 A YEN 345 102 2 201 B INR 223 101 2 202 A USD

我有一个如下所示的数据帧:

leid     run_seq     cp_id     products    currency     amount
101           1       201        A           YEN         345
102           2       201        B           INR         223
101           2       202        A           USD         845
102           3       201        C           USD         345
102           3       203        A           INR         747
现在我想创建另一个数据帧,或者重写现有的数据帧,其中包含列current和history以及现有列,如下所示:

leid     run_seq     current                                     History
101           1       {201:{A:{YEN:345}}}                          {}
102           2       {201:{B:{INR:223}}}                          {}
101           2       {202:{A:{USD:845}}}                          {201:{A:{YEN:345}}}
102           3       {201:{C:{USD:773}},203:{A:{INR:747 }         {201:{B:{INR:223}}}
给出上下文并解释问题:run_seq可以被视为date,如果run seq=1,则为第一天,因此leid=101可能没有历史记录,因此字典为空。 当前条目是指该特定运行序列上的条目

例如:如果leid 101在运行序列1上执行两个事务,那么当前将是{201:{A:{YEN:345}},202:{B:{USD:INR}},如果在同一运行序列上有两个不同的cp id对应于相同的leid。如果两个特定leid和run seq的cp_ID相同,但购买不同的产品,则{201:{A:{YAN:345},B:{USD:828}};如果相同的cp_id,相同的运行顺序相同的产品和相同的,则{201:{A:{日元:345,美元:734}};如果相同的cp_id、产品、特定leid的货币和run_seq,然后添加amnt,即{201:{a:{YEN:345,YEN:734}}},结果将是{201:{a:{YEN:1079}}

在给定的运行序列中,特定leid的语法是所有先前运行序列的所有可能字典的组合。例如:如果run_seq=5,历史记录将是run_seq=1,2,3,4的所有嵌套dict的组合,用于run_seq上的特定leid

请注意,输出中的特定运行顺序上应该只有一个唯一的leid


我已经尝试了所有的方法,但是没有一个完整的代码。更重要的是,我想不出从哪里开始?

我利用了Pandas的应用功能和定制的Pandas的groupby功能

定制熊猫团购积分:

我还稍微修改了您的输入,以显示一些可能的结果

代码如下所示

# defined the table copied from your question

table = """
leid     run_seq     cp_id     products    currency     amount
101           1       201        A           YEN         345
102           1       201        A           IDR         900
102           2       201        B           INR         223
101           2       202        A           USD         845
102           3       201        C           USD         345
"""

import pandas as pd
import numpy as np

with open("stackoverflow.csv", "w") as f:
    f.write(table)

df = pd.read_csv("stackoverflow.csv", delim_whitespace=True)
df = df.sort_values(by = ["leid", "run_seq"]).reset_index(drop = True)
# assigned using pandas apply in axis = 1
df["current"] = df.apply(lambda x: {x["cp_id"]: {x["products"]: {x["currency"]: x["amount"]}}}, axis = 1)


# defining a function to merge dictionaries
def Merge(dict1, dict2): 
    res = {**dict1, **dict2} 
    return res 

# defining a customised cumulative function dictionary
def cumsumdict(data):

    current_dict = [{}]

    for i in range(1, data.shape[0]):
        cp_id = list(data["current"].iloc[i-1])[0]
        product = list(data["current"].iloc[i-1][cp_id])[0]
        currency = list(data["current"].iloc[i-1][cp_id][product])[0]
        if cp_id in current_dict[-1]:
            # merge cp_id using dictionary merge if exist in previous trx
            cp_merger = Merge(current_dict[-1][cp_id], data["current"].iloc[i-1][cp_id])
            appender = current_dict[-1]
            appender[cp_id] = cp_merger
            if product in current_dict[-1][cp_id]:
                # merge products using dictionary merge if exist in previous trx
                product_merger = Merge(current_dict[-1][cp_id][product], data["current"].iloc[i-1][cp_id][product])
                appender = current_dict[-1]
                appender[cp_id][product] = product_merger
                if currency in current_dict[-1][cp_id][product]:
                    # sum the currency value 
                    currency_merger = current_dict[-1][cp_id][product][currency] + data["current"].iloc[i-1][cp_id][product][currency]
                    appender = current_dict[-1]
                    appender[cp_id][product][currency] = currency_merger



        else:
            appender = Merge(current_dict[-1], data["current"].iloc[i-1])

        current_dict.append(appender)

    data["history"] = current_dict

    return data

df = df.groupby(["leid"]).apply(cumsumdict)
df = df[["leid", "run_seq", "current", "history"]]
print(df)
上述功能将导致

  leid  run_seq                     current  \
0   101        1  {201: {'A': {'YEN': 345}}}   
3   101        2  {202: {'A': {'USD': 845}}}   
1   102        1  {201: {'A': {'IDR': 900}}}   
2   102        2  {201: {'B': {'INR': 223}}}   
4   102        3  {201: {'C': {'USD': 345}}}   

                                         history  
0                                             {}  
3                     {201: {'A': {'YEN': 345}}}  
1                                             {}  
2  {201: {'A': {'IDR': 900}, 'B': {'INR': 446}}}  
4  {201: {'A': {'IDR': 900}, 'B': {'INR': 446}}}  

这是我的解决方案,但是“历史”包含了dict列表,而不仅仅是dict

import pandas as pd, numpy as np
import io

# My test data:
text="""leid  run_seq  cp_id products currency  amount
0      101.0      1.0  201.0        A      YEN   345.0
1      102.0      2.0  201.0        B      INR   223.0
2      101.0      2.0  202.0        A      USD   845.0
3      102.0      3.0  201.0        C      USD   345.0
4      101.0      1.0  201.0        A      YEN   100.0
5      101.0      1.0  203.0        B     EURO   200.0
6      101.0      1.0  203.0        C      AUD   300.0"""

df= pd.read_csv(io.StringIO(text),sep=r"\s+",engine="python").sort_values(["leid","run_seq"])
G= df.groupby(["leid","run_seq"],sort=False)

def mkdict(grp):
    # Out: {201:{A:{YEN:345}}}
    d_cpid={}
    for r in grp.itertuples():
        d_prod= d_cpid.setdefault(r.cp_id, {} )     # {201:{}
        d_curr= d_prod.setdefault(r.products,{})    # {201:{A:{}
        d_curr[r.currency]= d_curr.get(r.currency,0)+r.amount   # {201:{A:{YEN:

    return d_cpid

rslt= G.apply(lambda grp: mkdict(grp))
rslt= rslt.reset_index().rename(columns={0:"current"})

L=[]
G1= rslt.groupby("leid")
for key,grp in G1:
    L.append([])
    lv= grp["current"].values
    for i in range(1,len(lv)):
        L.append(lv[:i])

rslt["history"]= L
编辑:下一次尝试


是否可以将更多示例从例如段落添加到数据帧?它给出以下错误-KeyError:“金额”,“发生在索引0处”@AkashDubey在pandas apply中,您需要添加axis=1,其中df.applylambda x:。。。x[金额]…,轴=1是。我只是复制粘贴了你的代码。它抛出相同的错误。对不起。是我的错。现在工作。列名不同。非常感谢。不管怎样,这些清单还是字典。这对我来说是最重要的。而且,代码没有按它应该的方式工作。特定leid、run_seq对的历史记录是该特定leid的所有当前DICT在所有前一次run_seq中的合并字典。因此,对于'leid=102',运行leid=102的历史记录。运行顺序=4将是当前的运行顺序=1+当前的运行顺序=2+当前的运行顺序=3,leid=101当前列工作正常,但历史列都搞乱了。@AkashDubey查看上面编辑的代码。第一步不是创建字典,而是创建元组列表,然后将这些元组转换为dict。这将列表定位到dict部分。但仍然不能使历史部分正确。历史完全是一团糟。
import pandas as pd, numpy as np
import io

# My test data
text="""leid  run_seq  cp_id products currency  amount
0      101.0      1.0  201.0        A      YEN   345.0
1      102.0      2.0  201.0        B      INR   223.0
2      101.0      2.0  202.0        A      USD   845.0
3      102.0      3.0  201.0        C      USD   345.0
4      101.0      1.0  201.0        A      YEN   100.0
5      101.0      1.0  203.0        B      EUR   200.0
6      101.0      1.0  203.0        C      AUD   300.0
7      101.0      3.0  204.0        D      INR   400.0
8      101.0      2.0  203.0        B      EUR   155.0
"""

df= pd.read_csv(io.StringIO(text),sep=r"\s+",engine="python").sort_values(["leid","run_seq"])
G= df.groupby(["leid","run_seq"],sort=False)

# This function works on a groupby object, and returns list of tuples:
def mklist(grp):
    return [ (r.cp_id,r.products,r.currency,r.amount) for r in grp.itertuples()]

# It makes dictionary from a list of tuples:
def mkdict(lt):

    # Out: { {201:{A:{YEN:345}}}, ... }
    d_cpid={}
    for cpid,prod,curr,amnt in lt:
        d_prod= d_cpid.setdefault(cpid, {})    # {201:{}
        d_curr= d_prod.setdefault(prod,{})      # {201:{A:{}
        d_curr[curr]= d_curr.get(curr,0)+amnt   # {201:{A:{YEN:

    return d_cpid

rslt= G.apply(lambda grp: mklist(grp) )
rslt= rslt.reset_index().rename(columns={0:"current"})

L=[]
G1= rslt.groupby("leid")
for key,grp in G1:
    L.append([])
    lv= grp["current"].values
    for i in range(1,len(lv)):
        L.append( [t for l in lv[:i] for t in l] )

rslt["history"]= [ mkdict(l) for l in L ]
rslt["current"]= [ mkdict(l) for l in rslt.current.values ]