Python 添加到已保存CSV的其他列
我有以下从csv生成功能的代码Python 添加到已保存CSV的其他列,python,pandas,Python,Pandas,我有以下从csv生成功能的代码 def gen_features_per_id(file_name, label): df = pd.read_csv(file_name, delimiter=',') df['dt'] = pd.to_datetime(df['datetime'], unit='s') row = [] column_names = ['group_timestamp', 'label', 'x
def gen_features_per_id(file_name, label):
df = pd.read_csv(file_name, delimiter=',')
df['dt'] = pd.to_datetime(df['datetime'], unit='s')
row = []
column_names = ['group_timestamp', 'label',
'x_mean', 'x_median', 'x_stdev', 'x_raw_min', 'x_raw_max', 'x_abs_min', 'x_abs_max',
'y_mean', 'y_median', 'y_stdev', 'y_raw_min', 'y_raw_max', 'y_abs_min', 'y_abs_max',
'z_mean', 'z_median', 'z_stdev', 'z_raw_min', 'z_raw_max', 'z_abs_min', 'z_abs_max' ]
group_df = pd.DataFrame(columns=column_names)
for group_name, g in df.groupby(pd.Grouper(freq='10s', key='dt')):
print(f'Start time {group_name} has {len(g)} records within 10 secs')
group_timestamp = group_name
label = label
x = g['x'].head(50)
x_mean = x.mean()
x_median = x.median()
x_std_dev = statistics.stdev(x)
x_raw_min = min(x)
x_raw_max = max(x)
x_abs_min = min(abs(x))
x_abs_max = max(abs(x))
# print(
# f'Mean : {x_mean}, Median : {x_median}, Stdev : {x_std_dev}, '
# f'X raw Min : {x_raw_min}, X raw Max : {x_raw_max}, '
# f'X abs Min : {x_abs_min}, X abs Max : {x_abs_max}'
# )
y = g['y'].head(50)
y_mean = y.mean()
y_median = y.median()
y_std_dev = statistics.stdev(y)
y_raw_min = min(y)
y_raw_max = max(y)
y_abs_min = min(abs(y))
y_abs_max = max(abs(y))
# print(
# f'Mean : {y_mean}, Median : {y_median}, Std dev : {y_std_dev}, '
# f'X raw Min : {y_raw_min}, X raw Max : {y_raw_max}, '
# f'X abs Min : {y_abs_min}, X abs Max : {y_abs_max}'
# )
z = g['z'].head(50)
z_mean = z.mean()
z_median = z.median()
z_std_dev = statistics.stdev(z)
z_raw_min = min(z)
z_raw_max = max(z)
z_abs_min = min(abs(z))
z_abs_max = max(abs(z))
# print(
# f'Mean : {z_mean}, Median : {z_median}, Std dev : {z_std_dev}, '
# f'X raw Min : {z_raw_min}, X raw Max : {z_raw_max}, '
# f'X abs Min : {z_abs_min}, X abs Max : {z_abs_max}'
# )
row.append(group_timestamp)
row.append(label)
row.append(x_mean)
row.append(x_median)
row.append(x_std_dev)
row.append(x_raw_min)
row.append(x_raw_max)
row.append(x_abs_min)
row.append(x_abs_max)
row.append(y_mean)
row.append(y_median)
row.append(y_std_dev)
row.append(y_raw_min)
row.append(y_raw_max)
row.append(y_abs_min)
row.append(y_abs_max)
row.append(z_mean)
row.append(z_median)
row.append(z_std_dev)
row.append(z_raw_min)
row.append(z_raw_max)
row.append(z_abs_min)
row.append(z_abs_max)
group_df = group_df.append([row], ignore_index=True)
group_df.to_csv("some.csv", index=False)
row = []
但保存的csv文件在csv头的开头添加了其他列,这些列等于提供的列数
示例CSV
datetime,x,y,z,label
1493740845,0.0004,-0.0001,0.0045,bad
1493740846,0.0003,0.0002,0.0047,bad
1493740847,0.0005,0.0001,0.0049,bad
1493740848,0.0006,0.0004,0.005,bad
1493740849,0.0006,-0.0003,0.005,bad
1493740851,0.0001,-0.0003,0.0039,bad
1493740852,-0.0006,0.0003,0.0046,bad
B1493740853,0.0001,0.0,0.0048,bad
输出:
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,group_timestamp,label,x_abs_max,x_abs_min,x_mean,x_median,x_raw_max,x_raw_min,x_stdev,y_abs_max,y_abs_min,y_mean,y_median,y_raw_max,y_raw_min,y_stdev,z_abs_max,z_abs_min,z_mean,z_median,z_raw_max,z_raw_min,z_stdev
# data ... ,,,,,,,,,,,,,,,,,,,,,,,
# data ... ,,,,,,,,,,,,,,,,,,,,,,,
如何解决这个问题
另外:如果您可以帮助我进一步简化代码。groupby中的每个循环都有问题,需要将值附加到
行
列表,然后将值附加到行
外部循环中,以便在上一步中传递到数据帧
协同构造函数:
#added for nested lists (outside loops)
rows = []
df['dt'] = pd.to_datetime(df['datetime'], unit='s')
for group_name, g in df.groupby(pd.Grouper(freq='10s', key='dt')):
#added for row per loop
row = []
print(f'Start time {group_name} has {len(g)} records within 10 secs')
group_timestamp = group_name
label = label
x = g['x'].head(50)
x_mean = x.mean()
....
row.append(z_abs_max)
rows.append(row)
#DataFrame outside loops
group_df = pd.DataFrame(rows, columns=column_names)
print (group_df)
应通过以下方式改进您的解决方案:
您能给出一行样本数据,以便我们执行code@ShubhamPeriwal示例csv输入addedHank它起作用了,我想知道我的方法有什么问题?我假设循环中的数据帧导致了issue@user158-行变量中还追加了值,因此对于eah循环,添加了新的23个值,因此在secon din循环中为46,然后46+23…@user158-hmmm,我很想念它。列具有
1,2,3
的原因是group\u df=group\u df.append([row],ignore\u index=True)
-添加了具有0,1,2
索引的系列并创建新列。因此应更改为group\u df=group\u df.append(pd.Series(行,索引=列\u名称),ignore\u index=True)
以防止it@user158-我认为group\u timestamp
应该在DatetimeIndex
中,因此在我的解决方案之后应该工作group\u df=group\u df.reset\u index()
,并且对于标签使用group\u df.insert(0,“标签”,标签)
在解决方案之后
#custom aggregate functions
def std_dev(x):
return statistics.stdev(x)
def abs_min(x):
return x.abs().min()
def abs_max(x):
return x.abs().max()
d = ['mean','median',std_dev, 'min','max', abs_min, abs_max]
cols = ['x','y','z']
#filtered first 50 rows
df[cols] = df.groupby(pd.Grouper(freq='10s', key='dt'))[cols].head(50)
#aggregate functions
group_df = df.groupby(pd.Grouper(freq='10s', key='dt'))[cols].agg(d)
group_df.columns = group_df.columns.map('_'.join)
print (group_df)