如何在python中的pandas数据帧上使用vectorize或Apply代替iterrows
我有2000多个带有两列的数据帧。我想在列上创建ngrams,然后使用ngrams创建一个新的数据帧。这是我的密码。工作正常。只是花了很多时间 我目前正在使用itterows遍历每个文件中每个数据帧的每一行。有没有更简单的方法使用矢量化或应用来实现这一点如何在python中的pandas数据帧上使用vectorize或Apply代替iterrows,python,pandas,dataframe,vectorization,apply,Python,Pandas,Dataframe,Vectorization,Apply,我有2000多个带有两列的数据帧。我想在列上创建ngrams,然后使用ngrams创建一个新的数据帧。这是我的密码。工作正常。只是花了很多时间 我目前正在使用itterows遍历每个文件中每个数据帧的每一行。有没有更简单的方法使用矢量化或应用来实现这一点 import logging import os from os import listdir from os.path import isfile, join import math import pickle import itertool
import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import psutil
import numpy as np
import pandas as pd
import time
def create_combinations(file):
initial_path ='./to_process/'
final_path = './processed/'
custom = pd.read_pickle(initial_path+file, compression='gzip')
custom = custom.drop_duplicates(subset=['category', 'element'])
custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
total_rows = len(custom.index)
logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
# if total_rows > cores:
# partitions = math.floor(total_rows/cores)
# logging.warning('Number of partitions : ' + str(partitions))
if total_rows > 0:
combined_df = pd.DataFrame(columns=['category', 'element'])
logging.warning('creating combinations')
for key, data in custom.iterrows():
words = data['element']#.split()
logging.warning(words)
words2 = words.replace('%', '%%').replace(' ', '%s')
logging.warning('Number of words to combine: '+ str(len(words.split())))
k = 0
df1 = pd.DataFrame(columns=['category', 'element'])
for i in itertools.product((' ', ''), repeat=words.count(' ')):
df1.loc[k, 'element'] = (words2 % i)
df1.loc[k, 'category'] = data['category']
k += 1
combined_df = pd.concat([combined_df,df1], axis=0)
del df1
combined_df.to_pickle(final_path + file, compression='gzip')
combined_df.to_csv(final_path + os.path.splitext(file)[0]+'.csv')
del combined_df
del custom
# partitions = 1
logging.warning('completed ' + file)
else:
logging.warning('No rows to process')
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
partitions = 1 #number of partitions to split dataframe
cores = 2 #number of cores on your machine
path ='./to_process/'
combi_path = './processed/'
files = [f for f in listdir(path) if isfile(join(path, f))]
pickle_files=[]
for any_file in files:
if any_file.endswith('.pickle'):
if os.path.isfile(combi_path+any_file):
logging.warning(any_file +' already processed.')
else:
pickle_files.insert(len(pickle_files),any_file)
p = multiprocessing.Pool(processes = len(pickle_files))
start = time.time()
async_result = p.map_async(create_combinations, pickle_files)
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
在此处输入代码#pylint:disable=I0011
# pylint: disable=I0011
# pylint: disable=C0111
# pylint: disable=C0301
# pylint: disable=C0103
# pylint: disable=W0612
# pylint: disable=W0611
import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import time
import gc
import numpy as np
import pandas as pd
def create_combinations(file):
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
initial_path ='./training/pickles/standard and documentation/custom_elements/trial/'
final_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
completed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
custom = pd.read_pickle(initial_path+file, compression='gzip')
custom = custom.drop_duplicates(subset=['category', 'element'])
custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
total_rows = len(custom.index)
logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
cat = []
ele = []
combined_df = pd.DataFrame(columns=['category', 'element'])
logging.warning('creating combinations')
k=1
for key, data in custom.iterrows():
words = data['element']#.split()
logging.warning(words)
words2 = words.replace('%', '%%').replace(' ', '%s')
logging.warning('Number of words to combine: '+ str(len(words.split())))
for i in itertools.product((' ', ''), repeat=words.count(' ')):
ele.append(words2 % i)
cat.append(data['category'])
lst = zip(cat,ele)
if len(lst) > 200000:
del cat
del ele
combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
del lst
combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv')
#del combined_df
gc.collect()
k+=1
del cat
del ele
combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
del lst
combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv')
del combined_df
gc.collect()
del custom
del words
del words2
logging.warning('completed ' + file)
os.rename(initial_path+file, completed_file_path+file)
os.rename(initial_path+os.path.splitext(file)[0]+'.csv', completed_file_path+os.path.splitext(file)[0]+'.csv')
return True
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
partitions = 1 #number of partitions to split dataframe
cores = 6 #number of cores on your machine
path ='./training/pickles/standard and documentation/custom_elements/trial/'
combi_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
processed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
files = [f for f in listdir(path) if isfile(join(path, f))]
pickle_files=[]
for any_file in files:
if any_file.endswith('.pickle'):
if os.path.isfile(combi_path+any_file):
os.rename(path+any_file, processed_file_path+any_file)
os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
logging.warning(any_file +' already processed.')
else:
df = pd.read_pickle(path+any_file, compression='gzip')
rows = len(df.index)
if rows > 0:
#if rows < 500:
pickle_files.insert(len(pickle_files),any_file)
# else:
# continue
else:
os.rename(path+any_file, processed_file_path+any_file)
os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
del df
gc.collect()
del rows
gc.collect()
ctx = multiprocessing.get_context('spawn')
p = ctx.Pool(processes=cores, maxtasksperchild=1000)
start = time.time()
async_result = p.map_async(create_combinations, pickle_files)
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
#pylint:disable=C0111
#pylint:disable=C0301
#pylint:disable=C0103
#pylint:disable=W0612
#pylint:disable=W0611
导入日志记录
导入操作系统
从操作系统导入listdir
从os.path导入isfile,加入
输入数学
进口泡菜
进口itertools
导入多处理
导入时间
导入gc
将numpy作为np导入
作为pd进口熊猫
def创建_组合(文件):
logging.basicConfig(格式='%(asctime)s%(消息)s',datefmt='%m/%d/%Y%I:%m:%s%p')
初始路径='。/培训/pickles/标准和文档/自定义元素/试用/'
最终路径='。/培训/pickles/标准和文档/定制元素\u组合\u试用/'
已完成的\u文件\u路径='。/培训/pickles/标准和文档/自定义\u元素\u已处理\u试用/'
custom=pd.read\u pickle(初始路径+文件,compression='gzip')
custom=custom.drop_重复项(子集=['category','element'])
自定义['element']=自定义['element'].str.replace(r'([a-z](?=[a-z])|[a-z](?=[a-z][a-z])),r'\1')
总计行数=len(自定义索引)
logging.warning('处理元素:'+file+'要组合的行数:'+str(总行数))
cat=[]
ele=[]
组合的_df=pd.DataFrame(列=['category','element'])
logging.warning('正在创建组合')
k=1
对于键,自定义.iterrows()中的数据:
words=data['element']#.split()
日志记录。警告(文字)
words2=words.replace('%','%')。replace(''%s'))
logging.warning('要组合的字数:'+str(len(words.split()))
对于itertools.product(“”,“”,“”)中的i,重复=words.count(“”)):
元素附加(单词2%i)
类别附加(数据['category'])
lst=zip(cat,ele)
如果len(lst)>200000:
德尔卡特
德勒
组合的_df=pd.DataFrame.from_记录(lst,列=['category','element'])
德尔斯特
组合的_df.to_pickle(final_path+os.path.splitext(文件)[0]+str(k)+'.pickle',compression='gzip')
组合的_df.到_csv(最终路径+os.path.splitext(文件)[0]+str(k)+'.csv')
#德尔库夫酒店
gc.collect()
k+=1
德尔卡特
德勒
组合的_df=pd.DataFrame.from_记录(lst,列=['category','element'])
德尔斯特
组合的_df.to_pickle(final_path+os.path.splitext(文件)[0]+str(k)+'.pickle',compression='gzip')
组合的_df.到_csv(最终路径+os.path.splitext(文件)[0]+str(k)+'.csv')
德尔库夫酒店
gc.collect()
德尔海关
del单词
del words2
logging.warning('已完成'+文件)
重命名(初始路径+文件,完成的路径+文件)
os.rename(初始_path+os.path.splitext(文件)[0]+'.csv',完成的_文件_path+os.path.splitext(文件)[0]+'.csv')
返回真值
如果名称=“\uuuuu main\uuuuuuuu”:
logging.basicConfig(格式='%(asctime)s%(消息)s',datefmt='%m/%d/%Y%I:%m:%s%p')
分区=1#要分割数据帧的分区数
cores=6#您机器上的cores数
路径='。/培训/泡菜/标准和文档/定制元素/试用/'
组合路径='。/培训/腌菜/标准和文档/定制元素组合试验/'
已处理的\u文件\u路径='。/培训/pickles/标准和文档/自定义\u元素\u已处理的\u试用/'
files=[f表示listdir(path)中的f,如果isfile(join(path,f))]
pickle_文件=[]
对于文件中的任何_文件:
如果有_file.endswith('.pickle'):
如果os.path.isfile(组合路径+任何文件):
重命名(路径+任何文件,已处理的文件\u路径+任何文件)
os.rename(path+os.path.splitext(任意_文件)[0]+'.csv',已处理_文件_path+os.path.splitext(任意_文件)[0]+'.csv')
logging.warning(任何_文件+'已处理')
其他:
df=pd.read\u pickle(路径+任何文件,compression='gzip'))
行=len(df.index)
如果行>0:
#如果行数小于500:
pickle_文件。插入(len(pickle_文件)、任意_文件)
#其他:
#继续
其他:
重命名(路径+任何文件,已处理的文件\u路径+任何文件)
os.rename(path+os.path.splitext(任意_文件)[0]+'.csv',已处理_文件_path+os.path.splitext(任意_文件)[0]+'.csv')
德尔夫
gc.collect()
del rows
gc.collect()
ctx=多处理。获取上下文('spawn')
p=ctx.Pool(进程=核心,maxtasksperchild=1000)
开始=时间。时间()
async\u result=p.map\u async(创建\u组合、pickle\u文件)
p、 关闭()
p、 加入
打印(“完成”)
end=time.time()
打印('总时间='+str(结束-开始))