如何在python中的pandas数据帧上使用vectorize或Apply代替iterrows

如何在python中的pandas数据帧上使用vectorize或Apply代替iterrows,python,pandas,dataframe,vectorization,apply,Python,Pandas,Dataframe,Vectorization,Apply,我有2000多个带有两列的数据帧。我想在列上创建ngrams,然后使用ngrams创建一个新的数据帧。这是我的密码。工作正常。只是花了很多时间 我目前正在使用itterows遍历每个文件中每个数据帧的每一行。有没有更简单的方法使用矢量化或应用来实现这一点 import logging import os from os import listdir from os.path import isfile, join import math import pickle import itertool

我有2000多个带有两列的数据帧。我想在列上创建ngrams,然后使用ngrams创建一个新的数据帧。这是我的密码。工作正常。只是花了很多时间

我目前正在使用itterows遍历每个文件中每个数据帧的每一行。有没有更简单的方法使用矢量化或应用来实现这一点

import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import psutil
import numpy as np
import pandas as pd
import time

def create_combinations(file):
    initial_path ='./to_process/'
    final_path = './processed/'
    custom = pd.read_pickle(initial_path+file, compression='gzip')
    custom = custom.drop_duplicates(subset=['category', 'element'])
    custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
    total_rows = len(custom.index)
    logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
    # if total_rows > cores:
    #     partitions = math.floor(total_rows/cores)
    # logging.warning('Number of partitions : ' + str(partitions))
    if total_rows > 0:
        combined_df = pd.DataFrame(columns=['category', 'element'])
        logging.warning('creating combinations')
        for key, data in custom.iterrows():
            words = data['element']#.split()
            logging.warning(words)
            words2 = words.replace('%', '%%').replace(' ', '%s')
            logging.warning('Number of words to combine: '+ str(len(words.split())))
            k = 0
            df1 = pd.DataFrame(columns=['category', 'element'])
            for i in itertools.product((' ', ''), repeat=words.count(' ')):
                df1.loc[k, 'element'] = (words2 % i)
                df1.loc[k, 'category'] = data['category']
                k += 1
            combined_df = pd.concat([combined_df,df1], axis=0)
            del df1
        combined_df.to_pickle(final_path + file, compression='gzip')
        combined_df.to_csv(final_path + os.path.splitext(file)[0]+'.csv') 
        del combined_df
        del custom
            # partitions = 1
        logging.warning('completed ' + file)
    else:
        logging.warning('No rows to process')





if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    partitions = 1 #number of partitions to split dataframe
    cores = 2 #number of cores on your machine
    path ='./to_process/'
    combi_path = './processed/'
    files = [f for f in listdir(path) if isfile(join(path, f))]
    pickle_files=[]
    for any_file in files:
        if any_file.endswith('.pickle'):
            if os.path.isfile(combi_path+any_file):
                logging.warning(any_file +' already processed.')
            else:
                pickle_files.insert(len(pickle_files),any_file)
    p = multiprocessing.Pool(processes = len(pickle_files))
    start = time.time()
    async_result = p.map_async(create_combinations, pickle_files)
    p.close()
    p.join()
    print("Complete")
    end = time.time()
    print('total time (s)= ' + str(end-start))
在此处输入代码

#pylint:disable=I0011
# pylint: disable=I0011
# pylint: disable=C0111
# pylint: disable=C0301
# pylint: disable=C0103
# pylint: disable=W0612
# pylint: disable=W0611
import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import time
import gc
import numpy as np
import pandas as pd


def create_combinations(file):
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    initial_path ='./training/pickles/standard and documentation/custom_elements/trial/'
    final_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
    completed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
    custom = pd.read_pickle(initial_path+file, compression='gzip')
    custom = custom.drop_duplicates(subset=['category', 'element'])
    custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
    total_rows = len(custom.index)
    logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
    cat = []
    ele = []
    combined_df = pd.DataFrame(columns=['category', 'element'])
    logging.warning('creating combinations')
    k=1
    for key, data in custom.iterrows():
        words = data['element']#.split()
        logging.warning(words)
        words2 = words.replace('%', '%%').replace(' ', '%s')
        logging.warning('Number of words to combine: '+ str(len(words.split())))
        for i in itertools.product((' ', ''), repeat=words.count(' ')):
            ele.append(words2 % i)
            cat.append(data['category'])
        lst = zip(cat,ele)
        if len(lst) > 200000:
            del cat
            del ele
            combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
            del lst
            combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
            combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv') 
            #del combined_df
            gc.collect()
            k+=1
    del cat
    del ele
    combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
    del lst
    combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
    combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv') 
    del combined_df
    gc.collect()
    del custom
    del words
    del words2
    logging.warning('completed ' + file)
    os.rename(initial_path+file, completed_file_path+file)
    os.rename(initial_path+os.path.splitext(file)[0]+'.csv', completed_file_path+os.path.splitext(file)[0]+'.csv')
    return True


if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    partitions = 1 #number of partitions to split dataframe
    cores = 6 #number of cores on your machine
    path ='./training/pickles/standard and documentation/custom_elements/trial/'
    combi_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
    processed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
    files = [f for f in listdir(path) if isfile(join(path, f))]
    pickle_files=[]
    for any_file in files:
        if any_file.endswith('.pickle'):
            if os.path.isfile(combi_path+any_file):
                os.rename(path+any_file, processed_file_path+any_file)
                os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
                logging.warning(any_file +' already processed.')
            else:
                df = pd.read_pickle(path+any_file, compression='gzip')
                rows = len(df.index)
                if rows > 0:
                    #if rows < 500:
                    pickle_files.insert(len(pickle_files),any_file)
                    # else:
                    #     continue
                else:
                    os.rename(path+any_file, processed_file_path+any_file)
                    os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
                del df
                gc.collect()
                del rows
                gc.collect()
    ctx = multiprocessing.get_context('spawn')
    p = ctx.Pool(processes=cores, maxtasksperchild=1000)
    start = time.time()
    async_result = p.map_async(create_combinations, pickle_files)
    p.close()
    p.join()
    print("Complete")
    end = time.time()
    print('total time (s)= ' + str(end-start))
#pylint:disable=C0111 #pylint:disable=C0301 #pylint:disable=C0103 #pylint:disable=W0612 #pylint:disable=W0611 导入日志记录 导入操作系统 从操作系统导入listdir 从os.path导入isfile,加入 输入数学 进口泡菜 进口itertools 导入多处理 导入时间 导入gc 将numpy作为np导入 作为pd进口熊猫 def创建_组合(文件): logging.basicConfig(格式='%(asctime)s%(消息)s',datefmt='%m/%d/%Y%I:%m:%s%p') 初始路径='。/培训/pickles/标准和文档/自定义元素/试用/' 最终路径='。/培训/pickles/标准和文档/定制元素\u组合\u试用/' 已完成的\u文件\u路径='。/培训/pickles/标准和文档/自定义\u元素\u已处理\u试用/' custom=pd.read\u pickle(初始路径+文件,compression='gzip') custom=custom.drop_重复项(子集=['category','element']) 自定义['element']=自定义['element'].str.replace(r'([a-z](?=[a-z])|[a-z](?=[a-z][a-z])),r'\1') 总计行数=len(自定义索引) logging.warning('处理元素:'+file+'要组合的行数:'+str(总行数)) cat=[] ele=[] 组合的_df=pd.DataFrame(列=['category','element']) logging.warning('正在创建组合') k=1 对于键,自定义.iterrows()中的数据: words=data['element']#.split() 日志记录。警告(文字) words2=words.replace('%','%')。replace(''%s')) logging.warning('要组合的字数:'+str(len(words.split())) 对于itertools.product(“”,“”,“”)中的i,重复=words.count(“”)): 元素附加(单词2%i) 类别附加(数据['category']) lst=zip(cat,ele) 如果len(lst)>200000: 德尔卡特 德勒 组合的_df=pd.DataFrame.from_记录(lst,列=['category','element']) 德尔斯特 组合的_df.to_pickle(final_path+os.path.splitext(文件)[0]+str(k)+'.pickle',compression='gzip') 组合的_df.到_csv(最终路径+os.path.splitext(文件)[0]+str(k)+'.csv') #德尔库夫酒店 gc.collect() k+=1 德尔卡特 德勒 组合的_df=pd.DataFrame.from_记录(lst,列=['category','element']) 德尔斯特 组合的_df.to_pickle(final_path+os.path.splitext(文件)[0]+str(k)+'.pickle',compression='gzip') 组合的_df.到_csv(最终路径+os.path.splitext(文件)[0]+str(k)+'.csv') 德尔库夫酒店 gc.collect() 德尔海关 del单词 del words2 logging.warning('已完成'+文件) 重命名(初始路径+文件,完成的路径+文件) os.rename(初始_path+os.path.splitext(文件)[0]+'.csv',完成的_文件_path+os.path.splitext(文件)[0]+'.csv') 返回真值 如果名称=“\uuuuu main\uuuuuuuu”: logging.basicConfig(格式='%(asctime)s%(消息)s',datefmt='%m/%d/%Y%I:%m:%s%p') 分区=1#要分割数据帧的分区数 cores=6#您机器上的cores数 路径='。/培训/泡菜/标准和文档/定制元素/试用/' 组合路径='。/培训/腌菜/标准和文档/定制元素组合试验/' 已处理的\u文件\u路径='。/培训/pickles/标准和文档/自定义\u元素\u已处理的\u试用/' files=[f表示listdir(path)中的f,如果isfile(join(path,f))] pickle_文件=[] 对于文件中的任何_文件: 如果有_file.endswith('.pickle'): 如果os.path.isfile(组合路径+任何文件): 重命名(路径+任何文件,已处理的文件\u路径+任何文件) os.rename(path+os.path.splitext(任意_文件)[0]+'.csv',已处理_文件_path+os.path.splitext(任意_文件)[0]+'.csv') logging.warning(任何_文件+'已处理') 其他: df=pd.read\u pickle(路径+任何文件,compression='gzip')) 行=len(df.index) 如果行>0: #如果行数小于500: pickle_文件。插入(len(pickle_文件)、任意_文件) #其他: #继续 其他: 重命名(路径+任何文件,已处理的文件\u路径+任何文件) os.rename(path+os.path.splitext(任意_文件)[0]+'.csv',已处理_文件_path+os.path.splitext(任意_文件)[0]+'.csv') 德尔夫 gc.collect() del rows gc.collect() ctx=多处理。获取上下文('spawn') p=ctx.Pool(进程=核心,maxtasksperchild=1000) 开始=时间。时间() async\u result=p.map\u async(创建\u组合、pickle\u文件) p、 关闭() p、 加入 打印(“完成”) end=time.time() 打印('总时间='+str(结束-开始))