Python 具有返回函数且接受多个参数的循环的多处理
我知道这个问题被问了很多次,但我找不到一个类似的案例 我有这个功能:Python 具有返回函数且接受多个参数的循环的多处理,python,python-3.x,multithreading,parallel-processing,multiprocessing,Python,Python 3.x,Multithreading,Parallel Processing,Multiprocessing,我知道这个问题被问了很多次,但我找不到一个类似的案例 我有这个功能: def load_data(list_of_files, INP_DIR, return_featues=False): data = [] # ------- I want to multithread this block------# for file_name in tqdm(list_of_files): subject , features = load_subje
def load_data(list_of_files, INP_DIR, return_featues=False):
data = []
# ------- I want to multithread this block------#
for file_name in tqdm(list_of_files):
subject , features = load_subject(INP_DIR,file_name)
data.append(subject.reset_index())
# -------------#
data = pd.concat(data, axis=0, ignore_index=True)
target = data['label']
if return_featues:
return data,target, features
else:
return data,target
上述函数使用load\u subject
,对于您的参考,其定义如下:
def load_subject(INP_DIR,file_name):
subject= pd.read_csv(INP_DIR+ file_name, sep='|')
< do some processing ...>
return subject, features
如您所见,train_files是一个文件名列表
当我运行上述行时,我得到以下错误:
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
TypeError: load_subject() missing 1 required positional argument: 'file_name'
"""
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-24-96a3ce89ebb8> in <module>()
2 if __name__ == '__main__':
3 with Pool(processes=2) as pool:
----> 4 pool.map(load_subject, train_files) # process data_inputs iterable with pool
/anaconda3/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize)
264 in a list that is returned.
265 '''
--> 266 return self._map_async(func, iterable, mapstar, chunksize).get()
267
268 def starmap(self, func, iterable, chunksize=None):
/anaconda3/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
TypeError: load_subject() missing 1 required positional argument: 'file_name'
---------------------------------------------------------------------------
远程回溯回溯(最近一次呼叫最后一次)
远程回溯:
"""
回溯(最近一次呼叫最后一次):
worker中的文件“/anaconda3/lib/python3.6/multiprocessing/pool.py”,第119行
结果=(True,func(*args,**kwds))
mapstar中的文件“/anaconda3/lib/python3.6/multiprocessing/pool.py”,第44行
返回列表(映射(*args))
TypeError:load\u subject()缺少1个必需的位置参数:“文件名”
"""
上述异常是以下异常的直接原因:
TypeError回溯(最近一次调用上次)
在()
2如果uuuu name uuuuuu='\uuuuuuuu main\uuuuuuuuu':
3将池(进程=2)作为池:
---->4 pool.map(加载主题、训练文件)#过程数据#可与pool匹配的输入
/映射中的anaconda3/lib/python3.6/multiprocessing/pool.py(self、func、iterable、chunksize)
返回的列表中的264。
265 '''
-->266返回self.\u map\u async(func、iterable、mapstar、chunksize).get()
267
268 def星图(self、func、iterable、chunksize=None):
/get中的anaconda3/lib/python3.6/multiprocessing/pool.py(self,timeout)
642返回自身值
643其他:
-->644提高自我价值
645
646 def_设置(自、i、obj):
TypeError:load\u subject()缺少1个必需的位置参数:“文件名”
更新:
在得到汤姆的回答后,我可以找到另一种方法来只通过一个论点
以下是函数,您将看到我得到的错误:
def load_data(list_of_files):
data = []
# ------- I want to multithread this block------#
for file_name in tqdm(list_of_files):
subject , features = load_subject(INP_DIR,file_name)
data.append(subject.reset_index())
# -------------#
data = pd.concat(data, axis=0, ignore_index=True)
target = data['label']
return data,target
def load_subject(file_name):
subject= pd.read_csv(file_name, sep='|')
< do some processing ...>
return subject, features
train_files= ['p011431.psv', 'p008160.psv', 'p007253.psv', 'p018373.psv']
from multiprocessing import Pool
if __name__ == '__main__':
with Pool(processes=64) as pool:
pool.map(load_data, train_files)
def load_数据(文件列表):
数据=[]
#----我想多线程处理这个块------#
对于TQM中的文件名(文件列表):
主题,功能=加载主题(输入目录,文件名)
data.append(subject.reset\u index())
# -------------#
数据=pd.concat(数据,轴=0,忽略索引=True)
目标=数据['label']
返回数据、目标
def加载主题(文件名):
subject=pd.read_csv(文件名,sep='|')
<做一些处理…>
返回主题、特征
列车_文件=['p011431.psv'、'p008160.psv'、'p007253.psv'、'p018373.psv']
来自多处理导入池
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
将池(进程=64)作为池:
pool.map(加载\u数据、列车\u文件)
当我运行上述行时,我得到一个新错误:
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
File "<ipython-input-21-494105028a08>", line 407, in load_data
subject , features = load_subject(file_name)
File "<ipython-input-21-494105028a08>", line 170, in load_subject
subject= pd.read_csv(file_name, sep='|')
File "/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py", line 678, in parser_f
return _read(filepath_or_buffer, kwds)
File "/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py", line 440, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py", line 787, in __init__
self._make_engine(self.engine)
File "/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py", line 1014, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py", line 1708, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 539, in pandas._libs.parsers.TextReader.__cinit__
File "pandas/_libs/parsers.pyx", line 737, in pandas._libs.parsers.TextReader._get_header
File "pandas/_libs/parsers.pyx", line 932, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas/_libs/parsers.pyx", line 2112, in pandas._libs.parsers.raise_parser_error
pandas.errors.ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.
"""
The above exception was the direct cause of the following exception:
ParserError Traceback (most recent call last)
<ipython-input-22-d6dcc5840b63> in <module>()
4
5 with Pool(processes=3) as pool:
----> 6 pool.map(load_data, files)
/anaconda3/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize)
264 in a list that is returned.
265 '''
--> 266 return self._map_async(func, iterable, mapstar, chunksize).get()
267
268 def starmap(self, func, iterable, chunksize=None):
/anaconda3/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.
---------------------------------------------------------------------------
远程回溯回溯(最近一次呼叫最后一次)
远程回溯:
"""
回溯(最近一次呼叫最后一次):
worker中的文件“/anaconda3/lib/python3.6/multiprocessing/pool.py”,第119行
结果=(True,func(*args,**kwds))
mapstar中的文件“/anaconda3/lib/python3.6/multiprocessing/pool.py”,第44行
返回列表(映射(*args))
文件“”,第407行,在load_数据中
主题,功能=加载主题(文件名)
文件“”,第170行,加载主题
subject=pd.read_csv(文件名,sep='|')
文件“/anaconda3/lib/python3.6/site packages/pandas/io/parsers.py”,第678行,在解析器中
返回读取(文件路径或缓冲区,kwds)
文件“/anaconda3/lib/python3.6/site packages/pandas/io/parsers.py”,第440行,已读
parser=TextFileReader(文件路径或缓冲区,**kwds)
文件“/anaconda3/lib/python3.6/site packages/pandas/io/parsers.py”,第787行,在__
自制发动机(自制发动机)
文件“/anaconda3/lib/python3.6/site packages/pandas/io/parsers.py”,第1014行,在“make”引擎中
self.\u engine=CParserWrapper(self.f,**self.options)
文件“/anaconda3/lib/python3.6/site packages/pandas/io/parsers.py”,第1708行,在__
self.\u reader=parsers.TextReader(src,**kwds)
文件“pandas/_libs/parsers.pyx”,第539行,在pandas._libs.parsers.TextReader.\uu\cinit中__
文件“pandas/_libs/parsers.pyx”,第737行,在pandas._libs.parsers.TextReader._get_头中
文件“pandas/_libs/parsers.pyx”,第932行,在pandas._libs.parsers.TextReader._标记化_行中
文件“pandas/_libs/parsers.pyx”,第2112行,在pandas._libs.parsers.raise_parser_错误
pandas.errors.ParserError:标记数据时出错。C错误:在源上调用读取(nbytes)失败。请尝试engine='python'。
"""
上述异常是以下异常的直接原因:
ParserError回溯(上次最近的调用)
在()
4.
5将池(进程=3)作为池:
---->6 pool.map(加载_数据、文件)
/映射中的anaconda3/lib/python3.6/multiprocessing/pool.py(self、func、iterable、chunksize)
返回的列表中的264。
265 '''
-->266返回self.\u map\u async(func、iterable、mapstar、chunksize).get()
267
268 def星图(self、func、iterable、chunksize=None):
/get中的anaconda3/lib/python3.6/multiprocessing/pool.py(self,timeout)
642返回自身值
643其他:
-->644提高自我价值
645
646 def_设置(自、i、obj):
ParserError:标记数据时出错。C错误:在源上调用读取(nbytes)失败。试试engine='python'。
我错过了什么?如何使其正常工作?多处理的
Pool.map()
函数一次只能传递一个参数。我相信在Python3中有一个“适当的”解决方法,但是我一直在Python2中使用下面的hack,没有理由认为它仍然有效
为只接受一个参数的load\u subject
定义包装器,为该参数定义一个特殊对象
def包装的加载对象(p
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
File "<ipython-input-21-494105028a08>", line 407, in load_data
subject , features = load_subject(file_name)
File "<ipython-input-21-494105028a08>", line 170, in load_subject
subject= pd.read_csv(file_name, sep='|')
File "/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py", line 678, in parser_f
return _read(filepath_or_buffer, kwds)
File "/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py", line 440, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py", line 787, in __init__
self._make_engine(self.engine)
File "/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py", line 1014, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py", line 1708, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 539, in pandas._libs.parsers.TextReader.__cinit__
File "pandas/_libs/parsers.pyx", line 737, in pandas._libs.parsers.TextReader._get_header
File "pandas/_libs/parsers.pyx", line 932, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas/_libs/parsers.pyx", line 2112, in pandas._libs.parsers.raise_parser_error
pandas.errors.ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.
"""
The above exception was the direct cause of the following exception:
ParserError Traceback (most recent call last)
<ipython-input-22-d6dcc5840b63> in <module>()
4
5 with Pool(processes=3) as pool:
----> 6 pool.map(load_data, files)
/anaconda3/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize)
264 in a list that is returned.
265 '''
--> 266 return self._map_async(func, iterable, mapstar, chunksize).get()
267
268 def starmap(self, func, iterable, chunksize=None):
/anaconda3/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.
with Pool(processes=64) as pool:
res = pool.map(load_data, train_files)