使用google cloud dataflow python导入电话号码
我正在尝试用Python对PhoneNumber模块进行相对简单的导入 我已经在一个单独的python文件上测试了该模块,没有任何其他导入,它工作得非常好 以下是我安装的软件包:使用google cloud dataflow python导入电话号码,python,import,google-cloud-platform,google-cloud-dataflow,importerror,Python,Import,Google Cloud Platform,Google Cloud Dataflow,Importerror,我正在尝试用Python对PhoneNumber模块进行相对简单的导入 我已经在一个单独的python文件上测试了该模块,没有任何其他导入,它工作得非常好 以下是我安装的软件包: from __future__ import absolute_import from __future__ import print_function import argparse import csv import logging import os import phonenumbers import ap
from __future__ import absolute_import
from __future__ import print_function
import argparse
import csv
import logging
import os
import phonenumbers
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
这是我的错误信息:
Traceback (most recent call last):
File "clean.py", line 114, in <module>
run()
File "clean.py", line 109, in run
| 'WriteOutputText' >> beam.io.WriteToText(known_args.output))
File "C:\Python27\lib\site-packages\apache_beam\pipeline.py", line 389, in __exit__
self.run().wait_until_finish()
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\dataflow_runner.py", line 996, in wait_until_finish
(self.state, getattr(self._runner, 'last_error_msg', None)), self)
apache_beam.runners.dataflow.dataflow_runner.DataflowRuntimeException: Dataflow pipeline failed. State: FAILED, Error:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 733, in run
self._load_main_session(self.local_staging_directory)
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 472, in _load_main_session
pickler.load_session(session_file)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/internal/pickler.py", line 247, in load_session
return dill.load_session(file_path)
File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 363, in load_session
module = unpickler.load()
File "/usr/lib/python2.7/pickle.py", line 864, in load
dispatch[key](self)
File "/usr/lib/python2.7/pickle.py", line 1139, in load_reduce
value = func(*args)
File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 766, in _import_module
return __import__(import_name)
ImportError: No module named phonenumbers
编辑:代码==
from __future__ import absolute_import
from __future__ import print_function
import argparse
import csv
import logging
import os
from collections import OrderedDict
import phonenumbers
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class ParseCSVFn(beam.DoFn):
"""Parses the raw CSV data into a Python dictionary."""
def process(self, elem):
try:
row = list(csv.reader([elem]))[0]
month, day, year = row[2].split('/')
birth_dict = {
'day': day,
'month': month,
'year': year,
}
order_dict = OrderedDict(birth_dict)
data_dict = {
'phoneNumber': row[4],
'firstName': row[0],
'lastName': row[1],
'birthDate': order_dict,
'voterId': row[3],
}
order_data_dict = OrderedDict(data_dict)
yield order_data_dict
except:
pass
def run(argv=None):
"""Pipeline entry point, runs the all the necessary processes"""
parser = argparse.ArgumentParser()
parser.add_argument('--input',
type=str,
dest='input',
default='gs://wordcount_project/demo-contacts-small*.csv',
help='Input file to process.')
parser.add_argument('--output',
dest='output',
# CHANGE 1/5: The Google Cloud Storage path is required
# for outputting the results.
default='gs://wordcount_project/cleaned.csv',
help='Output file to write results to.')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
# CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
# run your pipeline on the Google Cloud Dataflow Service.
'--runner=DataflowRunner',
# CHANGE 3/5: Your project ID is required in order to run your pipeline on
# the Google Cloud Dataflow Service.
'--project=--------',
# CHANGE 4/5: Your Google Cloud Storage path is required for staging local
# files.
# '--dataset=game_dataset',
'--staging_location=gs://wordcount_project/staging',
# CHANGE 5/5: Your Google Cloud Storage path is required for temporary
# files.
'--temp_location=gs://wordcount_project/temp',
'--job_name=cleaning-jobs',
])
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
with beam.Pipeline(options=pipeline_options) as p:
(p
| 'ReadInputText' >> beam.io.ReadFromText(known_args.input)
| 'ParseDataFn' >> beam.ParDo(ParseCSVFn())
# | 'JsonBirthDay' >> beam.ParDo(JsonBirthDay())
# | 'MatchNumber' >> beam.ParDo(MatchNumber('phoneNumber'))
# | 'MapData' >> beam.Map(lambda elem: (elem['phoneNumber'], elem['firstName'], elem['lastName'],
# elem['birthDate'], elem['voterId']))
| 'WriteOutputText' >> beam.io.WriteToText(known_args.output))
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
我也尝试过安装GoogleGax的特定软件包和请求,但似乎没有帮助
编辑:新编码错误:
File "new_clean.py", line 226, in <module>
run()
File "new_clean.py", line 219, in run
| 'WriteToText' >> beam.io.WriteToText(known_args.output)
File "C:\Python27\lib\site-packages\apache_beam\pipeline.py", line 389, in __exit__
self.run().wait_until_finish()
File "C:\Python27\lib\site-packages\apache_beam\pipeline.py", line 369, in run
self.to_runner_api(), self.runner, self._options).run(False)
File "C:\Python27\lib\site-packages\apache_beam\pipeline.py", line 382, in run
return self.runner.run_pipeline(self)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\dataflow_runner.py", line 324, in run_pipeline
self.dataflow_client.create_job(self.job), self)
File "C:\Python27\lib\site-packages\apache_beam\utils\retry.py", line 180, in wrapper
return fun(*args, **kwargs)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\apiclient.py", line 461, in create_job
self.create_job_description(job)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\apiclient.py", line 491, in create_job_description
job.options, file_copy=self._gcs_file_copy)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\dependency.py", line 328, in stage_job_resources
setup_options.requirements_file, requirements_cache_path)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\dependency.py", line 262, in _populate_requirements_cache
processes.check_call(cmd_args)
File "C:\Python27\lib\site-packages\apache_beam\utils\processes.py", line 44, in check_call
return subprocess.check_call(*args, **kwargs)
File "C:\Python27\lib\subprocess.py", line 186, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['C:\\Python27\\python.exe', '-m', 'pip', 'download', '--dest', 'c:\\users\\james\\appdata\\local\\temp\\dataflow-requirements-cache', '-r', 'requirements.txt', '--no-binary', ':all:']' returned non-zero exit status 1
文件“new_clean.py”,第226行,在
运行()
文件“new_clean.py”,第219行,运行中
|'WriteToText'>>beam.io.WriteToText(已知参数输出)
文件“C:\Python27\lib\site packages\apache\u beam\pipeline.py”,第389行,在\u\u出口__
self.run()
文件“C:\Python27\lib\site packages\apache\u beam\pipeline.py”,第369行,正在运行
self.to_runner_api(),self.runner,self.\u选项)。运行(False)
文件“C:\Python27\lib\site packages\apache\u beam\pipeline.py”,第382行,正在运行
返回self.runner.run_管道(self)
运行管道中的文件“C:\Python27\lib\site packages\apache\u beam\runners\dataflow\dataflow\u runner.py”,第324行
self.dataflow\u client.create\u作业(self.job),self)
文件“C:\Python27\lib\site packages\apache\u beam\utils\retry.py”,第180行,在包装器中
返回乐趣(*args,**kwargs)
文件“C:\Python27\lib\site packages\apache\u beam\runners\dataflow\internal\apiclient.py”,第461行,位于create\u作业中
自我创建工作描述(工作)
文件“C:\Python27\lib\site packages\apache\u beam\runners\dataflow\internal\apiclient.py”,第491行,位于创建作业描述中
job.options,文件\u copy=self.\u gcs\u文件\u copy)
文件“C:\Python27\lib\site packages\apache\u beam\runners\dataflow\internal\dependency.py”,第328行,在stage\u job\u参考资料中
设置\u选项.requirements\u文件,requirements\u缓存\u路径)
文件“C:\Python27\lib\site packages\apache\u beam\runners\dataflow\internal\dependency.py”,第262行,位于\u populate\u requirements\u cache中
进程。检查调用(cmd\u args)
文件“C:\Python27\lib\site packages\apache\u beam\utils\processs.py”,第44行,在check\u调用中
返回子进程。检查调用(*args,**kwargs)
文件“C:\Python27\lib\subprocess.py”,第186行,在check\u调用中
引发被调用的进程错误(retcode,cmd)
subprocess.CalledProcessError:Command'['C:\\Python27\\python.exe','-m',pip',download','-dest','C:\\users\\james\\appdata\\local\\temp\\dataflow requirements cache','-r','requirements.txt','-no binary',':all:']'返回非零退出状态1
可能是数据流没有接收到包含管道额外依赖项的文件。要安装它们,请执行以下操作:
pip freeze > requirements.txt
然后,您需要编辑requirements.txt
文件,只保留从PyPI安装并在管道中使用的软件包
运行管道时,传递以下命令行选项:
--requirements_file requirements.txt
这在中有记录
希望这会有所帮助。可能是数据流没有接收到带有管道额外依赖项的文件。要安装它们,请执行以下操作:
pip freeze > requirements.txt
然后,您需要编辑requirements.txt
文件,只保留从PyPI安装并在管道中使用的软件包
运行管道时,传递以下命令行选项:
--requirements_file requirements.txt
这在中有记录
希望这能有所帮助。Hi Pablo,我制作了requirements.txt文件,并确保检查所有的软件包,并且只使用PyPI中的软件包,我在pipline中使用这些软件包,尽管我现在在我的问题中遇到了t错误。它起了作用,我在google SDK中使用了它,没有问题,事实上我使用的是gitbash。。。谢谢嗨,Pablo,我制作了requirements.txt文件,并确保检查所有软件包,并且只使用PyPI和我在pipline中使用的软件包,尽管我现在得到了我在问题中提出的t错误。它成功了,我在google SDK中完成了,没有问题,事实上我使用的是gitbash。。。谢谢