Python AWS胶水变换
尝试从s3 bucket读取Input.csv文件,获取不同的值(并进行一些其他转换),然后写入target.csv文件,但在尝试将数据写入s3 bucket中的target.csv时遇到问题 下面是代码:Python AWS胶水变换,python,amazon-web-services,amazon-s3,aws-glue,Python,Amazon Web Services,Amazon S3,Aws Glue,尝试从s3 bucket读取Input.csv文件,获取不同的值(并进行一些其他转换),然后写入target.csv文件,但在尝试将数据写入s3 bucket中的target.csv时遇到问题 下面是代码: import sys from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsg
import sys
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
dfnew = glueContext.create_dynamic_frame_from_options("s3", {'paths': ["s3://bucket_name/Input.csv"] }, format="csv" )
dfMod = dfnew.select_fields(["Col2","Col3"]).toDF().distinct()
dnFrame = DynamicFrame.fromDF(dfMod, glueContext, "test_nest")
datasink = glueContext.write_dynamic_frame.from_options(frame = dnFrame, connection_type = "s3",connection_options = {"path": "s3://bucket_name/Target.csv"}, format = "csv", transformation_ctx ="datasink")
Col1 Col2 Col3
1 1 -30.4
2 2 -30.5
3 3 6.70
4 4 5.89
5 4 6.89
6 4 6.70
7 4 5.89
8 4 5.89
val dfmod = dfnew.select_fields(["Col2","Col3"]).toDF().distinct().show() ^ SyntaxError: invalid syntax During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/opt/amazon/bin/runscript.py", line 92, in <module>
while "runpy.py" in new_stack.tb_frame.f_code.co_filename: AttributeError: 'NoneType' object has no attribute 'tb_frame'
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::bucket_Name/Output/**/**/*"
]
}
]
}
{
"Version": "2012-10-17",
"Id": "Policy***",
"Statement": [
{
"Sid": "Stmt1***",
"Effect": "Allow",
"Principal": {
"AWS": "arn:aws:iam::account_number:root"
},
"Action": "s3:*",
"Resource": "arn:aws:s3:::bucket_name"
}
]
}
这是Input.csv中的数据:
import sys
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
dfnew = glueContext.create_dynamic_frame_from_options("s3", {'paths': ["s3://bucket_name/Input.csv"] }, format="csv" )
dfMod = dfnew.select_fields(["Col2","Col3"]).toDF().distinct()
dnFrame = DynamicFrame.fromDF(dfMod, glueContext, "test_nest")
datasink = glueContext.write_dynamic_frame.from_options(frame = dnFrame, connection_type = "s3",connection_options = {"path": "s3://bucket_name/Target.csv"}, format = "csv", transformation_ctx ="datasink")
Col1 Col2 Col3
1 1 -30.4
2 2 -30.5
3 3 6.70
4 4 5.89
5 4 6.89
6 4 6.70
7 4 5.89
8 4 5.89
val dfmod = dfnew.select_fields(["Col2","Col3"]).toDF().distinct().show() ^ SyntaxError: invalid syntax During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/opt/amazon/bin/runscript.py", line 92, in <module>
while "runpy.py" in new_stack.tb_frame.f_code.co_filename: AttributeError: 'NoneType' object has no attribute 'tb_frame'
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::bucket_Name/Output/**/**/*"
]
}
]
}
{
"Version": "2012-10-17",
"Id": "Policy***",
"Statement": [
{
"Sid": "Stmt1***",
"Effect": "Allow",
"Principal": {
"AWS": "arn:aws:iam::account_number:root"
},
"Action": "s3:*",
"Resource": "arn:aws:s3:::bucket_name"
}
]
}
错误:
import sys
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
dfnew = glueContext.create_dynamic_frame_from_options("s3", {'paths': ["s3://bucket_name/Input.csv"] }, format="csv" )
dfMod = dfnew.select_fields(["Col2","Col3"]).toDF().distinct()
dnFrame = DynamicFrame.fromDF(dfMod, glueContext, "test_nest")
datasink = glueContext.write_dynamic_frame.from_options(frame = dnFrame, connection_type = "s3",connection_options = {"path": "s3://bucket_name/Target.csv"}, format = "csv", transformation_ctx ="datasink")
Col1 Col2 Col3
1 1 -30.4
2 2 -30.5
3 3 6.70
4 4 5.89
5 4 6.89
6 4 6.70
7 4 5.89
8 4 5.89
val dfmod = dfnew.select_fields(["Col2","Col3"]).toDF().distinct().show() ^ SyntaxError: invalid syntax During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/opt/amazon/bin/runscript.py", line 92, in <module>
while "runpy.py" in new_stack.tb_frame.f_code.co_filename: AttributeError: 'NoneType' object has no attribute 'tb_frame'
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::bucket_Name/Output/**/**/*"
]
}
]
}
{
"Version": "2012-10-17",
"Id": "Policy***",
"Statement": [
{
"Sid": "Stmt1***",
"Effect": "Allow",
"Principal": {
"AWS": "arn:aws:iam::account_number:root"
},
"Action": "s3:*",
"Resource": "arn:aws:s3:::bucket_name"
}
]
}
S3存储桶策略:
import sys
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
dfnew = glueContext.create_dynamic_frame_from_options("s3", {'paths': ["s3://bucket_name/Input.csv"] }, format="csv" )
dfMod = dfnew.select_fields(["Col2","Col3"]).toDF().distinct()
dnFrame = DynamicFrame.fromDF(dfMod, glueContext, "test_nest")
datasink = glueContext.write_dynamic_frame.from_options(frame = dnFrame, connection_type = "s3",connection_options = {"path": "s3://bucket_name/Target.csv"}, format = "csv", transformation_ctx ="datasink")
Col1 Col2 Col3
1 1 -30.4
2 2 -30.5
3 3 6.70
4 4 5.89
5 4 6.89
6 4 6.70
7 4 5.89
8 4 5.89
val dfmod = dfnew.select_fields(["Col2","Col3"]).toDF().distinct().show() ^ SyntaxError: invalid syntax During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/opt/amazon/bin/runscript.py", line 92, in <module>
while "runpy.py" in new_stack.tb_frame.f_code.co_filename: AttributeError: 'NoneType' object has no attribute 'tb_frame'
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::bucket_Name/Output/**/**/*"
]
}
]
}
{
"Version": "2012-10-17",
"Id": "Policy***",
"Statement": [
{
"Sid": "Stmt1***",
"Effect": "Allow",
"Principal": {
"AWS": "arn:aws:iam::account_number:root"
},
"Action": "s3:*",
"Resource": "arn:aws:s3:::bucket_name"
}
]
}
请在线帮助语法错误
val dfMod = dfnew.select_fields(["Col2","Col3"]).toDF().distinct().show()
可以按如下方式更正,我们不需要val
或show()
它只会返回一个数据帧,在传递到write\u dynamic\u frame
之前,我们将其转换为DynamicFrame。DynamicFrame import DynamicFrame还需要在顶部从awsglue.DynamicFrame导入DynamicFrame
dfMod = dfnew.select_fields("Col2","Col3").toDF().distinct()
dnFrame = DynamicFrame.fromDF(dfMod, glueContext, "test_nest")
感谢我进行了上述编辑(将dnFrame传递给datasink参数),但现在我在datasink行遇到了以下问题:“org.apache.spark.sparkeException:作业因阶段失败而中止:阶段2.0中的任务43失败,阶段2.0中的任务43(执行者4):com.amazon.ws.emr.hadoop.fs.shade.com.amazonaws.services.s3.model.amazons3异常:(服务:amazon s3;状态代码:403;错误代码:AccessDenied”。但是我可以访问Input.csv,所以不知道为什么我在编写时会被拒绝访问。上面的代码更改看起来正确吗?或者我遗漏了什么吗?你能分享你的bucket策略和我是角色策略吗?我们必须明确访问putObjectSure,在上面添加了IAM和(AWS Glue service associated)角色策略,也是为了“阻止公共访问(存储桶设置)”所有选项都设置为“关闭”,现在仍然拒绝访问。请建议?