PySpark脚本替换所有值

PySpark脚本替换所有值,pyspark,Pyspark,我编写了一个小的PySpark脚本,它正在寻找一个名为resource\u tags\u user\u engagement的值 如果该值为空、null或包含单词,则应将其替换为默认值。但它不是简单地替换空白、null或word,而是替换所有值: import sys import pyspark.sql.functions as f from pyspark.context import SparkContext from awsglue.transforms import * from aw

我编写了一个小的PySpark脚本,它正在寻找一个名为resource\u tags\u user\u engagement的值

如果该值为空、null或包含单词,则应将其替换为默认值。但它不是简单地替换空白、null或word,而是替换所有值:

import sys
import pyspark.sql.functions as f
from pyspark.context import SparkContext
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame

# Set Glue Context
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
spark.sql("set spark.sql.parquet.enableVectorizedReader=false")

# Create Dynamic Data Frame from table in the glue database
cost_allocation = glueContext.create_dynamic_frame.from_catalog(database="company_cost_allocation", table_name="company_cost_allocation")

# Convert dynamic frame to dta frame
cost_allocation_df = cost_allocation.toDF()

# Set default engagements
cost_allocation_df = cost_allocation_df.withColumn('resource_tags_user_engagement',          
         f.when(
               (f.col('line_item_usage_account_id').isin('123456789101',  '123456789102', '123456789103',  '123456789104', '123456789105', '123456789106', '123456789107', '123456789108', '123456789109' )) &
               (f.col('resource_tags_user_engagement') == '' ) |
               (f.col('resource_tags_user_engagement').isNull()) |
               (f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '400000008378'
               )) \
               .withColumn('resource_tags_user_engagement',          
         f.when(
               ((f.col('line_item_usage_account_id') == f.lit('123456789110')) | 
               (f.col('line_item_usage_account_id') == f.lit('123456789111'))) & 
               (f.col('resource_tags_user_engagement') == f.lit('') ) |
               (f.col('resource_tags_user_engagement').isNull()) |
               (f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '807000000401'
               )) \
                .withColumn('resource_tags_user_engagement',          
         f.when(
               (f.col('line_item_usage_account_id').isin('123456789112',  '123456789113', '123456789114')) &
               (f.col('resource_tags_user_engagement') == '' ) |
               (f.col('resource_tags_user_engagement').isNull()) |
               (f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '807000000412'
               )) \
               .withColumn('resource_tags_user_engagement',          
         f.when(
               (f.col('line_item_usage_account_id').isin('123456789115',  '123456789116', '123456789117',  '123456789118', '123456789119', '123456789120', '123456789121', '123456789122', '123456789123')) &
               (f.col('resource_tags_user_engagement') == '' ) |
               (f.col('resource_tags_user_engagement').isNull()) |
               (f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '400000008692'
               )) \
                .withColumn('resource_tags_user_engagement',          
         f.when(
               (f.col('line_item_usage_account_id').isin('123456789124',  '123456789125', '123456789126')) &
               (f.col('resource_tags_user_engagement') == '' ) |
               (f.col('resource_tags_user_engagement').isNull()) |
               (f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '807000000412'
               )) \
                .withColumn('resource_tags_user_engagement',          
         f.when(
               (f.col('line_item_usage_account_id').isin('123456789127',  '123456789128', '123456789129', '123456789130', '123456789131')) &
               (f.col('resource_tags_user_engagement') == '' ) |
               (f.col('resource_tags_user_engagement').isNull()) |
               (f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '808000000298'
               )) \
                .withColumn('resource_tags_user_engagement',          
        f.when(
               (f.col('line_item_usage_account_id') == '123456789132') &
               (f.col('resource_tags_user_engagement') == '' ) |
               (f.col('resource_tags_user_engagement').isNull()) |
               (f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '803000006453'
               )) \
                .withColumn('resource_tags_user_engagement',          
         f.when(
               ((f.col('line_item_usage_account_id') == f.lit('123456789133')) | 
               (f.col('line_item_usage_account_id') == f.lit('123456789134'))) &
               (f.col('resource_tags_user_engagement') == f.lit('') ) |
               (f.col('resource_tags_user_engagement').isNull()) |
               (f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '400000008426'
               )) \
                .withColumn('resource_tags_user_engagement',
        f.when(
               ((f.col('line_item_usage_account_id') == f.lit('123456789135')) | 
               (f.col('line_item_usage_account_id') == f.lit('123456789136'))) &
               (f.col('resource_tags_user_engagement') == f.lit('') ) |
               (f.col('resource_tags_user_engagement').isNull()) |
               (f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '800000047650'
               ).otherwise(f.col('resource_tags_user_engagement')))


# Convert back to a DynamicFrame for further processing.
partitioned_dynamicframe = DynamicFrame.fromDF(cost_allocation_df, glueContext, "partitioned_df")

# Repartition the dynamic frame before writing to S3
cost_allocation_df = cost_allocation_df.repartition(5)

# Write to S3
output_dir = "s3://company-cur-reports/company-costs-transformed-legacy-billing"
datasink = glueContext.write_dynamic_frame.from_options(frame = partitioned_dynamicframe, connection_type = "s3", connection_options = {"path": output_dir}, format = "parquet", transformation_ctx = "datasink")

它为什么这样做?如何让脚本仅替换空值、空值或包含单词的值?

您缺少
。否则(f.col('resource\u tags\u user\u engagement'))
除了最后一个
with column
语句之外,其他所有语句都会出现。如果条件不匹配,
when
也将返回
null
,您可以将对
when
的多个调用链接在一起,而不是重复使用
with column
:。进行更改可能会修复您的代码。好的,谢谢!我会调查的。