Apache spark 从嵌套的数据框列中删除新行字符

Apache spark 从嵌套的数据框列中删除新行字符,apache-spark,apache-spark-sql,Apache Spark,Apache Spark Sql,我有一个带有模式的数据框架 root |-- AppUsers: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- Id: integer (nullable = true) | | |-- Email: string (nullable = true) | | |-- FirstName: string (nullable = true)

我有一个带有模式的数据框架

root
 |-- AppUsers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Email: string (nullable = true)
 |    |    |-- FirstName: string (nullable = true)
 |    |    |-- LastName: string (nullable = true)
 |    |    |-- UserName: string (nullable = true)
 |-- BusinessLines: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- BusinessLineId: integer (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |    |    |-- StartDate: date (nullable = true)
 |    |    |-- EndDate: date (nullable = true)
 |    |    |-- Imported: boolean (nullable = true)
 |    |    |-- IsClosed: string (nullable = true)
 |-- CampaignDomains: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- CampaignId: integer (nullable = true)
 |    |    |-- DomainId: integer (nullable = true)
 |-- CampaignDomainEntityComments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- CampaignId: integer (nullable = true)
 |    |    |-- DomainId: integer (nullable = true)
 |    |    |-- EntityId: integer (nullable = true)
 |    |    |-- Comment: string (nullable = true)
 |-- CampaignEntities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- CampaignId: integer (nullable = true)
 |    |    |-- EntityId: integer (nullable = true)
 |    |    |-- ClosedDate: date (nullable = true)
 |    |    |-- ClosedBy: string (nullable = true)
 |-- CampaignDomainEntities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- DomainId: integer (nullable = true)
 |    |    |-- CampaignId: integer (nullable = true)
 |    |    |-- EntityId: integer (nullable = true)
 |    |    |-- Status: string (nullable = true)
 |    |    |-- ValidationDate: date (nullable = true)
 |    |    |-- ValidatedBy: string (nullable = true)
 |-- Domains: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- BusinessLineId: integer (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |    |    |-- Order: integer (nullable = true)
 |    |    |-- Enabled: boolean (nullable = true)
 |-- Entities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- BasesClient: string (nullable = true)
 |    |    |-- BusinessLineId: integer (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |    |    |-- Pole: string (nullable = true)
 |    |    |-- PoleCode: string (nullable = true)
 |    |    |-- PoleLabel: string (nullable = true)
 |    |    |-- Transactions: string (nullable = true)
 |    |    |-- Enabled: boolean (nullable = true)
 |    |    |-- ELRId: string (nullable = true)
 |    |    |-- ELRDescription: string (nullable = true)
 |    |    |-- UOId: string (nullable = true)
 |    |    |-- UODescription: string (nullable = true)
 |-- Groups: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- BusinessLine: integer (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |    |    |-- Enabled: boolean (nullable = true)
 |    |    |-- IsCampaign: boolean (nullable = true)
 |-- GroupEntities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- EntityId: integer (nullable = true)
 |    |    |-- GroupId: integer (nullable = true)
 |-- Indicators: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- AccessLevel: string (nullable = true)
 |    |    |-- CanBeCopied: boolean (nullable = true)
 |    |    |-- Definition: string (nullable = true)
 |    |    |-- ModeReporting: string (nullable = true)
 |    |    |-- NameEN: string (nullable = true)
 |    |    |-- NameFR: string (nullable = true)
 |    |    |-- Order: integer (nullable = true)
 |    |    |-- Perimeter: string (nullable = true)
 |    |    |-- PeriodTypeEN: string (nullable = true)
 |    |    |-- PeriodTypeFR: string (nullable = true)
 |    |    |-- PeriodTypeId: integer (nullable = true)
 |    |    |-- SubDomainId: integer (nullable = true)
 |    |    |-- Type: string (nullable = true)
 |    |    |-- Enabled: boolean (nullable = true)
 |    |    |-- OversightIndicatorID: string (nullable = true)
 |-- IndicatorEntities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- EntityId: integer (nullable = true)
 |    |    |-- IndicatorId: integer (nullable = true)
 |-- SubDomains: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- Comment: string (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |    |    |-- Order: integer (nullable = true)
 |    |    |-- Enabled: boolean (nullable = true)
 |    |    |-- DomainId: integer (nullable = true)
 |-- SubIndicators: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- IndicatorId: integer (nullable = true)
 |    |    |-- NameEN: string (nullable = true)
 |    |    |-- NameFR: string (nullable = true)
 |    |    |-- Order: integer (nullable = true)
 |    |    |-- Type: string (nullable = true)
 |    |    |-- Unit: string (nullable = true)
 |    |    |-- ValueListNameId: integer (nullable = true)
 |    |    |-- IsMandatory: boolean (nullable = true)
 |    |    |-- IsGDPR: boolean (nullable = true)
 |    |    |-- OversightSubIndicatorID: string (nullable = true)
 |-- ValueLists: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- NameEN: string (nullable = true)
 |    |    |-- NameFR: string (nullable = true)
 |    |    |-- Value: integer (nullable = true)
 |    |    |-- ValueListNameId: integer (nullable = true)
 |-- ValueListNames: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- NameEN: string (nullable = true)
 |    |    |-- NameFR: string (nullable = true)
 |-- Comments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- Definition: string (nullable = true)
 |-- CommentValues: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- CampaignId: integer (nullable = true)
 |    |    |-- CommentId: integer (nullable = true)
 |    |    |-- Value: string (nullable = true)
数据帧的打印:

+--------------------------------------+-------------+---------+---------------+----------------------------+-----------------------+--------------------------+--------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------+---------------------------------------------------------------+-----------------------------------------------------------+-------------------------------------+-----------------+----------------+
|AppUsers                              |BusinessLines|Campaigns|CampaignDomains|CampaignDomainEntityComments|CampaignEntities       |CampaignDomainEntities    |Domains                   |Entities                                                                                                                                                                                  |Groups                             |GroupEntities|Indicators                                                                                                                                                                                                                                                                                                                                                                   |IndicatorEntities|SubDomains                        |SubIndicators                                                  |ValueLists                                                 |ValueListNames                       |Comments         |CommentValues   |
+--------------------------------------+-------------+---------+---------------+----------------------------+-----------------------+--------------------------+--------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------+---------------------------------------------------------------+-----------------------------------------------------------+-------------------------------------+-----------------+----------------+
|[[1,null,JEROEN,SOMERS,JEROEN.SOMERS]]|[[1,PRIV]]   |null     |[[1,2]]        |[[122,1,9,add comments ]]   |[[1,1,2018-08-24,null]]|[[1,11,1,Draft,null,null]]|[[1,1,1,INCIDENTS,1,true]]|[[1,0071300000,Outil central (FORCE),1,SGPB MONACO GESTION PRIVEE,PRIV,000423,PRIV Monaco,Outil central (FORCE),true,0071300000,SOCIETE GENERALE PRIVATE BANKING (MONACO),20664,PRIV/MON]]|[[1,1,null,SGPB GROUPE,true,false]]|[[1,1]]      |[[18174,D3E_I1,EndUser,false,Rappel : les instructions transposées doivent être validées par la Conformité IBFS avant d'être soumises à la validation du Management de votre entité.,Flow,IBFS 000449 - IBFS Compliance Manual - published on 01/29/2015,IBFS 000449 - Manuel de conformité IBFS - publié le 29/01/2015,1,Global,Monthly,Mensuel,1,440,Complex,true,FCC.1.1]]|[[1,1]]          |[[1,18,null,Key Points,1,true,18]]|[[1,18.1,1,Entity,Entity,111,Text,,null,false,false,FCC.1.1.1]]|[[1,Discretionary management,Discretionary management,1,1]]|[[1,Compliance Item,Compliance Item]]|[[4,Priv-1,null]]|[[13,4,112323 ]]|
+--------------------------------------+-------------+---------+---------------+----------------------------+-----------------------+--------------------------+--------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------+---------------------------------------------------------------+-----------------------------------------------------------+-------------------------------------+-----------------+----------------+


指示符中的描述带有换行符和一些不需要的字符,如“,”

我想从“描述”子列中删除那些不需要的字符,并保持结构不变 我已经用平面结构完成了这项工作,但嵌套结构似乎令人困惑

为了保持简单,我删除了大部分字段,只保留了要应用转换的字段

样本输入:

{
  "AppUsers": [
    {
      "Id": 1,
      "UserName": "abc.bcd",
    }
  ],
  "Indicators": [
    {
      "Definition": "Rappel ;;;;; , \n",
    }
  ]
}
{
  "AppUsers": [
    {
      "Id": 1,
      "UserName": "abc.bcd",
    }
  ],
  "Indicators": [
    {

      "Definition": "Rappel",
    }
  ]
}
预期输出:

{
  "AppUsers": [
    {
      "Id": 1,
      "UserName": "abc.bcd",
    }
  ],
  "Indicators": [
    {
      "Definition": "Rappel ;;;;; , \n",
    }
  ]
}
{
  "AppUsers": [
    {
      "Id": 1,
      "UserName": "abc.bcd",
    }
  ],
  "Indicators": [
    {

      "Definition": "Rappel",
    }
  ]
}
必须从指标.定义列中删除不需要的字符
请帮助

也许您可以尝试访问您的列,并使用regexp\u replace删除不需要的字符。以下是一个示例

df=df.withColumn('Definition',regexp_replace(col('Indicators').getItem(4)),“/[%&\\\:”,?#\s]/g”,”)

请使用示例输入数据和预期输出进行更新。@RangaVure我已更新了示例输入和输出。我认为您必须编写一个UDF来处理目标列以删除不需要的字符。您可以打印示例数据框吗?我认为没有一种简单的方法来打开/包装JSON。你需要1。使用explode展开指示器字段,并使用Definition 2的值创建新列。应用
|\\s |,| \\\\n
筛选到新的第3列。将筛选列合并回指标字段4。重新创建JSON。这将构成巨大的开销。我认为最有效的方法是将筛选器应用于原始文本。我收到以下错误消息:``由于数据类型不匹配:参数1需要字符串类型,但是,`
指示符
定义
'是数组类型```