Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/cmake/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Apache spark 如何将多行标记xml文件转换为dataframe_Apache Spark_Pyspark_Apache Spark Xml - Fatal编程技术网

Apache spark 如何将多行标记xml文件转换为dataframe

Apache spark 如何将多行标记xml文件转换为dataframe,apache-spark,pyspark,apache-spark-xml,Apache Spark,Pyspark,Apache Spark Xml,我有一个包含多行的xml文件。我需要将此xml转换为适当的数据帧。我使用了spark xml,它只处理单行标记 xml数据如下所示 <?xml version='1.0' encoding='UTF-8' ?> <generic xmlns="http://xactware.com/generic.xsd" majorVersion="28" minorVersion="300" transactionId="0000"> <HEADER compN

我有一个包含多行的xml文件。我需要将此xml转换为适当的数据帧。我使用了spark xml,它只处理单行标记

xml数据如下所示

<?xml version='1.0' encoding='UTF-8' ?>
<generic
    xmlns="http://xactware.com/generic.xsd" majorVersion="28" minorVersion="300" transactionId="0000">
    <HEADER compName="ABGROUP" dateCreated="2018-03-09T09:38:51"/>
    <COVERSHEET>
        <ESTIMATE_INFO estimateName="2016-09-28-133907" priceList="YHTRDF" laborEff="Restoration/Service/Remodel" claimNumber="Hdchtdhtdh" policyNumber="Utfhtdhtd" typeOfLoss="Collapse" causeOfLoss="Collapse" roofDamage="0" deprMat="1" deprNonMat="1" deprRemoval="1" deprOandP="1" deprTaxes="1" estimateType="Mixed"/>
        <ADDRESSES>
            <ADDRESS type="Property" street="Pkwy" city="Lehi" state="UT" zip="0000" primary="1"/>
        </ADDRESSES>
        <CONTACTS>
            <CONTACT type="ClaimRep" name="Vytvyfv"/>
            <CONTACT type="Estimator" name="Vytvyfv"/>
        </CONTACTS>
        <DATES loss="2016-09-28T19:38:23Z" inspected="2016-09-28T19:39:27Z" completed="2018-03-09T09:38:49Z" received="2016-09-28T19:39:24Z" entered="2016-09-28T19:39:07Z" contacted="2016-09-28T19:39:26Z"/>
    </COVERSHEET>
    <COVERAGES>
        <COVERAGE coverageName="Dwelling" coverageType="0" id="1"/>
        <COVERAGE coverageName="Other Structures" coverageType="1" id="2"/>
        <COVERAGE coverageName="Contents" coverageType="2" id="3"/>
    </COVERAGES>
    <LINE_ITEM_DETAIL>
        <COV_BREAKDOWN>
            <COV_AMOUNTS desc="Dwelling"/>
            <COV_AMOUNTS desc="Other Structures"/>
            <COV_AMOUNTS desc="Contents"/>
        </COV_BREAKDOWN>
    </LINE_ITEM_DETAIL>
    <RECAP_BY_ROOM>
        <RECAP_GROUP desc="2016-09-28-133907"/>
    </RECAP_BY_ROOM>
</generic>

我建议您将其作为一个行标记(通用元素)阅读,然后根据您的需要进行分解

首先,元素的属性不应该包含行分隔符

<generic
    xmlns="http://xactware.com/generic.xsd" majorVersion="28" minorVersion="300" transactionId="0000">
应该给你什么

+-------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------+---------------------------------------------------+----------------------+-------------+-------------+--------------+-------------------------------+
|COVERAGES                                                          |COVERSHEET                                                                                                                                                                                                                                                                                                                                                  |HEADER                         |LINE_ITEM_DETAIL                                   |RECAP_BY_ROOM         |_majorVersion|_minorVersion|_transactionId|_xmlns                         |
+-------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------+---------------------------------------------------+----------------------+-------------+-------------+--------------+-------------------------------+
|[[[Dwelling, 0, 1,], [Other Structures, 1, 2,], [Contents, 2, 3,]]]|[[[Lehi, 1, UT, Pkwy, Property, 0,]], [[[Vytvyfv, ClaimRep,], [Vytvyfv, Estimator,]]], [2018-03-09T09:38:49Z, 2016-09-28T19:39:26Z, 2016-09-28T19:39:07Z, 2016-09-28T19:39:27Z, 2016-09-28T19:38:23Z, 2016-09-28T19:39:24Z,], [Collapse, Hdchtdhtdh, 1, 1, 1, 1, 1, 2016-09-28-133907, Mixed, Restoration/Service/Remodel, Utfhtdhtd, YHTRDF, 0, Collapse,]]|[ABGROUP, 2018-03-09T09:38:51,]|[[[[Dwelling,], [Other Structures,], [Contents,]]]]|[[2016-09-28-133907,]]|28           |300          |0             |http://xactware.com/generic.xsd|
+-------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------+---------------------------------------------------+----------------------+-------------+-------------+--------------+-------------------------------+

root
 |-- COVERAGES: struct (nullable = true)
 |    |-- COVERAGE: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _coverageName: string (nullable = true)
 |    |    |    |-- _coverageType: long (nullable = true)
 |    |    |    |-- _id: long (nullable = true)
 |    |    |    |-- false: string (nullable = true)
 |-- COVERSHEET: struct (nullable = true)
 |    |-- ADDRESSES: struct (nullable = true)
 |    |    |-- ADDRESS: struct (nullable = true)
 |    |    |    |-- _city: string (nullable = true)
 |    |    |    |-- _primary: long (nullable = true)
 |    |    |    |-- _state: string (nullable = true)
 |    |    |    |-- _street: string (nullable = true)
 |    |    |    |-- _type: string (nullable = true)
 |    |    |    |-- _zip: long (nullable = true)
 |    |    |    |-- false: string (nullable = true)
 |    |-- CONTACTS: struct (nullable = true)
 |    |    |-- CONTACT: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- _name: string (nullable = true)
 |    |    |    |    |-- _type: string (nullable = true)
 |    |    |    |    |-- false: string (nullable = true)
 |    |-- DATES: struct (nullable = true)
 |    |    |-- _completed: string (nullable = true)
 |    |    |-- _contacted: string (nullable = true)
 |    |    |-- _entered: string (nullable = true)
 |    |    |-- _inspected: string (nullable = true)
 |    |    |-- _loss: string (nullable = true)
 |    |    |-- _received: string (nullable = true)
 |    |    |-- false: string (nullable = true)
 |    |-- ESTIMATE_INFO: struct (nullable = true)
 |    |    |-- _causeOfLoss: string (nullable = true)
 |    |    |-- _claimNumber: string (nullable = true)
 |    |    |-- _deprMat: long (nullable = true)
 |    |    |-- _deprNonMat: long (nullable = true)
 |    |    |-- _deprOandP: long (nullable = true)
 |    |    |-- _deprRemoval: long (nullable = true)
 |    |    |-- _deprTaxes: long (nullable = true)
 |    |    |-- _estimateName: string (nullable = true)
 |    |    |-- _estimateType: string (nullable = true)
 |    |    |-- _laborEff: string (nullable = true)
 |    |    |-- _policyNumber: string (nullable = true)
 |    |    |-- _priceList: string (nullable = true)
 |    |    |-- _roofDamage: long (nullable = true)
 |    |    |-- _typeOfLoss: string (nullable = true)
 |    |    |-- false: string (nullable = true)
 |-- HEADER: struct (nullable = true)
 |    |-- _compName: string (nullable = true)
 |    |-- _dateCreated: string (nullable = true)
 |    |-- false: string (nullable = true)
 |-- LINE_ITEM_DETAIL: struct (nullable = true)
 |    |-- COV_BREAKDOWN: struct (nullable = true)
 |    |    |-- COV_AMOUNTS: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- _desc: string (nullable = true)
 |    |    |    |    |-- false: string (nullable = true)
 |-- RECAP_BY_ROOM: struct (nullable = true)
 |    |-- RECAP_GROUP: struct (nullable = true)
 |    |    |-- _desc: string (nullable = true)
 |    |    |-- false: string (nullable = true)
 |-- _majorVersion: long (nullable = true)
 |-- _minorVersion: long (nullable = true)
 |-- _transactionId: long (nullable = true)
 |-- _xmlns: string (nullable = true)
检查上面的数据帧,您可以通过执行以下操作来简化它

from pyspark.sql import functions as f
df.select(f.col('COVERAGES.COVERAGE'), f.col('COVERSHEET.ADDRESSES.ADDRESS.*'), f.col('COVERSHEET.CONTACTS.CONTACT'), f.col('COVERSHEET.DATES.*'), f.col('COVERSHEET.ESTIMATE_INFO.*'), f.col('HEADER.*'), f.col('LINE_ITEM_DETAIL.COV_BREAKDOWN.COV_AMOUNTS'), f.col('RECAP_BY_ROOM.RECAP_GROUP.*'), f.col('_majorVersion'), f.col('_minorVersion'), f.col('_transactionId'), f.col('_xmlns')).show(truncate=False)
这将为您提供如下模式的dataframe

现在,您可以根据
COVERAGE
CONTACT
COV\u AMOUNTS
列将其转换为多行,因为它们是唯一可以分解为多行的列


我希望答案是有帮助的

您希望您的数据帧如何?你能提供这个示例吗?不确定它应该是什么样的..试着用spark xml来解释它。但它只处理单行标记…我希望它位于正确的数据帧中,而不会丢失任何数据。您是否浏览了我在上一个答案中提供的链接?是的,我在其中找到了解析选项,但没有找到处理多行标记的方法。它不应该位于一行中吗?让我用我在拉美什所尝试的回答,但我无法阅读。它把我扔了,埃姆蒂数据框。我已经在xml文件中手动删除了行分隔符。您是否使用了任何流程来删除?我也手动删除了。试着一部分一部分地追踪这个bug。我的意思是你可以尝试阅读的第一部分。解决了错误。两件事,valueTag会做什么?为什么我们使用“df.select”后跟列?到底发生了什么?我正在复制粘贴它
valueTag:当元素中有没有子元素的属性时,用于值的标记。默认值是_值。
所以它创建的是空值,所以我将其设为false。select columns只是将结构列展平。数组列不能很容易地被压扁,所以我留下了它们,否则所有的结构列都会被压扁
+-------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------+---------------------------------------------------+----------------------+-------------+-------------+--------------+-------------------------------+
|COVERAGES                                                          |COVERSHEET                                                                                                                                                                                                                                                                                                                                                  |HEADER                         |LINE_ITEM_DETAIL                                   |RECAP_BY_ROOM         |_majorVersion|_minorVersion|_transactionId|_xmlns                         |
+-------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------+---------------------------------------------------+----------------------+-------------+-------------+--------------+-------------------------------+
|[[[Dwelling, 0, 1,], [Other Structures, 1, 2,], [Contents, 2, 3,]]]|[[[Lehi, 1, UT, Pkwy, Property, 0,]], [[[Vytvyfv, ClaimRep,], [Vytvyfv, Estimator,]]], [2018-03-09T09:38:49Z, 2016-09-28T19:39:26Z, 2016-09-28T19:39:07Z, 2016-09-28T19:39:27Z, 2016-09-28T19:38:23Z, 2016-09-28T19:39:24Z,], [Collapse, Hdchtdhtdh, 1, 1, 1, 1, 1, 2016-09-28-133907, Mixed, Restoration/Service/Remodel, Utfhtdhtd, YHTRDF, 0, Collapse,]]|[ABGROUP, 2018-03-09T09:38:51,]|[[[[Dwelling,], [Other Structures,], [Contents,]]]]|[[2016-09-28-133907,]]|28           |300          |0             |http://xactware.com/generic.xsd|
+-------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------+---------------------------------------------------+----------------------+-------------+-------------+--------------+-------------------------------+

root
 |-- COVERAGES: struct (nullable = true)
 |    |-- COVERAGE: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _coverageName: string (nullable = true)
 |    |    |    |-- _coverageType: long (nullable = true)
 |    |    |    |-- _id: long (nullable = true)
 |    |    |    |-- false: string (nullable = true)
 |-- COVERSHEET: struct (nullable = true)
 |    |-- ADDRESSES: struct (nullable = true)
 |    |    |-- ADDRESS: struct (nullable = true)
 |    |    |    |-- _city: string (nullable = true)
 |    |    |    |-- _primary: long (nullable = true)
 |    |    |    |-- _state: string (nullable = true)
 |    |    |    |-- _street: string (nullable = true)
 |    |    |    |-- _type: string (nullable = true)
 |    |    |    |-- _zip: long (nullable = true)
 |    |    |    |-- false: string (nullable = true)
 |    |-- CONTACTS: struct (nullable = true)
 |    |    |-- CONTACT: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- _name: string (nullable = true)
 |    |    |    |    |-- _type: string (nullable = true)
 |    |    |    |    |-- false: string (nullable = true)
 |    |-- DATES: struct (nullable = true)
 |    |    |-- _completed: string (nullable = true)
 |    |    |-- _contacted: string (nullable = true)
 |    |    |-- _entered: string (nullable = true)
 |    |    |-- _inspected: string (nullable = true)
 |    |    |-- _loss: string (nullable = true)
 |    |    |-- _received: string (nullable = true)
 |    |    |-- false: string (nullable = true)
 |    |-- ESTIMATE_INFO: struct (nullable = true)
 |    |    |-- _causeOfLoss: string (nullable = true)
 |    |    |-- _claimNumber: string (nullable = true)
 |    |    |-- _deprMat: long (nullable = true)
 |    |    |-- _deprNonMat: long (nullable = true)
 |    |    |-- _deprOandP: long (nullable = true)
 |    |    |-- _deprRemoval: long (nullable = true)
 |    |    |-- _deprTaxes: long (nullable = true)
 |    |    |-- _estimateName: string (nullable = true)
 |    |    |-- _estimateType: string (nullable = true)
 |    |    |-- _laborEff: string (nullable = true)
 |    |    |-- _policyNumber: string (nullable = true)
 |    |    |-- _priceList: string (nullable = true)
 |    |    |-- _roofDamage: long (nullable = true)
 |    |    |-- _typeOfLoss: string (nullable = true)
 |    |    |-- false: string (nullable = true)
 |-- HEADER: struct (nullable = true)
 |    |-- _compName: string (nullable = true)
 |    |-- _dateCreated: string (nullable = true)
 |    |-- false: string (nullable = true)
 |-- LINE_ITEM_DETAIL: struct (nullable = true)
 |    |-- COV_BREAKDOWN: struct (nullable = true)
 |    |    |-- COV_AMOUNTS: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- _desc: string (nullable = true)
 |    |    |    |    |-- false: string (nullable = true)
 |-- RECAP_BY_ROOM: struct (nullable = true)
 |    |-- RECAP_GROUP: struct (nullable = true)
 |    |    |-- _desc: string (nullable = true)
 |    |    |-- false: string (nullable = true)
 |-- _majorVersion: long (nullable = true)
 |-- _minorVersion: long (nullable = true)
 |-- _transactionId: long (nullable = true)
 |-- _xmlns: string (nullable = true)
from pyspark.sql import functions as f
df.select(f.col('COVERAGES.COVERAGE'), f.col('COVERSHEET.ADDRESSES.ADDRESS.*'), f.col('COVERSHEET.CONTACTS.CONTACT'), f.col('COVERSHEET.DATES.*'), f.col('COVERSHEET.ESTIMATE_INFO.*'), f.col('HEADER.*'), f.col('LINE_ITEM_DETAIL.COV_BREAKDOWN.COV_AMOUNTS'), f.col('RECAP_BY_ROOM.RECAP_GROUP.*'), f.col('_majorVersion'), f.col('_minorVersion'), f.col('_transactionId'), f.col('_xmlns')).show(truncate=False)
+-----------------------------------------------------------------+-----+--------+------+-------+--------+----+-----+---------------------------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+------------+------------+--------+-----------+----------+------------+----------+-----------------+-------------+---------------------------+-------------+----------+-----------+-----------+-----+---------+-------------------+-----+-----------------------------------------------+-----------------+-----+-------------+-------------+--------------+-------------------------------+
|COVERAGE                                                         |_city|_primary|_state|_street|_type   |_zip|false|CONTACT                                      |_completed          |_contacted          |_entered            |_inspected          |_loss               |_received           |false|_causeOfLoss|_claimNumber|_deprMat|_deprNonMat|_deprOandP|_deprRemoval|_deprTaxes|_estimateName    |_estimateType|_laborEff                  |_policyNumber|_priceList|_roofDamage|_typeOfLoss|false|_compName|_dateCreated       |false|COV_AMOUNTS                                    |_desc            |false|_majorVersion|_minorVersion|_transactionId|_xmlns                         |
+-----------------------------------------------------------------+-----+--------+------+-------+--------+----+-----+---------------------------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+------------+------------+--------+-----------+----------+------------+----------+-----------------+-------------+---------------------------+-------------+----------+-----------+-----------+-----+---------+-------------------+-----+-----------------------------------------------+-----------------+-----+-------------+-------------+--------------+-------------------------------+
|[[Dwelling, 0, 1,], [Other Structures, 1, 2,], [Contents, 2, 3,]]|Lehi |1       |UT    |Pkwy   |Property|0   |null |[[Vytvyfv, ClaimRep,], [Vytvyfv, Estimator,]]|2018-03-09T09:38:49Z|2016-09-28T19:39:26Z|2016-09-28T19:39:07Z|2016-09-28T19:39:27Z|2016-09-28T19:38:23Z|2016-09-28T19:39:24Z|null |Collapse    |Hdchtdhtdh  |1       |1          |1         |1           |1         |2016-09-28-133907|Mixed        |Restoration/Service/Remodel|Utfhtdhtd    |YHTRDF    |0          |Collapse   |null |ABGROUP  |2018-03-09T09:38:51|null |[[Dwelling,], [Other Structures,], [Contents,]]|2016-09-28-133907|null |28           |300          |0             |http://xactware.com/generic.xsd|
+-----------------------------------------------------------------+-----+--------+------+-------+--------+----+-----+---------------------------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+------------+------------+--------+-----------+----------+------------+----------+-----------------+-------------+---------------------------+-------------+----------+-----------+-----------+-----+---------+-------------------+-----+-----------------------------------------------+-----------------+-----+-------------+-------------+--------------+-------------------------------+

root
 |-- COVERAGE: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _coverageName: string (nullable = true)
 |    |    |-- _coverageType: long (nullable = true)
 |    |    |-- _id: long (nullable = true)
 |    |    |-- false: string (nullable = true)
 |-- _city: string (nullable = true)
 |-- _primary: long (nullable = true)
 |-- _state: string (nullable = true)
 |-- _street: string (nullable = true)
 |-- _type: string (nullable = true)
 |-- _zip: long (nullable = true)
 |-- false: string (nullable = true)
 |-- CONTACT: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _name: string (nullable = true)
 |    |    |-- _type: string (nullable = true)
 |    |    |-- false: string (nullable = true)
 |-- _completed: string (nullable = true)
 |-- _contacted: string (nullable = true)
 |-- _entered: string (nullable = true)
 |-- _inspected: string (nullable = true)
 |-- _loss: string (nullable = true)
 |-- _received: string (nullable = true)
 |-- false: string (nullable = true)
 |-- _causeOfLoss: string (nullable = true)
 |-- _claimNumber: string (nullable = true)
 |-- _deprMat: long (nullable = true)
 |-- _deprNonMat: long (nullable = true)
 |-- _deprOandP: long (nullable = true)
 |-- _deprRemoval: long (nullable = true)
 |-- _deprTaxes: long (nullable = true)
 |-- _estimateName: string (nullable = true)
 |-- _estimateType: string (nullable = true)
 |-- _laborEff: string (nullable = true)
 |-- _policyNumber: string (nullable = true)
 |-- _priceList: string (nullable = true)
 |-- _roofDamage: long (nullable = true)
 |-- _typeOfLoss: string (nullable = true)
 |-- false: string (nullable = true)
 |-- _compName: string (nullable = true)
 |-- _dateCreated: string (nullable = true)
 |-- false: string (nullable = true)
 |-- COV_AMOUNTS: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _desc: string (nullable = true)
 |    |    |-- false: string (nullable = true)
 |-- _desc: string (nullable = true)
 |-- false: string (nullable = true)
 |-- _majorVersion: long (nullable = true)
 |-- _minorVersion: long (nullable = true)
 |-- _transactionId: long (nullable = true)
 |-- _xmlns: string (nullable = true)