Apache spark Spark 2.2连接失败,数据集庞大

Apache spark Spark 2.2连接失败,数据集庞大,apache-spark,join,apache-spark-sql,hdfs,cluster-computing,Apache Spark,Join,Apache Spark Sql,Hdfs,Cluster Computing,我目前在尝试使用Spark DataFrame API将一个大数据集(654 GB)与一个较小的数据集(535 MB)连接(内部)时遇到问题 我正在使用broadcast()函数将较小的数据集广播到工作节点 我无法连接这两个数据集。以下是我得到的错误示例: 19/04/26 19:39:07 INFO executor.CoarseGrainedExecutorBackend: Got assigned task 1315 19/04/26 19:39:07 INFO executor.Exec

我目前在尝试使用Spark DataFrame API将一个大数据集(654 GB)与一个较小的数据集(535 MB)连接(内部)时遇到问题

我正在使用broadcast()函数将较小的数据集广播到工作节点

我无法连接这两个数据集。以下是我得到的错误示例:

19/04/26 19:39:07 INFO executor.CoarseGrainedExecutorBackend: Got assigned task 1315
19/04/26 19:39:07 INFO executor.Executor: Running task 25.1 in stage 13.0 (TID 1315)
19/04/26 19:39:07 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 1
19/04/26 19:39:07 INFO datasources.SQLHadoopMapReduceCommitProtocol: Using output committer class org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
19/04/26 19:39:07 INFO datasources.FileScanRDD: Reading File path: SOMEFILEPATH, range: 3087007744-3221225472, partition values: [empty row]
19/04/26 19:39:17 INFO datasources.FileScanRDD: Reading File path: SOMEFILEPATH, range: 15971909632-16106127360, partition values: [empty row]
19/04/26 19:39:24 WARN hdfs.DFSClient: DFSOutputStream ResponseProcessor exception  for block isi_hdfs_pool:blk_4549851005_134218728
java.io.IOException: Connection reset by peer
    at sun.nio.ch.FileDispatcherImpl.read0(Native Method)
    at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
    at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223)
    at sun.nio.ch.IOUtil.read(IOUtil.java:197)
    at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:380)
    at org.apache.hadoop.net.SocketInputStream$Reader.performIO(SocketInputStream.java:57)
    at org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:142)
    at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)
    at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:131)
    at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:118)
    at java.io.FilterInputStream.read(FilterInputStream.java:83)
    at java.io.FilterInputStream.read(FilterInputStream.java:83)
    at org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed(PBHelper.java:2280)
    at org.apache.hadoop.hdfs.protocol.datatransfer.PipelineAck.readFields(PipelineAck.java:244)
    at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer$ResponseProcessor.run(DFSOutputStream.java:733)
19/04/26 19:39:27 ERROR util.Utils: Aborting task
com.univocity.parsers.common.TextWritingException: Error writing row.
Internal state when error was thrown: recordCount=458089, recordData=["SOMEDATA"]
    at com.univocity.parsers.common.AbstractWriter.throwExceptionAndClose(AbstractWriter.java:916)
    at com.univocity.parsers.common.AbstractWriter.writeRow(AbstractWriter.java:706)
    at org.apache.spark.sql.execution.datasources.csv.UnivocityGenerator.write(UnivocityGenerator.scala:82)
    at org.apache.spark.sql.execution.datasources.csv.CsvOutputWriter.write(CSVFileFormat.scala:139)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.execute(FileFormatWriter.scala:327)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:258)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:256)
    at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1375)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:261)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:191)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:190)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:108)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalStateException: Error closing the output.
    at com.univocity.parsers.common.AbstractWriter.close(AbstractWriter.java:861)
    at com.univocity.parsers.common.AbstractWriter.throwExceptionAndClose(AbstractWriter.java:903)
    at com.univocity.parsers.common.AbstractWriter.writeRow(AbstractWriter.java:811)
    at com.univocity.parsers.common.AbstractWriter.writeRow(AbstractWriter.java:704)
    ... 15 more
Caused by: java.io.IOException: All datanodes DatanodeInfoWithStorage[10.241.209.34:585,null,DISK] are bad. Aborting...
    at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.setupPipelineForAppendOrRecovery(DFSOutputStream.java:1109)
    at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.processDatanodeError(DFSOutputStream.java:871)
    at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:401)
19/04/26 19:39:27 WARN util.Utils: Suppressing exception in catch: Failed on local exception: java.io.IOException: Connection reset by peer; Host Details : local host is: "SOMENODEHOST"; destination host is: "SOMEDESTINATIONHOST":SOMEPORT; 
java.io.IOException: Failed on local exception: java.io.IOException: Connection reset by peer; Host Details : local host is: "SOMENODEHOST"; destination host is: "SOMEDESTINATIONHOST":SOMEPORT; 
    at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:776)
    at org.apache.hadoop.ipc.Client.call(Client.java:1479)
    at org.apache.hadoop.ipc.Client.call(Client.java:1412)
    at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:229)
    at com.sun.proxy.$Proxy17.delete(Unknown Source)
    at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.delete(ClientNamenodeProtocolTranslatorPB.java:540)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:191)
    at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)
    at com.sun.proxy.$Proxy18.delete(Unknown Source)
    at org.apache.hadoop.hdfs.DFSClient.delete(DFSClient.java:2044)
    at org.apache.hadoop.hdfs.DistributedFileSystem$14.doCall(DistributedFileSystem.java:707)
    at org.apache.hadoop.hdfs.DistributedFileSystem$14.doCall(DistributedFileSystem.java:703)
    at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
    at org.apache.hadoop.hdfs.DistributedFileSystem.delete(DistributedFileSystem.java:714)
    at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.abortTask(FileOutputCommitter.java:568)
    at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.abortTask(FileOutputCommitter.java:557)
    at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.abortTask(HadoopMapReduceCommitProtocol.scala:159)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$1.apply$mcV$sp(FileFormatWriter.scala:266)
    at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1384)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:261)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:191)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:190)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:108)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Connection reset by peer
    at sun.nio.ch.FileDispatcherImpl.read0(Native Method)
    at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
    at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223)
    at sun.nio.ch.IOUtil.read(IOUtil.java:197)
    at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:380)
    at org.apache.hadoop.net.SocketInputStream$Reader.performIO(SocketInputStream.java:57)
    at org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:142)
    at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)
    at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:131)
    at java.io.FilterInputStream.read(FilterInputStream.java:133)
    at java.io.FilterInputStream.read(FilterInputStream.java:133)
    at org.apache.hadoop.ipc.Client$Connection$PingInputStream.read(Client.java:520)
    at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
    at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
    at java.io.DataInputStream.readInt(DataInputStream.java:387)
    at org.apache.hadoop.ipc.Client$Connection.receiveRpcResponse(Client.java:1084)
    at org.apache.hadoop.ipc.Client$Connection.run(Client.java:979)
在连接大数据集和小数据集之前,我尝试将第一个数据集的10000条记录连接到整个小数据集(535MB)。我有一个“期货超时[300秒]错误”

然后我将spark.sql.broadcastTimeout变量增加到3600秒。它工作得很好。但是,当我尝试将其与整个数据集(654 GB)连接时,它给出了您可以在上面看到的错误(TextWriting Exception

我的问题是:

  • 如何更有效地监控我的spark工作?我该如何继续

  • 您认为是什么导致了此错误的发生?我怎样才能解决它

您将在下面找到有关集群、spark作业的执行和配置的一些信息

一些信息/背景: 我正在生产环境中工作(请参阅下面的集群配置)。我无法升级我的spark版本。我没有spark UI或Thread UI来监视我的工作。我能找到的只有纱线原木

代码示例 smallDF是经过一些聚合和转换后获得的535 MB数据帧

执行计划
==物理计划==
*第五个数字项目是252个数字项目,第五个数字项目是252个数字项目,第五个数字项目,第五个数字项目,第五个数字项目是252个项目,第五个数字项目是252个项目,第五个数字项目是252个项目,第五个数字项目是252个项目,第五个数字项目,第五个数字项目是153个,德克里德154个,巴格里,巴格洛里155,第155个,巴德格里尔,BD格雷格雷尔,第156个数字项目是156个,156个数字项目,贝武武武武武夫,156个项目,贝武武武武武武武武武夫,156个项目,Be武武武武武武武武武夫,157个项目,15个项目,15个项目,15个项目,15个项目,社区项目,15个项目,15个项目,五五五五项目,社区项目,15个项目,15个项目,五五五个项目,15个项目,五#169,亚科#170,强制#DVA#172,BYBANK#173,CPTE#U PROTEGE#174,HOURMV#175,RDFB#176,#30多个字段]
+-*BroadcastHashJoin[NO#NUM#252]、[NO#NUM#13]、Inner、BuildRight
项目(NUM 355)152作为无任何数字的152作为无任何数字的252,DEV 355,153,Decred 355,154,BD格罗里355,155项目项目(NUM 355)152作为项目项目(NUM 355)152作为无任何数字的152作为无数字的152,发展中的152,发展中的152,发展中的153,去信的154,去信的154,去信的154,BD格罗,BD格罗那个里的155,155,155,图里的155,155,巴格里,巴德格里,巴德格里,155,巴格里,巴格里,BD格雷尔的155,155,BD格雷尔,155,BD格里,155,BD格里,BD格里,BD格里,BD格雷尔,155,155,图图图图图尔,156,156,BD格雷尔,156,BD格里168个,家庭169个,亚科170个,强迫DVA 172个,银行173个,CPTE保护174个,HOURMV 175个,RDFB 176个,还有26个字段]
:+-*过滤器(BEWC#157插图(25003,25302,25114,20113,12017,20108,25046,12018,15379,15358,11011,20114,10118,12003,25097,20106,20133,10133,10142,15402,25026,25345,28023,15376,25019,28004,21701,25001,11008,15310,15003,28020,22048,15470,25300,25514,25381,25339,15099,25301,28005,28026,25098,25018,15323,25376,15804,15414,25344,25102,15458,15313,28002,25385,22051,25214,15031,12005,15425,20145,22011,15304,25027,14020,11007,25901,15343,22049,20112,12031,20127,15339,25421,15432,28025,25340,25325,20150,28011,25368,25304,22501,25369,28022,15098,12032,15375,25002,25008,10116,10101,22502,25090,15004,20105,12030,22503,15095,22007,15809,15342,15311,25216,10103,20122,11019,20142,15097,20147,20149,25005,25205,25380,15380,10120,25015,15384,11003,10110,25016,15090,25307,15001,25390,15312,10115,25219,15806,15459,12016,15359,15395,15302,12021,11701,10111,10148,25379,15807,10102,25352,25355,12010,25095,25394,20101,25413,15385,25322,28027,11026,15533,25201,25371,10128,11028,12020,15819,10143,28028,10123,10125,11020,25029,10122,25343,15015,12033,25014,12012,25024,25375,11023,25501,25402,22001,15317,12014,16114,20501,15046,12001,12022,10104,10117,12002,25499,10145,10153,12011,15350,15300,10119,25305,15345,25374,11027,25430,28021,25202,10121,28024,25101,28001,15321,11025,25358,15333,15501,25533,15372,12008,11015,10114,10113,10112,15303,15320,28006,22002,25359,10132,15497,25353,11029,25425,15374,12019,25437,11022,15357,20148,2011126114250925354101242530311010201201351580153128029)和&isnotnull(BKA#164))&isnotnull(NUM#152))

10+文件文件们可以收集到csv[UHDIN_yyyyyyyyyyyyyymmddţ151,num355,152,发展村153,德布雷154,德布雷154,BD格罗里155,BD格雷尔(BD格里)155,yyyyyyyyyyyyymmdmdmdmdd35d#355.15.15.15年,数字15,数字15,数字152,152,152,152,152,152,发展,发展,发展,发展,发展村153,发展,发展村153,发展,发展村153,发展村153,发展村153,发展,发展村153,发展,153,发展,发展,发展,发展村153,153,发展,发展,发展村,发展,发展,发展,发展村,153,153,发展,发展,发展,153,发展,发展,153,153,153,发展,发展,发展,发展,发展,发展,发展167、品牌168、家族169、亚科170、DVA 171、强制DVA 172、BYBANK 173、CPTE 174、…26个其他字段]批处理:false,格式:CSV,位置:InMemoryFileIndex[hdfs://SOMEHOST:SOMEPORT/SOMEPATH...,PartitionFilters:[],PushedFilters:[在(BEWC,[25003253022511420113120172010825046120181537915358110112011410118182003,25…,ReadSchema:struct以下是关于代码的一些改进:

  • 添加
    重新分区
    根据与
    uh
    连接的键列,分区的数量应大约为
    650GB/500MB~1300
  • 在加入数据集之前对其应用过滤,在您的情况下,只需在join语句之前执行where子句
  • 可选地
    缓存
    小数据集
  • 确保小数据集将被广播,即您可以尝试保存并检查其大小。然后相应地调整
    spark.broadcast.blockSize
    的值,可能会增加它
  • 以下是您的代码在进行更改时的外观:

        val uh_months = readCsv(spark, input_dir_terro + "HDFS_PATH_OF_ALL_THE_CSV_FILES") 
          .withColumnRenamed("OLD_KEY", "KEY")
          .where(col("code").isin(LIST OF VALUES))
          .withColumn("january", lit("1960-01-01"))
    
        val uh = uh_months
          .withColumn("UHDIN", datediff(to_date(unix_timestamp(col("UHDIN_YYYYMMDD"), "yyyyMMdd").cast(TimestampType)),
            to_date(unix_timestamp(col("january"), "yyyy-MM-dd").cast(TimestampType))))
          //      .withColumn("field_1", to_date((unix_timestamp(col("field"), "ddMMMyyyy")).cast(TimestampType)))
          .withColumn("field_1", date_format(col("field"), "dd/MM/yyyy"))
          .drop("UHDIN_YYYYMMDD")
          .drop("january")
          .drop("field")
          .repartition(1300, $"KEY") //change 1: repartition based on KEY with 1300 (650GB/500MB~1300)
    
        //change 2: always prune as much information as possible before joining!
        val smallerDF = smallDF
                          .where(smallDF.col("ID").isNotNull && col("field_6").isNotNull)
                          .select("KEY", "ID", "field_3", "field_4", "field_5")
    
         //change 3: you can optionally cache the small dataset
        smallerDF.cache()
    
        //change 4: adjust spark.broadcast.blockSize i.e spark.conf.set("spark.broadcast.blockSize","16m"
    
        val uh_joined = uh.join(broadcast(smallerDF), "KEY")
          .select(
            uh.col("*"),
            smallerDF.col("ID"),
            smallerDF.col("field_3"),
            smallerDF.col("field_4"),
            smallerDF.col("field_5"))
          .withColumnRenamed("field_1", "field")
    

    最后一句话与您的集群配置相关,我会尝试增加
    num executors
    至少32个,因为这样一个大型集群中的并行化级别应该更高。

    请使用您当前的代码或执行计划示例更新您的问题(您可以使用df.explain()打印它)如果可能的话,Spark UI的DAG感谢您的回复。我添加了一个代码示例。但是,正如我提到的,我没有访问Spark UI的权限。对于执行计划,我目前正在执行,并将在我添加执行计划后立即编辑问题。非常感谢。您的广播效果很好,这是个好消息。您如何能够拨打电话l广播工作正常?如果没有,我会看到什么或没有看到什么?谢谢,它确实工作了!我甚至没有设置spark.broadcast.blockSize参数。
        == Physical Plan ==
    *Project [NO_NUM#252, DEV#153, DEBCRED#154, BDGRORI#155, BDGREUR#156, BEWC#157, MSG30_NL#158, SCAPMV#159, USERID#160, MMED#161, TNUM#162, NMTGP#163, BKA#164, CATEXT#165, SEQETAT#166, ACCTYPE#167, BRAND#168, FAMILY#169, SUBFAMILY#170, FORCED_DVA#172, BYBANK#173, CPTE_PROTEGE#174, HOURMV#175, RDFB#176, ... 30 more fields]
    +- *BroadcastHashJoin [NO_NUM#252], [NO_NUM#13], Inner, BuildRight
       :- *Project [NUM#152 AS NO_NUM#252, DEV#153, DEBCRED#154, BDGRORI#155, BDGREUR#156, BEWC#157, MSG30_NL#158, SCAPMV#159, USERID#160, MMED#161, TNUM#162, NMTGP#163, BKA#164, CATEXT#165, SEQETAT#166, ACCTYPE#167, BRAND#168, FAMILY#169, SUBFAMILY#170, FORCED_DVA#172, BYBANK#173, CPTE_PROTEGE#174, HOURMV#175, RDFB#176, ... 26 more fields]
       :  +- *Filter ((BEWC#157 INSET (25003,25302,25114,20113,12017,20108,25046,12018,15379,15358,11011,20114,10118,12003,25097,20106,20133,10133,10142,15402,25026,25345,28023,15376,25019,28004,21701,25001,11008,15310,15003,28020,22048,15470,25300,25514,25381,25339,15099,25301,28005,28026,25098,25018,15323,25376,15804,15414,25344,25102,15458,15313,28002,25385,22051,25214,15031,12005,15425,20145,22011,15304,25027,14020,11007,25901,15343,22049,20112,12031,20127,15339,25421,15432,28025,25340,25325,20150,28011,25368,25304,22501,25369,28022,15098,12032,15375,25002,25008,10116,10101,22502,25090,15004,20105,12030,22503,15095,22007,15809,15342,15311,25216,10103,20122,11019,20142,15097,20147,20149,25005,25205,25380,15380,10120,25015,15384,11003,10110,25016,15090,25307,15001,25390,15312,10115,25219,15806,15459,12016,15359,15395,15302,12021,11701,10111,10148,25379,15807,10102,25352,25355,12010,25095,25394,20101,25413,15385,25322,28027,11026,15533,25201,25371,10128,11028,12020,15819,10143,28028,10123,10125,11020,25029,10122,25343,15015,12033,25014,12012,25024,25375,11023,25501,25402,22001,15317,12014,16114,20501,15046,12001,12022,10104,10117,12002,25499,10145,10153,12011,15350,15300,10119,25305,15345,25374,11027,25430,28021,25202,10121,28024,25101,28001,15321,11025,25358,15333,15501,25533,15372,12008,11015,10114,10113,10112,15303,15320,28006,22002,25359,10132,15497,25353,11029,25425,15374,12019,25437,11022,15357,20148,20111,26114,25099,25354,10124,25303,11010,20120,20135,15820,15331,28029) && isnotnull(BKA#164)) && isnotnull(NUM#152))
       :     +- *FileScan csv [UHDIN_YYYYMMDD#151,NUM#152,DEV#153,DEBCRED#154,BDGRORI#155,BDGREUR#156,BEWC#157,MSG30_NL#158,SCAPMV#159,USERID#160,MMED#161,TNUM#162,NMTGP#163,BKA#164,CATEXT#165,SEQETAT#166,ACCTYPE#167,BRAND#168,FAMILY#169,SUBFAMILY#170,DVA#171,FORCED_DVA#172,BYBANK#173,CPTE_PROTEGE#174,... 26 more fields] Batched: false, Format: CSV, Location: InMemoryFileIndex[hdfs://SOMEHOST:SOMEPORT/SOMEPATH..., PartitionFilters: [], PushedFilters: [In(BEWC, [25003,25302,25114,20113,12017,20108,25046,12018,15379,15358,11011,20114,10118,12003,25..., ReadSchema: struct<UHDIN_YYYYMMDD:string,NUM:string,DEV:string,DEBCRED:string,BDGRORI:string,BDGREUR:string,B...
       +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]))
          +- *Project [NO_NUM#13, minrel#370, PSP#82, Label#105, StartDate#106]
             +- *SortMergeJoin [PSP#381], [PSP#82], Inner
                :- *Sort [PSP#381 ASC NULLS FIRST], false, 0
                :  +- Exchange hashpartitioning(PSP#381, 200)
                :     +- *Project [PSP#381, NO_NUM#13, minrel#370]
                :        +- SortMergeJoin [PSP#381, C_SNUM#14, minrel#370, NO_NUM#13], [NO_PSP#47, C_SNUM_1#387, C_NRELPR#50, NO_NUM_1#400], LeftOuter
                :           :- *Sort [PSP#381 ASC NULLS FIRST, C_SNUM#14 ASC NULLS FIRST, minrel#370 ASC NULLS FIRST, NO_NUM#13 ASC NULLS FIRST], false, 0
                :           :  +- Exchange hashpartitioning(PSP#381, C_SNUM#14, minrel#370, NO_NUM#13, 200)
                :           :     +- SortAggregate(key=[NO_PSP#12, C_SNUM#14, NO_NUM#13], functions=[min(C_NRELPR#15)])
                :           :        +- *Sort [NO_PSP#12 ASC NULLS FIRST, C_SNUM#14 ASC NULLS FIRST, NO_NUM#13 ASC NULLS FIRST], false, 0
                :           :           +- Exchange hashpartitioning(NO_PSP#12, C_SNUM#14, NO_NUM#13, 200)
                :           :              +- SortAggregate(key=[NO_PSP#12, C_SNUM#14, NO_NUM#13], functions=[partial_min(C_NRELPR#15)])
                :           :                 +- *Sort [NO_PSP#12 ASC NULLS FIRST, C_SNUM#14 ASC NULLS FIRST, NO_NUM#13 ASC NULLS FIRST], false, 0
                :           :                    +- *Project [NO_PSP#12, C_SNUM#14, NO_NUM#13, C_NRELPR#15]
                :           :                       +- *Filter (((C_NRELPR#15 IN (001,006) && C_SNUM#14 IN (030,033)) && isnotnull(NO_PSP#12)) && isnotnull(NO_NUM#13))
                :           :                          +- *FileScan csv [NO_PSP#12,NO_NUM#13,C_SNUM#14,c_nrelpr#15] Batched: false, Format: CSV, Location: InMemoryFileIndex[hdfs://SOMEHOST:SOMEPORT/SOMEPATH..., PartitionFilters: [], PushedFilters: [In(c_nrelpr, [001,006]), In(C_SNUM, [030,033]), IsNotNull(NO_PSP), IsNotNull(NO_NUM)], ReadSchema: struct<NO_PSP:string,NO_NUM:string,C_SNUM:string,c_nrelpr:string>
                :           +- *Sort [NO_PSP#47 ASC NULLS FIRST, C_SNUM_1#387 ASC NULLS FIRST, C_NRELPR#50 ASC NULLS FIRST, NO_NUM_1#400 ASC NULLS FIRST], false, 0
                :              +- Exchange hashpartitioning(NO_PSP#47, C_SNUM_1#387, C_NRELPR#50, NO_NUM_1#400, 200)
                :                 +- *Project [NO_PSP#47, NO_NUM#48 AS NO_NUM_1#400, C_SNUM#49 AS C_SNUM_1#387, c_nrelpr#50]
                :                    +- *FileScan csv [NO_PSP#47,NO_NUM#48,C_SNUM#49,c_nrelpr#50] Batched: false, Format: CSV, Location: InMemoryFileIndex[hdfs://SOMEHOST:SOMEPORT/SOMEPATH..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<NO_PSP:string,NO_NUM:string,C_SNUM:string,c_nrelpr:string>
                +- *Sort [PSP#82 ASC NULLS FIRST], false, 0
                   +- Exchange hashpartitioning(PSP#82, 200)
                      +- *Project [PSP#82, Label#105, StartDate#106]
                         +- *Filter isnotnull(PSP#82)
                            +- *FileScan csv [PSP#82,Label#105,StartDate#106] Batched: false, Format: CSV, Location: InMemoryFileIndex[hdfs://SOMEHOST:SOMEPORT/SOMEPATH..., PartitionFilters: [], PushedFilters: [IsNotNull(PSP)], ReadSchema: struct<PSP:string,Label:string,StartDate:string>
    
        val uh_months = readCsv(spark, input_dir_terro + "HDFS_PATH_OF_ALL_THE_CSV_FILES") 
          .withColumnRenamed("OLD_KEY", "KEY")
          .where(col("code").isin(LIST OF VALUES))
          .withColumn("january", lit("1960-01-01"))
    
        val uh = uh_months
          .withColumn("UHDIN", datediff(to_date(unix_timestamp(col("UHDIN_YYYYMMDD"), "yyyyMMdd").cast(TimestampType)),
            to_date(unix_timestamp(col("january"), "yyyy-MM-dd").cast(TimestampType))))
          //      .withColumn("field_1", to_date((unix_timestamp(col("field"), "ddMMMyyyy")).cast(TimestampType)))
          .withColumn("field_1", date_format(col("field"), "dd/MM/yyyy"))
          .drop("UHDIN_YYYYMMDD")
          .drop("january")
          .drop("field")
          .repartition(1300, $"KEY") //change 1: repartition based on KEY with 1300 (650GB/500MB~1300)
    
        //change 2: always prune as much information as possible before joining!
        val smallerDF = smallDF
                          .where(smallDF.col("ID").isNotNull && col("field_6").isNotNull)
                          .select("KEY", "ID", "field_3", "field_4", "field_5")
    
         //change 3: you can optionally cache the small dataset
        smallerDF.cache()
    
        //change 4: adjust spark.broadcast.blockSize i.e spark.conf.set("spark.broadcast.blockSize","16m"
    
        val uh_joined = uh.join(broadcast(smallerDF), "KEY")
          .select(
            uh.col("*"),
            smallerDF.col("ID"),
            smallerDF.col("field_3"),
            smallerDF.col("field_4"),
            smallerDF.col("field_5"))
          .withColumnRenamed("field_1", "field")