Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/jpa/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Apache spark 使用databricks列出Azure Blob中的所有文件_Apache Spark_Pyspark_Databricks_Azure Blob Storage - Fatal编程技术网

Apache spark 使用databricks列出Azure Blob中的所有文件

Apache spark 使用databricks列出Azure Blob中的所有文件,apache-spark,pyspark,databricks,azure-blob-storage,Apache Spark,Pyspark,Databricks,Azure Blob Storage,我需要一个pyspark(python)脚本来列出Azure blob存储中的所有文件(包括子目录)。我在scala中找到了一个用于此目的的脚本,需要帮助将此脚本转换为pyspark 刻度代码 import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.deploy.SparkHadoopUtil im

我需要一个pyspark(python)脚本来列出Azure blob存储中的所有文件(包括子目录)。我在scala中找到了一个用于此目的的脚本,需要帮助将此脚本转换为pyspark

刻度代码

import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.execution.datasources.InMemoryFileIndex import java.net.URI def listFiles(basep: String, globp: String): Seq[String] = { val conf = new Configuration(sc.hadoopConfiguration) val fs = FileSystem.get(new URI(basep), conf) def validated(path: String): Path = { if(path startsWith "/") new Path(path) else new Path("/" + path) } val fileCatalog = InMemoryFileIndex.bulkListLeafFiles( paths = SparkHadoopUtil.get.globPath(fs, Path.mergePaths(validated(basep), validated(globp))), hadoopConf = conf, filter = null, sparkSession = spark) fileCatalog.flatMap(_._2.map(_.path)) } val root = "/mnt/path/table" val globp = "[^_]*" // glob pattern, e.g. "service=webapp/date=2019-03-31/*log4j*" val files = listFiles(root, globp) files.toDF("path").show() 导入org.apache.hadoop.conf.Configuration 导入org.apache.hadoop.fs.{Path,FileSystem} 导入org.apache.spark.deploy.SparkHadoopUtil 导入org.apache.spark.sql.execution.datasources.InMemoryFileIndex 导入java.net.URI def listFiles(basep:String,globp:String):Seq[String]={ val conf=新配置(sc.hadoopConfiguration) val fs=FileSystem.get(新URI(basep),conf) def已验证(路径:字符串):路径={ 如果(路径以“/”开头)新路径(路径) else新路径(“/”+路径) } val fileCatalog=InMemoryFileIndex.bulkListLeafFiles( paths=SparkHadoopUtil.get.globPath(fs,Path.mergepath(validated(basep),validated(globp)), hadoopConf=conf, filter=null, 火花会话=火花) fileCatalog.flatMap(u._2.map(u.path)) } val root=“/mnt/path/table” val globp=“[^]*”//全局模式,例如“service=webapp/date=2019-03-31/*log4j*” val文件=列表文件(根,全局) files.toDF(“path”).show()文件 我已经成功地将代码转换为pyspark,但是我得到了以下错误。 “JavaMember”对象没有属性“globPath”

configuration = sc._jvm.org.apache.hadoop.conf fspath = sc._jvm.org.apache.hadoop.fs hadooputil = sc._jvm.org.apache.spark.deploy.SparkHadoopUtil inmemfileindex = sc._jvm.org.apache.spark.sql.execution.datasources.InMemoryFileIndex javauri = sc._jvm.java.net.URI rootURL = "/mnt/" globp = "[^_]*" #glob pattern, e.g. "service=webapp/date=2019-03-31/*log4j*" conf = sc._jvm.org.apache.hadoop.conf.Configuration(sc._jsc.hadoopConfiguration()) fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jvm.java.net.URI(rootURL),conf) g=sc._jvm.org.apache.hadoop.fs.Path.mergePaths(sc._jvm.org.apache.hadoop.fs.Path(rootURL), sc._jvm.org.apache.hadoop.fs.Path("/" + globp)) hadooputil.get.globPath(fs,g) configuration=sc.\u jvm.org.apache.hadoop.conf fspath=sc.\u jvm.org.apache.hadoop.fs hadooputil=sc.\u jvm.org.apache.spark.deploy.SparkHadoopUtil inmemfileindex=sc.\u jvm.org.apache.spark.sql.execution.datasources.InMemoryFileIndex javauri=sc.\u jvm.java.net.URI rootURL=“/mnt/” globp=“[^]*”#glob模式,例如“service=webapp/date=2019-03-31/*log4j*” conf=sc.\u jvm.org.apache.hadoop.conf.Configuration(sc.\jsc.hadoopConfiguration()) fs=sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jvm.java.net.URI(rootURL),conf) g=sc._jvm.org.apache.hadoop.fs.Path.mergepath(sc._jvm.org.apache.hadoop.fs.Path(rootURL),sc._jvm.org.apache.hadoop.fs.Path(“/”+globp)) hadooputil.get.globPath(fs,g)
非常感谢您的帮助

使用
globStatus
这是
hadoop.fs
的一部分,例如
files=fspath.globStatus(Path('/service=webapp/date=2019-03-31/*log4j*)
@forgetso我根据您的评论在这里添加了一个答案:,谢谢!