Scala 如何在spark中为输入文件定义多个自定义分隔符?
通过Spark读取文件时的默认输入文件分隔符为换行符(\n)。可以使用“textinputformat.record.delimiter”属性定义自定义分隔符 但是,是否可以为同一个文件指定多个分隔符 假设文件具有以下内容:Scala 如何在spark中为输入文件定义多个自定义分隔符?,scala,hadoop,apache-spark,rdd,Scala,Hadoop,Apache Spark,Rdd,通过Spark读取文件时的默认输入文件分隔符为换行符(\n)。可以使用“textinputformat.record.delimiter”属性定义自定义分隔符 但是,是否可以为同一个文件指定多个分隔符 假设文件具有以下内容: COMMENT,A,B,C COMMENT,D,E, F LIKE,I,H,G COMMENT,J,K, L COMMENT,M,N,O 我想读取这个文件,用分隔符作为注释,而不是换行符 尽管如此,如果spark中不允许使用多个分隔符,我还是提出了另一种选择 val ss
COMMENT,A,B,C
COMMENT,D,E,
F
LIKE,I,H,G
COMMENT,J,K,
L
COMMENT,M,N,O
我想读取这个文件,用分隔符作为注释,而不是换行符
尽管如此,如果spark中不允许使用多个分隔符,我还是提出了另一种选择
val ss = SparkSession.builder().appName("SentimentAnalysis").master("local[*]").getOrCreate()
val sc = ss.sparkContext
sc.hadoopConfiguration.set("textinputformat.record.delimiter", "COMMENT")
val rdd = sc.textFile("<filepath>")
val finalRdd = rdd.flatmap(f=>f.split("LIKE"))
val ss=SparkSession.builder().appName(“情感分析”).master(“本地[*]).getOrCreate()
val sc=ss.sparkContext
sc.hadoopConfiguration.set(“textinputformat.record.delimiter”、“COMMENT”)
val rdd=sc.textFile(“”)
val finalRdd=rdd.flatmap(f=>f.split(“LIKE”))
但是,我仍然认为最好有多个自定义分隔符。在spark有可能吗?还是必须使用上述替代方法?通过创建一个自定义TextInputFormat类来解决上述问题,该类在两种类型的分隔符字符串上拆分。@puhlen在评论中指出的帖子帮了大忙。下面是我使用的代码片段:
class CustomInputFormat extends TextInputFormat {
override def createRecordReader(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext): RecordReader[LongWritable, Text] = {
return new ParagraphRecordReader();
}
}
class ParagraphRecordReader extends RecordReader[LongWritable, Text] {
var end: Long = 0L;
var stillInChunk = true;
var key = new LongWritable();
var value = new Text();
var fsin: FSDataInputStream = null;
val buffer = new DataOutputBuffer();
val tempBuffer1 = MutableList[Int]();
val tempBuffer2 = MutableList[Int]();
val endTag1 = "COMMENT".getBytes();
val endTag2 = "LIKE".getBytes();
@throws(classOf[IOException])
@throws(classOf[InterruptedException])
override def initialize(inputSplit: org.apache.hadoop.mapreduce.InputSplit, taskAttemptContext: org.apache.hadoop.mapreduce.TaskAttemptContext) {
val split = inputSplit.asInstanceOf[FileSplit];
val conf = taskAttemptContext.getConfiguration();
val path = split.getPath();
val fs = path.getFileSystem(conf);
fsin = fs.open(path);
val start = split.getStart();
end = split.getStart() + split.getLength();
fsin.seek(start);
if (start != 0) {
readUntilMatch(endTag1, endTag2, false);
}
}
@throws(classOf[IOException])
override def nextKeyValue(): Boolean = {
if (!stillInChunk) return false;
val status = readUntilMatch(endTag1, endTag2, true);
value = new Text();
value.set(buffer.getData(), 0, buffer.getLength());
key = new LongWritable(fsin.getPos());
buffer.reset();
if (!status) {
stillInChunk = false;
}
return true;
}
@throws(classOf[IOException])
@throws(classOf[InterruptedException])
override def getCurrentKey(): LongWritable = {
return key;
}
@throws(classOf[IOException])
@throws(classOf[InterruptedException])
override def getCurrentValue(): Text = {
return value;
}
@throws(classOf[IOException])
@throws(classOf[InterruptedException])
override def getProgress(): Float = {
return 0;
}
@throws(classOf[IOException])
override def close() {
fsin.close();
}
@throws(classOf[IOException])
def readUntilMatch(match1: Array[Byte], match2: Array[Byte], withinBlock: Boolean): Boolean = {
var i = 0;
var j = 0;
while (true) {
val b = fsin.read();
if (b == -1) return false;
if (b == match1(i)) {
tempBuffer1.+=(b)
i = i + 1;
if (i >= match1.length) {
tempBuffer1.clear()
return fsin.getPos() < end;
}
} else if (b == match2(j)) {
tempBuffer2.+=(b)
j = j + 1;
if (j >= match2.length) {
tempBuffer2.clear()
return fsin.getPos() < end;
}
} else {
if (tempBuffer1.size != 0)
tempBuffer1.foreach { x => if (withinBlock) buffer.write(x) }
else if (tempBuffer2.size != 0)
tempBuffer2.foreach { x => if (withinBlock) buffer.write(x) }
tempBuffer1.clear()
tempBuffer2.clear()
if (withinBlock) buffer.write(b);
i = 0;
j = 0;
}
}
return false;
}
类CustomInputFormat扩展了TextInputFormat{
重写def createRecordReader(inputSplit:inputSplit,taskAttemptContext:taskAttemptContext):RecordReader[LongWritable,Text]={
返回新段落RecordReader();
}
}
类ParagraphRecordReader扩展了RecordReader[LongWritable,Text]{
var端:长=0L;
var stillInChunk=真;
var key=new LongWritable();
var值=新文本();
var fsin:FSDataInputStream=null;
val buffer=新的DataOutputBuffer();
val tempBuffer1=MutableList[Int]();
val tempBuffer2=MutableList[Int]();
val endTag1=“COMMENT”.getBytes();
val endTag2=“LIKE”.getBytes();
@抛出(类[IOException])
@抛出(类[InterruptedException])
覆盖def初始化(inputSplit:org.apache.hadoop.mapreduce.inputSplit,taskAttemptContext:org.apache.hadoop.mapreduce.taskAttemptContext){
val split=inputSplit.asInstanceOf[FileSplit];
val conf=taskAttemptContext.getConfiguration();
val path=split.getPath();
val fs=path.getFileSystem(conf);
fsin=fs.open(路径);
val start=split.getStart();
end=split.getStart()+split.getLength();
fsin.seek(启动);
如果(开始!=0){
readUntilMatch(endTag1,endTag2,false);
}
}
@抛出(类[IOException])
覆盖def nextKeyValue():布尔={
如果(!stillInChunk)返回false;
val状态=readUntilMatch(endTag1,endTag2,true);
值=新文本();
set(buffer.getData(),0,buffer.getLength());
key=new LongWritable(fsin.getPos());
reset();
如果(!状态){
stillInChunk=假;
}
返回true;
}
@抛出(类[IOException])
@抛出(类[InterruptedException])
重写def getCurrentKey():LongWritable={
返回键;
}
@抛出(类[IOException])
@抛出(类[InterruptedException])
覆盖def getCurrentValue():文本={
返回值;
}
@抛出(类[IOException])
@抛出(类[InterruptedException])
覆盖def getProgress():浮点={
返回0;
}
@抛出(类[IOException])
覆盖def close(){
fsin.close();
}
@抛出(类[IOException])
def readUntilMatch(match1:Array[Byte],match2:Array[Byte],withinBlock:Boolean):Boolean={
var i=0;
var j=0;
while(true){
val b=fsin.read();
如果(b==-1)返回false;
如果(b==match1(i)){
tempBuffer1.+=(b)
i=i+1;
如果(i>=匹配1.长度){
tempBuffer1.clear()
返回fsin.getPos()=match2.长度){
tempBuffer2.clear()
返回fsin.getPos()if(withinBlock)buffer.write(x)}
else if(tempBuffer2.size!=0)
tempBuffer2.foreach{x=>if(withinBlock)buffer.write(x)}
tempBuffer1.clear()
tempBuffer2.clear()
if(带锁)缓冲区写入(b);
i=0;
j=0;
}
}
返回false;
}
在从文件系统读取文件时,在中使用以下类,您的文件将根据需要使用两个分隔符进行读取:
val rdd=sc.newAPIHadoopFile(“,classOf[ParagraphInputFormat],classOf[LongWritable],classOf[Text],sc.hadoopConfiguration)
我认为这只能通过创建自定义textinputformat来实现。创建自定义textinputformat将有助于一次只指定一种类型的分隔符。它也可以用于定义多个分隔符吗?请提供您想要的示例?不,您可以定义输入格式以使用任何您想要的内容。您需要不过,自己编写代码。我以前做过一次,很难找到一个好的教程,我在做Hanks@puhlen时使用了这个答案作为指导。你提供的帖子确实帮助我编写了带有两个分隔符的自定义textinputformat。我将在这里发布答案。再次,Thanx:)
val rdd = sc.newAPIHadoopFile("<filepath>", classOf[ParagraphInputFormat], classOf[LongWritable], classOf[Text], sc.hadoopConfiguration)