Java Apache Flink可以写入基于密钥命名的文件吗?
在Apache Flink中,是否可以根据密钥写入多个文本文件?例如,我有一些这样的数据Java Apache Flink可以写入基于密钥命名的文件吗?,java,apache-flink,Java,Apache Flink,在Apache Flink中,是否可以根据密钥写入多个文本文件?例如,我有一些这样的数据 key1,foo,bar 钥匙2,巴兹,福 键3等 密钥的值在编译时是未知的;新的密钥将会出现,我想将该密钥的结果写入一个与其他密钥的结果分开的文件中 我希望看到3个文件,分别名为“key1.txt”、“key2.txt”和“key3.txt” 这是弗林克能在开箱即用的吗?这在开箱即用是不可能的。但是,您可以实现自己的输出格式,并通过result.out(…)(对于批处理API)使用它;看 对于流式API,
key1,foo,bar
钥匙2,巴兹,福
键3等
密钥的值在编译时是未知的;新的密钥将会出现,我想将该密钥的结果写入一个与其他密钥的结果分开的文件中
我希望看到3个文件,分别名为“key1.txt”、“key2.txt”和“key3.txt”
这是弗林克能在开箱即用的吗?这在开箱即用是不可能的。但是,您可以实现自己的输出格式,并通过result.out(…)
(对于批处理API)使用它;看
对于流式API,它应该是stream.addSink(…)
;请参见您可以尝试以下接收器的实现,该实现可与KeyedStream
一起使用:
KeyedStream<Tuple2<String, String>, Tuple> keyedDataStream = dataStream.keyBy(0);
StreamKeyPartitionerSink<Tuple2<String, SynopsesEvent>> sinkFunction = new StreamKeyPartitionerSink<Tuple2<String, SynopsesEvent>>(
"../data/key_grouping", "f0"); // f0 is the key field name
keyedDataStream.addSink(sinkFunction);
KeyedStream keyedDataStream=dataStream.keyBy(0);
StreamKeyPartitionerSink sinkFunction=新的StreamKeyPartitionerSink(
“./数据/键组”,“f0”);//f0是键字段名
keyedDataStream.addSink(sinkFunction);
有关Flink中状态管理的更多信息:因为我使用它来管理每个键的状态
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.lang.reflect.Field;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.List;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
/**
* * Flink sink writes tuples to files partitioned by their keys, which also writes the records as
* batches.
*
* @param <IN> Input tuple type
*
* @author ehabqadah
*/
public class StreamKeyPartitionerSink<IN> extends RichSinkFunction<IN> {
private transient ValueState<String> outputFilePath;
private transient ValueState<List<IN>> inputTupleList;
/**
* Number of rcords to be hold before writing.
*/
private int writeBatchSize;
/**
* The output directory path
*/
private String outputDirPath;
/**
* The name of the input tuple key
*/
private String keyFieldName;
public StreamKeyPartitionerSink(String outputDirPath, String keyFieldName) {
this(outputDirPath, keyFieldName, 1);
}
/**
*
* @param outputDirPath- writeBatchSize the size of on hold batch before write
* @param writeBatchSize - output directory
*/
public StreamKeyPartitionerSink(String outputDirPath, String keyFieldName, int writeBatchSize) {
this.writeBatchSize = writeBatchSize;
this.outputDirPath = outputDirPath;
this.keyFieldName = keyFieldName;
}
@Override
public void open(Configuration config) {
// initialize state holders
`//for more info about state management check `//
ValueStateDescriptor<String> outputFilePathDesc =
new ValueStateDescriptor<String>("outputFilePathDesc",
TypeInformation.of(new TypeHint<String>() {}));
ValueStateDescriptor<List<IN>> inputTupleListDesc =
new ValueStateDescriptor<List<IN>>("inputTupleListDesc",
TypeInformation.of(new TypeHint<List<IN>>() {}));
outputFilePath = getRuntimeContext().getState(outputFilePathDesc);
inputTupleList = getRuntimeContext().getState(inputTupleListDesc);
}
@Override
public void invoke(IN value) throws Exception {
List<IN> inputTuples =
inputTupleList.value() == null ? new ArrayList<IN>() : inputTupleList.value();
inputTuples.add(value);
if (inputTuples.size() == writeBatchSize) {
writeInputList(inputTuples);
inputTuples = new ArrayList<IN>();
}
// update the state
inputTupleList.update(inputTuples);
}
/**
* Write the tuple list, each record in separate line
*
* @param tupleList
* @throws Exception
*/
public void writeInputList(List<IN> tupleList) {
String path = getOrInitFilePath(tupleList);
try (PrintWriter outStream = new PrintWriter(new BufferedWriter(new FileWriter(path, true)))) {
for (IN tupleToWrite : tupleList) {
outStream.println(tupleToWrite);
}
} catch (IOException e) {
throw new RuntimeException("Exception occured while writing file " + path, e);
}
}
private String getOrInitFilePath(List<IN> tupleList) {
IN firstInstance = tupleList.get(0);
String path = null;
try {
path = outputFilePath.value();
if (path == null) {
Field keyField = firstInstance.getClass().getField(keyFieldName);
String keyValue = keyField.get(firstInstance).toString();
path = Paths.get(outputDirPath, keyValue + ".txt").toString();
setUpOutputFilePathPath(outputDirPath, path);
// save the computed path for this key
outputFilePath.update(path);
}
} catch (IOException | NoSuchFieldException | SecurityException | IllegalArgumentException
| IllegalAccessException e) {
throw new RuntimeException(
"ExceptionsetUpOutputFilePathPath occured while fetching the value of key field " + path,
e);
}
return path;
}
private void setUpOutputFilePathPath(String outputDirPath, String path) throws IOException {
if (!Files.exists(Paths.get(outputDirPath))) {
Files.createDirectories(Paths.get(outputDirPath));
}
// create the file if it does not exist and delete its content
Files.write(Paths.get(path), "".getBytes(), StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING);
}
}
导入java.io.BufferedWriter;
导入java.io.FileWriter;
导入java.io.IOException;
导入java.io.PrintWriter;
导入java.lang.reflect.Field;
导入java.nio.file.Files;
导入java.nio.file.path;
导入java.nio.file.StandardOpenOption;
导入java.util.ArrayList;
导入java.util.List;
导入org.apache.flink.api.common.state.ValueState;
导入org.apache.flink.api.common.state.ValueStateDescriptor;
导入org.apache.flink.api.common.typeinfo.TypeHint;
导入org.apache.flink.api.common.typeinfo.TypeInformation;
导入org.apache.flink.configuration.configuration;
导入org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
/**
**Flink sink将元组写入按其键分区的文件,这也会将记录写入
*批次。
*
*@param输入元组类型
*
*@作者ehabqadah
*/
公共类StreamKeyPartitionsLink扩展了RichSink函数{
私有瞬态值状态输出文件路径;
私有瞬时值状态输入列表;
/**
*写入前要保留的RCORD数。
*/
私有int-writeBatchSize;
/**
*输出目录路径
*/
私有字符串outputDirPath;
/**
*输入元组键的名称
*/
私有字符串keyFieldName;
public StreamKeyPartitionsLink(字符串outputDirPath,字符串keyFieldName){
这(outputDirPath,keyFieldName,1);
}
/**
*
*@param outputDirPath-writeBatchSize写入前保留批处理的大小
*@param writeBatchSize-输出目录
*/
public StreamKeyPartitionsLink(字符串outputDirPath、字符串keyFieldName、int writeBatchSize){
this.writeBatchSize=writeBatchSize;
this.outputDirPath=outputDirPath;
this.keyFieldName=keyFieldName;
}
@凌驾
公共无效打开(配置){
//初始化状态持有者
`//有关状态管理检查的更多信息`//
ValueStateDescriptor输出文件路径描述=
新的ValueStateDescriptor(“outputFilePathDesc”,
TypeInformation.of(newtypehint(){});
ValueStateDescriptor InputUppleListDesc=
新的ValueStateDescriptor(“InputUpleListDesc”,
TypeInformation.of(newtypehint(){});
outputFilePath=getRuntimeContext().getState(outputFilePathDesc);
InputUpleList=getRuntimeContext().getState(InputUpleListDesc);
}
@凌驾
public void invoke(值中)引发异常{
列出整数=
InputUpleList.value()==null?新建ArrayList():InputUpleList.value();
输入两倍。添加(值);
if(inputUples.size()==writeBatchSize){
写入输入列表(输入个数);
InputUples=新的ArrayList();
}
//更新状态
inputUpleList.更新(inputUples);
}
/**
*编写元组列表,每条记录在单独的行中
*
*@param-tupleList
*@抛出异常
*/
public void writeInputList(列表元组列表){
字符串路径=getOrInitFilePath(tupleList);
try(PrintWriter outStream=new PrintWriter(new BufferedWriter(new FileWriter,path,true))){
for(在tupleToWrite:tupleList中){
exptream.println(tupleToWrite);
}
}捕获(IOE异常){
抛出新的RuntimeException(“写入文件时发生异常”+path,e);
}
}
私有字符串getOrInitFilePath(列表元组列表){
在firstInstance=tupleList.get(0);
字符串路径=null;
试一试{
path=outputFilePath.value();
if(路径==null){
Field keyField=firstInstance.getClass().getField(keyFieldName);
字符串keyValue=keyField.get(firstInstance.toString();
path=path.get(outputDirPath,keyValue+“.txt”).toString();
setUpOutputFilePathPath(outputDirPath,path);
//保存此键的计算路径
outputFilePath.update(路径);
}
}捕获(IOException | NoSuchFieldException | SecurityException | IllegalArgumentException)
|非法访问(例外e){
抛出新的运行时异常(
“获取键字段“+path”的值时发生异常SetupOutputFilePathPath,
e) );
}
返回路径;
}
私有void setUpOutputFilePathPath(字符串outputDirPath,字符串路径)引发IOException{
如果(!Files.exists(path.get(outputDirPath))){
Files.createDirectories(path.get(outputDirPath));
}
//如果文件不存在,请创建该文件并删除其内容
Files.write(路径).get(路径),“”.getBytes(),StandardOpenOption.CREATE,
StandardOpenOption。截断_(现有);
}
}
我在这里找到了一些相关信息,原作者也回答了这个问题。