Apache spark Spark写入文件并附加到s3-成本问题
因此,我有一个Apache Spark流,它每20分钟按天和小时写入S3拼花文件分区。Apache spark Spark写入文件并附加到s3-成本问题,apache-spark,amazon-s3,spark-streaming,Apache Spark,Amazon S3,Spark Streaming,因此,我有一个Apache Spark流,它每20分钟按天和小时写入S3拼花文件分区。 似乎每个批在写入之前,都会在此表(/root folder)名称的所有文件夹上执行“ls”和“head” 由于我们有多天X 24小时X不同的表格,因此总体上会产生相对较高的S3成本 请注意,我们的模式正在动态更改 因此,我的问题是: 写入递归读取所有拼花地板头是否正确 为什么流不缓存此信息/是否可以缓存它 你能推荐最佳做法吗 //编写代码: withPartition.write()
似乎每个批在写入之前,都会在此表(/root folder)名称的所有文件夹上执行“ls”和“head” 由于我们有多天X 24小时X不同的表格,因此总体上会产生相对较高的S3成本 请注意,我们的模式正在动态更改 因此,我的问题是:
withPartition.write()
.format(format)
.mode(SaveMode.Append)
.partitionBy("day","hour")
.save(path);
这个问题似乎与以下方面有关:
我发现火花分区是导致此问题的原因: 因此,我按如下方式实施它,它解决了问题,而且提高了性能:
withPartition = withPartition.persist(StorageLevel.MEMORY_AND_DISK());
Dataset<DayAndHour> daysAndHours = withPartition.map(mapToDayHour(), Encoders.bean(DayAndHour.class)).distinct();
DayAndHour[] collect = (DayAndHour[])daysAndHours.collect();
Arrays.sort(collect);
logger.info("found " + collect.length +" different days and hours: "
+ Arrays.stream(collect).map(DayAndHour::toString).collect(Collectors.joining(",")) );
long time = System.currentTimeMillis();
for(DayAndHour dayAndHour : collect){
int day = dayAndHour.getDay();
int hour = dayAndHour.getHour();
logger.info("Start filter on " + dayAndHour);
Dataset<Row> filtered = withPartition.filter(filterDayAndHour(day, hour))
.drop("day", hour");
String newPath = path + "/"
+ "day" +"=" +day +"/"
+ "hour" +"=" + hour;
long specificPathCount = filtered.count();
long timeStart = System.currentTimeMillis();
logger.info("writing " + specificPathCount+ " event to " + newPath );
filtered.write()
.format(format)
.mode(SaveMode.Append)
.save(newPath);
logger.info("Finish writing partition of " + dayAndHour+ " to "+ newPath+ ". Wrote [" + specificPathCount +"] events in " + TimeUtils.tookMinuteSecondsAndMillis(timeStart, System.currentTimeMillis()));
}
logger.info("Finish writing " + path+ ". Wrote [" + cnt +"] events in " + MinuteTimeUtils.tookMinuteSecondsAndMillis(time, System.currentTimeMillis()));
withPartition.unpersist();
private static MapFunction<Row, DayAndHour> mapToDayHour() {
return new MapFunction<Row, DayAndHour>() {
@Override
public DayAndHour call(Row value) throws Exception {
int day = value.getAs("day");
int hour = value.getAs(hour");
DayAndHour dayAndHour = new DayAndHour();
dayAndHour.setDay(day);
dayAndHour.setHour(hour);
return dayAndHour;
}
};
}
private static FilterFunction<Row> filterDayAndHour(int day, int hour) {
return new FilterFunction<Row>() {
@Override
public boolean call(Row value) throws Exception {
int cDay = value.getAs("day");
int cHour = value.getAs(hour");
return day == cDay && hour == cHour;
}
};
}
withPartition=withPartition.persist(StorageLevel.MEMORY_和_DISK());
Dataset daysAndHours=withPartition.map(mapToDayHour(),Encoders.bean(DayAndHour.class)).distinct();
DayAndHour[]收集=(DayAndHour[])daysAndHours.collect();
数组。排序(收集);
logger.info(“找到”+collect.length+“不同的日期和时间:”
+Arrays.stream(collect.map(DayAndHour::toString.collect(Collectors.joining(“,”));
长时间=System.currentTimeMillis();
for(DayAndHour DayAndHour:collect){
int day=dayAndHour.getDay();
int hour=dayAndHour.getHour();
logger.info(“启动过滤器打开”+天和小时);
Dataset filtered=withPartition.filter(filterDayAndHour(天,小时))
.下降(“日”,小时);
字符串newPath=path+“/”
+“日”+“=”+日+“/”
+“小时”+“=”+小时;
long specificPathCount=filtered.count();
long timeStart=System.currentTimeMillis();
info(“将“+specificPathCount+”事件写入“+newPath”);
filtered.write()
.格式(格式)
.mode(SaveMode.Append)
.save(newPath);
logger.info(“完成将“+dayAndHour+”的分区写入“+newPath+”。在“+TimeUtils.tookMinuteSecondsAndMillis(timeStart,System.currentTimeMillis())中写入[“+specificPathCount+”]事件);
}
logger.info(“完成写入”+path+”。在“+MinuteTimeUtils.tookMinuteSecondsAndMillis(time,System.currentTimeMillis())中写入[“+cnt+”]事件);
withPartition.unpersist();
私有静态映射函数mapToDayHour(){
返回新的映射函数(){
@凌驾
public DayAndHour调用(行值)引发异常{
int day=value.getAs(“日”);
int hour=value.getAs(小时);
DayAndHour DayAndHour=新的DayAndHour();
dayAndHour.setDay(天);
dayAndHour.setHour(小时);
返回日期和时间;
}
};
}
专用静态筛选器功能筛选器DayandHour(整数天,整数小时){
返回新的FilterFunction(){
@凌驾
公共布尔调用(行值)引发异常{
int cDay=value.getAs(“日”);
int cHour=value.getAs(小时);
返回日==cDay和小时==cHour;
}
};
}
//还有一个POJO
public class DayAndHour implements Serializable , Comparable<DayAndHour>{
private int day;
private int hour;
public int getDay() {
return day;
}
public void setDay(int day) {
this.day = day;
}
public int getHour() {
return hour;
}
public void setHour(int hour) {
this.hour = hour;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
DayAndHour that = (DayAndHour) o;
if (day != that.day) return false;
return hour == that.hour;
}
@Override
public int hashCode() {
int result = day;
result = 31 * result + hour;
return result;
}
@Override
public String toString() {
return "(" +
"day=" + day +
", hour=" + hour +
')';
}
@Override
public int compareTo(DayAndHour dayAndHour) {
return Integer.compare((day * 100) + hour, (dayAndHour.day * 100) + dayAndHour.hour);
}
}
public类DayAndHour实现了可序列化、可比较的{
私人国际日;
私人整数小时;
public int getDay(){
回归日;
}
公共无效设定日(整数日){
this.day=天;
}
公共整数getHour(){
返程时间;
}
公共无效设置小时(整小时){
这个小时=小时;
}
@凌驾
公共布尔等于(对象o){
如果(this==o)返回true;
如果(o==null | | getClass()!=o.getClass())返回false;
DayAndHour that=(DayAndHour)o;
如果(day!=that.day)返回false;
return hour==that.hour;
}
@凌驾
公共int hashCode(){
int结果=天;
结果=31*结果+小时;
返回结果;
}
@凌驾
公共字符串toString(){
返回“(”+
“day=”+day+
“,hour=“+hour+
')';
}
@凌驾
公共int比较(DayAndHour DayAndHour){
返回整数。比较((day*100)+小时,(dayAndHour.day*100)+dayAndHour.hour);
}
}
我发现火花分区是导致此问题的原因:
因此,我按如下方式实施它,它解决了问题,而且提高了性能:
withPartition = withPartition.persist(StorageLevel.MEMORY_AND_DISK());
Dataset<DayAndHour> daysAndHours = withPartition.map(mapToDayHour(), Encoders.bean(DayAndHour.class)).distinct();
DayAndHour[] collect = (DayAndHour[])daysAndHours.collect();
Arrays.sort(collect);
logger.info("found " + collect.length +" different days and hours: "
+ Arrays.stream(collect).map(DayAndHour::toString).collect(Collectors.joining(",")) );
long time = System.currentTimeMillis();
for(DayAndHour dayAndHour : collect){
int day = dayAndHour.getDay();
int hour = dayAndHour.getHour();
logger.info("Start filter on " + dayAndHour);
Dataset<Row> filtered = withPartition.filter(filterDayAndHour(day, hour))
.drop("day", hour");
String newPath = path + "/"
+ "day" +"=" +day +"/"
+ "hour" +"=" + hour;
long specificPathCount = filtered.count();
long timeStart = System.currentTimeMillis();
logger.info("writing " + specificPathCount+ " event to " + newPath );
filtered.write()
.format(format)
.mode(SaveMode.Append)
.save(newPath);
logger.info("Finish writing partition of " + dayAndHour+ " to "+ newPath+ ". Wrote [" + specificPathCount +"] events in " + TimeUtils.tookMinuteSecondsAndMillis(timeStart, System.currentTimeMillis()));
}
logger.info("Finish writing " + path+ ". Wrote [" + cnt +"] events in " + MinuteTimeUtils.tookMinuteSecondsAndMillis(time, System.currentTimeMillis()));
withPartition.unpersist();
private static MapFunction<Row, DayAndHour> mapToDayHour() {
return new MapFunction<Row, DayAndHour>() {
@Override
public DayAndHour call(Row value) throws Exception {
int day = value.getAs("day");
int hour = value.getAs(hour");
DayAndHour dayAndHour = new DayAndHour();
dayAndHour.setDay(day);
dayAndHour.setHour(hour);
return dayAndHour;
}
};
}
private static FilterFunction<Row> filterDayAndHour(int day, int hour) {
return new FilterFunction<Row>() {
@Override
public boolean call(Row value) throws Exception {
int cDay = value.getAs("day");
int cHour = value.getAs(hour");
return day == cDay && hour == cHour;
}
};
}
withPartition=withPartition.persist(StorageLevel.MEMORY_和_DISK());
Dataset daysAndHours=withPartition.map(mapToDayHour(),Encoders.bean(DayAndHour.class)).distinct();
DayAndHour[]收集=(DayAndHour[])daysAndHours.collect();
数组。排序(收集);
logger.info(“找到”+collect.length+“不同的日期和时间:”
+Arrays.stream(collect.map(DayAndHour::toString.collect(Collectors.joining(“,”));
长时间=System.currentTimeMillis();
for(DayAndHour DayAndHour:collect){
int day=dayAndHour.getDay();
int hour=dayAndHour.getHour();
logger.info(“启动过滤器打开”+天和小时);
Dataset filtered=withPartition.filter(filterDayAndHour(天,小时))
.下降(“日”,小时);
字符串newPath=path+“/”
+“日”+“=”+日+“/”
+“小时”+“=”+小时;
long specificPathCount=filtered.count();
long timeStart=System.currentTimeMillis();
info(“将“+specificPathCount+”事件写入“+newPath”);
filtered.write()
.格式(格式)
.mode(SaveMode.Append)
.save(newPath);
logger.info(“完成写入