如何在spark程序中解析多个JSON结构
我正在用Scala解析日志(Json格式)。我不知道如何继续。我可能会得到不同种类的日志进行处理 如何编写/设计代码来处理不同类型的Json结构? 我能给我的Scala程序一个模式并让它解析吗 我使用对象映射器编写了一些代码,并通读了节点,但我想要一种更具结构无关性的方法 我不知道从哪里开始。请给我指一些阅读材料或例子。我试图在Stackoverflow中搜索或搜索,结果导致了太多的例子,这让我很困惑,因为我也在学习Scala如何在spark程序中解析多个JSON结构,json,scala,apache-spark,Json,Scala,Apache Spark,我正在用Scala解析日志(Json格式)。我不知道如何继续。我可能会得到不同种类的日志进行处理 如何编写/设计代码来处理不同类型的Json结构? 我能给我的Scala程序一个模式并让它解析吗 我使用对象映射器编写了一些代码,并通读了节点,但我想要一种更具结构无关性的方法 我不知道从哪里开始。请给我指一些阅读材料或例子。我试图在Stackoverflow中搜索或搜索,结果导致了太多的例子,这让我很困惑,因为我也在学习Scala import org.apache.hadoop.fs.FileSy
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Calendar;
import org.apache.spark.sql.hive.HiveContext
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonMappingException;
import org.apache.spark.rdd.RDD;
sc.setLogLevel("OFF");
val args = sc.getConf.get("spark.driver.args").split("\\s+")
args.foreach(println);
var envStr = "dev";
var srcStr = "appm"
val RootFolderStr = "/source_folder/";
val DestFolderStr = "/dest_folder/";
val dateformatter = new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss.SSS'Z'");
val formatter = new SimpleDateFormat("yyyy-MM-dd");
val theMonthFormatter = new SimpleDateFormat("yyyy-MM");
var fromDay: Date = formatter.parse("2018-04-29");
var toDay: Date = formatter.parse("2018-05-01");
if (args.length < 2) {
printf("usage: need at least 2 parameters in spark.driver.args");
sys.exit(2);
}
envStr = args(0).toLowerCase();
srcStr = args(1).toLowerCase();
if (args.length == 4) {
fromDay = formatter.parse(args(2));
toDay = formatter.parse(args(3));
}
if (args.length == 2) {
// default to be yesterday to today
toDay = formatter.parse(formatter.format(Calendar.getInstance().getTime()));
val previousDay = Calendar.getInstance();
previousDay.add(Calendar.DATE, -1);
fromDay = formatter.parse(formatter.format(previousDay.getTime()));
}
// get the sub-folder for the monthly partition
val monthFolder = theMonthFormatter.format(fromDay);
var rootFolder = RootFolderStr.replaceAll("ENV", envStr) + monthFolder;
rootFolder = rootFolder.replaceAll("SRC", srcStr);
val destFolder = DestFolderStr.replaceAll("ENV", envStr);
var toCalendar = Calendar.getInstance();
toCalendar.setTime(toDay);
toCalendar.add(Calendar.DATE, 1);
// need to consider the case across the month boundary
val toDay2 = formatter.parse(formatter.format(toCalendar.getTime()));
// filter out .tmp files and 0-size files
// .tmp files are not safe to read from, it's possible that the files are under updating by Flume job and the message data is incomplete
// when the Spark job starts to read from it.
val pathInfos = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(rootFolder));
// filter out the 0-length files, .tmp files which is of today
val allfiles = pathInfos.filter(fileStatus => {
if (fileStatus.getLen == 0)
false
else {
val aPath = fileStatus.getPath().getName();
// use the modification time is more accurate.
val lastTime = fileStatus.getModificationTime();
val aDate = new Date(lastTime);
// all files between fromDay and toDay2
aDate.after(fromDay) && aDate.before(toDay2);
}
}
).map(_.getPath.toString);
case class event_log(
time_stp: Long,
msg_sze: Int,
msg_src: String,
action_path: String,
s_code: Int,
s_desc: String,
p_code: String,
c_id: String,
m_id: String,
c_ip: String,
c_gp: String,
gip: String,
ggip: String,
rbody: String
);
def readEvents(fileList: Array[String], msgSrc: String, fromTS: Long, toTS: Long): RDD[(event_log)] = {
val records =
sc.sequenceFile[Long, String](fileList.mkString(","))
.filter((message) => {
(message._1 >= fromTS && message._1 < toTS);
}
)
val eventLogs = records.map((message) => {
val time_stp = message._1;
var msg_sze = message._2.length();
var c_id = ""
var m_id = "";
var p_code = "";
var c_ip = "";
var c_gp = "";
var gip = "";
var ggip = "";
var rbody = "";
var action_path = "";
var s_code: Int = 200;
var s_desc = "";
try {
// parse the message
val mapper = new ObjectMapper();
val aBuff = message._2.getBytes();
val root = mapper.readTree(aBuff);
var aNode = root.path("rbody");
rbody = aNode.textValue();
if (rbody != null && rbody.length() > 0) {
val mapper_2 = new ObjectMapper();
val aBuff_2 = rbody.getBytes();
var root2 = mapper_2.readTree(aBuff_2);
aNode = root2.path("p_code");
if (aNode != null && aNode.isValueNode())
p_code = String.valueOf(aNode.intValue());
aNode = root2.path("mkay");
if (aNode != null && aNode.isObject()) {
root2 = aNode;
}
{
aNode = root2.get("c_id");
if (aNode != null && aNode.isValueNode())
c_id = aNode.textValue();
aNode = root2.get("m_id");
if (aNode != null && aNode.isValueNode()) {
m_id = String.valueOf(aNode.intValue());
}
}
}
aNode = root.path("c_ip");
c_ip = aNode.textValue();
aNode = root.path("c_gp");
c_gp = aNode.textValue();
aNode = root.path("gip");
gip = aNode.textValue();
aNode = root.path("ggip");
ggip = aNode.textValue();
aNode = root.path("action_path");
action_path = aNode.textValue();
aNode = root.path("s_code");
val statusNodeValue = aNode.textValue().trim();
s_code = Integer.valueOf(statusNodeValue.substring(0, 3));
s_desc = statusNodeValue.substring(3).trim();
}
catch {
// return empty string as indicator that it's not a well-formatted JSON message
case jex: JsonParseException => {
msg_sze = 0
};
case ioEx: java.io.IOException => {
msg_sze = 0
};
case rtEx: JsonMappingException => {
msg_sze = 0
};
}
event_log(time_stp, msg_sze, msgSrc, action_path, s_code, s_desc,
p_code, c_id, m_id,
c_ip, c_gp, gip, ggip,
rbody);
});
eventLogs;
}
val hiveContext = new HiveContext(sc)
if (allfiles.length == 0)
sys.exit(3);
val fromTime = fromDay.getTime();
val toTime = toDay.getTime();
val events = readEvents(allfiles, srcStr, fromTime, toTime);
val df = hiveContext.createDataFrame(events).coalesce(1);
df.write.parquet(destFolder);
sys.exit(0);
import org.apache.hadoop.fs.FileSystem
导入org.apache.hadoop.fs.Path
导入java.text.simpleDataFormat;
导入java.util.Date;
导入java.util.Calendar;
导入org.apache.spark.sql.hive.HiveContext
导入com.fasterxml.jackson.databind.ObjectMapper;
导入com.fasterxml.jackson.core.JsonParseException;
导入com.fasterxml.jackson.databind.JsonMappingException;
导入org.apache.spark.rdd.rdd;
sc.setLogLevel(“关”);
val args=sc.getConf.get(“spark.driver.args”).split(\\s+)
参数foreach(println);
var envStr=“dev”;
var srcStr=“appm”
val RootFolderStr=“/source_folder/”;
val DestFolderStr=“/dest_folder/”;
val dateformatter=新的SimpleDataFormat(“yyyy-MM-dd'T'hh:MM:ss.SSS'Z'”;
val格式化程序=新的SimpleDataFormat(“yyyy-MM-dd”);
val theMonthFormatter=新的简化格式(“yyyy-MM”);
var fromDay:Date=formatter.parse(“2018-04-29”);
今日风险值:日期=格式化程序.parse(“2018-05-01”);
如果(参数长度<2){
printf(“用法:spark.driver.args中至少需要2个参数”);
系统出口(2);
}
envStr=args(0.toLowerCase();
srcStr=args(1.toLowerCase();
如果(args.length==4){
fromDay=formatter.parse(args(2));
今天=formatter.parse(args(3));
}
如果(args.length==2){
//默认为昨天到今天
今天=formatter.parse(formatter.format(Calendar.getInstance().getTime());
val previousDay=Calendar.getInstance();
添加(Calendar.DATE,-1);
fromDay=formatter.parse(formatter.format(previousDay.getTime());
}
//获取每月分区的子文件夹
val monthFolder=monthformatter.format(fromDay);
var rootFolder=RootFolderStr.replaceAll(“ENV”,envStr)+monthFolder;
rootFolder=rootFolder.replaceAll(“SRC”,srcStr);
val destFolder=DestFolderStr.replaceAll(“ENV”,envStr);
var toCalendar=Calendar.getInstance();
toCalendar.setTime(今天);
toCalendar.add(Calendar.DATE,1);
/需要考虑跨月份的情况
val toDay2=formatter.parse(formatter.format(toCalendar.getTime());
//过滤掉.tmp文件和0大小的文件
//.tmp文件读取不安全,可能是Flume作业正在更新这些文件,并且消息数据不完整
//当Spark作业开始读取它时。
val pathInfos=FileSystem.get(sc.hadoopConfiguration).listStatus(新路径(根文件夹));
//过滤掉当前的0长度文件、.tmp文件
val allfiles=pathInfos.filter(fileStatus=>{
if(fileStatus.getLen==0)
假的
否则{
val aPath=fileStatus.getPath().getName();
//使用修改时间更准确。
val lastTime=fileStatus.getModificationTime();
val aDate=新日期(上次);
//fromDay和Today之间的所有文件2
日期后(从当天开始)和日期前(今天2);
}
}
).map(u.getPath.toString);
案例类事件日志(
时间:长,
msg_sze:Int,
msg_src:String,
操作路径:字符串,
s_代码:Int,
s_desc:String,
p_代码:字符串,
c_id:String,
m_id:String,
叶澍鹖:字符串,
c_gp:字符串,
gip:字符串,
ggip:字符串,
巴迪:弦
);
def readEvents(文件列表:数组[String],msgSrc:String,fromTS:Long,toTS:Long):RDD[(事件日志)]={
val记录=
sc.sequenceFile[Long,String](fileList.mkString(“,”))
.filter((消息)=>{
(message.\u 1>=fromTS和message.\u 1{
val time_stp=消息。_1;
var msg_sze=message._2.length();
var c_id=“”
var m_id=“”;
var p_代码=”;
var c_ip=“”;
var c_gp=“”;
var gip=“”;
var ggip=“”;
var rbody=“”;
var action_path=“”;
var s_代码:Int=200;
var s_desc=“”;
试一试{
//解析消息
val mapper=new ObjectMapper();
val aBuff=message._2.getBytes();
val root=mapper.readTree(aBuff);
var阳极=根路径(“rbody”);
rbody=阳极。textValue();
if(rbody!=null&&rbody.length()>0){
val mapper_2=新对象映射器();
val aBuff_2=rbody.getBytes();
var root2=mapper_2.readTree(aBuff_2);
阳极=根2.路径(“p_代码”);
if(aNode!=null&&aNode.isValueNode()
p_code=String.valueOf(阳极.intValue());
阳极=根2.路径(“mkay”);
if(aNode!=null&&aNode.isObject()){
root2=阳极;
}
{
阳极=根2.get(“c_id”);
if(aNode!=null&&aNode.isValueNode()
c_id=阳极。textValue();
阳极=根2.get(“m_id”);
if(aNode!=null&&aNode.isValueNode(){
m_id=String.valueOf(阳极.intValue());
}
}
}
阳极=根路径(“c_ip”);
c_ip=阳极。textValue();
阳极=根路径(“c_gp”);
c_gp=阳极。textValue();
阳极=根路径(“gip”);
gip=阳极。textValue();
阳极=根路径(“ggip”);
ggip=阳极.textValue();
阳极=根路径(“动作路径”);
action_path=Anodel.textValue();
阳极=根路径(“s_代码”);
val状态节点值=