Session 会话化web日志,获取上一个域和下一个域
我们有一大堆网络日志数据。我们需要对其进行会话,并为每个会话生成上一个域和下一个域。我正在通过AWS EMR上的交互式作业流进行测试 现在,我可以在这里使用以下代码将数据会话化:。要熟悉编译和使用UDF需要做一些工作,但我已经做到了这一点 以下是输入文件的标题行和第一行(以制表符分隔): 这是来自Session 会话化web日志,获取上一个域和下一个域,session,hadoop,amazon-web-services,apache-pig,elastic-map-reduce,Session,Hadoop,Amazon Web Services,Apache Pig,Elastic Map Reduce,我们有一大堆网络日志数据。我们需要对其进行会话,并为每个会话生成上一个域和下一个域。我正在通过AWS EMR上的交互式作业流进行测试 现在,我可以在这里使用以下代码将数据会话化:。要熟悉编译和使用UDF需要做一些工作,但我已经做到了这一点 以下是输入文件的标题行和第一行(以制表符分隔): 这是来自会话关系的元组(获取关系的步骤如下所示): 这大致就是我现在运行的测试数据会话: register s3://TestBucket/Sessionize.jar define Sessionize d
会话
关系的元组(获取关系的步骤如下所示):
这大致就是我现在运行的测试数据会话:
register s3://TestBucket/Sessionize.jar
define Sessionize datafu.pig.sessions.Sessionize('30m');
A = load 's3://TestBucket/party2.gz' USING PigStorage() as (id: chararray, data_date: chararray, rule_code: chararray, project_uid: chararray, respondent_uid: chararray, type: chararray, tab_id: chararray, url_domain: chararray, url_path: chararray, duration: chararray, exit_cause: chararray, details: chararray);
B = foreach A generate $1, $0, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11;
C = filter B by id neq 'ID';
VIEWS = group C by (respondent_uid, url_domain);
SESSIONS = foreach VIEWS { VISITS = order C by data_date; generate FLATTEN(Sessionize(VISITS)) as (data_date: chararray, id: chararray, rule_code: chararray, project_uid: chararray, respondent_uid: chararray, type: chararray, tab_id: chararray, url_domain: chararray, url_path: chararray, duration: chararray, exit_cause: chararray, details: chararray, session_id); }
(B处的步骤是将日期移动到第一个位置。C处的步骤是过滤掉文件头)
我已经迷失了正确的方向
我可以迭代我的会话
与foreach的关系
并从pig脚本中获取下一个和上一个域吗?编写自定义UDF并将会话
关系传递给该UDF是否更好?(写我自己的UDF将是一次冒险!)
如有任何建议,将不胜感激。即使有人可以推荐不做的事情,也可能同样有用,所以我不会浪费时间研究垃圾方法。我对Hadoop和pig脚本非常陌生,因此这绝对不是我的强项之一(但…)。如果有人能改进下面的解决方案,我一点也不会感到惊讶,但是,它对我的情况有效。我使用SessionizeUDF(在我的问题中提到)作为编写以下UDF的参考
import java.io.IOException;
import java.util.ArrayList;
import org.apache.pig.Accumulator;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
public class PreviousNext extends EvalFunc<DataBag> implements Accumulator<DataBag>
{
private DataBag outputBag;
private String previous;
private String next;
public PreviousNext()
{
cleanup();
}
@Override
public DataBag exec(Tuple input) throws IOException
{
accumulate(input);
DataBag outputBag = getValue();
cleanup();
return outputBag;
}
@Override
public void accumulate(Tuple input) throws IOException
{
ArrayList<String> domains = new ArrayList<String>();
DataBag d = (DataBag)input.get(0);
//put all domains into ArrayList to allow for
//accessing specific indexes
for(Tuple t : d)
{
domains.add((String)t.get(2));
}
//add empty string for "next domain" value for last iteration
domains.add("");
int i = 0;
previous = "";
for(Tuple t : d)
{
next = domains.get(i+1);
Tuple t_new = TupleFactory.getInstance().newTuple(t.getAll());
t_new.append(previous);
t_new.append(next);
outputBag.add(t_new);
//current domain is previous for next iteration
previous = domains.get(i);
i++;
}
}
@Override
public void cleanup()
{
this.outputBag = BagFactory.getInstance().newDefaultBag();
}
@Override
public DataBag getValue()
{
return outputBag;
}
@Override
public Schema outputSchema(Schema input)
{
try
{
Schema.FieldSchema inputFieldSchema = input.getField(0);
if (inputFieldSchema.type != DataType.BAG)
{
throw new RuntimeException("Expected a BAG as input");
}
Schema inputBagSchema = inputFieldSchema.schema;
if (inputBagSchema.getField(0).type != DataType.TUPLE)
{
throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s", DataType.findTypeName(inputBagSchema.getField(0).type)));
}
Schema inputTupleSchema = inputBagSchema.getField(0).schema;
Schema outputTupleSchema = inputTupleSchema.clone();
outputTupleSchema.add(new Schema.FieldSchema("previous_domain", DataType.CHARARRAY));
outputTupleSchema.add(new Schema.FieldSchema("next_domain", DataType.CHARARRAY));
return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),outputTupleSchema,DataType.BAG));
}
catch (CloneNotSupportedException e)
{
throw new RuntimeException(e);
}
catch (FrontendException e)
{
throw new RuntimeException(e);
}
}
}
import java.io.IOException;
导入java.util.ArrayList;
导入org.apache.pig.acculator;
导入org.apache.pig.EvalFunc;
导入org.apache.pig.data.BagFactory;
导入org.apache.pig.data.DataBag;
导入org.apache.pig.data.DataType;
导入org.apache.pig.data.Tuple;
导入org.apache.pig.data.TupleFactory;
导入org.apache.pig.impl.logicalayer.FrontendException;
导入org.apache.pig.impl.logicalayer.schema.schema;
公共类PreviousNext扩展了EvalFunc实现累加器
{
专用数据包输出包;
私有字符串先前;
私有字符串下一步;
公共上一页(下一页)
{
清理();
}
@凌驾
公共数据包exec(元组输入)引发IOException
{
积累(输入);
数据包outputBag=getValue();
清理();
返回输出包;
}
@凌驾
公共无效累积(元组输入)引发IOException
{
ArrayList域=新的ArrayList();
databagd=(DataBag)input.get(0);
//将所有域放入ArrayList以允许
//访问特定索引
for(元组t:d)
{
domains.add((String)t.get(2));
}
//为上一次迭代的“下一个域”值添加空字符串
域。添加(“”);
int i=0;
先前的=”;
for(元组t:d)
{
next=domains.get(i+1);
Tuple t_new=TupleFactory.getInstance().newTuple(t.getAll());
t_新.追加(先前);
t_new.append(下一步);
outputBag.add(t_new);
//当前域是下一次迭代的前一个域
previous=domains.get(i);
i++;
}
}
@凌驾
公共空间清理()
{
this.outputBag=BagFactory.getInstance().newDefaultBag();
}
@凌驾
公共数据包getValue()
{
返回输出包;
}
@凌驾
公共模式输出模式(模式输入)
{
尝试
{
Schema.FieldSchema inputFieldSchema=input.getField(0);
if(inputFieldSchema.type!=DataType.BAG)
{
抛出新的RuntimeException(“需要一个包作为输入”);
}
Schema inputBagSchema=inputFieldSchema.Schema;
if(inputBagSchema.getField(0.type)!=DataType.TUPLE)
{
抛出新的RuntimeException(String.format(“预期输入包包含元组,但找到了%s”),DataType.findTypeName(inputBagSchema.getField(0.type));
}
Schema inputTupleSchema=inputBagSchema.getField(0).Schema;
Schema OutputUpleSchema=InputUpleSchema.clone();
add(newschema.FieldSchema(“previous_domain”,DataType.CHARARRAY));
add(newschema.FieldSchema(“next_domain”,DataType.CHARARRAY));
返回新模式(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(),input),OutputUpleSchema,DataType.BAG));
}
捕获(CloneNotSupportedException e)
{
抛出新的运行时异常(e);
}
捕获(前端异常e)
{
抛出新的运行时异常(e);
}
}
}
你能提供一些输入和输出数据吗?@alexeipab我添加了一个输入数据示例,以及一个来自会话的元组示例。输出还不存在,我需要有下一个域和上一个域的应答者uid
,会话id
。我可以将会话
关系分组,但我不确定如何获得下一个和上一个会话。如果我能提供任何其他有用的信息,请让我知道只是一个更新。我现在正试图走UDF路线。我会随着进度和失败的发生继续更新。我有一个不太好的UDF解决方案在工作!一旦我清理了它,并做了一些调整,我将添加它作为任何其他遇到类似情况的人的答案。
register s3://TestBucket/Sessionize.jar
define Sessionize datafu.pig.sessions.Sessionize('30m');
A = load 's3://TestBucket/party2.gz' USING PigStorage() as (id: chararray, data_date: chararray, rule_code: chararray, project_uid: chararray, respondent_uid: chararray, type: chararray, tab_id: chararray, url_domain: chararray, url_path: chararray, duration: chararray, exit_cause: chararray, details: chararray);
B = foreach A generate $1, $0, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11;
C = filter B by id neq 'ID';
VIEWS = group C by (respondent_uid, url_domain);
SESSIONS = foreach VIEWS { VISITS = order C by data_date; generate FLATTEN(Sessionize(VISITS)) as (data_date: chararray, id: chararray, rule_code: chararray, project_uid: chararray, respondent_uid: chararray, type: chararray, tab_id: chararray, url_domain: chararray, url_path: chararray, duration: chararray, exit_cause: chararray, details: chararray, session_id); }
import java.io.IOException;
import java.util.ArrayList;
import org.apache.pig.Accumulator;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
public class PreviousNext extends EvalFunc<DataBag> implements Accumulator<DataBag>
{
private DataBag outputBag;
private String previous;
private String next;
public PreviousNext()
{
cleanup();
}
@Override
public DataBag exec(Tuple input) throws IOException
{
accumulate(input);
DataBag outputBag = getValue();
cleanup();
return outputBag;
}
@Override
public void accumulate(Tuple input) throws IOException
{
ArrayList<String> domains = new ArrayList<String>();
DataBag d = (DataBag)input.get(0);
//put all domains into ArrayList to allow for
//accessing specific indexes
for(Tuple t : d)
{
domains.add((String)t.get(2));
}
//add empty string for "next domain" value for last iteration
domains.add("");
int i = 0;
previous = "";
for(Tuple t : d)
{
next = domains.get(i+1);
Tuple t_new = TupleFactory.getInstance().newTuple(t.getAll());
t_new.append(previous);
t_new.append(next);
outputBag.add(t_new);
//current domain is previous for next iteration
previous = domains.get(i);
i++;
}
}
@Override
public void cleanup()
{
this.outputBag = BagFactory.getInstance().newDefaultBag();
}
@Override
public DataBag getValue()
{
return outputBag;
}
@Override
public Schema outputSchema(Schema input)
{
try
{
Schema.FieldSchema inputFieldSchema = input.getField(0);
if (inputFieldSchema.type != DataType.BAG)
{
throw new RuntimeException("Expected a BAG as input");
}
Schema inputBagSchema = inputFieldSchema.schema;
if (inputBagSchema.getField(0).type != DataType.TUPLE)
{
throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s", DataType.findTypeName(inputBagSchema.getField(0).type)));
}
Schema inputTupleSchema = inputBagSchema.getField(0).schema;
Schema outputTupleSchema = inputTupleSchema.clone();
outputTupleSchema.add(new Schema.FieldSchema("previous_domain", DataType.CHARARRAY));
outputTupleSchema.add(new Schema.FieldSchema("next_domain", DataType.CHARARRAY));
return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),outputTupleSchema,DataType.BAG));
}
catch (CloneNotSupportedException e)
{
throw new RuntimeException(e);
}
catch (FrontendException e)
{
throw new RuntimeException(e);
}
}
}