Session 会话化web日志，获取上一个域和下一个域_Session_Hadoop_Amazon Web Services_Apache Pig_Elastic Map Reduce

Session 会话化web日志，获取上一个域和下一个域

session hadoop amazon-web-services apache-pig

Session 会话化web日志，获取上一个域和下一个域,session,hadoop,amazon-web-services,apache-pig,elastic-map-reduce,Session,Hadoop,Amazon Web Services,Apache Pig,Elastic Map Reduce,我们有一大堆网络日志数据。我们需要对其进行会话，并为每个会话生成上一个域和下一个域。我正在通过AWS EMR上的交互式作业流进行测试现在，我可以在这里使用以下代码将数据会话化：。要熟悉编译和使用UDF需要做一些工作，但我已经做到了这一点以下是输入文件的标题行和第一行（以制表符分隔）：这是来自会话关系的元组（获取关系的步骤如下所示）：这大致就是我现在运行的测试数据会话： register s3://TestBucket/Sessionize.jar define Sessionize d

我们有一大堆网络日志数据。我们需要对其进行会话，并为每个会话生成上一个域和下一个域。我正在通过AWS EMR上的交互式作业流进行测试

现在，我可以在这里使用以下代码将数据会话化：。要熟悉编译和使用UDF需要做一些工作，但我已经做到了这一点

以下是输入文件的标题行和第一行（以制表符分隔）：

这是来自

会话

关系的元组（获取关系的步骤如下所示）：

这大致就是我现在运行的测试数据会话：

register s3://TestBucket/Sessionize.jar

define Sessionize datafu.pig.sessions.Sessionize('30m');

A = load 's3://TestBucket/party2.gz' USING PigStorage() as (id: chararray, data_date: chararray, rule_code: chararray, project_uid: chararray, respondent_uid: chararray, type: chararray, tab_id: chararray, url_domain: chararray, url_path: chararray, duration: chararray, exit_cause: chararray, details: chararray);

B = foreach A generate $1, $0, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11;

C = filter B by id neq 'ID';

VIEWS = group C by (respondent_uid, url_domain);

SESSIONS = foreach VIEWS { VISITS = order C by data_date; generate FLATTEN(Sessionize(VISITS)) as (data_date: chararray, id: chararray, rule_code: chararray, project_uid: chararray, respondent_uid: chararray, type: chararray, tab_id: chararray, url_domain: chararray, url_path: chararray, duration: chararray, exit_cause: chararray, details: chararray, session_id); }

（B处的步骤是将日期移动到第一个位置。C处的步骤是过滤掉文件头）

我已经迷失了正确的方向

我可以迭代我的

会话

与

foreach的关系

并从pig脚本中获取下一个和上一个域吗？编写自定义UDF并将

会话

关系传递给该UDF是否更好？（写我自己的UDF将是一次冒险！）

如有任何建议，将不胜感激。即使有人可以推荐不做的事情，也可能同样有用，所以我不会浪费时间研究垃圾方法。我对Hadoop和pig脚本非常陌生，因此这绝对不是我的强项之一（但…）。
如果有人能改进下面的解决方案，我一点也不会感到惊讶，但是，它对我的情况有效。我使用SessionizeUDF（在我的问题中提到）作为编写以下UDF的参考

import java.io.IOException; import java.util.ArrayList; import org.apache.pig.Accumulator; import org.apache.pig.EvalFunc; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; public class PreviousNext extends EvalFunc<DataBag> implements Accumulator<DataBag> { private DataBag outputBag; private String previous; private String next; public PreviousNext() { cleanup(); } @Override public DataBag exec(Tuple input) throws IOException { accumulate(input); DataBag outputBag = getValue(); cleanup(); return outputBag; } @Override public void accumulate(Tuple input) throws IOException { ArrayList<String> domains = new ArrayList<String>(); DataBag d = (DataBag)input.get(0); //put all domains into ArrayList to allow for //accessing specific indexes for(Tuple t : d) { domains.add((String)t.get(2)); } //add empty string for "next domain" value for last iteration domains.add(""); int i = 0; previous = ""; for(Tuple t : d) { next = domains.get(i+1); Tuple t_new = TupleFactory.getInstance().newTuple(t.getAll()); t_new.append(previous); t_new.append(next); outputBag.add(t_new); //current domain is previous for next iteration previous = domains.get(i); i++; } } @Override public void cleanup() { this.outputBag = BagFactory.getInstance().newDefaultBag(); } @Override public DataBag getValue() { return outputBag; } @Override public Schema outputSchema(Schema input) { try { Schema.FieldSchema inputFieldSchema = input.getField(0); if (inputFieldSchema.type != DataType.BAG) { throw new RuntimeException("Expected a BAG as input"); } Schema inputBagSchema = inputFieldSchema.schema; if (inputBagSchema.getField(0).type != DataType.TUPLE) { throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s", DataType.findTypeName(inputBagSchema.getField(0).type))); } Schema inputTupleSchema = inputBagSchema.getField(0).schema; Schema outputTupleSchema = inputTupleSchema.clone(); outputTupleSchema.add(new Schema.FieldSchema("previous_domain", DataType.CHARARRAY)); outputTupleSchema.add(new Schema.FieldSchema("next_domain", DataType.CHARARRAY)); return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),outputTupleSchema,DataType.BAG)); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } catch (FrontendException e) { throw new RuntimeException(e); } } }

import java.io.IOException；导入java.util.ArrayList；导入org.apache.pig.acculator；导入org.apache.pig.EvalFunc；导入org.apache.pig.data.BagFactory；导入org.apache.pig.data.DataBag；导入org.apache.pig.data.DataType；导入org.apache.pig.data.Tuple；导入org.apache.pig.data.TupleFactory；导入org.apache.pig.impl.logicalayer.FrontendException；导入org.apache.pig.impl.logicalayer.schema.schema；公共类PreviousNext扩展了EvalFunc实现累加器 { 专用数据包输出包；私有字符串先前；私有字符串下一步；公共上一页（下一页） { 清理（）； } @凌驾公共数据包exec（元组输入）引发IOException { 积累（输入）；数据包outputBag=getValue（）；清理（）；返回输出包； } @凌驾公共无效累积（元组输入）引发IOException { ArrayList域=新的ArrayList（）； databagd=（DataBag）input.get（0）； //将所有域放入ArrayList以允许 //访问特定索引 for（元组t:d） { domains.add（（String）t.get（2））； } //为上一次迭代的“下一个域”值添加空字符串域。添加（“”）； int i=0；先前的=”； for（元组t:d） { next=domains.get（i+1）； Tuple t_new=TupleFactory.getInstance（）.newTuple（t.getAll（））； t_新.追加（先前）； t_new.append（下一步）； outputBag.add（t_new）； //当前域是下一次迭代的前一个域 previous=domains.get（i）； i++； } } @凌驾公共空间清理（） { this.outputBag=BagFactory.getInstance（）.newDefaultBag（）； } @凌驾公共数据包getValue（） { 返回输出包； } @凌驾公共模式输出模式（模式输入） { 尝试 { Schema.FieldSchema inputFieldSchema=input.getField（0）； if（inputFieldSchema.type！=DataType.BAG） { 抛出新的RuntimeException（“需要一个包作为输入”）； } Schema inputBagSchema=inputFieldSchema.Schema； if（inputBagSchema.getField（0.type）！=DataType.TUPLE） { 抛出新的RuntimeException（String.format（“预期输入包包含元组，但找到了%s”），DataType.findTypeName（inputBagSchema.getField（0.type））； } Schema inputTupleSchema=inputBagSchema.getField（0）.Schema； Schema OutputUpleSchema=InputUpleSchema.clone（）； add（newschema.FieldSchema（“previous_domain”，DataType.CHARARRAY））； add（newschema.FieldSchema（“next_domain”，DataType.CHARARRAY））；返回新模式（new Schema.FieldSchema（getSchemaName（this.getClass（）.getName（）.toLowerCase（），input），OutputUpleSchema，DataType.BAG））； } 捕获（CloneNotSupportedException e） { 抛出新的运行时异常（e）； } 捕获（前端异常e） { 抛出新的运行时异常（e）； } } }
你能提供一些输入和输出数据吗？@alexeipab我添加了一个输入数据示例，以及一个来自
会话的元组示例。输出还不存在，我需要有下一个域和上一个域的应答者uid ，会话id 。我可以将会话关系分组，但我不确定如何获得下一个和上一个会话。如果我能提供任何其他有用的信息，请让我知道只是一个更新。我现在正试图走UDF路线。我会随着进度和失败的发生继续更新。我有一个不太好的UDF解决方案在工作！一旦我清理了它，并做了一些调整，我将添加它作为任何其他遇到类似情况的人的答案。 register s3://TestBucket/Sessionize.jar define Sessionize datafu.pig.sessions.Sessionize('30m'); A = load 's3://TestBucket/party2.gz' USING PigStorage() as (id: chararray, data_date: chararray, rule_code: chararray, project_uid: chararray, respondent_uid: chararray, type: chararray, tab_id: chararray, url_domain: chararray, url_path: chararray, duration: chararray, exit_cause: chararray, details: chararray); B = foreach A generate $1, $0, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11; C = filter B by id neq 'ID'; VIEWS = group C by (respondent_uid, url_domain); SESSIONS = foreach VIEWS { VISITS = order C by data_date; generate FLATTEN(Sessionize(VISITS)) as (data_date: chararray, id: chararray, rule_code: chararray, project_uid: chararray, respondent_uid: chararray, type: chararray, tab_id: chararray, url_domain: chararray, url_path: chararray, duration: chararray, exit_cause: chararray, details: chararray, session_id); } import java.io.IOException; import java.util.ArrayList; import org.apache.pig.Accumulator; import org.apache.pig.EvalFunc; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; public class PreviousNext extends EvalFunc<DataBag> implements Accumulator<DataBag> { private DataBag outputBag; private String previous; private String next; public PreviousNext() { cleanup(); } @Override public DataBag exec(Tuple input) throws IOException { accumulate(input); DataBag outputBag = getValue(); cleanup(); return outputBag; } @Override public void accumulate(Tuple input) throws IOException { ArrayList<String> domains = new ArrayList<String>(); DataBag d = (DataBag)input.get(0); //put all domains into ArrayList to allow for //accessing specific indexes for(Tuple t : d) { domains.add((String)t.get(2)); } //add empty string for "next domain" value for last iteration domains.add(""); int i = 0; previous = ""; for(Tuple t : d) { next = domains.get(i+1); Tuple t_new = TupleFactory.getInstance().newTuple(t.getAll()); t_new.append(previous); t_new.append(next); outputBag.add(t_new); //current domain is previous for next iteration previous = domains.get(i); i++; } } @Override public void cleanup() { this.outputBag = BagFactory.getInstance().newDefaultBag(); } @Override public DataBag getValue() { return outputBag; } @Override public Schema outputSchema(Schema input) { try { Schema.FieldSchema inputFieldSchema = input.getField(0); if (inputFieldSchema.type != DataType.BAG) { throw new RuntimeException("Expected a BAG as input"); } Schema inputBagSchema = inputFieldSchema.schema; if (inputBagSchema.getField(0).type != DataType.TUPLE) { throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s", DataType.findTypeName(inputBagSchema.getField(0).type))); } Schema inputTupleSchema = inputBagSchema.getField(0).schema; Schema outputTupleSchema = inputTupleSchema.clone(); outputTupleSchema.add(new Schema.FieldSchema("previous_domain", DataType.CHARARRAY)); outputTupleSchema.add(new Schema.FieldSchema("next_domain", DataType.CHARARRAY)); return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),outputTupleSchema,DataType.BAG)); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } catch (FrontendException e) { throw new RuntimeException(e); } } }