Java Apache Hbase MapReduce作业在读取数据存储时花费太多时间
我已经安装了ApacheHBase、Nutch和Hadoop集群。我已经抓取了几个文档,即大约3000万个。集群中有3名工作人员和1名主控人员。我已经编写了自己的Hbase mapreduce作业来读取爬网数据,并根据一些逻辑更改一些分数 为此,我合并了同一域的文档,找到了它们的有效字节,并找到了一些分数。稍后,在reducer中,我将该分数分配给该域的每个URL(通过缓存)。这部分工作花费了大量时间,即16小时。下面是代码片段Java Apache Hbase MapReduce作业在读取数据存储时花费太多时间,java,hadoop,mapreduce,hbase,nutch,Java,Hadoop,Mapreduce,Hbase,Nutch,我已经安装了ApacheHBase、Nutch和Hadoop集群。我已经抓取了几个文档,即大约3000万个。集群中有3名工作人员和1名主控人员。我已经编写了自己的Hbase mapreduce作业来读取爬网数据,并根据一些逻辑更改一些分数 为此,我合并了同一域的文档,找到了它们的有效字节,并找到了一些分数。稍后,在reducer中,我将该分数分配给该域的每个URL(通过缓存)。这部分工作花费了大量时间,即16小时。下面是代码片段 for(int index=0;indexmaxURL\u len
for(int index=0;index
若我从数据存储中删除那个文档读取语句,那个么这项工作只需2到3个小时就可以完成。这就是为什么,我认为语句WebPage=datastore.get(Orig_key)代码>导致此问题。是吧??
如果是这样,那么什么是最好的方法。缓存对象只是一个包含相同域URL的列表
DomainAnalysisJob.java
...
公共类DomainAnalysisJob实现工具{
公共静态最终记录器日志=LoggerFactory
.getLogger(DomainAnalysisJob.class);
私有静态最终集合字段=new HashSet();
私有配置配置;
受保护的静态最终Utf8 URL_ORIG_KEY=新Utf8(“文档原始id”);
受保护的静态最终Utf8文件\虚拟\标记=新Utf8(“文件\标记”);
受保护静态最终Utf8伪密钥=新Utf8(“文件id”);
受保护的静态最终Utf8域\伪\标记=新Utf8(“域\标记”);
受保护的静态最终Utf8链路_标记=新的Utf8(“链路”);
受保护的静态最终Utf8队列=新Utf8(“q”);
私有静态URLnormalizer URLnormalizer;
专用静态URL过滤器;
私有静态int maxURL_长度;
静止的{
字段。添加(网页。字段。状态);
字段。添加(网页。字段。语言信息);
字段。添加(网页。字段。乌尔都语分数);
字段。添加(网页。字段。标记);
添加(WebPage.Field.INLINKS);
}
/**
*将每个网页映射到主机密钥。
*/
公共静态类映射器扩展了静态类映射器{
@凌驾
受保护的无效设置(上下文上下文)引发IOException、InterruptedException{
conf=context.getConfiguration();
urlNormalizers=新的urlNormalizers(context.getConfiguration(),urlNormalizers.SCOPE\u默认值);
filters=新的URLFilters(context.getConfiguration());
maxURL_Length=conf.getInt(“url.characters.max.Length”,2000);
}
@凌驾
受保护的无效映射(字符串键、网页页、上下文)
抛出IOException、InterruptedException{
字符串reversedHost=null;
如果(第==null页){
返回;
}
if(key.length()>maxURL\u length){
返回;
}
字符串url=null;
试一试{
url=TableUtil.unreverseUrl(键);
url=urlNormalizers.normalize(url,urlNormalizers.SCOPE\u默认值);
url=过滤器。过滤器(url);//过滤url
}捕获(例外e){
LOG.warn(“跳过“+key+”:“+e”);
返回;
}
如果(url==null){
getCounter(“DomainAnalysis”、“FilteredURL”)。增量(1);
返回;
}
试一试{
reversedHost=TableUtil.getReversedHost(key.toString());
}
捕获(例外e){
返回;
}
page.getMarkers().put(URL_ORIG_键,新Utf8(键));
编写(新文本(反向主机),第页);
}
}
公共域分析作业(){
}
公共域分析作业(配置配置){
setConf(conf);
}
@凌驾
公共配置getConf(){
返回形态;
}
@凌驾
公共无效设置配置(配置配置配置){
this.conf=conf;
}
public void updateDomains(布尔buildLinkDb,int numTasks)引发异常{
NutchJob job=NutchJob.getInstance(getConf(),“rankDomain update”);
job.getConfiguration().setInt(“mapreduce.task.timeout”,1800000);
如果(numTasks<1){
job.setNumReduceTasks(job.getConfiguration().getInt(
“mapred.map.tasks”,job.getNumReduceTasks());
}否则{
job.setNumReduceTasks(numTasks);
}
ScoringFilters ScoringFilters=新的ScoringFilters(getConf());
HashSet字段=新的HashSet(字段);
fields.addAll(scoringFilters.getFields());
StorageUtils.initMapperJob(作业、字段、Text.class、WebPage.class、,
Mapper.class);
initReducerJob(job,DomainAnalysisReducer.class);
job.waitForCompletion(true);
}
@凌驾
公共int运行(字符串[]args)引发异常{
布尔linkDb=false;
int numTasks=-1;
对于(int i=0;i
DomainAnalysisReducer.java
。。。
...
公共类DomainAnalysisReducer扩展
戈拉减速器{
公共街
public class DomainAnalysisJob implements Tool {
public static final Logger LOG = LoggerFactory
.getLogger(DomainAnalysisJob.class);
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
private Configuration conf;
protected static final Utf8 URL_ORIG_KEY = new Utf8("doc_orig_id");
protected static final Utf8 DOC_DUMMY_MARKER = new Utf8("doc_marker");
protected static final Utf8 DUMMY_KEY = new Utf8("doc_id");
protected static final Utf8 DOMAIN_DUMMY_MARKER = new Utf8("domain_marker");
protected static final Utf8 LINK_MARKER = new Utf8("link");
protected static final Utf8 Queue = new Utf8("q");
private static URLNormalizers urlNormalizers;
private static URLFilters filters;
private static int maxURL_Length;
static {
FIELDS.add(WebPage.Field.STATUS);
FIELDS.add(WebPage.Field.LANG_INFO);
FIELDS.add(WebPage.Field.URDU_SCORE);
FIELDS.add(WebPage.Field.MARKERS);
FIELDS.add(WebPage.Field.INLINKS);
}
/**
* Maps each WebPage to a host key.
*/
public static class Mapper extends GoraMapper<String, WebPage, Text, WebPage> {
@Override
protected void setup(Context context) throws IOException ,InterruptedException {
Configuration conf = context.getConfiguration();
urlNormalizers = new URLNormalizers(context.getConfiguration(), URLNormalizers.SCOPE_DEFAULT);
filters = new URLFilters(context.getConfiguration());
maxURL_Length = conf.getInt("url.characters.max.length", 2000);
}
@Override
protected void map(String key, WebPage page, Context context)
throws IOException, InterruptedException {
String reversedHost = null;
if (page == null) {
return;
}
if ( key.length() > maxURL_Length ) {
return;
}
String url = null;
try {
url = TableUtil.unreverseUrl(key);
url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
url = filters.filter(url); // filter the url
} catch (Exception e) {
LOG.warn("Skipping " + key + ":" + e);
return;
}
if ( url == null) {
context.getCounter("DomainAnalysis", "FilteredURL").increment(1);
return;
}
try {
reversedHost = TableUtil.getReversedHost(key.toString());
}
catch (Exception e) {
return;
}
page.getMarkers().put( URL_ORIG_KEY, new Utf8(key) );
context.write( new Text(reversedHost), page );
}
}
public DomainAnalysisJob() {
}
public DomainAnalysisJob(Configuration conf) {
setConf(conf);
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
public void updateDomains(boolean buildLinkDb, int numTasks) throws Exception {
NutchJob job = NutchJob.getInstance(getConf(), "rankDomain-update");
job.getConfiguration().setInt("mapreduce.task.timeout", 1800000);
if ( numTasks < 1) {
job.setNumReduceTasks(job.getConfiguration().getInt(
"mapred.map.tasks", job.getNumReduceTasks()));
} else {
job.setNumReduceTasks(numTasks);
}
ScoringFilters scoringFilters = new ScoringFilters(getConf());
HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
fields.addAll(scoringFilters.getFields());
StorageUtils.initMapperJob(job, fields, Text.class, WebPage.class,
Mapper.class);
StorageUtils.initReducerJob(job, DomainAnalysisReducer.class);
job.waitForCompletion(true);
}
@Override
public int run(String[] args) throws Exception {
boolean linkDb = false;
int numTasks = -1;
for (int i = 0; i < args.length; i++) {
if ("-rankDomain".equals(args[i])) {
linkDb = true;
} else if ("-crawlId".equals(args[i])) {
getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
} else if ("-numTasks".equals(args[i]) ) {
numTasks = Integer.parseInt(args[++i]);
}
else {
throw new IllegalArgumentException("unrecognized arg " + args[i]
+ " usage: updatedomain -crawlId <crawlId> [-numTasks N]" );
}
}
LOG.info("Updating DomainRank:");
updateDomains(linkDb, numTasks);
return 0;
}
public static void main(String[] args) throws Exception {
final int res = ToolRunner.run(NutchConfiguration.create(),
new DomainAnalysisJob(), args);
System.exit(res);
}
}
...
...
public class DomainAnalysisReducer extends
GoraReducer<Text, WebPage, String, WebPage> {
public static final Logger LOG = DomainAnalysisJob.LOG;
public DataStore<String, WebPage> datastore;
protected static float q1_ur_threshold = 500.0f;
protected static float q1_ur_docCount = 50;
public static final Utf8 Queue = new Utf8("q"); // Markers for Q1 and Q2
public static final Utf8 Q1 = new Utf8("q1");
public static final Utf8 Q2 = new Utf8("q2");
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
try {
datastore = StorageUtils.createWebStore(conf, String.class, WebPage.class);
}
catch (ClassNotFoundException e) {
throw new IOException(e);
}
q1_ur_threshold = conf.getFloat("domain.queue.threshold.bytes", 500.0f);
q1_ur_docCount = conf.getInt("domain.queue.doc.count", 50);
LOG.info("Conf updated: Queue-bytes-threshold = " + q1_ur_threshold + " Queue-doc-threshold: " + q1_ur_docCount);
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
datastore.close();
}
@Override
protected void reduce(Text key, Iterable<WebPage> values, Context context)
throws IOException, InterruptedException {
ArrayList<String> Cache = new ArrayList<String>();
int doc_counter = 0;
int total_ur_bytes = 0;
for ( WebPage page : values ) {
// cache
String orig_key = page.getMarkers().get( DomainAnalysisJob.URL_ORIG_KEY ).toString();
Cache.add(orig_key);
// do not consider those doc's that are not fetched or link URLs
if ( page.getStatus() == CrawlStatus.STATUS_UNFETCHED ) {
continue;
}
doc_counter++;
int ur_score_int = 0;
int doc_ur_bytes = 0;
int doc_total_bytes = 0;
String ur_score_str = "0";
String langInfo_str = null;
// read page and find its Urdu score
langInfo_str = TableUtil.toString(page.getLangInfo());
if (langInfo_str == null) {
continue;
}
ur_score_str = TableUtil.toString(page.getUrduScore());
ur_score_int = Integer.parseInt(ur_score_str);
doc_total_bytes = Integer.parseInt( langInfo_str.split("&")[0] );
doc_ur_bytes = ( doc_total_bytes * ur_score_int) / 100; //Formula to find ur percentage
total_ur_bytes += doc_ur_bytes;
}
float avg_bytes = 0;
float log10 = 0;
if ( doc_counter > 0 && total_ur_bytes > 0) {
avg_bytes = (float) total_ur_bytes/doc_counter;
log10 = (float) Math.log10(avg_bytes);
log10 = (Math.round(log10 * 100000f)/100000f);
}
context.getCounter("DomainAnalysis", "DomainCount").increment(1);
// if average bytes and doc count, are more than threshold then mark as q1
boolean mark = false;
if ( avg_bytes >= q1_ur_threshold && doc_counter >= q1_ur_docCount ) {
mark = true;
for ( int index = 0; index < Cache.size(); index++) {
String Orig_key = Cache.get(index);
float doc_score = log10;
WebPage page = datastore.get(Orig_key);
if ( page == null ) {
continue;
}
page.setScore(doc_score);
if (mark) {
page.getMarkers().put( Queue, Q1);
}
context.write(Orig_key, page);
}
}
}