Java Apache Hbase MapReduce作业在读取数据存储时花费太多时间_Java_Hadoop_Mapreduce_Hbase_Nutch

Java Apache Hbase MapReduce作业在读取数据存储时花费太多时间

java hadoop mapreduce hbase

Java Apache Hbase MapReduce作业在读取数据存储时花费太多时间,java,hadoop,mapreduce,hbase,nutch,Java,Hadoop,Mapreduce,Hbase,Nutch,我已经安装了ApacheHBase、Nutch和Hadoop集群。我已经抓取了几个文档，即大约3000万个。集群中有3名工作人员和1名主控人员。我已经编写了自己的Hbase mapreduce作业来读取爬网数据，并根据一些逻辑更改一些分数为此，我合并了同一域的文档，找到了它们的有效字节，并找到了一些分数。稍后，在reducer中，我将该分数分配给该域的每个URL（通过缓存）。这部分工作花费了大量时间，即16小时。下面是代码片段 for（int index=0；indexmaxURL\u len

我已经安装了ApacheHBase、Nutch和Hadoop集群。我已经抓取了几个文档，即大约3000万个。集群中有3名工作人员和1名主控人员。我已经编写了自己的Hbase mapreduce作业来读取爬网数据，并根据一些逻辑更改一些分数

为此，我合并了同一域的文档，找到了它们的有效字节，并找到了一些分数。稍后，在reducer中，我将该分数分配给该域的每个URL（通过缓存）。这部分工作花费了大量时间，即16小时。下面是代码片段

for（int index=0；index


若我从数据存储中删除那个文档读取语句，那个么这项工作只需2到3个小时就可以完成。这就是为什么，我认为语句WebPage=datastore.get（Orig_key）导致此问题。是吧?？
如果是这样，那么什么是最好的方法。缓存对象只是一个包含相同域URL的列表
DomainAnalysisJob.java
...

公共类DomainAnalysisJob实现工具{
公共静态最终记录器日志=LoggerFactory
.getLogger（DomainAnalysisJob.class）；
私有静态最终集合字段=new HashSet（）；
私有配置配置；
受保护的静态最终Utf8 URL_ORIG_KEY=新Utf8（“文档原始id”）；
受保护的静态最终Utf8文件\虚拟\标记=新Utf8（“文件\标记”）；
受保护静态最终Utf8伪密钥=新Utf8（“文件id”）；
受保护的静态最终Utf8域\伪\标记=新Utf8（“域\标记”）；
受保护的静态最终Utf8链路_标记=新的Utf8（“链路”）；
受保护的静态最终Utf8队列=新Utf8（“q”）；
私有静态URLnormalizer URLnormalizer；
专用静态URL过滤器；
私有静态int maxURL_长度；
静止的{
字段。添加（网页。字段。状态）；
字段。添加（网页。字段。语言信息）；
字段。添加（网页。字段。乌尔都语分数）；
字段。添加（网页。字段。标记）；
添加（WebPage.Field.INLINKS）；
}
/**
*将每个网页映射到主机密钥。
*/
公共静态类映射器扩展了静态类映射器{
@凌驾
受保护的无效设置（上下文上下文）引发IOException、InterruptedException{
conf=context.getConfiguration（）；
urlNormalizers=新的urlNormalizers（context.getConfiguration（），urlNormalizers.SCOPE\u默认值）；
filters=新的URLFilters（context.getConfiguration（））；
maxURL_Length=conf.getInt（“url.characters.max.Length”，2000）；
}
@凌驾
受保护的无效映射（字符串键、网页页、上下文）
抛出IOException、InterruptedException{
字符串reversedHost=null；
如果（第==null页）{
返回；
}
if（key.length（）>maxURL\u length）{
返回；
}
字符串url=null；
试一试{
url=TableUtil.unreverseUrl（键）；
url=urlNormalizers.normalize（url，urlNormalizers.SCOPE\u默认值）；
url=过滤器。过滤器（url）；//过滤url
}捕获（例外e）{
LOG.warn（“跳过“+key+”：“+e”）；
返回；
}
如果（url==null）{
getCounter（“DomainAnalysis”、“FilteredURL”）。增量（1）；
返回；
}
试一试{
reversedHost=TableUtil.getReversedHost（key.toString（））；
} 
捕获（例外e）{
返回；
}
page.getMarkers（）.put（URL_ORIG_键，新Utf8（键））；
编写（新文本（反向主机），第页）；
}
}
公共域分析作业（）{
}
公共域分析作业（配置配置）{
setConf（conf）；
}
@凌驾
公共配置getConf（）{
返回形态；
}
@凌驾
公共无效设置配置（配置配置配置）{
this.conf=conf；
}
public void updateDomains（布尔buildLinkDb，int numTasks）引发异常{
NutchJob job=NutchJob.getInstance（getConf（），“rankDomain update”）；
job.getConfiguration（）.setInt（“mapreduce.task.timeout”，1800000）；
如果（numTasks<1）{
job.setNumReduceTasks（job.getConfiguration（）.getInt(
“mapred.map.tasks”，job.getNumReduceTasks（））；
}否则{
job.setNumReduceTasks（numTasks）；
}
ScoringFilters ScoringFilters=新的ScoringFilters（getConf（））；
HashSet字段=新的HashSet（字段）；
fields.addAll（scoringFilters.getFields（））；
StorageUtils.initMapperJob（作业、字段、Text.class、WebPage.class、，
Mapper.class）；
initReducerJob（job，DomainAnalysisReducer.class）；
job.waitForCompletion（true）；
}
@凌驾
公共int运行（字符串[]args）引发异常{
布尔linkDb=false；
int numTasks=-1；
对于（int i=0；i

DomainAnalysisReducer.java

。。。
...
公共类DomainAnalysisReducer扩展
戈拉减速器{
公共街
public class DomainAnalysisJob implements Tool {

  public static final Logger LOG = LoggerFactory
      .getLogger(DomainAnalysisJob.class);
  private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

  private Configuration conf;
  protected static final Utf8 URL_ORIG_KEY = new Utf8("doc_orig_id");
  protected static final Utf8 DOC_DUMMY_MARKER = new Utf8("doc_marker");
  protected static final Utf8 DUMMY_KEY = new Utf8("doc_id");
  protected static final Utf8 DOMAIN_DUMMY_MARKER = new Utf8("domain_marker");
  protected static final Utf8 LINK_MARKER = new Utf8("link");
  protected static final Utf8 Queue = new Utf8("q");

  private static URLNormalizers urlNormalizers;
  private static URLFilters filters;
  private static int maxURL_Length;

  static {
    FIELDS.add(WebPage.Field.STATUS);
    FIELDS.add(WebPage.Field.LANG_INFO);
    FIELDS.add(WebPage.Field.URDU_SCORE);
    FIELDS.add(WebPage.Field.MARKERS);
    FIELDS.add(WebPage.Field.INLINKS);
  }

  /**
   * Maps each WebPage to a host key.
   */
  public static class Mapper extends GoraMapper<String, WebPage, Text, WebPage> {

      @Override
        protected void setup(Context context) throws IOException ,InterruptedException {
          Configuration conf = context.getConfiguration();
          urlNormalizers = new URLNormalizers(context.getConfiguration(), URLNormalizers.SCOPE_DEFAULT);
          filters = new URLFilters(context.getConfiguration());
          maxURL_Length = conf.getInt("url.characters.max.length", 2000);
        }

    @Override
    protected void map(String key, WebPage page, Context context)
        throws IOException, InterruptedException {

     String reversedHost = null;
     if (page == null) {
         return;
     }
    if ( key.length() > maxURL_Length ) {
        return;
    }
     String url = null;
     try {
         url = TableUtil.unreverseUrl(key);
         url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
         url = filters.filter(url); // filter the url
       } catch (Exception e) {
         LOG.warn("Skipping " + key + ":" + e);
         return;
       }
     if ( url == null) {
         context.getCounter("DomainAnalysis", "FilteredURL").increment(1);
         return;
     }
     try {
         reversedHost = TableUtil.getReversedHost(key.toString());
     } 
     catch (Exception e) {
        return;
    }
     page.getMarkers().put( URL_ORIG_KEY, new Utf8(key) );

     context.write( new Text(reversedHost), page );

    }
  }

  public DomainAnalysisJob() {
  }

  public DomainAnalysisJob(Configuration conf) {
    setConf(conf);
  }

  @Override
  public Configuration getConf() {
    return conf;
  }

  @Override
  public void setConf(Configuration conf) {
    this.conf = conf;
   }

  public void updateDomains(boolean buildLinkDb, int numTasks) throws Exception {


    NutchJob job = NutchJob.getInstance(getConf(), "rankDomain-update");

    job.getConfiguration().setInt("mapreduce.task.timeout", 1800000);

    if ( numTasks < 1) {
        job.setNumReduceTasks(job.getConfiguration().getInt(
            "mapred.map.tasks", job.getNumReduceTasks()));
      } else {
        job.setNumReduceTasks(numTasks);
      }
    ScoringFilters scoringFilters = new ScoringFilters(getConf());
    HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
    fields.addAll(scoringFilters.getFields());

    StorageUtils.initMapperJob(job, fields, Text.class, WebPage.class,
            Mapper.class);
    StorageUtils.initReducerJob(job, DomainAnalysisReducer.class);


    job.waitForCompletion(true);
  }

  @Override
  public int run(String[] args) throws Exception {
    boolean linkDb = false;
    int numTasks = -1;
    for (int i = 0; i < args.length; i++) {
      if ("-rankDomain".equals(args[i])) {
        linkDb = true;
      } else if ("-crawlId".equals(args[i])) {
        getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
      } else if ("-numTasks".equals(args[i]) ) {
          numTasks = Integer.parseInt(args[++i]);
      }
      else {
        throw new IllegalArgumentException("unrecognized arg " + args[i]
            + " usage: updatedomain -crawlId <crawlId> [-numTasks N]" );
      }
    }
    LOG.info("Updating DomainRank:");
    updateDomains(linkDb, numTasks);
    return 0;
  }

  public static void main(String[] args) throws Exception {
    final int res = ToolRunner.run(NutchConfiguration.create(),
        new DomainAnalysisJob(), args);
    System.exit(res);
  }
}

...
...
public class DomainAnalysisReducer extends
    GoraReducer<Text, WebPage, String, WebPage> {

    public static final Logger LOG = DomainAnalysisJob.LOG;
    public DataStore<String, WebPage> datastore;

    protected static float q1_ur_threshold = 500.0f;
    protected static float q1_ur_docCount = 50;
    public static final Utf8 Queue = new Utf8("q");     // Markers for Q1 and Q2
    public static final Utf8 Q1 = new Utf8("q1");           
    public static final Utf8 Q2 = new Utf8("q2");

      @Override
      protected void setup(Context context) throws IOException,
      InterruptedException {
        Configuration conf = context.getConfiguration();
        try {
          datastore = StorageUtils.createWebStore(conf, String.class, WebPage.class);
        }
        catch (ClassNotFoundException e) {
          throw new IOException(e);
        }
        q1_ur_threshold = conf.getFloat("domain.queue.threshold.bytes", 500.0f);
        q1_ur_docCount = conf.getInt("domain.queue.doc.count", 50);
        LOG.info("Conf updated: Queue-bytes-threshold = " + q1_ur_threshold + " Queue-doc-threshold: " + q1_ur_docCount);
      }

      @Override
      protected void cleanup(Context context) throws IOException, InterruptedException {
        datastore.close();
      }

  @Override
  protected void reduce(Text key, Iterable<WebPage> values, Context context)
      throws IOException, InterruptedException {

      ArrayList<String> Cache = new ArrayList<String>();

      int doc_counter = 0;
      int total_ur_bytes = 0;

    for ( WebPage page : values ) {

        // cache
        String orig_key = page.getMarkers().get( DomainAnalysisJob.URL_ORIG_KEY ).toString();
        Cache.add(orig_key);

        // do not consider those doc's that are not fetched or link URLs
        if ( page.getStatus() == CrawlStatus.STATUS_UNFETCHED ) {
         continue;
        }

        doc_counter++;
        int ur_score_int = 0;
        int doc_ur_bytes = 0;
        int doc_total_bytes = 0;
        String ur_score_str = "0";
        String langInfo_str = null;

        // read page and find its Urdu score
        langInfo_str = TableUtil.toString(page.getLangInfo());      
        if (langInfo_str == null) {
            continue;
        }
        ur_score_str = TableUtil.toString(page.getUrduScore());
        ur_score_int = Integer.parseInt(ur_score_str);
        doc_total_bytes = Integer.parseInt( langInfo_str.split("&")[0] );
        doc_ur_bytes = ( doc_total_bytes * ur_score_int) / 100;             //Formula to find ur percentage

        total_ur_bytes += doc_ur_bytes;     

    }
    float avg_bytes = 0;
    float log10 = 0;
    if ( doc_counter > 0 && total_ur_bytes > 0) {
        avg_bytes = (float) total_ur_bytes/doc_counter;
         log10 = (float) Math.log10(avg_bytes);
         log10 = (Math.round(log10 * 100000f)/100000f);
    }

    context.getCounter("DomainAnalysis", "DomainCount").increment(1);
    // if average bytes and doc count, are more than threshold then mark as q1
    boolean mark = false;
    if ( avg_bytes >= q1_ur_threshold && doc_counter >= q1_ur_docCount ) {
        mark = true; 

    for ( int index = 0; index < Cache.size(); index++) {

        String Orig_key = Cache.get(index);
        float doc_score = log10;

        WebPage page = datastore.get(Orig_key);
        if ( page == null ) {
            continue;
        }
        page.setScore(doc_score);

        if (mark) {
            page.getMarkers().put( Queue, Q1);
        }
        context.write(Orig_key, page);
    }
  }
}