Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/hadoop/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java Apache Hbase MapReduce作业在读取数据存储时花费太多时间_Java_Hadoop_Mapreduce_Hbase_Nutch - Fatal编程技术网

Java Apache Hbase MapReduce作业在读取数据存储时花费太多时间

Java Apache Hbase MapReduce作业在读取数据存储时花费太多时间,java,hadoop,mapreduce,hbase,nutch,Java,Hadoop,Mapreduce,Hbase,Nutch,我已经安装了ApacheHBase、Nutch和Hadoop集群。我已经抓取了几个文档,即大约3000万个。集群中有3名工作人员和1名主控人员。我已经编写了自己的Hbase mapreduce作业来读取爬网数据,并根据一些逻辑更改一些分数 为此,我合并了同一域的文档,找到了它们的有效字节,并找到了一些分数。稍后,在reducer中,我将该分数分配给该域的每个URL(通过缓存)。这部分工作花费了大量时间,即16小时。下面是代码片段 for(int index=0;indexmaxURL\u len

我已经安装了ApacheHBase、Nutch和Hadoop集群。我已经抓取了几个文档,即大约3000万个。集群中有3名工作人员和1名主控人员。我已经编写了自己的Hbase mapreduce作业来读取爬网数据,并根据一些逻辑更改一些分数

为此,我合并了同一域的文档,找到了它们的有效字节,并找到了一些分数。稍后,在reducer中,我将该分数分配给该域的每个URL(通过缓存)。这部分工作花费了大量时间,即16小时。下面是代码片段

for(int index=0;index
若我从数据存储中删除那个文档读取语句,那个么这项工作只需2到3个小时就可以完成。这就是为什么,我认为语句
WebPage=datastore.get(Orig_key)导致此问题。是吧??
如果是这样,那么什么是最好的方法。缓存对象只是一个包含相同域URL的列表

DomainAnalysisJob.java ...

公共类DomainAnalysisJob实现工具{
公共静态最终记录器日志=LoggerFactory
.getLogger(DomainAnalysisJob.class);
私有静态最终集合字段=new HashSet();
私有配置配置;
受保护的静态最终Utf8 URL_ORIG_KEY=新Utf8(“文档原始id”);
受保护的静态最终Utf8文件\虚拟\标记=新Utf8(“文件\标记”);
受保护静态最终Utf8伪密钥=新Utf8(“文件id”);
受保护的静态最终Utf8域\伪\标记=新Utf8(“域\标记”);
受保护的静态最终Utf8链路_标记=新的Utf8(“链路”);
受保护的静态最终Utf8队列=新Utf8(“q”);
私有静态URLnormalizer URLnormalizer;
专用静态URL过滤器;
私有静态int maxURL_长度;
静止的{
字段。添加(网页。字段。状态);
字段。添加(网页。字段。语言信息);
字段。添加(网页。字段。乌尔都语分数);
字段。添加(网页。字段。标记);
添加(WebPage.Field.INLINKS);
}
/**
*将每个网页映射到主机密钥。
*/
公共静态类映射器扩展了静态类映射器{
@凌驾
受保护的无效设置(上下文上下文)引发IOException、InterruptedException{
conf=context.getConfiguration();
urlNormalizers=新的urlNormalizers(context.getConfiguration(),urlNormalizers.SCOPE\u默认值);
filters=新的URLFilters(context.getConfiguration());
maxURL_Length=conf.getInt(“url.characters.max.Length”,2000);
}
@凌驾
受保护的无效映射(字符串键、网页页、上下文)
抛出IOException、InterruptedException{
字符串reversedHost=null;
如果(第==null页){
返回;
}
if(key.length()>maxURL\u length){
返回;
}
字符串url=null;
试一试{
url=TableUtil.unreverseUrl(键);
url=urlNormalizers.normalize(url,urlNormalizers.SCOPE\u默认值);
url=过滤器。过滤器(url);//过滤url
}捕获(例外e){
LOG.warn(“跳过“+key+”:“+e”);
返回;
}
如果(url==null){
getCounter(“DomainAnalysis”、“FilteredURL”)。增量(1);
返回;
}
试一试{
reversedHost=TableUtil.getReversedHost(key.toString());
} 
捕获(例外e){
返回;
}
page.getMarkers().put(URL_ORIG_键,新Utf8(键));
编写(新文本(反向主机),第页);
}
}
公共域分析作业(){
}
公共域分析作业(配置配置){
setConf(conf);
}
@凌驾
公共配置getConf(){
返回形态;
}
@凌驾
公共无效设置配置(配置配置配置){
this.conf=conf;
}
public void updateDomains(布尔buildLinkDb,int numTasks)引发异常{
NutchJob job=NutchJob.getInstance(getConf(),“rankDomain update”);
job.getConfiguration().setInt(“mapreduce.task.timeout”,1800000);
如果(numTasks<1){
job.setNumReduceTasks(job.getConfiguration().getInt(
“mapred.map.tasks”,job.getNumReduceTasks());
}否则{
job.setNumReduceTasks(numTasks);
}
ScoringFilters ScoringFilters=新的ScoringFilters(getConf());
HashSet字段=新的HashSet(字段);
fields.addAll(scoringFilters.getFields());
StorageUtils.initMapperJob(作业、字段、Text.class、WebPage.class、,
Mapper.class);
initReducerJob(job,DomainAnalysisReducer.class);
job.waitForCompletion(true);
}
@凌驾
公共int运行(字符串[]args)引发异常{
布尔linkDb=false;
int numTasks=-1;
对于(int i=0;i
DomainAnalysisReducer.java

。。。
...
公共类DomainAnalysisReducer扩展
戈拉减速器{
公共街
public class DomainAnalysisJob implements Tool {

  public static final Logger LOG = LoggerFactory
      .getLogger(DomainAnalysisJob.class);
  private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

  private Configuration conf;
  protected static final Utf8 URL_ORIG_KEY = new Utf8("doc_orig_id");
  protected static final Utf8 DOC_DUMMY_MARKER = new Utf8("doc_marker");
  protected static final Utf8 DUMMY_KEY = new Utf8("doc_id");
  protected static final Utf8 DOMAIN_DUMMY_MARKER = new Utf8("domain_marker");
  protected static final Utf8 LINK_MARKER = new Utf8("link");
  protected static final Utf8 Queue = new Utf8("q");

  private static URLNormalizers urlNormalizers;
  private static URLFilters filters;
  private static int maxURL_Length;

  static {
    FIELDS.add(WebPage.Field.STATUS);
    FIELDS.add(WebPage.Field.LANG_INFO);
    FIELDS.add(WebPage.Field.URDU_SCORE);
    FIELDS.add(WebPage.Field.MARKERS);
    FIELDS.add(WebPage.Field.INLINKS);
  }

  /**
   * Maps each WebPage to a host key.
   */
  public static class Mapper extends GoraMapper<String, WebPage, Text, WebPage> {

      @Override
        protected void setup(Context context) throws IOException ,InterruptedException {
          Configuration conf = context.getConfiguration();
          urlNormalizers = new URLNormalizers(context.getConfiguration(), URLNormalizers.SCOPE_DEFAULT);
          filters = new URLFilters(context.getConfiguration());
          maxURL_Length = conf.getInt("url.characters.max.length", 2000);
        }

    @Override
    protected void map(String key, WebPage page, Context context)
        throws IOException, InterruptedException {

     String reversedHost = null;
     if (page == null) {
         return;
     }
    if ( key.length() > maxURL_Length ) {
        return;
    }
     String url = null;
     try {
         url = TableUtil.unreverseUrl(key);
         url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
         url = filters.filter(url); // filter the url
       } catch (Exception e) {
         LOG.warn("Skipping " + key + ":" + e);
         return;
       }
     if ( url == null) {
         context.getCounter("DomainAnalysis", "FilteredURL").increment(1);
         return;
     }
     try {
         reversedHost = TableUtil.getReversedHost(key.toString());
     } 
     catch (Exception e) {
        return;
    }
     page.getMarkers().put( URL_ORIG_KEY, new Utf8(key) );

     context.write( new Text(reversedHost), page );

    }
  }

  public DomainAnalysisJob() {
  }

  public DomainAnalysisJob(Configuration conf) {
    setConf(conf);
  }

  @Override
  public Configuration getConf() {
    return conf;
  }

  @Override
  public void setConf(Configuration conf) {
    this.conf = conf;
   }

  public void updateDomains(boolean buildLinkDb, int numTasks) throws Exception {


    NutchJob job = NutchJob.getInstance(getConf(), "rankDomain-update");

    job.getConfiguration().setInt("mapreduce.task.timeout", 1800000);

    if ( numTasks < 1) {
        job.setNumReduceTasks(job.getConfiguration().getInt(
            "mapred.map.tasks", job.getNumReduceTasks()));
      } else {
        job.setNumReduceTasks(numTasks);
      }
    ScoringFilters scoringFilters = new ScoringFilters(getConf());
    HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
    fields.addAll(scoringFilters.getFields());

    StorageUtils.initMapperJob(job, fields, Text.class, WebPage.class,
            Mapper.class);
    StorageUtils.initReducerJob(job, DomainAnalysisReducer.class);


    job.waitForCompletion(true);
  }

  @Override
  public int run(String[] args) throws Exception {
    boolean linkDb = false;
    int numTasks = -1;
    for (int i = 0; i < args.length; i++) {
      if ("-rankDomain".equals(args[i])) {
        linkDb = true;
      } else if ("-crawlId".equals(args[i])) {
        getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
      } else if ("-numTasks".equals(args[i]) ) {
          numTasks = Integer.parseInt(args[++i]);
      }
      else {
        throw new IllegalArgumentException("unrecognized arg " + args[i]
            + " usage: updatedomain -crawlId <crawlId> [-numTasks N]" );
      }
    }
    LOG.info("Updating DomainRank:");
    updateDomains(linkDb, numTasks);
    return 0;
  }

  public static void main(String[] args) throws Exception {
    final int res = ToolRunner.run(NutchConfiguration.create(),
        new DomainAnalysisJob(), args);
    System.exit(res);
  }
}
...
...
public class DomainAnalysisReducer extends
    GoraReducer<Text, WebPage, String, WebPage> {

    public static final Logger LOG = DomainAnalysisJob.LOG;
    public DataStore<String, WebPage> datastore;

    protected static float q1_ur_threshold = 500.0f;
    protected static float q1_ur_docCount = 50;
    public static final Utf8 Queue = new Utf8("q");     // Markers for Q1 and Q2
    public static final Utf8 Q1 = new Utf8("q1");           
    public static final Utf8 Q2 = new Utf8("q2");

      @Override
      protected void setup(Context context) throws IOException,
      InterruptedException {
        Configuration conf = context.getConfiguration();
        try {
          datastore = StorageUtils.createWebStore(conf, String.class, WebPage.class);
        }
        catch (ClassNotFoundException e) {
          throw new IOException(e);
        }
        q1_ur_threshold = conf.getFloat("domain.queue.threshold.bytes", 500.0f);
        q1_ur_docCount = conf.getInt("domain.queue.doc.count", 50);
        LOG.info("Conf updated: Queue-bytes-threshold = " + q1_ur_threshold + " Queue-doc-threshold: " + q1_ur_docCount);
      }

      @Override
      protected void cleanup(Context context) throws IOException, InterruptedException {
        datastore.close();
      }

  @Override
  protected void reduce(Text key, Iterable<WebPage> values, Context context)
      throws IOException, InterruptedException {

      ArrayList<String> Cache = new ArrayList<String>();

      int doc_counter = 0;
      int total_ur_bytes = 0;

    for ( WebPage page : values ) {

        // cache
        String orig_key = page.getMarkers().get( DomainAnalysisJob.URL_ORIG_KEY ).toString();
        Cache.add(orig_key);

        // do not consider those doc's that are not fetched or link URLs
        if ( page.getStatus() == CrawlStatus.STATUS_UNFETCHED ) {
         continue;
        }

        doc_counter++;
        int ur_score_int = 0;
        int doc_ur_bytes = 0;
        int doc_total_bytes = 0;
        String ur_score_str = "0";
        String langInfo_str = null;

        // read page and find its Urdu score
        langInfo_str = TableUtil.toString(page.getLangInfo());      
        if (langInfo_str == null) {
            continue;
        }
        ur_score_str = TableUtil.toString(page.getUrduScore());
        ur_score_int = Integer.parseInt(ur_score_str);
        doc_total_bytes = Integer.parseInt( langInfo_str.split("&")[0] );
        doc_ur_bytes = ( doc_total_bytes * ur_score_int) / 100;             //Formula to find ur percentage

        total_ur_bytes += doc_ur_bytes;     

    }
    float avg_bytes = 0;
    float log10 = 0;
    if ( doc_counter > 0 && total_ur_bytes > 0) {
        avg_bytes = (float) total_ur_bytes/doc_counter;
         log10 = (float) Math.log10(avg_bytes);
         log10 = (Math.round(log10 * 100000f)/100000f);
    }

    context.getCounter("DomainAnalysis", "DomainCount").increment(1);
    // if average bytes and doc count, are more than threshold then mark as q1
    boolean mark = false;
    if ( avg_bytes >= q1_ur_threshold && doc_counter >= q1_ur_docCount ) {
        mark = true; 

    for ( int index = 0; index < Cache.size(); index++) {

        String Orig_key = Cache.get(index);
        float doc_score = log10;

        WebPage page = datastore.get(Orig_key);
        if ( page == null ) {
            continue;
        }
        page.setScore(doc_score);

        if (mark) {
            page.getMarkers().put( Queue, Q1);
        }
        context.write(Orig_key, page);
    }
  }
}