Java 将commoncrawl关键字搜索脚本转换为Hadoop EMR脚本
我已经构建了一个关键字搜索脚本,它从EC2运行,并成功地将输出保存在s3上。但它是单线程的,这就是它速度慢的原因。我想使用自定义jar在EMR上运行它。谁能把这个转换成Hadoop脚本,这样我就可以在EMR上运行它了 我是hadoop新手。我尝试了以下回购,但没有成功 然后我将这两种回购协议混合在一起,制作了下面的脚本Java 将commoncrawl关键字搜索脚本转换为Hadoop EMR脚本,java,hadoop,amazon-s3,amazon-emr,common-crawl,Java,Hadoop,Amazon S3,Amazon Emr,Common Crawl,我已经构建了一个关键字搜索脚本,它从EC2运行,并成功地将输出保存在s3上。但它是单线程的,这就是它速度慢的原因。我想使用自定义jar在EMR上运行它。谁能把这个转换成Hadoop脚本,这样我就可以在EMR上运行它了 我是hadoop新手。我尝试了以下回购,但没有成功 然后我将这两种回购协议混合在一起,制作了下面的脚本 import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.List;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;
import org.archive.io.warc.WARCReaderFactory;
import org.jets3t.service.S3Service;
import org.jets3t.service.S3ServiceException;
import org.jets3t.service.impl.rest.httpclient.RestS3Service;
import org.jets3t.service.model.S3Object;
import com.amazonaws.AmazonClientException;
import com.amazonaws.AmazonServiceException;
import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.S3ObjectSummary;
public class S3BucketReader3 {
static public void process(AmazonS3 s3, String bucketName, String prefix,
int max) throws S3ServiceException, AmazonServiceException,
AmazonClientException, InterruptedException {
int maxCount = 0;
int counter = 0;
int fileCounter = 1;
S3Service s3s = new RestS3Service(null);
// use a callback class for handling WARC record data:
IProcessWarcRecord processor = new SampleProcessWarcRecord();
ObjectListing list = s3.listObjects(bucketName, prefix);
do {
List<S3ObjectSummary> summaries = list.getObjectSummaries();
for (S3ObjectSummary summary : summaries) {
try {
// get single warc.gz file name
String key = summary.getKey();
System.out.println("+ key: " + counter + " " + key);
if (key.contains(".warc.gz") == false)
continue;
S3Object f = s3s.getObject("aws-publicdatasets", key, null,
null, null, null, null, null);
ArchiveReader ar = WARCReaderFactory.get(key,
f.getDataInputStream(), true);
for (ArchiveRecord r : ar) {
// The header file contains information such as the type
// of record, size, creation time, and URL
if (r.getHeader().getMimetype()
.contains("application/http; msgtype=response") == false)
continue;
// If we want to read the contents of the record, we can
// use the ArchiveRecord as an InputStream
// Create a byte array that is as long as all the
// record's stated length
byte[] rawData = IOUtils.toByteArray(r, r.available());
r.read(rawData);
// Note: potential optimization would be to have a large
// buffer only allocated once
// Why don't we convert it to a string and print the
// start of it? Let's hope it's text!
String content = new String(rawData);
if (content.contains("<!DOCTYPE html") == false)
continue;
// remove header
content = content.substring(content
.indexOf("<!DOCTYPE html")
+ "<!DOCTYPE html".length());
content = "<!DOCTYPE html" + content;
String lowerContent = content.toLowerCase();
// search keywords in HTML
if (lowerContent.contains("gambler")
|| lowerContent.contains("rich")
|| lowerContent.contains("name list")
|| lowerContent.contains("2nd rich generation")
|| lowerContent
.contains("2nd official generation")
|| lowerContent.contains("gambler addict")
|| lowerContent.contains("gamble")
|| lowerContent.contains("shanxi")
|| lowerContent.contains("macau")
|| lowerContent.contains("rich businessman")
|| lowerContent.contains("tour")
|| lowerContent.contains("smoking")) {
// write file directly to s3
byte[] contentBytes = null;
ByteArrayInputStream input = null;
try {
input = new ByteArrayInputStream(
content.getBytes());
contentBytes = IOUtils.toByteArray(input);
} catch (IOException e) {
System.err.printf(
"Failed while reading bytes from %s",
e.getMessage());
}
input.reset();
Long contentLength = Long
.valueOf(contentBytes.length);
ObjectMetadata metadata = new ObjectMetadata();
metadata.setContentLength(contentLength);
String dataFileName = "dataFile_" + fileCounter
+ "_" + System.currentTimeMillis() / 1000;
try {
s3.putObject(
"mybucket/common_crawl/warcOutput",
dataFileName, input, metadata);
} catch (AmazonServiceException ase) {
System.out.println("Error Message: "
+ ase.getMessage());
System.out.println("HTTP Status Code: "
+ ase.getStatusCode());
System.out.println("AWS Error Code: "
+ ase.getErrorCode());
System.out.println("Error Type: "
+ ase.getErrorType());
System.out.println("Request ID: "
+ ase.getRequestId());
} catch (AmazonClientException ace) {
System.out.println("Error Message: "
+ ace.getMessage());
} finally {
if (input != null) {
input.close();
}
}
contentBytes = null;
input = null;
// Pretty printing to make the output more readable
System.out.println("Files created number: "
+ fileCounter);
fileCounter++;
} // if keyword match
lowerContent = null;
content = null;
} // for each ArchiveRecord
} catch (Exception ex) {
ex.printStackTrace();
}
counter++;
System.out.println("Count no: " + counter);
if (max != -1) {
if (++maxCount >= max)
return;
}
} // for each summary
list = s3.listNextBatchOfObjects(list);
} while (list.isTruncated());
// done processing all WARC records:
processor.done();
}
static public void main(String[] args) throws S3ServiceException,
AmazonServiceException, AmazonClientException, InterruptedException {
String awsAccessKey = "******";
String awsSecretKey = "******";
AWSCredentials credentials = new BasicAWSCredentials(awsAccessKey,
awsSecretKey);
AmazonS3 s3 = new AmazonS3Client(credentials);
process(s3, "aws-publicdatasets",
"common-crawl/crawl-data/CC-MAIN-2013-48", -1);
}
}
import java.io.ByteArrayInputStream;
导入java.io.IOException;
导入java.util.List;
导入org.apache.commons.io.IOUtils;
导入org.archive.io.ArchiveReader;
导入org.archive.io.ArchiveRecord;
导入org.archive.io.warc.WARCReaderFactory;
导入org.jets3t.service.S3Service;
导入org.jets3t.service.S3ServiceException;
导入org.jets3t.service.impl.rest.httpclient.RestS3Service;
导入org.jets3t.service.model.S3Object;
导入com.amazonaws.AmazonClientException;
导入com.amazonaws.AmazonServiceException;
导入com.amazonaws.auth.AWSCredentials;
导入com.amazonaws.auth.BasicAWSCredentials;
导入com.amazonaws.services.s3.AmazonS3;
导入com.amazonaws.services.s3.amazon3Client;
导入com.amazonaws.services.s3.model.ObjectListing;
导入com.amazonaws.services.s3.model.ObjectMetadata;
导入com.amazonaws.services.s3.model.S3ObjectSummary;
公共类S3BucketReader3{
静态公共无效进程(AmazonS3 s3、字符串bucketName、字符串前缀、,
int max)抛出S3ServiceException、AmazonServiceException、,
AmazonClientException,InterruptedException{
int maxCount=0;
int计数器=0;
int fileCounter=1;
S3服务s3s=新RESTS3服务(空);
//使用回调类处理WARC记录数据:
IProcessWarcRecord处理器=新的SampleProcessWarcRecord();
ObjectListing list=s3.listObjects(bucketName,前缀);
做{
List summaries=List.getObjectSummaries();
对于(S3ObjectSummary摘要:摘要){
试一试{
//获取单个warc.gz文件名
String key=summary.getKey();
System.out.println(“+键:“+计数器+”+键);
if(key.contains(“.warc.gz”)==false)
继续;
S3Object f=s3s.getObject(“aws公共数据集”,键,空,
空,空,空,空,空,空);
ArchiveReader ar=WARCReaderFactory.get(键,
f、 getDataInputStream(),true);
用于(档案记录r:ar){
//头文件包含类型等信息
//记录、大小、创建时间和URL的
if(r.getHeader().getMimetype())
.contains(“应用程序/http;msgtype=response”)==false)
继续;
//如果我们想读取记录的内容,我们可以
//将ArchiveRecord用作输入流
//创建一个字节数组,其长度与所有
//记录的规定长度
byte[]rawData=IOUtils.toByteArray(r,r.available());
r、 读取(原始数据);
//注:潜在的优化将是有一个大的
//缓冲区只分配一次
//我们为什么不把它转换成字符串并打印
//开始?希望是文字!
字符串内容=新字符串(原始数据);
如果(content.contains)(“您是在单个WARC文件中搜索吗?否@vanajayaraman我想用EMR搜索所有WARC文件。我只想为它创建映射器而不是还原器。您是否尝试过哪个是具有多处理能力的python实现?谢谢,我会检查