Mongodb 使用ApacheSqoop将数据从Mongo/Cassandra导出到HDFS
我有一个问题,我必须通过Hive从多个数据源读取数据,即RDBMS(MYSQL、Oracle)和NOSQL(MongoDb、Cassandra)到HDFS ApacheSqoop非常适合RDBMS,但它不适合NOSQL,至少我无法成功地使用它,(我尝试使用用于Mongo的JDBC驱动程序…它能够连接到Mongo,但无法推送到HDFS)Mongodb 使用ApacheSqoop将数据从Mongo/Cassandra导出到HDFS,mongodb,hadoop,hdfs,sqoop,Mongodb,Hadoop,Hdfs,Sqoop,我有一个问题,我必须通过Hive从多个数据源读取数据,即RDBMS(MYSQL、Oracle)和NOSQL(MongoDb、Cassandra)到HDFS ApacheSqoop非常适合RDBMS,但它不适合NOSQL,至少我无法成功地使用它,(我尝试使用用于Mongo的JDBC驱动程序…它能够连接到Mongo,但无法推送到HDFS) 如果有人做过与此相关的工作并可以分享,那将非常有帮助我使用了一个来自web的示例,能够将文件从Mongo传输到HDFS,反之亦然。我现在无法掌握确切的网页。但程序
如果有人做过与此相关的工作并可以分享,那将非常有帮助我使用了一个来自web的示例,能够将文件从Mongo传输到HDFS,反之亦然。我现在无法掌握确切的网页。但程序如下所示 你可以从中获得灵感,继续前进
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.bson.BSONObject;
import org.bson.types.ObjectId;
import com.mongodb.hadoop.MongoInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import com.mongodb.hadoop.util.MongoConfigUtil;
public class CopyFromMongodbToHDFS {
public static class ImportWeblogsFromMongo extends
Mapper<LongWritable, Text, Text, Text> {
public void map(Object key, BSONObject value, Context context)
throws IOException, InterruptedException {
System.out.println("Key: " + key);
System.out.println("Value: " + value);
String md5 = value.get("md5").toString();
String url = value.get("url").toString();
String date = value.get("date").toString();
String time = value.get("time").toString();
String ip = value.get("ip").toString();
String output = "\t" + url + "\t" + date + "\t" + time + "\t" + ip;
context.write(new Text(md5), new Text(output));
}
}
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
MongoConfigUtil.setInputURI(conf,
"mongodb://127.0.0.1:27017/test.mylogs");
System.out.println("Configuration: " + conf);
@SuppressWarnings("deprecation")
Job job = new Job(conf, "Mongo Import");
Path out = new Path("/user/cloudera/test1/logs.txt");
FileOutputFormat.setOutputPath(job, out);
job.setJarByClass(CopyFromMongodbToHDFS.class);
job.setMapperClass(ImportWeblogsFromMongo.class);
job.setOutputKeyClass(ObjectId.class);
job.setOutputValueClass(BSONObject.class);
job.setInputFormatClass(MongoInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setNumReduceTasks(0);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
import org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.Job;
导入org.apache.hadoop.mapreduce.Mapper;
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
导入org.bson.BSONObject;
导入org.bson.types.ObjectId;
导入com.mongodb.hadoop.MongoInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
导入com.mongodb.hadoop.util.MongoConfigUtil;
公共类CopyFromMongodbToHDFS{
Mongo扩展的公共静态类ImportWeblogsFromMongo
制图员{
公共无效映射(对象键、BSONObject值、上下文)
抛出IOException、InterruptedException{
System.out.println(“Key:+Key”);
System.out.println(“值:”+Value);
字符串md5=value.get(“md5”).toString();
字符串url=value.get(“url”).toString();
字符串日期=value.get(“日期”).toString();
字符串时间=value.get(“time”).toString();
字符串ip=value.get(“ip”).toString();
字符串输出=“\t”+url+“\t”+date+“\t”+time+“\t”+ip;
编写(新文本(md5),新文本(输出));
}
}
公共静态void main(字符串[]args)引发IOException,
InterruptedException,ClassNotFoundException{
Configuration conf=新配置();
MongoConfigUtil.setInputURI(conf,
"mongodb://127.0.0.1:27017/test.mylogs");
System.out.println(“配置:+conf”);
@抑制警告(“弃用”)
Job Job=新作业(conf,“Mongo导入”);
路径输出=新路径(“/user/cloudera/test1/logs.txt”);
setOutputPath(作业,输出);
job.setJarByClass(CopyFromMongodbToHDFS.class);
setMapperClass(importwewBlogsFrommongo.class);
job.setOutputKeyClass(ObjectId.class);
job.setOutputValueClass(BSONObject.class);
job.setInputFormatClass(MongoInputFormat.class);
setOutputFormatClass(TextOutputFormat.class);
job.setNumReduceTasks(0);
系统退出(作业等待完成(真)?0:1;
}
}
对于mongoDB,创建要导出到HDFS的集合的mongodump
cd
mongodump-h