通过公共标签在java中高效合并2个大型csv文件
我需要通过普通的行或列标签合并2个大型csv文件(每个so~500mb中约4000万个数据元素),这可以由用户指定。例如,如果dataset1.csv包含:通过公共标签在java中高效合并2个大型csv文件,java,csv,arraylist,merge,large-files,Java,Csv,Arraylist,Merge,Large Files,我需要通过普通的行或列标签合并2个大型csv文件(每个so~500mb中约4000万个数据元素),这可以由用户指定。例如,如果dataset1.csv包含: patient_id x1 x2 x3 pi1 1 2 3 pi3 4 5 6 patient_id y1 y2 y3 pi0 0 0 0 pi1 11 12
patient_id x1 x2 x3
pi1 1 2 3
pi3 4 5 6
patient_id y1 y2 y3
pi0 0 0 0
pi1 11 12 13
pi2 99 98 97
pi3 14 15 16
和dataset2.csv包含:
patient_id x1 x2 x3
pi1 1 2 3
pi3 4 5 6
patient_id y1 y2 y3
pi0 0 0 0
pi1 11 12 13
pi2 99 98 97
pi3 14 15 16
用户可以指定通过行标签(患者ID)合并这两个文件,结果output.csv为:
patient_id x1 x2 x3 y1 y2 y3
pi1 1 2 3 11 12 13
pi3 4 5 6 14 15 16
因为我们只合并两个输入文件共用(交叉)的患者ID信息。我解决这个问题的策略是创建一个HashMap,其中要合并的行或列标签(在本例中是行标签,即患者id)是键,患者id的数据作为值存储为ArrayList。我为每个输入数据文件创建一个HashMap,然后基于相似的键合并这些值。我将数据表示为ArrayList>类型的二维ArrayList,因此合并的数据也具有这种类型。然后,我简单地遍历合并的ArrayList>对象,我称之为数据类型对象,并将其打印到文件中。代码如下:
下面是依赖于下面的数据类文件的DataMerge类
import java.util.HashMap;
import java.util.ArrayList;
public class DataMerge {
/**Merges two Data objects by a similar label. For example, if two data sets represent
* different data for the same set of patients, which are represented by their unique patient
* ID, mergeData will return a data set containing only those patient IDs that are common to both
* data sets along with the data represented in both data sets. labelInRow1 and labelInRow2 separately
* indicate whether the common labels are in separate rows(true) of d1 and d2, respectively, or separate columns otherwise.*/
public static Data mergeData(Data d1, Data d2, boolean labelInRow1,
boolean labelInRow2){
ArrayList<ArrayList<String>> mergedData = new ArrayList<ArrayList<String>>();
HashMap<String,ArrayList<String>> d1Map = d1.mapFeatureToData(labelInRow1);
HashMap<String,ArrayList<String>> d2Map = d2.mapFeatureToData(labelInRow2);
ArrayList<String> d1Features;
ArrayList<String> d2Features;
if (labelInRow1){
d1Features = d1.getColumnLabels();
} else {
d1Features = d1.getRowLabels();
}
if (labelInRow2){
d2Features = d2.getColumnLabels();
} else {
d2Features = d2.getRowLabels();
}
d1Features.trimToSize();
d2Features.trimToSize();
ArrayList<String> mergedFeatures = new ArrayList<String>();
if ((d1.getLabelLabel() != "") && (d1.getLabelLabel() == "")) {
mergedFeatures.add(d1.getLabelLabel());
}
else if ((d1.getLabelLabel() == "") && (d1.getLabelLabel() != "")) {
mergedFeatures.add(d2.getLabelLabel());
} else {
mergedFeatures.add(d1.getLabelLabel());
}
mergedFeatures.addAll(d1Features);
mergedFeatures.addAll(d2Features);
mergedFeatures.trimToSize();
mergedData.add(mergedFeatures);
for (String key : d1Map.keySet()){
ArrayList<String> curRow = new ArrayList<String>();
if (d2Map.containsKey(key)){
curRow.add(key);
curRow.addAll(d1Map.get(key));
curRow.addAll(d2Map.get(key));
curRow.trimToSize();
mergedData.add(curRow);
}
}
mergedData.trimToSize();
Data result = new Data(mergedData, true);
return result;
}
}
import java.util.HashMap;
导入java.util.ArrayList;
公共类数据合并{
/**通过相似的标签合并两个数据对象。例如,如果两个数据集表示
*同一组患者的不同数据,由其唯一患者表示
*ID,mergeData将返回一个数据集,该数据集仅包含两个患者共用的患者ID
*数据集以及两个数据集中表示的数据。分别为LabelIrow1和LabelIrow2
*指示公共标签是分别位于d1和d2的单独行(true)中,还是位于单独的列中*/
公共静态数据合并数据(数据d1、数据d2、布尔标签Row1、,
布尔标号(箭头2){
ArrayList mergedData=新的ArrayList();
HashMap d1Map=d1.mapFeatureToData(LabelIrow1);
HashMap d2Map=d2.mapFeatureToData(LabelIrow2);
阵列列表特征;
阵列列表特征;
如果(标签行1){
d1Features=d1.getColumnLabels();
}否则{
d1Features=d1.getRowLabels();
}
如果(标签箭头2){
d2Features=d2.getColumnLabels();
}否则{
d2Features=d2.getRowLabels();
}
d1.trimToSize();
d2Features.trimToSize();
ArrayList mergedFeatures=新的ArrayList();
如果((d1.getLabelLabel()!=“”)和&(d1.getLabelLabel()=“”){
mergedFeatures.add(d1.getLabelLabel());
}
else if((d1.getLabelLabel()=“”)和&(d1.getLabelLabel()!=“”){
mergedFeatures.add(d2.getLabelLabel());
}否则{
mergedFeatures.add(d1.getLabelLabel());
}
mergedFeatures.addAll(d1Features);
mergedFeatures.addAll(d2Features);
mergedFeatures.trimToSize();
mergedData.add(mergedFeatures);
for(字符串键:d1Map.keySet()){
ArrayList curRow=新的ArrayList();
if(d2Map.containsKey(键)){
curRow.add(键);
curRow.addAll(d1Map.get(key));
curRow.addAll(d2Map.get(key));
curRow.trimToSize();
mergedData.add(curRow);
}
}
mergedData.trimToSize();
数据结果=新数据(mergedData,true);
返回结果;
}
}
下面是数据类型对象及其关联的HashMap生成函数,其中包含一些行和列标签提取方法
import java.util.*;
import java.io.*;
/**Represents an unlabeled or labeled data set as a series of nested ArrayLists, where each nested
* ArrayList represents a line of the input data.*/
public class Data {
private ArrayList<String> colLabels = new ArrayList<String>(); //row labels
private ArrayList<String> rowLabels = new ArrayList<String>(); //column labels
private String labelLabel;
private ArrayList<ArrayList<String>> unlabeledData; //data without row and column labels
/**Returns an ArrayList of ArrayLists, where each nested ArrayList represents a line
*of the input file.*/
@SuppressWarnings("resource")
private static ArrayList<ArrayList<String>> readFile(String filePath, String fileSep){
ArrayList<ArrayList<String>> result = new ArrayList<ArrayList<String>>();
try{
BufferedReader input = new BufferedReader(new FileReader (filePath));
String line = input.readLine();
while (line != null){
String[] splitLine = line.split(fileSep);
result.add(new ArrayList<String>(Arrays.asList(splitLine)));
line = input.readLine();
}
}
catch (Exception e){
System.err.println(e);
}
result.trimToSize();;
return result;
}
/**Returns an ArrayList of ArrayLists, where each nested ArrayList represents a line of the input
* data but WITHOUT any row or column labels*/
private ArrayList<ArrayList<String>> extractLabelsAndData(String filePath, String fileSep){
ArrayList<ArrayList<String>> tempData = new ArrayList<ArrayList<String>>();
tempData.addAll(readFile(filePath, fileSep));
tempData.trimToSize();
this.colLabels.addAll(tempData.remove(0));
this.labelLabel = this.colLabels.remove(0);
this.colLabels.trimToSize();
for (ArrayList<String> line : tempData){
this.rowLabels.add(line.remove(0));
}
this.rowLabels.trimToSize();
return tempData;
}
/**Returns an ArrayList of ArrayLists, where each nested ArrayList represents a line of the input
* data but WITHOUT any row or column labels. Does mutate the original data*/
private ArrayList<ArrayList<String>> extractLabelsAndData (ArrayList<ArrayList<String>> data){
ArrayList<ArrayList<String>> result = new ArrayList<ArrayList<String>>();
for (ArrayList<String> line : data){
ArrayList<String> temp = new ArrayList<String>();
for (String element : line){
temp.add(element);
}
temp.trimToSize();
result.add(temp);
}
this.colLabels.addAll(result.remove(0));
this.labelLabel = this.colLabels.remove(0);
this.colLabels.trimToSize();
for (ArrayList<String> line : result){
this.rowLabels.add(line.remove(0));
}
this.rowLabels.trimToSize();
result.trimToSize();
return result;
}
/**Returns the labelLabel for the data*/
public String getLabelLabel(){
return this.labelLabel;
}
/**Returns an ArrayList of the labels while maintaining the order
* in which they appear in the data. Row indicates that the desired
* features are all in the same row. Assumed that the labels are in the
* first row of the data. */
public ArrayList<String> getColumnLabels(){
return this.colLabels;
}
/**Returns an ArrayList of the labels while maintaining the order
* in which they appear in the data. Column indicates that the desired
* features are all in the same column. Assumed that the labels are in the
* first column of the data.*/
public ArrayList<String> getRowLabels(){
return this.rowLabels;
}
/**Creates a HashMap where a list of feature labels are mapped to the entire data. For example,
* if a data set contains patient IDs and test results, this function can be used to create
* a HashMap where the keys are the patient IDs and the values are an ArrayList of the test
* results. The boolean input isRow, which, when true, designates that the
* desired keys are listed in the rows or false if they are in the columns.*/
public HashMap<String, ArrayList<String>> mapFeatureToData(boolean isRow){
HashMap<String, ArrayList<String>> featureMap = new HashMap<String,ArrayList<String>>();
if (!isRow){
for (ArrayList<String> line : this.unlabeledData){
for (int i = 0; i < this.colLabels.size(); i++){
if (featureMap.containsKey(this.colLabels.get(i))){
featureMap.get(this.colLabels.get(i)).add(line.get(i));
} else{
ArrayList<String> firstValue = new ArrayList<String>();
firstValue.add(line.get(i));
featureMap.put(this.colLabels.get(i), firstValue);
}
}
}
} else {
for (int i = 0; i < this.rowLabels.size(); i++){
if (!featureMap.containsKey(this.rowLabels.get(i))){
featureMap.put(this.rowLabels.get(i), this.unlabeledData.get(i));
} else {
featureMap.get(this.rowLabels.get(i)).addAll(this.unlabeledData.get(i));
}
}
}
return featureMap;
}
/**Writes the data to a file in the specified outputPath. sep indicates the data delimiter.
* labeledOutput indicates whether or not the user wants the data written to a file to be
* labeled or unlabeled. If the data was unlabeled to begin with, then labeledOutput
* should not be set to true. */
public void writeDataToFile(String outputPath, String sep){
try {
PrintStream writer = new PrintStream(new BufferedOutputStream (new FileOutputStream (outputPath, true)));
String sol = this.labelLabel + sep;
for (int n = 0; n < this.colLabels.size(); n++){
if (n == this.colLabels.size()-1){
sol += this.colLabels.get(n) + "\n";
} else {
sol += this.colLabels.get(n) + sep;
}
}
for (int i = 0; i < this.unlabeledData.size(); i++){
ArrayList<String> line = this.unlabeledData.get(i);
sol += this.rowLabels.get(i) + sep;
for (int j = 0; j < line.size(); j++){
if (j == line.size()-1){
sol += line.get(j);
} else {
sol += line.get(j) + sep;
}
}
sol += "\n";
}
sol = sol.trim();
writer.print(sol);
writer.close();
} catch (Exception e){
System.err.println(e);
}
}
/**Constructor for Data object. filePath specifies the input file directory,
* fileSep indicates the file separator used in the input file, and hasLabels
* designates whether the input data has row and column labels. Note that if
* hasLabels is set to true, it is assumed that there are BOTH row and column labels*/
public Data(String filePath, String fileSep, boolean hasLabels){
if (hasLabels){
this.unlabeledData = extractLabelsAndData(filePath, fileSep);
this.unlabeledData.trimToSize();
} else {
this.unlabeledData = readFile(filePath, fileSep);
this.unlabeledData.trimToSize();
}
}
/**Constructor for Data object that accepts nested ArrayLists as inputs*/
public Data (ArrayList<ArrayList<String>> data, boolean hasLabels){
if (hasLabels){
this.unlabeledData = extractLabelsAndData(data);
this.unlabeledData.trimToSize();
} else {
this.unlabeledData = data;
this.unlabeledData.trimToSize();
}
}
}
import java.util.*;
导入java.io.*;
/**将未标记或标记的数据集表示为一系列嵌套的ArrayList,其中每个嵌套的ArrayList
*ArrayList表示一行输入数据*/
公共类数据{
私有ArrayList colLabels=新ArrayList();//行标签
私有ArrayList rowLabels=新ArrayList();//列标签
私有字符串标签;
private ArrayList unlabeledData;//没有行和列标签的数据
/**返回ArrayList的ArrayList,其中每个嵌套的ArrayList表示一行
*输入文件的名称*/
@抑制警告(“资源”)
私有静态ArrayList读取文件(字符串文件路径、字符串文件步骤){
ArrayList结果=新建ArrayList();
试一试{
BufferedReader输入=新BufferedReader(新文件读取器(文件路径));
String line=input.readLine();
while(行!=null){
String[]splitLine=line.split(fileSep);
添加(新的ArrayList(Arrays.asList(splitLine));
line=input.readLine();
}
}
捕获(例外e){
系统错误println(e);
}
result.trimToSize();;
返回结果;
}
/**返回ArrayList的ArrayList,其中每个嵌套的ArrayList表示一行输入
*数据,但没有任何行或列标签*/
私有ArrayList extractLabelsAndData(字符串文件路径、字符串文件步骤){
ArrayList tempData=新的ArrayList();
addAll(readFile(filePath,fileSep));
tempData.trimToSize();
this.colLabels.addAll(tempData.remove(0));
this.labelLabel=this.colLabels.remove(0);
this.colLabels.trimToSize();
用于(ArrayList行:tempData){
this.rowLabels.add(line.remove(0));
}
this.rowLabels.trimToSize();
返回临时数据;
}
/**返回ArrayList的ArrayList,其中每个嵌套的ArrayList表示一行输入
*数据,但没有任何行或列标签。不改变原始数据*/
专用ArrayList extractLabelsAndData(ArrayList数据){
ArrayList结果=新建ArrayList();
用于(ArrayList行:数据){
ArrayList temp=新的ArrayList();
for(字符串元素:行){
温度添加(元素);
}
温度微调尺寸();
结果。添加(温度);
}
this.colLabels.addAll(result.remove(0));
this.labelLabel=this.colLabels.remove(0);
this.colLabels.trimToSize();
用于(ArrayList行:结果){
this.rowLabels.add(line.remove(0));
}
this.rowLabels.trimToSize();
结果:trimToSize();
返回结果;
}
/**返回