Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/mysql/61.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 合并具有多个变量的2个数据集(不能仅使用相似的变量)_Python_Mysql_Matlab_Sas_Weka - Fatal编程技术网

Python 合并具有多个变量的2个数据集(不能仅使用相似的变量)

Python 合并具有多个变量的2个数据集(不能仅使用相似的变量),python,mysql,matlab,sas,weka,Python,Mysql,Matlab,Sas,Weka,我试图合并两个数据集(150000和50000条记录),每个数据集大约有50个变量,其中一些可能匹配。这两个数据集中的一个常见变量是“事件日期”,但我不能使用它,因为数据集中大约有300起事件发生在特定日期(按地址、城市、县、邮政编码、通知紧急医疗服务(EMS)的时间细分)。另一个数据集包含事件发生的确切时间、地址、城市、县、邮政编码和一些其他字段,但如果信息未知/未记录,这些字段可能为空 我想创建一个缓冲区来按每个字段连接数据集。例如,首先从事件日期开始(无缺失值),如果它们相同,下一步将检查

我试图合并两个数据集(150000和50000条记录),每个数据集大约有50个变量,其中一些可能匹配。这两个数据集中的一个常见变量是“事件日期”,但我不能使用它,因为数据集中大约有300起事件发生在特定日期(按地址、城市、县、邮政编码、通知紧急医疗服务(EMS)的时间细分)。另一个数据集包含事件发生的确切时间、地址、城市、县、邮政编码和一些其他字段,但如果信息未知/未记录,这些字段可能为空

我想创建一个缓冲区来按每个字段连接数据集。例如,首先从事件日期开始(无缺失值),如果它们相同,下一步将检查它们是否发生在同一个县、市等。(某些值可能为空)。最后一个比较字段是通知EMS的时间(事件发生后30-60分钟内)。如果所有字段都匹配,则最终缓冲时间为30-60分钟。这将是一个多对一的合并(50000到150000)

什么程序可以让我这样做?有某种代码吗

我添加了数据集()和()的一个片段

预期输出如下所示

使用的代码是:

T1 = readtable('dataset1.csv')
T2 = readtable('dataset2.csv')
LT1 = size(T1,1);
LT2 = size(T2,1);
T1 = [T1, cell2table(repmat({''}, LT1, 7),'VariableNames', {'County_Name', 'City_Name', 'Town_Name','CrashTime', 'SecondaryLocation', 'RouteName', 'PostalCityName'})]
augmented = false(LT1,1);
dtstr = 'MM/dd/yyyy HH:mm';
trange = duration([0,0,0;1,0,0]);
for tt2 = 1:LT2
cdate2 = T2.CrashDate{tt2};
crasht2 = T2.CrashDateTime{tt2};
assert(~isempty(cdate2) & ~isempty(crasht2),'Major data missing')
crashdt2 = [cdate2, ' ', crasht2];
crashdt2 = datetime(crashdt2,'InputFormat',dtstr);
strtaddr2 = T2.RouteName{tt2};
if ~isempty(strtaddr2)
strtaddr2 = upper(strtaddr2);
strtaddr2 = strrep(strtaddr2,'ROAD','RD');
strtaddr2 = strtaddr2(isletter(strtaddr2));
end
pcityn2 = T2.PostalCityName{tt2};
if ~isempty(pcityn2)
pcityn2 = upper(pcityn2);
pcityn2 = pcityn2(isletter(pcityn2));
end
countyn2 = T2.County_Name{tt2};
if ~isempty(countyn2)
countyn2 = countyn2(isletter(countyn2));
countyn2 = upper(countyn2);
countyn2 = strrep(countyn2,'COUNTY','');
end
for tt1 = 1:LT1
if augmented(tt1)
continue
end
matchvec = true(5,1);
cdate1 = T1.IncidentDate{tt1};
matchvec(1) = strcmp(cdate1, cdate2);
strtaddr1 = upper(T1.AddressStreet{tt1});
if ~isempty(strtaddr2) && ~isempty(strtaddr1)
strtaddr1 = strrep(strtaddr1,'ROAD','RD');
strtaddr1 = strtaddr1(isletter(strtaddr1));
matchvec(2) = strcmp(strtaddr1,strtaddr2);
end
pcityn1 = upper(T1.AddressCityIncident{tt1});
pcityn1 = pcityn1(isletter(pcityn1));
if ~isempty(pcityn2) && ~isempty(pcityn1)
pcityn1 = pcityn1(isletter(pcityn1));
matchvec(3) = strcmp(pcityn1,pcityn2);
end
countyn1 = upper(T1.AddressCountyIncident{tt1});
countyn1 = countyn1(isletter(countyn1));
if ~isempty(countyn2) && ~isempty(countyn1)
countyn1 = countyn1(isletter(countyn1));
matchvec(4) = strcmp(countyn1,countyn2);
end
crashdt1u = T1.UnitNotified{tt1};
crashdt1d = T1.Date12_DispatchNotified{tt1};
if ~isempty(crashdt1u) || ~isempty(crashdt1d)
tmatch = true(2,1);
if ~isempty(crashdt1u)
crashdt1u = datetime(crashdt1u,'InputFormat',dtstr);
difcrdt1d = crashdt1d-crashdt2;
tmatch = difcrdt1d >= trange(1) && difcrdt1d <= trange(2);
end
matchvec(5) = all(tmatch);
end
if all(matchvec)
T1{tt1,{'County_Name', 'City_Name', 'Town_Name','CrashTime', 'SecondaryLocation', 'RouteName','PostalCityName'}} = table2cell( T2(tt2,{'County_Name', 'City_Name','Town_Name', 'CrashTime', 'SecondaryLocation','RouteName', 'PostalCityName'}) );
augmented(tt1)=true;
else
T1(tt1,:)
T2(tt2,:)
matchvec
end
end
end
T1
T1=readtable('dataset1.csv'))
T2=可读性('dataset2.csv'))
LT1=尺寸(T1,1);
LT2=尺寸(T2,1);
T1=[T1,cell2table(repmat({'},LT1,7),'VariableNames',{'country'u Name','City'u Name','Town'u Name','CrashTime','SecondaryLocation','RouteName','PostalCityName}]
增强=假(LT1,1);
dtstr='MM/dd/yyyy HH:MM';
交易=持续时间([0,0,0;1,0,0]);
对于tt2=1:LT2
cdate2=T2.CrashDate{tt2};
crasht2=T2.CrashDateTime{tt2};
assert(~isempty(cdate2)和~isempty(crasht2),“主数据丢失”)
crashdt2=[cdate2',,crasht2];
crashdt2=日期时间(crashdt2,'InputFormat',dtstr);
strtaddr2=T2.RouteName{tt2};
if~isempty(strtaddr2)
strtaddr2=上(strtaddr2);
strtaddr2=strrep(strtaddr2,'ROAD','RD');
strtaddr2=strtaddr2(胰岛(strtaddr2));
结束
pcityn2=T2.PostalCityName{tt2};
if~isempty(pcityn2)
pcityn2=上(pcityn2);
pcityn2=pcityn2(胰岛细胞(pcityn2));
结束
countyn2=T2.县名称{tt2};
if~isempty(countyn2)
countyn2=countyn2(胰岛细胞(countyn2));
countyn2=上限(countyn2);
countyn2=strrep(countyn2,'COUNTY','';
结束
对于tt1=1:LT1
如果增加(tt1)
持续
结束
matchvec=true(5,1);
cdate1=T1.IncidentDate{tt1};
matchvec(1)=strcmp(cdate1,cdate2);
strtaddr1=上(T1.AddressStreet{tt1});
如果~isempty(strtaddr2)&&~isempty(strtaddr1)
strtaddr1=strrep(strtaddr1,'ROAD','RD');
strtaddr1=strtaddr1(胰岛(strtaddr1));
matchvec(2)=strcmp(strtaddr1,strtaddr2);
结束
pcityn1=上限(T1.AddressCityIncident{tt1});
pcityn1=pcityn1(胰岛细胞(pcityn1));
如果~isempty(pcityn2)&&~isempty(pcityn1)
pcityn1=pcityn1(胰岛细胞(pcityn1));
matchvec(3)=strcmp(pcityn1,pcityn2);
结束
countyn1=上限(T1.AddressCountyIncident{tt1});
countyn1=countyn1(胰岛细胞(countyn1));
如果~isempty(countyn2)&&~isempty(countyn1)
countyn1=countyn1(胰岛细胞(countyn1));
matchvec(4)=strcmp(countyn1,countyn2);
结束
crashdt1u=T1.UnitNotified{tt1};
crashdt1d=T1.Date12_DispatchNotified{tt1};
如果~isempty(crashdt1u)| ~isempty(crashdt1d)
t匹配=真(2,1);
如果~isempty(crashdt1u)
crashdt1u=datetime(crashdt1u,'InputFormat',dtstr);
difcrdt1d=crashdt1d-crashdt2;

t匹配=difcrdt1d>=trange(1)和&difcrdt1d编辑:性能优化代码;预测大量数据

请注意:您的原始数据有许多错误。csv文件中实际数据的任何位置都不允许使用逗号。某些字符串(在通知时间内找到1个单位)没有预定义的格式。
try
块处理一个特定情况;如果所有字段都存在缺陷数据,则应在所有字段中执行
try
。所有这些都应在合并之前解决

clear;clc;close all

T1 = readtable('dataset1.csv');
T2 = readtable('dataset2.csv');
T1 = T1(1:1000,:);
T2 = T2(1:900,:);
LT1 = size(T1,1);
LT2 = size(T2,1);
% expand T1 for expansion
T1 = [T1, cell2table(repmat({''}, LT1, 7), ....
    'VariableNames', {'County_Name', 'City_Name', 'Town_Name', ....
    'CrashTime', 'SecondaryLocation', 'RouteName', 'PostalCityName'})];

augmented = false(LT1,1); % see usage below
dtstr = 'MM/dd/yyyy HH:mm';
trange = duration([-1,0,0;1,0,0]); % 0 to 1 hour
strtaddrcmpf = @(c1,c2) cellfun(@(s2) ....
    cellfun(@(s1) ....
    ~(isempty(strfind(s1,s2)) | isempty(strfind(s2,s1))), ....
    c1), ....
    c2);
% buffer original to speed up
fprintf('Pre-processing started at %s \n', datestr(datetime('now')))
T1B = cell2table([repmat({''}, LT1, 5), repmat({true}, LT1, 4)], ....
    'VariableNames', {'CrashDTU','CrashDTD',  ....
    'StrtAdd','PoCityN', 'CountyN', ....
    'CrashDTFlg', 'StrtAddFlg', 'PoCityNFlg', 'CountyNFlg'});
T2B = cell2table([repmat({''}, LT2, 4), repmat({true}, LT2, 3)], ....
    'VariableNames', {'CrashDT', 'StrtAdd', 'PoCityN', 'CountyN', ....
    'StrtAddFlg', 'PoCityNFlg', 'CountyNFlg'});

fprintf('Progress:        ')
for tt2 = 1:LT2
    fprintf('%s',repmat(sprintf('\b'),1,length('Progress:        ')))
    fprintf('Progress: %6.2f%%', tt2/LT2*50);

    cdate2 = T2.CrashDate{tt2};
    crasht2 = T2.CrashTime{tt2};
    assert(~isempty(cdate2) & ~isempty(crasht2),'Major data missing')
    crashdt2 = [cdate2, ' ', crasht2];
    T2B.CrashDT{tt2} = datetime(crashdt2,'InputFormat',dtstr);

    strtaddr2 = T2.RouteName{tt2};
    if ~isempty(strtaddr2)
        strtaddr2 = upper(strtaddr2);
        strtaddr2 = strrep(strtaddr2,'ROAD','RD'); % repeat for HWY ST etc
        strtaddr2 = strsplit(strtaddr2,'/');
        switch true
            case strfind(strtaddr2,'I95')
                strtaddr2 = {'I95'};
            case strfind(strtaddr2,'I495')
                strtaddr2 = {'I495'};
            otherwise
                strtaddr2 = cellfun(@(s) s(isletter(s)), ....
                    strtaddr2, 'Uniform',false);
        end
        T2B.StrtAdd{tt2} = strtaddr2;
    else
        T2B.StrtAddFlg(tt2) = false;
    end

    pcityn2 = T2.PostalCityName{tt2};
    if ~isempty(pcityn2)
        pcityn2 = upper(pcityn2);
        pcityn2 = pcityn2(isletter(pcityn2));
        T2B.PoCityN{tt2} = pcityn2;
    else
        T2B.PoCityNFlg(tt2) = false;
    end

    countyn2 = T2.County_Name{tt2};
    if ~isempty(countyn2)
        countyn2 = upper(countyn2);
        countyn2 = countyn2(isletter(countyn2));
        countyn2 = strrep(countyn2,'COUNTY','');
        T2B.CountyN{tt2} = countyn2;
    else
        T2B.CountyNFlg(tt2) = false;
    end
end
for tt1 = 1:LT1
    fprintf('%s',repmat(sprintf('\b'),1,length('Progress:        ')))
    fprintf('Progress: %6.2f%%', tt1/LT1*50+50);

    strtaddr1 = upper(T1.AddressStreet{tt1});
    if ~isempty(strtaddr1)
        strtaddr1 = strrep(strtaddr1,'ROAD','RD');
        strtaddr1 = strsplit(strtaddr1,'/');
        switch true
            case strfind(strtaddr1,'I95')
                strtaddr1 = {'I95'};
            case strfind(strtaddr1,'I495')
                strtaddr1 = {'I495'};
            otherwise
                strtaddr1 = cellfun(@(s) s(isletter(s)), ....
                    strtaddr1, 'Uniform',false);
        end
        T1B.StrtAdd{tt1} = strtaddr1;
    else
        T1B.StrtAddFlg(tt1) = false;
    end

    pcityn1 = upper(T1.AddressCityIncident{tt1});
    if ~isempty(pcityn1)
        pcityn1 = pcityn1(isletter(pcityn1));
        T1B.PoCityN{tt1} = pcityn1;
    else
        T1B.PoCityNFlg(tt1) = false;
    end

    countyn1 = upper(T1.AddressCountyIncident{tt1});
    if ~isempty(countyn1)
        countyn1 = countyn1(isletter(countyn1));
        countyn1 = strrep(countyn1,'COUNTY','');
        T1B.CountyN{tt1} = countyn1;
    else
        T1B.CountyNFlg(tt1) = false;
    end

    crashdt1u = T1.UnitNotified{tt1};
    crashdt1d = T1.DispatchNotified{tt1};
    if ~isempty(crashdt1u) || ~isempty(crashdt1d)
        tmatch = true(2,1);
%         a little dirty here, need both date and time
        try
            if ~isempty(crashdt1u)
                crashdt1u = datetime(crashdt1u,'InputFormat',dtstr);
                T1B.CrashDTU{tt1} = crashdt1u;
            end
            if ~isempty(crashdt1d)
                crashdt1d = datetime(crashdt1d,'InputFormat',dtstr);
                T1B.CrashDTD{tt1} = crashdt1d;
            end
        catch
            T1B.CrashDTFlg(tt1) = false;
        end
    else
        T1B.CrashDTFlg(tt1) = false;
    end
end
fprintf('%s',repmat(sprintf('\b'),1,length('Progress:        ')))
fprintf('Pre-processing finished at %s \n', ....
    datestr(datetime('now')))

fprintf('Matching started at %s \n', datestr(datetime('now')))

% process data
fprintf('Progress:        ')
for tt2 = 1:LT2
    fprintf('%s',repmat(sprintf('\b'),1,length('Progress:        ')))
    fprintf('Progress: %6.2f%%', tt2/LT2*100);
%     extract a row for comparison
    crashdt2 = T2B.CrashDT{tt2};
    strtaddr2 = T2B.StrtAdd{tt2};
    pcityn2 = T2B.PoCityN{tt2};
    countyn2 = T2B.CountyN{tt2};

    for tt1 = 1:LT1
        if augmented(tt1) % match already found, skip
            continue
        end

%         Boolean comparison: treat missing data as identical
        cdate1 = T1.IncidentDate{tt1};
        match1 = strcmp(cdate1, cdate2); % incident date
        if ~match1
            continue
        end

        if T2B.StrtAddFlg(tt2) && T1B.StrtAddFlg(tt1) % put 2 first: faster
            strtaddr1 = T1B.StrtAdd{tt1};
            strtaddr_cmp = strtaddrcmpf(strtaddr2,strtaddr1);
            match2 = any(strtaddr_cmp); % street name match
        end
        if ~match2
            continue
        end

        if T2B.PoCityNFlg(tt2) && T1B.PoCityNFlg(tt1)
            pcityn1 = T1B.PoCityN{tt1};
            match3 = strcmp(pcityn1,pcityn2); % postal city name match
        end
        if ~match3
            continue
        end

        if T2B.CountyNFlg(tt2) && T1B.CountyNFlg(tt1)
            countyn1 = T1B.CountyN{tt1};
            countyn1 = countyn1(isletter(countyn1));
            countyn1 = strrep(countyn1,'COUNTY','');
            match4 = strcmp(countyn1,countyn2); % county name match
        end
        if ~match4
            continue
        end

        if T1B.CrashDTFlg(tt1)
            crashdt1u = T1B.CrashDTU{tt1};
            crashdt1d = T1B.CrashDTD{tt1};
%             a little dirty here, need both date and time
            if ~isempty(crashdt1u)
                difcrdt1u = crashdt1u-crashdt2;
                tmatch1 = difcrdt1u >= trange(1) && difcrdt1u <= trange(2);
            end
            if ~isempty(crashdt1d)
                difcrdt1d = crashdt1d-crashdt2;
                tmatch2 = difcrdt1d >= trange(1) && difcrdt1d <= trange(2);
            end
            match5 = tmatch1 & tmatch2;
        end
        if ~match5
            continue
        end

%         append row in T2 to T1
        T1{tt1,{'County_Name', 'City_Name', 'Town_Name', ....
            'CrashTime', 'SecondaryLocation', 'RouteName', ....
            'PostalCityName'}} = ....
            table2cell( T2(tt2,{'County_Name', 'City_Name', ....
            'Town_Name', 'CrashTime', 'SecondaryLocation', ....
            'RouteName', 'PostalCityName'}) );
        augmented(tt1) = true;
%         break % assume unique matching
    end
end

fprintf('%s',repmat(sprintf('\b'),1,length('Progress:        ')))
fprintf('Matching finished at %s \nTotalling %d matches. \n', ....
    datestr(datetime('now')), sum(augmented))

我从Matlab得到这个消息

警告:已修改变量名以使其有效 标识符

因此,您可能需要根据需要更改表中的列名


这些是从csv文件导入的原始数据集

(obsolete)
示例输出:

(obsolete)
新数据集和输出:

>> T1

T1 = 

    IncidentDate                   AddressStreet                   AddressCityIncident    AddressCountyIncident    AddressState    IncidentPostalCode    DispatchNotified      UnitNotified  
    ____________    ___________________________________________    ___________________    _____________________    ____________    __________________    ________________    ________________

    '1/1/2014'      'BURKE LAKE RD/BURKE RD'                       'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 1:33'     '1/1/2014 1:33' 
    '1/1/2014'      'BURKE LAKE RD/BURKE RD'                       'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 1:33'     '1/1/2014 1:33' 
    '1/1/2014'      'I95 SB TO OLD KEENE MILL RD'                  'SPRINGFIELD'          'Fairfax County'         'VA'            22150                 '1/1/2014 2:00'     '1/1/2014 2:00' 
    '1/1/2014'      'SYDENSTRICKER RD/OLD KEENE MILL RD'           'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 4:54'     '1/1/2014 4:54' 
    '1/1/2014'      'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB'    'CHANTILLY'            'Fairfax County'         'VA'            20151                 '1/1/2014 12:28'    '1/1/2014 12:28'
    '1/1/2014'      '11700 SWARTS DR'                              'FAIRFAX'              'Fairfax County'         'VA'            22030                 '1/1/2014 13:07'    '1/1/2014 13:07'
    '1/1/2014'      '11700 SWARTS DR'                              'FAIRFAX'              'Fairfax County'         'VA'            22030                 '1/1/2014 13:07'    '1/1/2014 13:07'
    '1/1/2014'      'CENTREVILLE RD/BRADENTON DR'                  'CENTREVILLE'          'Fairfax County'         'VA'            20121                 '1/1/2014 13:41'    '1/1/2014 13:41'
    '1/1/2014'      'GEORGETOWN PIKE/CENTRILLION DR'               'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:45'    '1/1/2014 16:45'
    '1/1/2014'      'GEORGETOWN PIKE/CENTRILLION DR'               'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:42'    '1/1/2014 16:42'
    '1/1/2014'      '8526 GEORGETOWN PIKE'                         'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:49'    '1/1/2014 16:49'
    '1/1/2014'      'OX RD/BRADDOCK RD'                            'FAIRFAX'              'Fairfax County'         'VA'            22032                 '1/1/2014 22:32'    '1/1/2014 22:32'

>> T2

T2 = 

    CrashDate       County_Name       City_Name    Town_Name    CrashTime        SecondaryLocation              RouteName         PostalCityName
    __________    ________________    _________    _________    _________    __________________________    ___________________    ______________

    '1/1/2014'    'Fairfax County'    NaN          NaN          '6:35'       ''                            'I95'                  'LORTON'      
    '1/1/2014'    'Fairfax County'    NaN          NaN          '5:19'       ''                            'I95 RAMP'             'SPRINGFIELD' 
    '1/1/2014'    'Fairfax County'    NaN          NaN          '10:23'      ''                            'I495'                 'ANNANDALE'   
    '1/1/2014'    'Fairfax County'    NaN          NaN          '2:08'       ''                            'BUILDERS RD'          'HERNDON'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '16:42'      ''                            'GEORGETOWN PIKE'      'MCLEAN'      
    '1/1/2014'    'Fairfax County'    NaN          NaN          '20:55'      'LEESBURG PIKE'               'WILSON BLVD'          'FALLS CHURCH'
    '1/1/2014'    'Fairfax County'    NaN          NaN          '4:54'       ''                            'SYDENSTRICKER RD'     'BURKE'       
    '1/1/2014'    'Fairfax County'    NaN          NaN          '2:34'       'BEACON HILL RD'              'RICHMOND HWY'         'ALEXANDRIA'  
    '1/1/2014'    'Fairfax County'    NaN          NaN          '2:00'       ''                            'COAT RIDGE RD'        'HERNDON'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '13:17'      ''                            'OLD KEENE MILL RD'    'BURKE'       
    '1/1/2014'    'Fairfax County'    NaN          NaN          '5:19'       'MCLEAREN RD'                 'CENTREVILLE RD'       'HERNDON'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '21:48'      'VIRGINIA CENTER BLVD'        'VADEN DR'             'VIENNA'      
    '1/1/2014'    'Fairfax County'    NaN          NaN          '19:59'      'FAIRFAX COUNTY PKWY RAMP'    'LEE HWY RAMP'         'FAIRFAX'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '2:36'       ''                            'I95'                  'SPRINGFIELD' 
    '1/1/2014'    'Fairfax County'    NaN          NaN          '20:36'      'MOUNT GILEAD RD'             'BRADDOCK RD'          'CENTREVILLE' 
    '1/1/2014'    'Fairfax County'    NaN          NaN          '1:46'       ''                            'I95'                  'LORTON'      
    '1/1/2014'    'Fairfax County'    NaN          NaN          '18:45'      ''                            'I495'                 'HAMPTON'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '13:40'      'BRADENTON DR'                'CENTREVILLE RD'       'CENTREVILLE' 
    '1/1/2014'    'Fairfax County'    NaN          NaN          '17:24'      'SHREVE HILL RD'              'IDYLWOOD RD'          'DUNN LORING' 
    '1/1/2014'    'Fairfax County'    NaN          NaN          '17:46'      'SACRAMENTO DR'               'RICHMOND HWY'         'ALEXANDRIA'  
    '1/1/2014'    'Fairfax County'    NaN          NaN          '1:40'       ''                            'WINBOURNE RD'         'BURKE'       
    '1/1/2014'    'Fairfax County'    NaN          NaN          '1:33'       ''                            'BURKE LAKE RD'        'BURKE'       
    '1/1/2014'    'Fairfax County'    NaN          NaN          '15:44'      'TELEGRAPH RD'                'FRANCONIA RD'         'ALEXANDRIA'  
    '1/1/2014'    'Fairfax County'    NaN          NaN          '22:19'      'OX RD'                       'BRADDOCK RD'          'FAIRFAX'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '12:27'      ''                            'SULLY RD'             'HERNDON'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '11:25'      'MONUMENT DR'                 'LEE HWY'              'FAIRFAX'     



T1 = 

    IncidentDate                   AddressStreet                   AddressCityIncident    AddressCountyIncident    AddressState    IncidentPostalCode    DispatchNotified      UnitNotified        County_Name       City_Name    Town_Name    CrashTime    SecondaryLocation        RouteName         PostalCityName
    ____________    ___________________________________________    ___________________    _____________________    ____________    __________________    ________________    ________________    ________________    _________    _________    _________    _________________    __________________    ______________

    '1/1/2014'      'BURKE LAKE RD/BURKE RD'                       'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 1:33'     '1/1/2014 1:33'     'Fairfax County'    [NaN]        [NaN]        '1:33'       ''                   'BURKE LAKE RD'       'BURKE'       
    '1/1/2014'      'BURKE LAKE RD/BURKE RD'                       'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 1:33'     '1/1/2014 1:33'     'Fairfax County'    [NaN]        [NaN]        '1:33'       ''                   'BURKE LAKE RD'       'BURKE'       
    '1/1/2014'      'I95 SB TO OLD KEENE MILL RD'                  'SPRINGFIELD'          'Fairfax County'         'VA'            22150                 '1/1/2014 2:00'     '1/1/2014 2:00'     ''                  ''           ''           ''           ''                   ''                    ''            
    '1/1/2014'      'SYDENSTRICKER RD/OLD KEENE MILL RD'           'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 4:54'     '1/1/2014 4:54'     'Fairfax County'    [NaN]        [NaN]        '4:54'       ''                   'SYDENSTRICKER RD'    'BURKE'       
    '1/1/2014'      'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB'    'CHANTILLY'            'Fairfax County'         'VA'            20151                 '1/1/2014 12:28'    '1/1/2014 12:28'    ''                  ''           ''           ''           ''                   ''                    ''            
    '1/1/2014'      '11700 SWARTS DR'                              'FAIRFAX'              'Fairfax County'         'VA'            22030                 '1/1/2014 13:07'    '1/1/2014 13:07'    ''                  ''           ''           ''           ''                   ''                    ''            
    '1/1/2014'      '11700 SWARTS DR'                              'FAIRFAX'              'Fairfax County'         'VA'            22030                 '1/1/2014 13:07'    '1/1/2014 13:07'    ''                  ''           ''           ''           ''                   ''                    ''            
    '1/1/2014'      'CENTREVILLE RD/BRADENTON DR'                  'CENTREVILLE'          'Fairfax County'         'VA'            20121                 '1/1/2014 13:41'    '1/1/2014 13:41'    'Fairfax County'    [NaN]        [NaN]        '13:40'      'BRADENTON DR'       'CENTREVILLE RD'      'CENTREVILLE' 
    '1/1/2014'      'GEORGETOWN PIKE/CENTRILLION DR'               'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:45'    '1/1/2014 16:45'    'Fairfax County'    [NaN]        [NaN]        '16:42'      ''                   'GEORGETOWN PIKE'     'MCLEAN'      
    '1/1/2014'      'GEORGETOWN PIKE/CENTRILLION DR'               'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:42'    '1/1/2014 16:42'    'Fairfax County'    [NaN]        [NaN]        '16:42'      ''                   'GEORGETOWN PIKE'     'MCLEAN'      
    '1/1/2014'      '8526 GEORGETOWN PIKE'                         'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:49'    '1/1/2014 16:49'    'Fairfax County'    [NaN]        [NaN]        '16:42'      ''                   'GEORGETOWN PIKE'     'MCLEAN'      
    '1/1/2014'      'OX RD/BRADDOCK RD'                            'FAIRFAX'              'Fairfax County'         'VA'            22032                 '1/1/2014 22:32'    '1/1/2014 22:32'    'Fairfax County'    [NaN]        [NaN]        '22:19'      'OX RD'              'BRADDOCK RD'         'FAIRFAX'     

建议您从要合并的数据集中包含一些示例数据(10-20条记录),并且所需的SQL输出类型应该足够了。如果您有名称,那么Link King将是执行一些模糊匹配的好方法。有一些替代方案可以检查不同的变量并进行概率匹配。有很多选择,但这不是一个简单而直接的问题,一个查询可以回答,多个SQL连接也可以帮助您实现这一点。如您所见,第二个数据集的最后第二行与第一个数据集的第五行完全匹配。第二个数据集的最后一行与第一个数据集的第六行匹配。(缓冲时间为3分钟)。像这样,我必须将数据集1的50000条记录合并到数据集2的150000条记录。@Yvon在数据集1中,事件日期是唯一一个始终不为空的字段。在50000条记录中,地址街有10条空白,单位通知字段有13条空白。但是,在这种情况下可以使用相邻时间(调度通知)。在数据集2中,崩溃日期和崩溃时间字段始终不是空的。在150000条记录中,路线名称和邮政城市名称中只有7条缺失值。评论不用于进一步讨论;这段对话已经结束。
>> T1

T1 = 

    IncidentDate                   AddressStreet                   AddressCityIncident    AddressCountyIncident    AddressState    IncidentPostalCode    DispatchNotified      UnitNotified  
    ____________    ___________________________________________    ___________________    _____________________    ____________    __________________    ________________    ________________

    '1/1/2014'      'BURKE LAKE RD/BURKE RD'                       'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 1:33'     '1/1/2014 1:33' 
    '1/1/2014'      'BURKE LAKE RD/BURKE RD'                       'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 1:33'     '1/1/2014 1:33' 
    '1/1/2014'      'I95 SB TO OLD KEENE MILL RD'                  'SPRINGFIELD'          'Fairfax County'         'VA'            22150                 '1/1/2014 2:00'     '1/1/2014 2:00' 
    '1/1/2014'      'SYDENSTRICKER RD/OLD KEENE MILL RD'           'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 4:54'     '1/1/2014 4:54' 
    '1/1/2014'      'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB'    'CHANTILLY'            'Fairfax County'         'VA'            20151                 '1/1/2014 12:28'    '1/1/2014 12:28'
    '1/1/2014'      '11700 SWARTS DR'                              'FAIRFAX'              'Fairfax County'         'VA'            22030                 '1/1/2014 13:07'    '1/1/2014 13:07'
    '1/1/2014'      '11700 SWARTS DR'                              'FAIRFAX'              'Fairfax County'         'VA'            22030                 '1/1/2014 13:07'    '1/1/2014 13:07'
    '1/1/2014'      'CENTREVILLE RD/BRADENTON DR'                  'CENTREVILLE'          'Fairfax County'         'VA'            20121                 '1/1/2014 13:41'    '1/1/2014 13:41'
    '1/1/2014'      'GEORGETOWN PIKE/CENTRILLION DR'               'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:45'    '1/1/2014 16:45'
    '1/1/2014'      'GEORGETOWN PIKE/CENTRILLION DR'               'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:42'    '1/1/2014 16:42'
    '1/1/2014'      '8526 GEORGETOWN PIKE'                         'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:49'    '1/1/2014 16:49'
    '1/1/2014'      'OX RD/BRADDOCK RD'                            'FAIRFAX'              'Fairfax County'         'VA'            22032                 '1/1/2014 22:32'    '1/1/2014 22:32'

>> T2

T2 = 

    CrashDate       County_Name       City_Name    Town_Name    CrashTime        SecondaryLocation              RouteName         PostalCityName
    __________    ________________    _________    _________    _________    __________________________    ___________________    ______________

    '1/1/2014'    'Fairfax County'    NaN          NaN          '6:35'       ''                            'I95'                  'LORTON'      
    '1/1/2014'    'Fairfax County'    NaN          NaN          '5:19'       ''                            'I95 RAMP'             'SPRINGFIELD' 
    '1/1/2014'    'Fairfax County'    NaN          NaN          '10:23'      ''                            'I495'                 'ANNANDALE'   
    '1/1/2014'    'Fairfax County'    NaN          NaN          '2:08'       ''                            'BUILDERS RD'          'HERNDON'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '16:42'      ''                            'GEORGETOWN PIKE'      'MCLEAN'      
    '1/1/2014'    'Fairfax County'    NaN          NaN          '20:55'      'LEESBURG PIKE'               'WILSON BLVD'          'FALLS CHURCH'
    '1/1/2014'    'Fairfax County'    NaN          NaN          '4:54'       ''                            'SYDENSTRICKER RD'     'BURKE'       
    '1/1/2014'    'Fairfax County'    NaN          NaN          '2:34'       'BEACON HILL RD'              'RICHMOND HWY'         'ALEXANDRIA'  
    '1/1/2014'    'Fairfax County'    NaN          NaN          '2:00'       ''                            'COAT RIDGE RD'        'HERNDON'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '13:17'      ''                            'OLD KEENE MILL RD'    'BURKE'       
    '1/1/2014'    'Fairfax County'    NaN          NaN          '5:19'       'MCLEAREN RD'                 'CENTREVILLE RD'       'HERNDON'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '21:48'      'VIRGINIA CENTER BLVD'        'VADEN DR'             'VIENNA'      
    '1/1/2014'    'Fairfax County'    NaN          NaN          '19:59'      'FAIRFAX COUNTY PKWY RAMP'    'LEE HWY RAMP'         'FAIRFAX'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '2:36'       ''                            'I95'                  'SPRINGFIELD' 
    '1/1/2014'    'Fairfax County'    NaN          NaN          '20:36'      'MOUNT GILEAD RD'             'BRADDOCK RD'          'CENTREVILLE' 
    '1/1/2014'    'Fairfax County'    NaN          NaN          '1:46'       ''                            'I95'                  'LORTON'      
    '1/1/2014'    'Fairfax County'    NaN          NaN          '18:45'      ''                            'I495'                 'HAMPTON'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '13:40'      'BRADENTON DR'                'CENTREVILLE RD'       'CENTREVILLE' 
    '1/1/2014'    'Fairfax County'    NaN          NaN          '17:24'      'SHREVE HILL RD'              'IDYLWOOD RD'          'DUNN LORING' 
    '1/1/2014'    'Fairfax County'    NaN          NaN          '17:46'      'SACRAMENTO DR'               'RICHMOND HWY'         'ALEXANDRIA'  
    '1/1/2014'    'Fairfax County'    NaN          NaN          '1:40'       ''                            'WINBOURNE RD'         'BURKE'       
    '1/1/2014'    'Fairfax County'    NaN          NaN          '1:33'       ''                            'BURKE LAKE RD'        'BURKE'       
    '1/1/2014'    'Fairfax County'    NaN          NaN          '15:44'      'TELEGRAPH RD'                'FRANCONIA RD'         'ALEXANDRIA'  
    '1/1/2014'    'Fairfax County'    NaN          NaN          '22:19'      'OX RD'                       'BRADDOCK RD'          'FAIRFAX'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '12:27'      ''                            'SULLY RD'             'HERNDON'     
    '1/1/2014'    'Fairfax County'    NaN          NaN          '11:25'      'MONUMENT DR'                 'LEE HWY'              'FAIRFAX'     



T1 = 

    IncidentDate                   AddressStreet                   AddressCityIncident    AddressCountyIncident    AddressState    IncidentPostalCode    DispatchNotified      UnitNotified        County_Name       City_Name    Town_Name    CrashTime    SecondaryLocation        RouteName         PostalCityName
    ____________    ___________________________________________    ___________________    _____________________    ____________    __________________    ________________    ________________    ________________    _________    _________    _________    _________________    __________________    ______________

    '1/1/2014'      'BURKE LAKE RD/BURKE RD'                       'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 1:33'     '1/1/2014 1:33'     'Fairfax County'    [NaN]        [NaN]        '1:33'       ''                   'BURKE LAKE RD'       'BURKE'       
    '1/1/2014'      'BURKE LAKE RD/BURKE RD'                       'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 1:33'     '1/1/2014 1:33'     'Fairfax County'    [NaN]        [NaN]        '1:33'       ''                   'BURKE LAKE RD'       'BURKE'       
    '1/1/2014'      'I95 SB TO OLD KEENE MILL RD'                  'SPRINGFIELD'          'Fairfax County'         'VA'            22150                 '1/1/2014 2:00'     '1/1/2014 2:00'     ''                  ''           ''           ''           ''                   ''                    ''            
    '1/1/2014'      'SYDENSTRICKER RD/OLD KEENE MILL RD'           'BURKE'                'Fairfax County'         'VA'            22015                 '1/1/2014 4:54'     '1/1/2014 4:54'     'Fairfax County'    [NaN]        [NaN]        '4:54'       ''                   'SYDENSTRICKER RD'    'BURKE'       
    '1/1/2014'      'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB'    'CHANTILLY'            'Fairfax County'         'VA'            20151                 '1/1/2014 12:28'    '1/1/2014 12:28'    ''                  ''           ''           ''           ''                   ''                    ''            
    '1/1/2014'      '11700 SWARTS DR'                              'FAIRFAX'              'Fairfax County'         'VA'            22030                 '1/1/2014 13:07'    '1/1/2014 13:07'    ''                  ''           ''           ''           ''                   ''                    ''            
    '1/1/2014'      '11700 SWARTS DR'                              'FAIRFAX'              'Fairfax County'         'VA'            22030                 '1/1/2014 13:07'    '1/1/2014 13:07'    ''                  ''           ''           ''           ''                   ''                    ''            
    '1/1/2014'      'CENTREVILLE RD/BRADENTON DR'                  'CENTREVILLE'          'Fairfax County'         'VA'            20121                 '1/1/2014 13:41'    '1/1/2014 13:41'    'Fairfax County'    [NaN]        [NaN]        '13:40'      'BRADENTON DR'       'CENTREVILLE RD'      'CENTREVILLE' 
    '1/1/2014'      'GEORGETOWN PIKE/CENTRILLION DR'               'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:45'    '1/1/2014 16:45'    'Fairfax County'    [NaN]        [NaN]        '16:42'      ''                   'GEORGETOWN PIKE'     'MCLEAN'      
    '1/1/2014'      'GEORGETOWN PIKE/CENTRILLION DR'               'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:42'    '1/1/2014 16:42'    'Fairfax County'    [NaN]        [NaN]        '16:42'      ''                   'GEORGETOWN PIKE'     'MCLEAN'      
    '1/1/2014'      '8526 GEORGETOWN PIKE'                         'MCLEAN'               'Fairfax County'         'VA'            22102                 '1/1/2014 16:49'    '1/1/2014 16:49'    'Fairfax County'    [NaN]        [NaN]        '16:42'      ''                   'GEORGETOWN PIKE'     'MCLEAN'      
    '1/1/2014'      'OX RD/BRADDOCK RD'                            'FAIRFAX'              'Fairfax County'         'VA'            22032                 '1/1/2014 22:32'    '1/1/2014 22:32'    'Fairfax County'    [NaN]        [NaN]        '22:19'      'OX RD'              'BRADDOCK RD'         'FAIRFAX'