Python 合并具有多个变量的2个数据集(不能仅使用相似的变量)
我试图合并两个数据集(150000和50000条记录),每个数据集大约有50个变量,其中一些可能匹配。这两个数据集中的一个常见变量是“事件日期”,但我不能使用它,因为数据集中大约有300起事件发生在特定日期(按地址、城市、县、邮政编码、通知紧急医疗服务(EMS)的时间细分)。另一个数据集包含事件发生的确切时间、地址、城市、县、邮政编码和一些其他字段,但如果信息未知/未记录,这些字段可能为空 我想创建一个缓冲区来按每个字段连接数据集。例如,首先从事件日期开始(无缺失值),如果它们相同,下一步将检查它们是否发生在同一个县、市等。(某些值可能为空)。最后一个比较字段是通知EMS的时间(事件发生后30-60分钟内)。如果所有字段都匹配,则最终缓冲时间为30-60分钟。这将是一个多对一的合并(50000到150000) 什么程序可以让我这样做?有某种代码吗 我添加了数据集()和()的一个片段 预期输出如下所示 使用的代码是:Python 合并具有多个变量的2个数据集(不能仅使用相似的变量),python,mysql,matlab,sas,weka,Python,Mysql,Matlab,Sas,Weka,我试图合并两个数据集(150000和50000条记录),每个数据集大约有50个变量,其中一些可能匹配。这两个数据集中的一个常见变量是“事件日期”,但我不能使用它,因为数据集中大约有300起事件发生在特定日期(按地址、城市、县、邮政编码、通知紧急医疗服务(EMS)的时间细分)。另一个数据集包含事件发生的确切时间、地址、城市、县、邮政编码和一些其他字段,但如果信息未知/未记录,这些字段可能为空 我想创建一个缓冲区来按每个字段连接数据集。例如,首先从事件日期开始(无缺失值),如果它们相同,下一步将检查
T1 = readtable('dataset1.csv')
T2 = readtable('dataset2.csv')
LT1 = size(T1,1);
LT2 = size(T2,1);
T1 = [T1, cell2table(repmat({''}, LT1, 7),'VariableNames', {'County_Name', 'City_Name', 'Town_Name','CrashTime', 'SecondaryLocation', 'RouteName', 'PostalCityName'})]
augmented = false(LT1,1);
dtstr = 'MM/dd/yyyy HH:mm';
trange = duration([0,0,0;1,0,0]);
for tt2 = 1:LT2
cdate2 = T2.CrashDate{tt2};
crasht2 = T2.CrashDateTime{tt2};
assert(~isempty(cdate2) & ~isempty(crasht2),'Major data missing')
crashdt2 = [cdate2, ' ', crasht2];
crashdt2 = datetime(crashdt2,'InputFormat',dtstr);
strtaddr2 = T2.RouteName{tt2};
if ~isempty(strtaddr2)
strtaddr2 = upper(strtaddr2);
strtaddr2 = strrep(strtaddr2,'ROAD','RD');
strtaddr2 = strtaddr2(isletter(strtaddr2));
end
pcityn2 = T2.PostalCityName{tt2};
if ~isempty(pcityn2)
pcityn2 = upper(pcityn2);
pcityn2 = pcityn2(isletter(pcityn2));
end
countyn2 = T2.County_Name{tt2};
if ~isempty(countyn2)
countyn2 = countyn2(isletter(countyn2));
countyn2 = upper(countyn2);
countyn2 = strrep(countyn2,'COUNTY','');
end
for tt1 = 1:LT1
if augmented(tt1)
continue
end
matchvec = true(5,1);
cdate1 = T1.IncidentDate{tt1};
matchvec(1) = strcmp(cdate1, cdate2);
strtaddr1 = upper(T1.AddressStreet{tt1});
if ~isempty(strtaddr2) && ~isempty(strtaddr1)
strtaddr1 = strrep(strtaddr1,'ROAD','RD');
strtaddr1 = strtaddr1(isletter(strtaddr1));
matchvec(2) = strcmp(strtaddr1,strtaddr2);
end
pcityn1 = upper(T1.AddressCityIncident{tt1});
pcityn1 = pcityn1(isletter(pcityn1));
if ~isempty(pcityn2) && ~isempty(pcityn1)
pcityn1 = pcityn1(isletter(pcityn1));
matchvec(3) = strcmp(pcityn1,pcityn2);
end
countyn1 = upper(T1.AddressCountyIncident{tt1});
countyn1 = countyn1(isletter(countyn1));
if ~isempty(countyn2) && ~isempty(countyn1)
countyn1 = countyn1(isletter(countyn1));
matchvec(4) = strcmp(countyn1,countyn2);
end
crashdt1u = T1.UnitNotified{tt1};
crashdt1d = T1.Date12_DispatchNotified{tt1};
if ~isempty(crashdt1u) || ~isempty(crashdt1d)
tmatch = true(2,1);
if ~isempty(crashdt1u)
crashdt1u = datetime(crashdt1u,'InputFormat',dtstr);
difcrdt1d = crashdt1d-crashdt2;
tmatch = difcrdt1d >= trange(1) && difcrdt1d <= trange(2);
end
matchvec(5) = all(tmatch);
end
if all(matchvec)
T1{tt1,{'County_Name', 'City_Name', 'Town_Name','CrashTime', 'SecondaryLocation', 'RouteName','PostalCityName'}} = table2cell( T2(tt2,{'County_Name', 'City_Name','Town_Name', 'CrashTime', 'SecondaryLocation','RouteName', 'PostalCityName'}) );
augmented(tt1)=true;
else
T1(tt1,:)
T2(tt2,:)
matchvec
end
end
end
T1
T1=readtable('dataset1.csv'))
T2=可读性('dataset2.csv'))
LT1=尺寸(T1,1);
LT2=尺寸(T2,1);
T1=[T1,cell2table(repmat({'},LT1,7),'VariableNames',{'country'u Name','City'u Name','Town'u Name','CrashTime','SecondaryLocation','RouteName','PostalCityName}]
增强=假(LT1,1);
dtstr='MM/dd/yyyy HH:MM';
交易=持续时间([0,0,0;1,0,0]);
对于tt2=1:LT2
cdate2=T2.CrashDate{tt2};
crasht2=T2.CrashDateTime{tt2};
assert(~isempty(cdate2)和~isempty(crasht2),“主数据丢失”)
crashdt2=[cdate2',,crasht2];
crashdt2=日期时间(crashdt2,'InputFormat',dtstr);
strtaddr2=T2.RouteName{tt2};
if~isempty(strtaddr2)
strtaddr2=上(strtaddr2);
strtaddr2=strrep(strtaddr2,'ROAD','RD');
strtaddr2=strtaddr2(胰岛(strtaddr2));
结束
pcityn2=T2.PostalCityName{tt2};
if~isempty(pcityn2)
pcityn2=上(pcityn2);
pcityn2=pcityn2(胰岛细胞(pcityn2));
结束
countyn2=T2.县名称{tt2};
if~isempty(countyn2)
countyn2=countyn2(胰岛细胞(countyn2));
countyn2=上限(countyn2);
countyn2=strrep(countyn2,'COUNTY','';
结束
对于tt1=1:LT1
如果增加(tt1)
持续
结束
matchvec=true(5,1);
cdate1=T1.IncidentDate{tt1};
matchvec(1)=strcmp(cdate1,cdate2);
strtaddr1=上(T1.AddressStreet{tt1});
如果~isempty(strtaddr2)&&~isempty(strtaddr1)
strtaddr1=strrep(strtaddr1,'ROAD','RD');
strtaddr1=strtaddr1(胰岛(strtaddr1));
matchvec(2)=strcmp(strtaddr1,strtaddr2);
结束
pcityn1=上限(T1.AddressCityIncident{tt1});
pcityn1=pcityn1(胰岛细胞(pcityn1));
如果~isempty(pcityn2)&&~isempty(pcityn1)
pcityn1=pcityn1(胰岛细胞(pcityn1));
matchvec(3)=strcmp(pcityn1,pcityn2);
结束
countyn1=上限(T1.AddressCountyIncident{tt1});
countyn1=countyn1(胰岛细胞(countyn1));
如果~isempty(countyn2)&&~isempty(countyn1)
countyn1=countyn1(胰岛细胞(countyn1));
matchvec(4)=strcmp(countyn1,countyn2);
结束
crashdt1u=T1.UnitNotified{tt1};
crashdt1d=T1.Date12_DispatchNotified{tt1};
如果~isempty(crashdt1u)| ~isempty(crashdt1d)
t匹配=真(2,1);
如果~isempty(crashdt1u)
crashdt1u=datetime(crashdt1u,'InputFormat',dtstr);
difcrdt1d=crashdt1d-crashdt2;
t匹配=difcrdt1d>=trange(1)和&difcrdt1d编辑:性能优化代码;预测大量数据
请注意:您的原始数据有许多错误。csv文件中实际数据的任何位置都不允许使用逗号。某些字符串(在通知时间内找到1个单位)没有预定义的格式。try
块处理一个特定情况;如果所有字段都存在缺陷数据,则应在所有字段中执行try
。所有这些都应在合并之前解决
clear;clc;close all
T1 = readtable('dataset1.csv');
T2 = readtable('dataset2.csv');
T1 = T1(1:1000,:);
T2 = T2(1:900,:);
LT1 = size(T1,1);
LT2 = size(T2,1);
% expand T1 for expansion
T1 = [T1, cell2table(repmat({''}, LT1, 7), ....
'VariableNames', {'County_Name', 'City_Name', 'Town_Name', ....
'CrashTime', 'SecondaryLocation', 'RouteName', 'PostalCityName'})];
augmented = false(LT1,1); % see usage below
dtstr = 'MM/dd/yyyy HH:mm';
trange = duration([-1,0,0;1,0,0]); % 0 to 1 hour
strtaddrcmpf = @(c1,c2) cellfun(@(s2) ....
cellfun(@(s1) ....
~(isempty(strfind(s1,s2)) | isempty(strfind(s2,s1))), ....
c1), ....
c2);
% buffer original to speed up
fprintf('Pre-processing started at %s \n', datestr(datetime('now')))
T1B = cell2table([repmat({''}, LT1, 5), repmat({true}, LT1, 4)], ....
'VariableNames', {'CrashDTU','CrashDTD', ....
'StrtAdd','PoCityN', 'CountyN', ....
'CrashDTFlg', 'StrtAddFlg', 'PoCityNFlg', 'CountyNFlg'});
T2B = cell2table([repmat({''}, LT2, 4), repmat({true}, LT2, 3)], ....
'VariableNames', {'CrashDT', 'StrtAdd', 'PoCityN', 'CountyN', ....
'StrtAddFlg', 'PoCityNFlg', 'CountyNFlg'});
fprintf('Progress: ')
for tt2 = 1:LT2
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Progress: %6.2f%%', tt2/LT2*50);
cdate2 = T2.CrashDate{tt2};
crasht2 = T2.CrashTime{tt2};
assert(~isempty(cdate2) & ~isempty(crasht2),'Major data missing')
crashdt2 = [cdate2, ' ', crasht2];
T2B.CrashDT{tt2} = datetime(crashdt2,'InputFormat',dtstr);
strtaddr2 = T2.RouteName{tt2};
if ~isempty(strtaddr2)
strtaddr2 = upper(strtaddr2);
strtaddr2 = strrep(strtaddr2,'ROAD','RD'); % repeat for HWY ST etc
strtaddr2 = strsplit(strtaddr2,'/');
switch true
case strfind(strtaddr2,'I95')
strtaddr2 = {'I95'};
case strfind(strtaddr2,'I495')
strtaddr2 = {'I495'};
otherwise
strtaddr2 = cellfun(@(s) s(isletter(s)), ....
strtaddr2, 'Uniform',false);
end
T2B.StrtAdd{tt2} = strtaddr2;
else
T2B.StrtAddFlg(tt2) = false;
end
pcityn2 = T2.PostalCityName{tt2};
if ~isempty(pcityn2)
pcityn2 = upper(pcityn2);
pcityn2 = pcityn2(isletter(pcityn2));
T2B.PoCityN{tt2} = pcityn2;
else
T2B.PoCityNFlg(tt2) = false;
end
countyn2 = T2.County_Name{tt2};
if ~isempty(countyn2)
countyn2 = upper(countyn2);
countyn2 = countyn2(isletter(countyn2));
countyn2 = strrep(countyn2,'COUNTY','');
T2B.CountyN{tt2} = countyn2;
else
T2B.CountyNFlg(tt2) = false;
end
end
for tt1 = 1:LT1
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Progress: %6.2f%%', tt1/LT1*50+50);
strtaddr1 = upper(T1.AddressStreet{tt1});
if ~isempty(strtaddr1)
strtaddr1 = strrep(strtaddr1,'ROAD','RD');
strtaddr1 = strsplit(strtaddr1,'/');
switch true
case strfind(strtaddr1,'I95')
strtaddr1 = {'I95'};
case strfind(strtaddr1,'I495')
strtaddr1 = {'I495'};
otherwise
strtaddr1 = cellfun(@(s) s(isletter(s)), ....
strtaddr1, 'Uniform',false);
end
T1B.StrtAdd{tt1} = strtaddr1;
else
T1B.StrtAddFlg(tt1) = false;
end
pcityn1 = upper(T1.AddressCityIncident{tt1});
if ~isempty(pcityn1)
pcityn1 = pcityn1(isletter(pcityn1));
T1B.PoCityN{tt1} = pcityn1;
else
T1B.PoCityNFlg(tt1) = false;
end
countyn1 = upper(T1.AddressCountyIncident{tt1});
if ~isempty(countyn1)
countyn1 = countyn1(isletter(countyn1));
countyn1 = strrep(countyn1,'COUNTY','');
T1B.CountyN{tt1} = countyn1;
else
T1B.CountyNFlg(tt1) = false;
end
crashdt1u = T1.UnitNotified{tt1};
crashdt1d = T1.DispatchNotified{tt1};
if ~isempty(crashdt1u) || ~isempty(crashdt1d)
tmatch = true(2,1);
% a little dirty here, need both date and time
try
if ~isempty(crashdt1u)
crashdt1u = datetime(crashdt1u,'InputFormat',dtstr);
T1B.CrashDTU{tt1} = crashdt1u;
end
if ~isempty(crashdt1d)
crashdt1d = datetime(crashdt1d,'InputFormat',dtstr);
T1B.CrashDTD{tt1} = crashdt1d;
end
catch
T1B.CrashDTFlg(tt1) = false;
end
else
T1B.CrashDTFlg(tt1) = false;
end
end
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Pre-processing finished at %s \n', ....
datestr(datetime('now')))
fprintf('Matching started at %s \n', datestr(datetime('now')))
% process data
fprintf('Progress: ')
for tt2 = 1:LT2
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Progress: %6.2f%%', tt2/LT2*100);
% extract a row for comparison
crashdt2 = T2B.CrashDT{tt2};
strtaddr2 = T2B.StrtAdd{tt2};
pcityn2 = T2B.PoCityN{tt2};
countyn2 = T2B.CountyN{tt2};
for tt1 = 1:LT1
if augmented(tt1) % match already found, skip
continue
end
% Boolean comparison: treat missing data as identical
cdate1 = T1.IncidentDate{tt1};
match1 = strcmp(cdate1, cdate2); % incident date
if ~match1
continue
end
if T2B.StrtAddFlg(tt2) && T1B.StrtAddFlg(tt1) % put 2 first: faster
strtaddr1 = T1B.StrtAdd{tt1};
strtaddr_cmp = strtaddrcmpf(strtaddr2,strtaddr1);
match2 = any(strtaddr_cmp); % street name match
end
if ~match2
continue
end
if T2B.PoCityNFlg(tt2) && T1B.PoCityNFlg(tt1)
pcityn1 = T1B.PoCityN{tt1};
match3 = strcmp(pcityn1,pcityn2); % postal city name match
end
if ~match3
continue
end
if T2B.CountyNFlg(tt2) && T1B.CountyNFlg(tt1)
countyn1 = T1B.CountyN{tt1};
countyn1 = countyn1(isletter(countyn1));
countyn1 = strrep(countyn1,'COUNTY','');
match4 = strcmp(countyn1,countyn2); % county name match
end
if ~match4
continue
end
if T1B.CrashDTFlg(tt1)
crashdt1u = T1B.CrashDTU{tt1};
crashdt1d = T1B.CrashDTD{tt1};
% a little dirty here, need both date and time
if ~isempty(crashdt1u)
difcrdt1u = crashdt1u-crashdt2;
tmatch1 = difcrdt1u >= trange(1) && difcrdt1u <= trange(2);
end
if ~isempty(crashdt1d)
difcrdt1d = crashdt1d-crashdt2;
tmatch2 = difcrdt1d >= trange(1) && difcrdt1d <= trange(2);
end
match5 = tmatch1 & tmatch2;
end
if ~match5
continue
end
% append row in T2 to T1
T1{tt1,{'County_Name', 'City_Name', 'Town_Name', ....
'CrashTime', 'SecondaryLocation', 'RouteName', ....
'PostalCityName'}} = ....
table2cell( T2(tt2,{'County_Name', 'City_Name', ....
'Town_Name', 'CrashTime', 'SecondaryLocation', ....
'RouteName', 'PostalCityName'}) );
augmented(tt1) = true;
% break % assume unique matching
end
end
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Matching finished at %s \nTotalling %d matches. \n', ....
datestr(datetime('now')), sum(augmented))
我从Matlab得到这个消息
警告:已修改变量名以使其有效
标识符
因此,您可能需要根据需要更改表中的列名
这些是从csv文件导入的原始数据集
(obsolete)
示例输出:
(obsolete)
新数据集和输出:
>> T1
T1 =
IncidentDate AddressStreet AddressCityIncident AddressCountyIncident AddressState IncidentPostalCode DispatchNotified UnitNotified
____________ ___________________________________________ ___________________ _____________________ ____________ __________________ ________________ ________________
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33'
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33'
'1/1/2014' 'I95 SB TO OLD KEENE MILL RD' 'SPRINGFIELD' 'Fairfax County' 'VA' 22150 '1/1/2014 2:00' '1/1/2014 2:00'
'1/1/2014' 'SYDENSTRICKER RD/OLD KEENE MILL RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 4:54' '1/1/2014 4:54'
'1/1/2014' 'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB' 'CHANTILLY' 'Fairfax County' 'VA' 20151 '1/1/2014 12:28' '1/1/2014 12:28'
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07'
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07'
'1/1/2014' 'CENTREVILLE RD/BRADENTON DR' 'CENTREVILLE' 'Fairfax County' 'VA' 20121 '1/1/2014 13:41' '1/1/2014 13:41'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:45' '1/1/2014 16:45'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:42' '1/1/2014 16:42'
'1/1/2014' '8526 GEORGETOWN PIKE' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:49' '1/1/2014 16:49'
'1/1/2014' 'OX RD/BRADDOCK RD' 'FAIRFAX' 'Fairfax County' 'VA' 22032 '1/1/2014 22:32' '1/1/2014 22:32'
>> T2
T2 =
CrashDate County_Name City_Name Town_Name CrashTime SecondaryLocation RouteName PostalCityName
__________ ________________ _________ _________ _________ __________________________ ___________________ ______________
'1/1/2014' 'Fairfax County' NaN NaN '6:35' '' 'I95' 'LORTON'
'1/1/2014' 'Fairfax County' NaN NaN '5:19' '' 'I95 RAMP' 'SPRINGFIELD'
'1/1/2014' 'Fairfax County' NaN NaN '10:23' '' 'I495' 'ANNANDALE'
'1/1/2014' 'Fairfax County' NaN NaN '2:08' '' 'BUILDERS RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'Fairfax County' NaN NaN '20:55' 'LEESBURG PIKE' 'WILSON BLVD' 'FALLS CHURCH'
'1/1/2014' 'Fairfax County' NaN NaN '4:54' '' 'SYDENSTRICKER RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '2:34' 'BEACON HILL RD' 'RICHMOND HWY' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '2:00' '' 'COAT RIDGE RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '13:17' '' 'OLD KEENE MILL RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '5:19' 'MCLEAREN RD' 'CENTREVILLE RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '21:48' 'VIRGINIA CENTER BLVD' 'VADEN DR' 'VIENNA'
'1/1/2014' 'Fairfax County' NaN NaN '19:59' 'FAIRFAX COUNTY PKWY RAMP' 'LEE HWY RAMP' 'FAIRFAX'
'1/1/2014' 'Fairfax County' NaN NaN '2:36' '' 'I95' 'SPRINGFIELD'
'1/1/2014' 'Fairfax County' NaN NaN '20:36' 'MOUNT GILEAD RD' 'BRADDOCK RD' 'CENTREVILLE'
'1/1/2014' 'Fairfax County' NaN NaN '1:46' '' 'I95' 'LORTON'
'1/1/2014' 'Fairfax County' NaN NaN '18:45' '' 'I495' 'HAMPTON'
'1/1/2014' 'Fairfax County' NaN NaN '13:40' 'BRADENTON DR' 'CENTREVILLE RD' 'CENTREVILLE'
'1/1/2014' 'Fairfax County' NaN NaN '17:24' 'SHREVE HILL RD' 'IDYLWOOD RD' 'DUNN LORING'
'1/1/2014' 'Fairfax County' NaN NaN '17:46' 'SACRAMENTO DR' 'RICHMOND HWY' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '1:40' '' 'WINBOURNE RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '15:44' 'TELEGRAPH RD' 'FRANCONIA RD' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '22:19' 'OX RD' 'BRADDOCK RD' 'FAIRFAX'
'1/1/2014' 'Fairfax County' NaN NaN '12:27' '' 'SULLY RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '11:25' 'MONUMENT DR' 'LEE HWY' 'FAIRFAX'
T1 =
IncidentDate AddressStreet AddressCityIncident AddressCountyIncident AddressState IncidentPostalCode DispatchNotified UnitNotified County_Name City_Name Town_Name CrashTime SecondaryLocation RouteName PostalCityName
____________ ___________________________________________ ___________________ _____________________ ____________ __________________ ________________ ________________ ________________ _________ _________ _________ _________________ __________________ ______________
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33' 'Fairfax County' [NaN] [NaN] '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33' 'Fairfax County' [NaN] [NaN] '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'I95 SB TO OLD KEENE MILL RD' 'SPRINGFIELD' 'Fairfax County' 'VA' 22150 '1/1/2014 2:00' '1/1/2014 2:00' '' '' '' '' '' '' ''
'1/1/2014' 'SYDENSTRICKER RD/OLD KEENE MILL RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 4:54' '1/1/2014 4:54' 'Fairfax County' [NaN] [NaN] '4:54' '' 'SYDENSTRICKER RD' 'BURKE'
'1/1/2014' 'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB' 'CHANTILLY' 'Fairfax County' 'VA' 20151 '1/1/2014 12:28' '1/1/2014 12:28' '' '' '' '' '' '' ''
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07' '' '' '' '' '' '' ''
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07' '' '' '' '' '' '' ''
'1/1/2014' 'CENTREVILLE RD/BRADENTON DR' 'CENTREVILLE' 'Fairfax County' 'VA' 20121 '1/1/2014 13:41' '1/1/2014 13:41' 'Fairfax County' [NaN] [NaN] '13:40' 'BRADENTON DR' 'CENTREVILLE RD' 'CENTREVILLE'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:45' '1/1/2014 16:45' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:42' '1/1/2014 16:42' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' '8526 GEORGETOWN PIKE' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:49' '1/1/2014 16:49' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'OX RD/BRADDOCK RD' 'FAIRFAX' 'Fairfax County' 'VA' 22032 '1/1/2014 22:32' '1/1/2014 22:32' 'Fairfax County' [NaN] [NaN] '22:19' 'OX RD' 'BRADDOCK RD' 'FAIRFAX'
建议您从要合并的数据集中包含一些示例数据(10-20条记录),并且所需的SQL输出类型应该足够了。如果您有名称,那么Link King将是执行一些模糊匹配的好方法。有一些替代方案可以检查不同的变量并进行概率匹配。有很多选择,但这不是一个简单而直接的问题,一个查询可以回答,多个SQL连接也可以帮助您实现这一点。如您所见,第二个数据集的最后第二行与第一个数据集的第五行完全匹配。第二个数据集的最后一行与第一个数据集的第六行匹配。(缓冲时间为3分钟)。像这样,我必须将数据集1的50000条记录合并到数据集2的150000条记录。@Yvon在数据集1中,事件日期是唯一一个始终不为空的字段。在50000条记录中,地址街有10条空白,单位通知字段有13条空白。但是,在这种情况下可以使用相邻时间(调度通知)。在数据集2中,崩溃日期和崩溃时间字段始终不是空的。在150000条记录中,路线名称和邮政城市名称中只有7条缺失值。评论不用于进一步讨论;这段对话已经结束。
>> T1
T1 =
IncidentDate AddressStreet AddressCityIncident AddressCountyIncident AddressState IncidentPostalCode DispatchNotified UnitNotified
____________ ___________________________________________ ___________________ _____________________ ____________ __________________ ________________ ________________
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33'
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33'
'1/1/2014' 'I95 SB TO OLD KEENE MILL RD' 'SPRINGFIELD' 'Fairfax County' 'VA' 22150 '1/1/2014 2:00' '1/1/2014 2:00'
'1/1/2014' 'SYDENSTRICKER RD/OLD KEENE MILL RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 4:54' '1/1/2014 4:54'
'1/1/2014' 'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB' 'CHANTILLY' 'Fairfax County' 'VA' 20151 '1/1/2014 12:28' '1/1/2014 12:28'
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07'
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07'
'1/1/2014' 'CENTREVILLE RD/BRADENTON DR' 'CENTREVILLE' 'Fairfax County' 'VA' 20121 '1/1/2014 13:41' '1/1/2014 13:41'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:45' '1/1/2014 16:45'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:42' '1/1/2014 16:42'
'1/1/2014' '8526 GEORGETOWN PIKE' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:49' '1/1/2014 16:49'
'1/1/2014' 'OX RD/BRADDOCK RD' 'FAIRFAX' 'Fairfax County' 'VA' 22032 '1/1/2014 22:32' '1/1/2014 22:32'
>> T2
T2 =
CrashDate County_Name City_Name Town_Name CrashTime SecondaryLocation RouteName PostalCityName
__________ ________________ _________ _________ _________ __________________________ ___________________ ______________
'1/1/2014' 'Fairfax County' NaN NaN '6:35' '' 'I95' 'LORTON'
'1/1/2014' 'Fairfax County' NaN NaN '5:19' '' 'I95 RAMP' 'SPRINGFIELD'
'1/1/2014' 'Fairfax County' NaN NaN '10:23' '' 'I495' 'ANNANDALE'
'1/1/2014' 'Fairfax County' NaN NaN '2:08' '' 'BUILDERS RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'Fairfax County' NaN NaN '20:55' 'LEESBURG PIKE' 'WILSON BLVD' 'FALLS CHURCH'
'1/1/2014' 'Fairfax County' NaN NaN '4:54' '' 'SYDENSTRICKER RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '2:34' 'BEACON HILL RD' 'RICHMOND HWY' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '2:00' '' 'COAT RIDGE RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '13:17' '' 'OLD KEENE MILL RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '5:19' 'MCLEAREN RD' 'CENTREVILLE RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '21:48' 'VIRGINIA CENTER BLVD' 'VADEN DR' 'VIENNA'
'1/1/2014' 'Fairfax County' NaN NaN '19:59' 'FAIRFAX COUNTY PKWY RAMP' 'LEE HWY RAMP' 'FAIRFAX'
'1/1/2014' 'Fairfax County' NaN NaN '2:36' '' 'I95' 'SPRINGFIELD'
'1/1/2014' 'Fairfax County' NaN NaN '20:36' 'MOUNT GILEAD RD' 'BRADDOCK RD' 'CENTREVILLE'
'1/1/2014' 'Fairfax County' NaN NaN '1:46' '' 'I95' 'LORTON'
'1/1/2014' 'Fairfax County' NaN NaN '18:45' '' 'I495' 'HAMPTON'
'1/1/2014' 'Fairfax County' NaN NaN '13:40' 'BRADENTON DR' 'CENTREVILLE RD' 'CENTREVILLE'
'1/1/2014' 'Fairfax County' NaN NaN '17:24' 'SHREVE HILL RD' 'IDYLWOOD RD' 'DUNN LORING'
'1/1/2014' 'Fairfax County' NaN NaN '17:46' 'SACRAMENTO DR' 'RICHMOND HWY' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '1:40' '' 'WINBOURNE RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '15:44' 'TELEGRAPH RD' 'FRANCONIA RD' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '22:19' 'OX RD' 'BRADDOCK RD' 'FAIRFAX'
'1/1/2014' 'Fairfax County' NaN NaN '12:27' '' 'SULLY RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '11:25' 'MONUMENT DR' 'LEE HWY' 'FAIRFAX'
T1 =
IncidentDate AddressStreet AddressCityIncident AddressCountyIncident AddressState IncidentPostalCode DispatchNotified UnitNotified County_Name City_Name Town_Name CrashTime SecondaryLocation RouteName PostalCityName
____________ ___________________________________________ ___________________ _____________________ ____________ __________________ ________________ ________________ ________________ _________ _________ _________ _________________ __________________ ______________
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33' 'Fairfax County' [NaN] [NaN] '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33' 'Fairfax County' [NaN] [NaN] '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'I95 SB TO OLD KEENE MILL RD' 'SPRINGFIELD' 'Fairfax County' 'VA' 22150 '1/1/2014 2:00' '1/1/2014 2:00' '' '' '' '' '' '' ''
'1/1/2014' 'SYDENSTRICKER RD/OLD KEENE MILL RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 4:54' '1/1/2014 4:54' 'Fairfax County' [NaN] [NaN] '4:54' '' 'SYDENSTRICKER RD' 'BURKE'
'1/1/2014' 'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB' 'CHANTILLY' 'Fairfax County' 'VA' 20151 '1/1/2014 12:28' '1/1/2014 12:28' '' '' '' '' '' '' ''
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07' '' '' '' '' '' '' ''
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07' '' '' '' '' '' '' ''
'1/1/2014' 'CENTREVILLE RD/BRADENTON DR' 'CENTREVILLE' 'Fairfax County' 'VA' 20121 '1/1/2014 13:41' '1/1/2014 13:41' 'Fairfax County' [NaN] [NaN] '13:40' 'BRADENTON DR' 'CENTREVILLE RD' 'CENTREVILLE'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:45' '1/1/2014 16:45' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:42' '1/1/2014 16:42' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' '8526 GEORGETOWN PIKE' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:49' '1/1/2014 16:49' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'OX RD/BRADDOCK RD' 'FAIRFAX' 'Fairfax County' 'VA' 22032 '1/1/2014 22:32' '1/1/2014 22:32' 'Fairfax County' [NaN] [NaN] '22:19' 'OX RD' 'BRADDOCK RD' 'FAIRFAX'