Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/variables/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R:pmatch:';特蕾莎·德尔卡';与'不匹配;特里萨·德尔卡门';_R_Dataframe_String Matching_Bigdata - Fatal编程技术网

R:pmatch:';特蕾莎·德尔卡';与'不匹配;特里萨·德尔卡门';

R:pmatch:';特蕾莎·德尔卡';与'不匹配;特里萨·德尔卡门';,r,dataframe,string-matching,bigdata,R,Dataframe,String Matching,Bigdata,我有两个数据帧,数据几乎相同: 考生有29260条观察结果和以下列名: 父。名,母。名,名。名,申请。编号 和该国的每个学生共有12000000个观察结果,列名如下: 父。名,母。名,名。名,申请。编号 Test.Takers$Application.Number用NA值填充,我想用Application.Number填充该字段,中的每个.Student.in.The.Country 我试图通过将中的每个.Student.In.the.Country中的父名和母名进行子集来实现这一点。然后,我将

我有两个数据帧,数据几乎相同:

考生有29260条观察结果和以下列名:

父。名,母。名,名。名,申请。编号

该国的每个学生共有12000000个观察结果,列名如下:

父。名,母。名,名。名,申请。编号

Test.Takers$Application.Number用NA值填充,我想用Application.Number填充该字段,中的每个.Student.in.The.Country

我试图通过将中的每个.Student.In.the.Country中的父名和母名进行子集来实现这一点。然后,我将用以下代码填写Test.Takers$Application.Number

Test.Takers$Application.Number[i] <- subset$Application.Number[pmatch(as.character(Test.Taker$First.Name[i]), subset$First.Names)]
Test.Takers$Application.Number[i]如果#符号是唯一可以添加重复项的问题,则在函数中ok=TRUE

Test.Takers$Application.Number[i]欢迎来到StackOverflow。请看一下这些关于如何制作a的技巧,以及上的这篇文章。也许下面的提示也值得一读。好的,gsubing为示例代码工作。我将检查它,以获得更大的数据帧。代码需要很长时间才能运行,但我应该能够在明天告诉您结果。谢谢!gsubing工作了,但只发现了1/6的缺失观察结果。我必须不断想办法解决这个问题……你可能还有其他特殊情况。您也可以尝试gsub和dup=T。唯一对我来说很明显的情况是名称中有“#”。事实证明,这些名称仅是问题的1/6:“0”(。我将不得不查看其他有问题的模式。尽管如此,感谢您在这方面的帮助。G搜索“#”帮助我找到了大约2000人的申请编号。
Test.Takers <- data.frame(
    Paternal.Name = c('Last', 'Last', 'Paternal'),
    Maternal.Name = c('Maternal', 'Last', 'Last'),
    First.Name = c('First', 'Name', 'TERESA DEL CA'),
    Application.Number = NA)

Every.Student.In.The.Country <- data.frame(
    Paternal.Name = c('Last', 'Last', 'Last', 'Paternal', 'Paternal', 'Paternal'),
    Maternal.Name = c('Maternal', 'Last', 'Maternal', 'Last', 'Maternal', 'Last'),
    First.Name = c('First', 'Name', 'Whatever', 'TERESA DEL#CARMEN', 'Another', 'Something Else'),
    Application.Number = c(123, 456, 789, 234, 567, 890)
)

#a place holder that will hold a subset of all a selected paternal last names
indexp <- data.frame(Paternal.Name='name')

for(i in 1:nrow(Test.Takers)) {
    namep <- as.character(Test.Takers$Paternal.Name[i])

    #below if statement prevents us from having to subset the paternal lastname unnecessarily

    if(is.na(indexp$Paternal.Name[1]) == T | as.character(indexp$Paternal.Name[1]) != namep) { 
        indexp <- subset(Every.Student.In.The.Country, Paternal.Name == as.character(Test.Takers$Paternal.Name[i]))
    }

    #below if-statement prevents an error that arrises
    #when a paternal last name does not exist
    #in the Every.Student.In.The.Country file

    if(is.na(indexp$Paternal.Name[1]) == F) {


    #group paternal last names by maternal last names
    indexm <- subset(indexp, Maternal.Name == as.character(Test.Takers$Maternal.Name[i]))    

    #find a partial string match to find an exact or similiar first name within the selected
    #last name subset. Attaches a application.number if a match is found

    Test.Takers$Application.Number[i] <- indexm$Application.Number[pmatch(as.character(Test.Takers$First.Name[i]), indexm$First.Name)]     
}}