R函数不断返回空数据帧
我正在尝试编写一个R函数,它通过循环给定的数据帧对其进行过滤。数据框中的数据由伦敦地铁两条线路之间的旅行信息组成,我想去掉最上面的百分比。下面是输入数据的str()函数的输出:R函数不断返回空数据帧,r,R,我正在尝试编写一个R函数,它通过循环给定的数据帧对其进行过滤。数据框中的数据由伦敦地铁两条线路之间的旅行信息组成,我想去掉最上面的百分比。下面是输入数据的str()函数的输出: 'data.frame': 71748 obs. of 9 variables: $ depart : Factor w/ 52 levels "Bank","Barkingside",..: 22 22 22 22 22 25 25 25 25 25 ... $ arrival : Fact
'data.frame': 71748 obs. of 9 variables:
$ depart : Factor w/ 52 levels "Bank","Barkingside",..: 22 22 22 22 22 25 25 25 25 25 ...
$ arrival : Factor w/ 48 levels "Bank","Barkingside",..: 48 43 38 5 8 1 42 48 41 43 ...
$ traveltime : num 433 1102 161 584 891 ...
$ departuretime: POSIXlt, format: "2014-03-24 18:17:20" "2014-03-24 18:17:20" "2014-03-24 18:17:20" ...
$ arrivaltime : POSIXlt, format: "2014-03-24 18:24:33" "2014-03-24 18:35:42" "2014-03-24 18:20:01" ...
$ lcid : Factor w/ 28 levels "1000001","1000002",..: 1 1 1 1 1 1 1 1 1 1 ...
$ tripno : Factor w/ 25 levels "1","10","11",..: 2 2 2 2 2 2 2 2 2 2 ...
$ destination : Factor w/ 18 levels "Debden","Ealing Broadway",..: 3 3 3 3 3 3 3 3 3 3 ...
$ line : Factor w/ 1 level "C": 1 1 1 1 1 1 1 1 1 1 ...
structure(list(depart = structure(c(22L, 22L, 22L, 22L, 22L,
25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
25L, 25L, 25L, 25L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L), .Label = c("Bank",
"Barkingside", "Bethnal Green", "Bond Street", "Buckhurst Hill",
"Chancery Lane", "Chigwell", "Debden", "Ealing Broadway", "East Acton",
"Epping", "Fairlop", "Gants Hill", "Grange Hill", "Greenford",
"Hainault", "Hanger Lane", "Holborn", "Holland Park", "Lancaster Gate",
"Leyton", "Leytonstone", "Liverpool Street", "Loughton", "Marble Arch",
"Mile End", "Newbury Park", "Newbury Park Loop", "North Acton",
"North Acton Junction", "Northolt", "Notting Hill Gate", "Oxford Circus",
"Perivale", "Queensway", "Redbridge", "Roding Valley", "Ruislip Gardens",
"Shepherd's Bush", "Shepherds Bush (Central Line)", "Snaresbrook",
"South Ruislip", "South Woodford", "St. Paul's", "Stratford",
"Theydon Bois", "Tottenham Court Road", "Wanstead", "West Acton",
"West Ruislip", "White City", "Woodford"), class = "factor"),
arrival = structure(c(48L, 43L, 38L, 5L, 8L, 1L, 42L, 48L,
41L, 43L, 6L, 38L, 5L, 4L, 16L, 30L, 44L, 20L, 8L, 3L, 24L,
19L, 1L, 42L, 48L, 41L, 43L, 6L, 38L, 5L), .Label = c("Bank",
"Barkingside", "Bethnal Green", "Bond Street", "Buckhurst Hill",
"Chancery Lane", "Chigwell", "Debden", "East Acton", "Fairlop",
"Gants Hill", "Grange Hill", "Greenford", "Hainault", "Hanger Lane",
"Holborn", "Holland Park", "Lancaster Gate", "Leyton", "Leytonstone",
"Liverpool Street", "Loughton", "Marble Arch", "Mile End",
"Newbury Park", "North Acton", "North Acton Junction", "Northolt",
"Notting Hill Gate", "Oxford Circus", "Perivale", "Queensway",
"Redbridge", "Roding Valley", "Ruislip Gardens", "Shepherd's Bush",
"Shepherds Bush (Central Line)", "Snaresbrook", "South Ruislip",
"South Woodford", "St. Paul's", "Stratford", "Theydon Bois",
"Tottenham Court Road", "Wanstead", "West Acton", "White City",
"Woodford"), class = "factor"), traveltime = c(433, 1102,
161, 584, 891, 829, 1473, 2273, 629, 2942, 467, 2001, 2424,
75, 351, 165, 249, 1840, 2731, 1148, 1289, 1653, 580, 1224,
2024, 380, 2693, 218, 1752, 2175), departuretime = structure(list(
sec = c(20, 20, 20, 20, 20, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 49, 49, 49, 49,
49, 49, 49, 49), min = c(17L, 17L, 17L, 17L, 17L, 46L,
46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L,
46L, 46L, 46L, 46L, 46L, 50L, 50L, 50L, 50L, 50L, 50L,
50L, 50L), hour = c(18L, 18L, 18L, 18L, 18L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L), mday = c(24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L
), mon = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), year = c(114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L), wday = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), yday = c(82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L
), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("sec", "min", "hour",
"mday", "mon", "year", "wday", "yday", "isdst"), class = c("POSIXlt",
"POSIXt"), tzone = "GMT"), arrivaltime = structure(list(sec = c(33,
42, 1, 4, 11, 29, 13, 33, 9, 42, 27, 1, 4, 55, 31, 25, 49,
20, 11, 48, 9, 13, 29, 13, 33, 9, 42, 27, 1, 4), min = c(24L,
35L, 20L, 27L, 32L, 0L, 11L, 24L, 57L, 35L, 54L, 20L, 27L,
47L, 52L, 49L, 50L, 17L, 32L, 5L, 8L, 14L, 0L, 11L, 24L,
57L, 35L, 54L, 20L, 27L), hour = c(18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 17L, 18L, 17L, 18L, 18L, 17L, 17L, 17L, 17L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 17L, 18L, 17L, 18L,
18L), mday = c(24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L), mon = c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
year = c(114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L), wday = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), yday = c(82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L), isdst = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
)), .Names = c("sec", "min", "hour", "mday", "mon", "year",
"wday", "yday", "isdst"), class = c("POSIXlt", "POSIXt"), tzone = "GMT"),
lcid = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1000001", "1000002", "1000003",
"1000004", "1000005", "1000006", "1000007", "1000008", "1000009",
"1000010", "1000045", "1000054", "1000070", "1000088", "1000089",
"1000090", "1000097", "1000098", "1000099", "1000100", "1000101",
"1000102", "1000103", "1000104", "1000105", "1000106", "1000107",
"1000109"), class = "factor"), tripno = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"2", "20", "21", "22", "23", "24", "3", "4", "5", "6", "7",
"8", "81", "9"), class = "factor"), destination = structure(c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Debden",
"Ealing Broadway", "Epping", "Grange Hill via Woodford",
"Hainault", "Hainault via Newbury Park", "Hainault via Woodford",
"Leytonstone", "Loughton", "Marble Arch", "Newbury Park",
"North Acton", "Northolt", "Ruislip Gardens", "West Ruislip",
"White City", "Woodford", "Woodford Via Hainault"), class = "factor"),
line = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "C", class = "factor")), .Names = c("depart",
"arrival", "traveltime", "departuretime", "arrivaltime", "lcid",
"tripno", "destination", "line"), row.names = c(NA, 30L), class = "data.frame")
以下是我编写的函数:
#cut off top percent of travel times for each combination of arrival and
#departure stations to remove outliers
cutOffTopPercent <- function(data, percentage=0.99){
res <- data.frame()
#loop through all combinations of depart and arrival stations
for(i in 1:length(levels(data$depart))){
for(j in 1:length(levels(data$arrival))){
#create variables for departure/arrival station to make code easier to read
departureStation <- levels(data$depart)[i]
arrivalStation <- levels(data$arrival)[j]
#create a subset containing only the current departure and arrival station
dataSubset <- data[data$depart == departureStation & data$arrival == arrivalStation,]
#get top value that's allowed
upperBorder <- getTopPercentileBottom(dataSubset, percentage)
#remove records with values higher than than allowed
dataSubset <- dataSubset[dataSubset$traveltime < upperBorder,]
#glue the subset to the end result
res <- rbind(res,dataSubset)
}
}
return(res)
}
#returns the traveltime that marks where the given percentage of traveltimes starts
getTopPercentileBottom <- function(data, percentile){
upperBorder <- quantile(data$traveltime, probs = percentile)
return(upperBorder)
}
这里是代码的矢量化版本。基本上,我使用
Map
来避免双循环和手动填充结果(使用rbind,非常简单)
cutofftopercent“手动”是什么意思?您是否尝试过使用debug
或browser
调试函数?我无法重现您的错误df Hi,我在输入数据中添加了前30条记录的dput。我以前从未真正使用过调试器,也没有任何必要,但我会尝试一下。谢谢你的快速回复!在您的数据中,每个到达和离开地点组合只有一条记录。1条记录的最高百分比恰好是这个记录,因此函数应该返回一个空的data.frame
。也许你需要把上界,我想扔掉每个站组合的最高离群值,因为它们非常高。就我所知,在我的完整数据中,我确实有多条记录用于每一个这样的组合。我试着做一个这样的子集,但对于这个,它似乎确实起了作用。我想问题在于我的数据,即使没有NA。我会朝那个方向看得更远。谢谢你的帮助!嘿,谢谢你的建议。您的代码似乎返回一个空数据帧列表,但是。。。我不知道哪里出了问题,因为我不完全确定你在这里做什么:)@HerrSubset我编辑我的代码。实际上,它返回空行。但这是你的代码。我把条件从<放宽到>
cutOffTopPercent <-
function(data,percent=0.99){
cut_off_dep_arr <-
function(dep,arr){
dataSubset <- data[data$depart == dep & data$arrival == arr,]
upperBorder <- getTopPercentileBottom(dataSubset, percent)
dataSubset[dataSubset$traveltime <= upperBorder,] ## <= not <
}
Map(cut_off_dep_arr,df$depart,df$arrival)
}
cutOffTopPercent(data=df)