R 对数据帧中的行值重新排序

R 对数据帧中的行值重新排序,r,R,我有以下数据框: df <- data.frame( var1 = c("A", "C", "C", "B", "D"), val1 = c(.89, .99, .67, .88, .92), var2 = c("B", "A", "D", "A", "B"), val2 = c(.87, .95, .55, .84, .88), var3 = c("C", "B", "B", "C", "A"), val3 = c(.66, .55, .45, .81, .77)

我有以下数据框:

df <- data.frame(
  var1 = c("A", "C", "C", "B", "D"),
  val1 = c(.89, .99, .67, .88, .92),
  var2 = c("B", "A", "D", "A", "B"),
  val2 = c(.87, .95, .55, .84, .88),
  var3 = c("C", "B", "B", "C", "A"),
  val3 = c(.66, .55, .45, .81, .77),
  var4 = c("D", "D", "A", "D", "C"),
  val4 = c(.44, .33, .43, .77, .69),
  stringsAsFactors = FALSE
)

df
#  var1 val1 var2 val2 var3 val3 var4 val4
#1    A 0.89    B 0.87    C 0.66    D 0.44
#2    C 0.99    A 0.95    B 0.55    D 0.33
#3    C 0.67    D 0.55    B 0.45    A 0.43
#4    B 0.88    A 0.84    C 0.81    D 0.77
#5    D 0.92    B 0.88    A 0.77    C 0.69
原始数据集中的行
vals
将始终按降序排列(即
val1
val2
val3
val4
),每个字母将恰好在该行中一次

我能够通过一个相当笨拙的
for
循环获得预期的输出:

df_new <- df

for (i in 1:nrow(df)){
  if (df$var1[i] %in% c("C", "D")){
    if (df$var2[i] == "A"){
      df_new$var1[i] <- df$var2[i]
      df_new$var2[i] <- df$var1[i]
      df_new$val1[i] <- df$val2[i]
      df_new$val2[i] <- df$val1[i]
    } else if (df$var3[i] == "A"){
      df_new$var1[i] <- df$var3[i]
      df_new$var2[i] <- df$var1[i]
      df_new$var3[i] <- df$var2[i]
      df_new$val1[i] <- df$val3[i]
      df_new$val2[i] <- df$val1[i]
      df_new$val3[i] <- df$val2[i]
    } else {
      df_new$var1[i] <- df$var4[i]
      df_new$var2[i] <- df$var1[i]
      df_new$var3[i] <- df$var2[i]
      df_new$var4[i] <- df$var3[i]
      df_new$val1[i] <- df$val4[i]
      df_new$val2[i] <- df$val1[i]
      df_new$val3[i] <- df$val2[i]
      df_new$val4[i] <- df$val3[i]
    }
  }
}

df_new不确定这是否算是更优雅,但是:

for(i in 1:nrow(df)){
  if(df$var1[i] == "C"){
    # Holds val1 if var1 is "C"
    oldval <- df$val1[i]
    # Which column has the new value in it?
    col <- which(df[i, ] == "A") + 1
    # Replace the values
    df[i, "var1"] <- "A"
    df[i, "val1"] <- df[i, col]
    df[i, (col - 1)] <- "C"
    df[i, col] <- oldval
  }
  # To maintain original ordering
  if(df$val3[i] > df$val2[i]){
    # Hold the vars and values
    vars <- df[i, paste0("var", 2:3)]
    vals <- df[i, paste0("val", 2:3)]
    # Replace the values
    df[i, paste0("var", 2:3)] <- rev(vars)
    df[i, paste0("val", 2:3)] <- rev(vals)
  }
}
for(1中的i:nrow(df)){
如果(df$var1[i]=“C”){
#如果var1为“C”,则保持val1

oldval这是一个矢量化的解决方案。为判断变量名道歉。我马上要添加注释

> a_vars <- apply(df, 1, function(vec) which(vec == 'A')) # these are the columns in each row which contain an 'A'
> a_cols <- a_vars + 1 # these are the corresponding value columns
> 
> bad_rows <- (1:nrow(df))[df$var1 %in% c('C', 'D')] # these are the rows which have a C or D in the first column
> shifts <- sequence(a_vars[bad_rows] - 1) # we'll need to shift certain values in each bad row; shift stores their columns
> bad_vals <- df$val1[a_cols] # these are the values in the column containing an A 
> 
> shift_vals <- df[cbind(rep(bad_rows, (a_vars[bad_rows] - 1)), shifts)] # these are the values which need to be shifted over
> 
> df$val1[bad_rows] <- df[cbind(bad_rows, a_cols[df$var1 %in% c('C', 'D')])] # shift the values from the A columns into the first column in the bad rows
> df$var1[bad_rows] <- 'A' # and make those variables 'A's
> df[cbind(rep(bad_rows, (a_vars[bad_rows] - 1)), shifts + 2)] <- shift_vals # now put the shifting values into their correct columns
> df
  var1 val1 var2 val2 var3 val3 var4 val4
1    A 0.89    B 0.87    C 0.66    D 0.44
2    A 0.95    C 0.99    B 0.55    D 0.33
3    A 0.43    C 0.67    D 0.55    B 0.45
4    B 0.88    A 0.84    C 0.81    D 0.77
5    A 0.77    D 0.92    B 0.88    C 0.69
>a_vars a_cols
>糟糕的轮班
现在,如果您愿意(但我建议您使用此结构),我们可以将数据转换回宽格式:

x <- dcast(x, group ~ newOrder, value.var = c('var', 'val'))
# do some reformatting, if you want:
setnames(x, gsub('_', '', colnames(x)))
x[, group := NULL] # deletes group column
setcolorder(x, colnames(df))
x
#    var1 val1 var2 val2 var3 val3 var4 val4
# 1:    A 0.89    B 0.87    C 0.66    D 0.44
# 2:    A 0.95    C 0.99    B 0.55    D 0.33
# 3:    A 0.43    C 0.67    D 0.55    B 0.45
# 4:    B 0.88    A 0.84    C 0.81    D 0.77
# 5:    A 0.77    D 0.92    B 0.88    C 0.69

# test, if matches your results (after conversion to data.frame)
all.equal(df_new, as.data.frame(x))
# [1] TRUE

这在data.table中相当简单-类似于minem的方法,使用更难加工的熔体和dcast公式,以及修订的混洗方法:

# data table package and setup
library(data.table)
setDT(df)

# set row index to work within rows
df[, rowid := 1:.N]

# melt data.table to allow for easier indexing
df <- melt(df, id.vars = "rowid", measure.vars = list(grep("var",colnames(df)),grep("val",colnames(df))), value.name = c("var","val"), variable.factor = FALSE)
df[, variable := as.integer(variable)]

# set index of A variables with C/D in starting position to 0 
df[rowid %in% df[(var == "C" | var == "D") & variable == 1, rowid] & var == "A", variable := 0]

#shuffle
df <- df[order(rowid, variable)]

# reset index to 1..N instead of 0..N
df[, variable := 1:.N, by = .(rowid)]

# back to table format
df <- dcast(df, rowid ~ variable, value.var = list("var", "val"), sep = "")

# reorder
setcolorder(df,order(as.numeric(gsub("\\D+","",colnames(df)))))

df
   var1 val1 var2 val2 var3 val3 var4 val4 rowid
1:    A 0.89    B 0.87    C 0.66    D 0.44     1
2:    A 0.95    C 0.99    B 0.55    D 0.33     2
3:    A 0.43    C 0.67    D 0.55    B 0.45     3
4:    B 0.88    A 0.84    C 0.81    D 0.77     4
5:    A 0.77    D 0.92    B 0.88    C 0.69     5
#数据表包和设置
库(数据表)
setDT(df)
#将行索引设置为在行内工作
df[,rowid:=1:.N]
#melt data.table允许更简单的索引

df感谢您的回复,但我不认为这真的会比我目前拥有的性能更好——理想情况下,我正在寻找某种矢量化的解决方案。
# get max index (if you have more than 4 vars)
# other approaches could be used here
i <- max(as.integer(substr(grep('var', colnames(dt), value = T), 4, 4)))
# split the data by variables
x <- lapply(1:i, function(x) {
  k <- dt[, grep(x, colnames(dt)), with = F]
  setnames(k, c('var', 'val'))
  k[, group := .I]
  })
x <- rbindlist(x)
x
#     var  val group
#  1:   A 0.89     1
#  2:   C 0.99     2
#  3:   C 0.67     3
# ---
# 18:   A 0.43     3
# 19:   D 0.77     4
# 20:   C 0.69     5
# we can now calculate order index, representing your column order
setorder(x, group, -val)
x[, orderI := 1:.N, by = group]
x

# now your logic:
# add index for groups that have C D as first:
x[, CDisFirst := any(orderI == 1 & var %in% c('C', 'D')), by = group]
# add index that A need to be first
x[, aFirst := CDisFirst & var == 'A']
# order now by groups, aFirst and val
setorder(x, group, -aFirst, -val)
x[, newOrder := 1:.N, by = group] # adds newOrder
x
#     var  val group orderI CDisFirst aFirst newOrder
#  1:   A 0.89     1      1     FALSE  FALSE        1
#  2:   B 0.87     1      2     FALSE  FALSE        2
#  3:   C 0.66     1      3     FALSE  FALSE        3
# ---
# 18:   D 0.92     5      1      TRUE  FALSE        2
# 19:   B 0.88     5      2      TRUE  FALSE        3
# 20:   C 0.69     5      4      TRUE  FALSE        4
x <- dcast(x, group ~ newOrder, value.var = c('var', 'val'))
# do some reformatting, if you want:
setnames(x, gsub('_', '', colnames(x)))
x[, group := NULL] # deletes group column
setcolorder(x, colnames(df))
x
#    var1 val1 var2 val2 var3 val3 var4 val4
# 1:    A 0.89    B 0.87    C 0.66    D 0.44
# 2:    A 0.95    C 0.99    B 0.55    D 0.33
# 3:    A 0.43    C 0.67    D 0.55    B 0.45
# 4:    B 0.88    A 0.84    C 0.81    D 0.77
# 5:    A 0.77    D 0.92    B 0.88    C 0.69

# test, if matches your results (after conversion to data.frame)
all.equal(df_new, as.data.frame(x))
# [1] TRUE
system.time(original())
# user  system elapsed 
# 28.23   22.25   51.22
system.time(minem())
# user  system elapsed 
# 0.29    0.00    0.30 
system.time(Joseph())
# user  system elapsed 
# 1.75    0.03    1.83 
# data table package and setup
library(data.table)
setDT(df)

# set row index to work within rows
df[, rowid := 1:.N]

# melt data.table to allow for easier indexing
df <- melt(df, id.vars = "rowid", measure.vars = list(grep("var",colnames(df)),grep("val",colnames(df))), value.name = c("var","val"), variable.factor = FALSE)
df[, variable := as.integer(variable)]

# set index of A variables with C/D in starting position to 0 
df[rowid %in% df[(var == "C" | var == "D") & variable == 1, rowid] & var == "A", variable := 0]

#shuffle
df <- df[order(rowid, variable)]

# reset index to 1..N instead of 0..N
df[, variable := 1:.N, by = .(rowid)]

# back to table format
df <- dcast(df, rowid ~ variable, value.var = list("var", "val"), sep = "")

# reorder
setcolorder(df,order(as.numeric(gsub("\\D+","",colnames(df)))))

df
   var1 val1 var2 val2 var3 val3 var4 val4 rowid
1:    A 0.89    B 0.87    C 0.66    D 0.44     1
2:    A 0.95    C 0.99    B 0.55    D 0.33     2
3:    A 0.43    C 0.67    D 0.55    B 0.45     3
4:    B 0.88    A 0.84    C 0.81    D 0.77     4
5:    A 0.77    D 0.92    B 0.88    C 0.69     5