按日期r排序列的子集

按日期r排序列的子集,r,columnsorting,R,Columnsorting,我有一个数据框,其中部分列的日期顺序不正确。见: data1989 <- data.frame("date_fire" = c("1987-02-01", "1987-07-03", "1988-01-01"), "Foresttype" = c("oak", "pine", "oak"), "meanSolarRad" = c(500, 550, 450),

我有一个数据框,其中部分列的日期顺序不正确。见:

data1989 <- data.frame("date_fire" = c("1987-02-01", "1987-07-03", "1988-01-01"), 
                       "Foresttype" = c("oak", "pine", "oak"),
                       "meanSolarRad" = c(500, 550, 450),
                       "meanRainfall" = c(600, 300, 450),
                       "meanTemp" = c(14, 15, 12),
                       "1988.01.01" = c(0.5, 0.589, 0.66), 
                       "1986.06.03" = c(0.56, 0.447, 0.75), 
                       "1986.10.19" = c(0.8, NA, 0.83),
                       "1988.01.19" = c(0.75, 0.65,0.75), 
                       "1986.06.19" = c(0.1, 0.55,0.811),
                       "1987.10.19" = c(0.15, 0.12, 0.780),
                       "1988.01.19" = c(0.2, 0.22,0.32), 
                       "1986.06.19" = c(0.18, 0.21,0.23),
                       "1987.10.19" = c(0.21, 0.24, 0.250),
                       check.names = FALSE,
                       stringsAsFactors = FALSE) 

> data1989
   date_fire Foresttype meanSolarRad meanRainfall meanTemp 1988.01.01 1986.06.03 1986.10.19 1988.01.19 1986.06.19 1987.10.19 1988.01.19 1986.06.19 1987.10.19
1 1987-02-01        oak          500          600       14      0.500      0.560       0.80       0.75      0.100       0.15       0.20       0.18       0.21
2 1987-07-03       pine          550          300       15      0.589      0.447         NA       0.65      0.550       0.12       0.22       0.21       0.24
3 1988-01-01        oak          450          450       12      0.660      0.750       0.83       0.75      0.811       0.78       0.32       0.23       0.25

我想通过增加日期来排序列,并保持前5列不变。请记住,在我的原始数据集中,我有30个初始列要保持不变。

我们可以将日期列名称转换为日期类,按顺序操作,然后将其用作列索引

i1 <- grep('^\\d{4}\\.\\d{2}\\.\\d{2}$', names(data1989))
data1989[c(seq_len(i1[1]-1), order(as.Date(names(data1989)[i1], "%Y.%m.%d")) + i1[1]-1)]
# date_fire Foresttype meanSolarRad meanRainfall meanTemp 1986.06.03 1986.06.19 1986.06.19.1 1986.10.19 1987.10.19
#1 1987-02-01        oak          500          600       14      0.560      0.100         0.18       0.80       0.15
#2 1987-07-03       pine          550          300       15      0.447      0.550         0.21         NA       0.12
#3 1988-01-01        oak          450          450       12      0.750      0.811         0.23       0.83       0.78
#  1987.10.19.1 1988.01.01 1988.01.19 1988.01.19.1
#1         0.21      0.500       0.75         0.20
#2         0.24      0.589       0.65         0.22
#3         0.25      0.660       0.75         0.32

我们可以将dates列名转换为Date类,按顺序操作,然后将其用作列索引

i1 <- grep('^\\d{4}\\.\\d{2}\\.\\d{2}$', names(data1989))
data1989[c(seq_len(i1[1]-1), order(as.Date(names(data1989)[i1], "%Y.%m.%d")) + i1[1]-1)]
# date_fire Foresttype meanSolarRad meanRainfall meanTemp 1986.06.03 1986.06.19 1986.06.19.1 1986.10.19 1987.10.19
#1 1987-02-01        oak          500          600       14      0.560      0.100         0.18       0.80       0.15
#2 1987-07-03       pine          550          300       15      0.447      0.550         0.21         NA       0.12
#3 1988-01-01        oak          450          450       12      0.750      0.811         0.23       0.83       0.78
#  1987.10.19.1 1988.01.01 1988.01.19 1988.01.19.1
#1         0.21      0.500       0.75         0.20
#2         0.24      0.589       0.65         0.22
#3         0.25      0.660       0.75         0.32

如果您想使用dplyr,这里有一个替代方案。注意每个colname必须是唯一的。在你身上有一些重复的

library(dplyr)

data1989 <- data.frame("date_fire" = c("1987-02-01", "1987-07-03", "1988-01-01"), 
                       "Foresttype" = c("oak", "pine", "oak"),
                       "meanSolarRad" = c(500, 550, 450),
                       "meanRainfall" = c(600, 300, 450),
                       "meanTemp" = c(14, 15, 12),
                       "1988.01.01" = c(0.5, 0.589, 0.66), 
                       "1986.06.03" = c(0.56, 0.447, 0.75), 
                       "1986.10.19" = c(0.8, NA, 0.83),
                       "1988.01.19" = c(0.75, 0.65,0.75), 
                       "1986.06.19" = c(0.1, 0.55,0.811),
                       "1987.10.19" = c(0.15, 0.12, 0.780),
                       # "1988.01.19" = c(0.2, 0.22,0.32),
                       # "1986.06.19" = c(0.18, 0.21,0.23),
                       # "1987.10.19" = c(0.21, 0.24, 0.250),
                       check.names = FALSE,
                       stringsAsFactors = FALSE) 

# Sort date column names. replace 6 with first date column 
sorted_colnames = sort(names(data1989)[6:ncol(data1989)])

# Sort columns. Replace 5 with last non-date column
data1989 %>% 
  select(1:5, sorted_colnames)

如果您想使用dplyr,这里有一个替代方案。注意每个colname必须是唯一的。在你身上有一些重复的

library(dplyr)

data1989 <- data.frame("date_fire" = c("1987-02-01", "1987-07-03", "1988-01-01"), 
                       "Foresttype" = c("oak", "pine", "oak"),
                       "meanSolarRad" = c(500, 550, 450),
                       "meanRainfall" = c(600, 300, 450),
                       "meanTemp" = c(14, 15, 12),
                       "1988.01.01" = c(0.5, 0.589, 0.66), 
                       "1986.06.03" = c(0.56, 0.447, 0.75), 
                       "1986.10.19" = c(0.8, NA, 0.83),
                       "1988.01.19" = c(0.75, 0.65,0.75), 
                       "1986.06.19" = c(0.1, 0.55,0.811),
                       "1987.10.19" = c(0.15, 0.12, 0.780),
                       # "1988.01.19" = c(0.2, 0.22,0.32),
                       # "1986.06.19" = c(0.18, 0.21,0.23),
                       # "1987.10.19" = c(0.21, 0.24, 0.250),
                       check.names = FALSE,
                       stringsAsFactors = FALSE) 

# Sort date column names. replace 6 with first date column 
sorted_colnames = sort(names(data1989)[6:ncol(data1989)])

# Sort columns. Replace 5 with last non-date column
data1989 %>% 
  select(1:5, sorted_colnames)

如前所述,尽量避免使用包含数据元素(如日期、类别值和其他指标)的列的宽格式数据。取而代之的是使用长格式的数据,其中排序更容易,包括聚合、合并、打印和建模

具体而言,考虑将数据转换成一个字段,如具有值的四分之一。然后轻松订购四分之一列:

# RESHAPE WIDE TO LONG
long_data1989 <- reshape(data1989, varying = names(data1989)[6:ncol(data1989)],
                         times = names(data1989)[6:ncol(data1989)],
                         v.names = "value", timevar = "quarter", ids = NULL,
                         new.row.names = 1:1E4, direction = "long")

# ORDER DATES AND RESET row.names
long_data1989 <- `row.names<-`(with(long_data1989, long_data1989[order(date_fire, quarter),]),
                               NULL)

long_data1989

如前所述,尽量避免使用包含数据元素(如日期、类别值和其他指标)的列的宽格式数据。取而代之的是使用长格式的数据,其中排序更容易,包括聚合、合并、打印和建模

具体而言,考虑将数据转换成一个字段,如具有值的四分之一。然后轻松订购四分之一列:

# RESHAPE WIDE TO LONG
long_data1989 <- reshape(data1989, varying = names(data1989)[6:ncol(data1989)],
                         times = names(data1989)[6:ncol(data1989)],
                         v.names = "value", timevar = "quarter", ids = NULL,
                         new.row.names = 1:1E4, direction = "long")

# ORDER DATES AND RESET row.names
long_data1989 <- `row.names<-`(with(long_data1989, long_data1989[order(date_fire, quarter),]),
                               NULL)

long_data1989

基本R解决方案类似于@Parfaits:

# Reshape dataframe wide --> long:

df_long <- 
  reshape(data1989,
          direction = "long",
          varying = which(!(is.na(as.Date(names(data1989), "%Y.%m.%d")))),
          idvar = which(is.na(as.Date(names(data1989), "%Y.%m.%d"))),
          v.names = "value",
          times = na.omit(as.Date(names(data1989), "%Y.%m.%d")),
          timevar = "date_surveyed",
          new.row.names = 1:(nrow(data1989)*length(na.omit(as.Date(names(data1989), 
                                                         "%Y.%m.%d")))))
# Order the data frame and reset the index: 

ordered_df_long <- data.frame(df_long[with(df_long, order(date_fire, date_surveyed)),],
                              row.names = NULL)

基本R解决方案类似于@Parfaits:

# Reshape dataframe wide --> long:

df_long <- 
  reshape(data1989,
          direction = "long",
          varying = which(!(is.na(as.Date(names(data1989), "%Y.%m.%d")))),
          idvar = which(is.na(as.Date(names(data1989), "%Y.%m.%d"))),
          v.names = "value",
          times = na.omit(as.Date(names(data1989), "%Y.%m.%d")),
          timevar = "date_surveyed",
          new.row.names = 1:(nrow(data1989)*length(na.omit(as.Date(names(data1989), 
                                                         "%Y.%m.%d")))))
# Order the data frame and reset the index: 

ordered_df_long <- data.frame(df_long[with(df_long, order(date_fire, date_surveyed)),],
                              row.names = NULL)

考虑日期列是长格式的,在日期列中的所有日期指示器的前列名称与值列相邻,每个DAT/值对重复前5列。不整洁的数据将迫使复杂的编码缺乏可维护性和可读性。我应该明确检查这一点。值得考虑的是,日期列采用长格式,所有日期指示符前一列的名称都位于与值列相邻的单列中,每个dat/值对的前5列重复。不整洁的数据将迫使复杂的编码缺乏可维护性和可读性。我应该明确检查这一点。谢谢冻糕谢谢阿克伦!不过,这会产生一些奇怪的列名;例如,最后一个,添加。1@OriolBaenaCrespo好的,如果默认情况下有重复的列名,那么data.frame检查只允许唯一的列名,并将使用make.unique转换为append.1、.2等,当你有duplicates@OriolBaenaCrespo我会考虑保留它独特的列名而不是duplicatesThanks Akrun!不过,这会产生一些奇怪的列名;例如,最后一个,添加。1@OriolBaenaCrespo好的,如果默认情况下有重复的列名,那么data.frame检查只允许唯一的列名,并且当您有了duplicates@OriolBaenaCrespo我会考虑保留它唯一的列名而不是重复。