R 如何根据现有列值的if条件为新列赋值？_R_Apache Spark_Sparkr

R 如何根据现有列值的if条件为新列赋值？

r apache-spark

R 如何根据现有列值的if条件为新列赋值？,r,apache-spark,sparkr,R,Apache Spark,Sparkr,我需要达到以下条件 if column Avg_sales_greaterthan_7 == 'YES' { column Avg_sales_after_outliner_rejection == column Avg_cache_out } else if column Avg_sales_greaterthan_7 == 'NO' { column Avg_sales_after_outliner_rejection == column Avg_sales_for_3mon } 使

我需要达到以下条件

if column Avg_sales_greaterthan_7 == 'YES'
{
 column Avg_sales_after_outliner_rejection == column Avg_cache_out 
}
else if column Avg_sales_greaterthan_7 == 'NO'
{
 column Avg_sales_after_outliner_rejection == column Avg_sales_for_3mon
}

使用的数据集：df_sales3

|Location_code|        Avg_cache | Avg_sales_for_3mon  | Avg_sales_greaterthan_7|Avg_cache_out|Avg_sales_after_outliner_rejection|
+-------------+------------------+---------------------+------------------------+-------------+----------------------------------+
|         1003|             752.0|                 8.17|                     YES|         5.15|                              5.15|
|         1010|            1906.0|                13.33|                     NO |        20.72|                             13.33|
|         1014|            7965.0|                86.58|                     YES|        80.32|                             80.32|
|         1031|3199.6400000000003|                34.78|                     YES|        30.88|                             30.88|
|         1040|1690.5069999999998|                18.38|                     YES|        14.21|                             14.21|
|         1047|            1000.0|                10.87|                     NO |         8.73|                             10.87|
|         1061|            1133.0|                12.32|                     NO |         8.61|                             12.32|

我使用此sparkR代码来实现此条件：

df_1 <- filter(df_sales_3, df_sales_3$Avg_sales_greater_than_7 == "YES")
df_1$Avg_sales_after_outliner_rejection <- df_1$Avg_cache_out
df_2 <- filter(df_sales_3, df_sales_3$Avg_sales_greater_than_7 == "NO")
df_2$Avg_sales_after_outliner_rejection <- df_2$Avg_sales_for_3mon
df_sales_3 <- unionAll(df_1, df_2)

df_1当

表达式：

df <- createDataFrame(sqlContext, 
  data.frame(foo=c(TRUE, FALSE, TRUE), x=c(1, 0, 3), y=c(-1, -3, -5)))
registerTempTable(df, "df")

head(sql(sqlContext, "SELECT *, CASE WHEN foo THEN x ELSE y END as bar FROM df"))

##     foo x  y bar
## 1  TRUE 1 -1   1
## 2 FALSE 0 -3  -3
## 3  TRUE 3 -5   3

应该也能正常工作，但它看起来像这样，它在1.5中被破坏了您可以使用原始SQL和

CASE当

表达式：

df <- createDataFrame(sqlContext, 
  data.frame(foo=c(TRUE, FALSE, TRUE), x=c(1, 0, 3), y=c(-1, -3, -5)))
registerTempTable(df, "df")

head(sql(sqlContext, "SELECT *, CASE WHEN foo THEN x ELSE y END as bar FROM df"))

##     foo x  y bar
## 1  TRUE 1 -1   1
## 2 FALSE 0 -3  -3
## 3  TRUE 3 -5   3

应该也能用，但它看起来像是在1.5中被破坏了。我们可以试试这个

temp <- df_sales3$greaterthan_7 == "YES"
df_sales3$after_outliner_rejection[temp] <- df_sales3$cache_out[temp]
df_sales3$after_outliner_rejection[!temp] <- df_sales3$for_3mon[!temp]

temp我们可以试试这个
temp <- df_sales3$greaterthan_7 == "YES"
df_sales3$after_outliner_rejection[temp] <- df_sales3$cache_out[temp]
df_sales3$after_outliner_rejection[!temp] <- df_sales3$for_3mon[!temp]

temp使用sqldf
可以做到这一点
library(sqldf)
sqldf("select * , case when col4 == 'YES' then col5 else col3 end new from data")

library(data.table)
setDT(data)[, new := if(col4 == 'YES') col5 else col3, by = 1:nrow(data)]

#> data
#   col1     col2  col3 col4  col5  col6   new
#1: 1003  752.000  8.17  YES  5.15  5.15  5.15
#2: 1010 1906.000 13.33   NO 20.72 13.33 13.33
#3: 1014 7965.000 86.58  YES 80.32 80.32 80.32
#4: 1031 3199.640 34.78  YES 30.88 30.88 30.88
#5: 1040 1690.507 18.38  YES 14.21 14.21 14.21
#6: 1047 1000.000 10.87   NO  8.73 10.87 10.87
#7: 1061 1133.000 12.32   NO  8.61 12.32 12.32

使用apply

data$new = as.numeric(apply(data, 1, 
           function(x) if(x['col4'] == "YES") x['col5'] else x['col3']))

#> data
#  col1     col2  col3 col4  col5  col6   new
#1 1003  752.000  8.17  YES  5.15  5.15  5.15
#2 1010 1906.000 13.33   NO 20.72 13.33 13.33
#3 1014 7965.000 86.58  YES 80.32 80.32 80.32
#4 1031 3199.640 34.78  YES 30.88 30.88 30.88
#5 1040 1690.507 18.38  YES 14.21 14.21 14.21
#6 1047 1000.000 10.87   NO  8.73 10.87 10.87
#7 1061 1133.000 12.32   NO  8.61 12.32 12.32

使用data.table
可以做到这一点
library(sqldf)
sqldf("select * , case when col4 == 'YES' then col5 else col3 end new from data")

library(data.table)
setDT(data)[, new := if(col4 == 'YES') col5 else col3, by = 1:nrow(data)]

#> data
#   col1     col2  col3 col4  col5  col6   new
#1: 1003  752.000  8.17  YES  5.15  5.15  5.15
#2: 1010 1906.000 13.33   NO 20.72 13.33 13.33
#3: 1014 7965.000 86.58  YES 80.32 80.32 80.32
#4: 1031 3199.640 34.78  YES 30.88 30.88 30.88
#5: 1040 1690.507 18.38  YES 14.21 14.21 14.21
#6: 1047 1000.000 10.87   NO  8.73 10.87 10.87
#7: 1061 1133.000 12.32   NO  8.61 12.32 12.32

样本数据
使用sqldf
可以做到这一点
library(sqldf)
sqldf("select * , case when col4 == 'YES' then col5 else col3 end new from data")

library(data.table)
setDT(data)[, new := if(col4 == 'YES') col5 else col3, by = 1:nrow(data)]

#> data
#   col1     col2  col3 col4  col5  col6   new
#1: 1003  752.000  8.17  YES  5.15  5.15  5.15
#2: 1010 1906.000 13.33   NO 20.72 13.33 13.33
#3: 1014 7965.000 86.58  YES 80.32 80.32 80.32
#4: 1031 3199.640 34.78  YES 30.88 30.88 30.88
#5: 1040 1690.507 18.38  YES 14.21 14.21 14.21
#6: 1047 1000.000 10.87   NO  8.73 10.87 10.87
#7: 1061 1133.000 12.32   NO  8.61 12.32 12.32

使用apply

data$new = as.numeric(apply(data, 1, 
           function(x) if(x['col4'] == "YES") x['col5'] else x['col3']))

#> data
#  col1     col2  col3 col4  col5  col6   new
#1 1003  752.000  8.17  YES  5.15  5.15  5.15
#2 1010 1906.000 13.33   NO 20.72 13.33 13.33
#3 1014 7965.000 86.58  YES 80.32 80.32 80.32
#4 1031 3199.640 34.78  YES 30.88 30.88 30.88
#5 1040 1690.507 18.38  YES 14.21 14.21 14.21
#6 1047 1000.000 10.87   NO  8.73 10.87 10.87
#7 1061 1133.000 12.32   NO  8.61 12.32 12.32

使用data.table
可以做到这一点
library(sqldf)
sqldf("select * , case when col4 == 'YES' then col5 else col3 end new from data")

library(data.table)
setDT(data)[, new := if(col4 == 'YES') col5 else col3, by = 1:nrow(data)]

#> data
#   col1     col2  col3 col4  col5  col6   new
#1: 1003  752.000  8.17  YES  5.15  5.15  5.15
#2: 1010 1906.000 13.33   NO 20.72 13.33 13.33
#3: 1014 7965.000 86.58  YES 80.32 80.32 80.32
#4: 1031 3199.640 34.78  YES 30.88 30.88 30.88
#5: 1040 1690.507 18.38  YES 14.21 14.21 14.21
#6: 1047 1000.000 10.87   NO  8.73 10.87 10.87
#7: 1061 1133.000 12.32   NO  8.61 12.32 12.32

样本数据