Question

为什么dplyr在数字和字母字符组成时不能正确排序第一列？

> library(dplyr)
> y <- read.table("file.csv", sep = ",")
> arrange(y, V1)
   V1         V2         V3         V4         V5         V6
1   1 0.97348999 0.11047091 0.95841014 0.61826620 0.43164420
2  10 0.82178167 0.21619067 0.11993356 0.06335101 0.28703842
3  11 0.35952632 0.27595845 0.24760335 0.63887200 0.47491472
4  12 0.43775624 0.08852486 0.06870304 0.63670202 0.55432641
5  13 0.83894086 0.40484966 0.96735507 0.86764578 0.02588688
6  14 0.95258399 0.65029909 0.97183605 0.87688243 0.97729517
7  15 0.62839615 0.52999000 0.05722874 0.40709867 0.56039580
8   2 0.22754619 0.16812359 0.39432991 0.68562992 0.43066861
9   3 0.33318220 0.21108688 0.60911213 0.64475379 0.98617404
10  4 0.57208511 0.58709229 0.29435093 0.78603855 0.81185551
11  5 0.35548490 0.15229426 0.42423263 0.72963238 0.04401239
12  6 0.08575802 0.33310521 0.09671737 0.90820671 0.33289880
13  7 0.05743798 0.20439928 0.56411860 0.54859270 0.81053637
14  8 0.99056584 0.29960046 0.20765701 0.45722997 0.51354034
15  9 0.35839568 0.11667019 0.56498996 0.43971051 0.23968955
16  A 0.25645249 0.07045102 0.17046681 0.75700118 0.50269449
17  B 0.57722865 0.31544398 0.33129932 0.44173772 0.11600295
18  C 0.94242373 0.55745376 0.01542128 0.01723924 0.11413310

我想看看：

   V1         V2         V3         V4         V5         V6
1   1 0.97348999 0.11047091 0.95841014 0.61826620 0.43164420
2   2 0.22754619 0.16812359 0.39432991 0.68562992 0.43066861
3   3 0.33318220 0.21108688 0.60911213 0.64475379 0.98617404
4   4 0.57208511 0.58709229 0.29435093 0.78603855 0.81185551
5   5 0.35548490 0.15229426 0.42423263 0.72963238 0.04401239
6   6 0.08575802 0.33310521 0.09671737 0.90820671 0.33289880
7   7 0.05743798 0.20439928 0.56411860 0.54859270 0.81053637
8   8 0.99056584 0.29960046 0.20765701 0.45722997 0.51354034
9   9 0.35839568 0.11667019 0.56498996 0.43971051 0.23968955
10 10 0.82178167 0.21619067 0.11993356 0.06335101 0.28703842
11 11 0.35952632 0.27595845 0.24760335 0.63887200 0.47491472
12 12 0.43775624 0.08852486 0.06870304 0.63670202 0.55432641
13 13 0.83894086 0.40484966 0.96735507 0.86764578 0.02588688
14 14 0.95258399 0.65029909 0.97183605 0.87688243 0.97729517
15 15 0.62839615 0.52999000 0.05722874 0.40709867 0.56039580
16  A 0.25645249 0.07045102 0.17046681 0.75700118 0.50269449
17  B 0.57722865 0.31544398 0.33129932 0.44173772 0.11600295
18  C 0.94242373 0.55745376 0.01542128 0.01723924 0.11413310

Answer 1

你对alpha的忽视有点问题，但是怎么样：

library(dplyr)
arrange(y, as.numeric(V1))
# Warning in order(as.numeric(y$V1)) : NAs introduced by coercion
#    V1         V2         V3         V4         V5         V6
# 1   1 0.97348999 0.11047091 0.95841014 0.61826620 0.43164420
# 8   2 0.22754619 0.16812359 0.39432991 0.68562992 0.43066861
# 9   3 0.33318220 0.21108688 0.60911213 0.64475379 0.98617404
# 10  4 0.57208511 0.58709229 0.29435093 0.78603855 0.81185551
# 11  5 0.35548490 0.15229426 0.42423263 0.72963238 0.04401239
# 12  6 0.08575802 0.33310521 0.09671737 0.90820671 0.33289880
# 13  7 0.05743798 0.20439928 0.56411860 0.54859270 0.81053637
# 14  8 0.99056584 0.29960046 0.20765701 0.45722997 0.51354034
# 15  9 0.35839568 0.11667019 0.56498996 0.43971051 0.23968955
# 2  10 0.82178167 0.21619067 0.11993356 0.06335101 0.28703842
# 3  11 0.35952632 0.27595845 0.24760335 0.63887200 0.47491472
# 4  12 0.43775624 0.08852486 0.06870304 0.63670202 0.55432641
# 5  13 0.83894086 0.40484966 0.96735507 0.86764578 0.02588688
# 6  14 0.95258399 0.65029909 0.97183605 0.87688243 0.97729517
# 7  15 0.62839615 0.52999000 0.05722874 0.40709867 0.56039580
# 16  A 0.25645249 0.07045102 0.17046681 0.75700118 0.50269449
# 17  B 0.57722865 0.31544398 0.33129932 0.44173772 0.11600295
# 18  C 0.94242373 0.55745376 0.01542128 0.01723924 0.11413310

这也适用于base：

y[ order(as.numeric(y$V1)), ]

编辑：OP然后问（deSpite！说“我真的不在乎”;-)如何对非数字字段进行排序。

第一个命令工作的原因是非数字字段全部转换为NA，这样可以方便地将在数字后排序。好吧，dplyr::arrange和base::order都采用任意参数，其中第一列中的关系由第二个参数处理，等等。因此，为了在NA之间进行排序（非数字V1元素），只需添加其中有意义的内容，例如......“他们”：

arrange(y, as.numeric(V1), V1)
y[ order(as.numeric(y$V1), y$V1), ]

Answer 2

我建议在排列之前将 V1 作为一个因子并使用 stringr 包函数对级别进行排序：

> library(dplyr)
> library(stringr)

> y <- tibble(V1 = c("B", "A", 2, 1), V2 =c(1,2,3,4), V3=c(1,2,3,4))
> y %>%  
     dplyr::mutate(V1_fac = factor(V1, levels= str_sort(V1, numeric=TRUE))) %>%
     dplyr::arrange(V1_fac)

numeric=TRUE 选项允许按数字排序 V1 数字，而不是按字符串排序。如果 V1 中的某些条目不是唯一的，您可能需要使用：

y %>%  
     dplyr::mutate(V1_fac = factor(V1, levels= str_sort(unique(V1), numeric=TRUE))) %>%
     dplyr::arrange(V1_fac)

R dplyr列与字母数字字符排序

2 个答案: