Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/79.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R中的unicode字符转换_R_Unicode - Fatal编程技术网

R中的unicode字符转换

R中的unicode字符转换,r,unicode,R,Unicode,我有这个MTST列,当打印出来时 [1] "<U+0391>G<U+03A1><U+0399><U+039D><U+0399><U+039F> " [2] "<U+0391>G<U+03A7><U+0399><U+0391><U+039B><U+039F>S

我有这个
MTST
列,当打印出来时

 [1] "<U+0391>G<U+03A1><U+0399><U+039D><U+0399><U+039F>                                 "
 [2] "<U+0391>G<U+03A7><U+0399><U+0391><U+039B><U+039F>S                                "
 [3] "<U+0391><U+0399>G<U+0399><U+039D><U+0391>                                  "
 [4] "<U+0391><U+0399>G<U+0399><U+039F>                                   "
 [5] "<U+0391><U+0399><U+0394><U+0397><U+03A8><U+039F>S                                 "
 [6] "<U+0391><U+039A><U+03A4><U+0399><U+039F>(<U+03A0><U+03A1><U+0395><U+0392><U+0395><U+0396><U+0391>)                          "
 [7] "<U+0391><U+039B><U+0395><U+039E><U+0391><U+039D><U+0394><U+03A1><U+039F><U+03A5><U+03A0><U+039F><U+039B><U+0397>                          "
 [8] "<U+0391><U+039B><U+0399><U+0391><U+03A1><U+03A4><U+039F>S                                "
我也尝试了
dump
dput
,但没有任何改变

请注意,
MTST
最初的类型是
字符

谢谢你的帮助。谢谢

编辑:显示以下
dput(MTST)

c("<U+0391>G<U+03A1><U+0399><U+039D><U+0399><U+039F>                                 ",
"<U+0391>G<U+03A7><U+0399><U+0391><U+039B><U+039F>S                                ",
"<U+0391><U+0399>G<U+0399><U+039D><U+0391>                                  ",
"<U+0391><U+0399>G<U+0399><U+039F>                                   ",
"<U+0391><U+0399><U+0394><U+0397><U+03A8><U+039F>S                                 ",
"<U+0391><U+039A><U+03A4><U+0399><U+039F>(<U+03A0><U+03A1><U+0395><U+0392><U+0395><U+0396><U+0391>)                          ",
"<U+0391><U+039B><U+0395><U+039E><U+0391><U+039D><U+0394><U+03A1><U+039F><U+03A5><U+03A0><U+039F><U+039B><U+0397>                          ",
"<U+0391><U+039B><U+0399><U+0391><U+03A1><U+03A4><U+039F>S                                ",
"<U+0391><U+039D><U+0391><U+0392><U+03A1><U+03A5><U+03A4><U+0391>                                ",
"<U+0391><U+039D><U+0394><U+03A1><U+0391><U+0392><U+0399><U+0394><U+0391>                               ",
"<U+0391><U+039D>OG<U+0395><U+0399><U+0391>                                 ",
"<U+0391><U+03A1><U+0391><U+039E><U+039F>S                                  ",
"<U+0391><U+03A1><U+0391><U+03A7>O<U+0392><U+0391>                                 ",
"<U+0391><U+03A1>G<U+039F>S(<U+03A0><U+03A5><U+03A1>G<U+0395><U+039B><U+0391>)                          ",
"<U+0391><U+03A1>G<U+039F>S<U+03A4><U+039F><U+039B><U+0399>                               ",
"<U+0391><U+03A1><U+03A4><U+0391> (<U+03A0><U+039F><U+039B><U+0397>)                             ",
"<U+0391><U+03A1><U+03A4><U+0391> (F<U+0399><U+039B><U+039F>T<U+0395><U+0397>)                          ",
"<U+0391>S<U+03A4><U+0395><U+03A1><U+039F>S<U+039A><U+039F><U+03A0><U+0395><U+0399><U+039F>                           ",
"<U+0391>S<U+03A4><U+03A1><U+039F>S                                  ",
"<U+0391>S<U+03A4><U+03A5><U+03A0><U+0391><U+039B><U+0391><U+0399><U+0391>                              ",
"<U+0392><U+0391><U+039C><U+039F>S                                   ",
"<U+0392><U+0395><U+039B><U+039F> (<U+039A><U+039F><U+03A1><U+0399><U+039D>T<U+0399><U+0391>S)                        ",
"<U+0392><U+039F><U+039B><U+039F>S                                   ",
"<U+0392><U+03A5><U+03A4><U+0399><U+039D><U+0391>                                  ",
"G<U+039F><U+03A1><U+03A4><U+03A5>S                                  ",
"G<U+03A5>T<U+0395><U+0399><U+039F>                                  ",
"<U+0394><U+0395>SF<U+0399><U+039D><U+0391>                                 ",
"<U+0394><U+0399><U+0391><U+0392><U+039F><U+039B><U+0399><U+03A4>S<U+0399>                              ",
"<U+0394><U+039F><U+039C><U+039F><U+039A><U+039F>S                                 ",
"<U+0394><U+03A1><U+0391><U+039C><U+0391>                                   ",
"<U+0395><U+0394><U+0395>SS<U+0391>                                  ",
"<U+0395><U+039B><U+0395><U+03A5>S<U+0399><U+039D><U+0391>                                ",
"<U+0395><U+039B><U+039B><U+0397><U+039D><U+0399><U+039A><U+039F> ae<U+03C1>                            ",
"<U+0396><U+0391><U+039A><U+03A5><U+039D>T<U+039F>S                                ",
"<U+0396><U+0391><U+039A><U+03A5><U+039D>T<U+039F>S_<U+03A0><U+039F><U+039B><U+0397>                           ",
"<U+0396><U+0391><U+03A1><U+039F>S                                   ",
"<U+0397><U+03A1><U+0391><U+039A><U+039B><U+0395><U+0399><U+039F>                                ",
"T<U+0391>S<U+039F>S                                   ", "T<U+0397><U+03A1><U+0391> (S<U+0391><U+039D><U+03A4><U+039F><U+03A1><U+0399><U+039D><U+0397>",
"<U+0399><U+0395><U+03A1><U+0391><U+03A0><U+0395><U+03A4><U+03A1><U+0391>                               ",
"<U+0399><U+039A><U+0391><U+03A1><U+0399><U+0391>_<U+0391>/<U+0394>                              ",
"<U+0399>O<U+0391><U+039D><U+039D><U+0399><U+039D><U+0391>                                ",
"<U+039A><U+0391><U+0392><U+0391><U+039B><U+0391> (<U+03A0><U+039F><U+039B><U+0397>)                           ",
"<U+039A><U+0391><U+0392><U+0391><U+039B><U+0391>(<U+0391><U+039C><U+03A5>G<U+0394><U+0391><U+039B><U+0395>O<U+039D><U+0391>S)                    ",
"<U+039A><U+0391><U+039B><U+0391><U+0392><U+03A1><U+03A5><U+03A4><U+0391>                               ",
"<U+039A><U+0391><U+039B><U+0391><U+039C><U+0391><U+03A4><U+0391>                                ",
"<U+039A><U+0391><U+039B><U+0391><U+039C><U+03A0><U+0391><U+039A><U+0391>                               ",
"<U+039A><U+0391><U+03A1><U+0394><U+0399><U+03A4>S<U+0391>                                ",
"<U+039A><U+0391><U+03A1><U+03A0><U+0391>T<U+039F>S_<U+0391>/<U+0394>                            ",
"<U+039A><U+0391><U+03A1><U+03A0><U+0391>T<U+039F>S_<U+03A0><U+039F><U+039B><U+0397>                           ",
"<U+039A><U+0391><U+03A1><U+03A0><U+0395><U+039D><U+0397>S<U+0399>                               ",
"<U+039A><U+0391><U+03A1><U+03A5>S<U+03A4><U+039F>S                                ",
"<U+039A><U+0391>S<U+039F>S                                   ",
"<U+039A><U+0391>S<U+03A4><U+0395><U+039B><U+039B><U+0399>                                ",
"<U+039A><U+0391>S<U+03A4><U+039F><U+03A1><U+0399><U+0391>                                ",
"<U+039A><U+0395><U+03A1><U+039A><U+03A5><U+03A1><U+0391>                                 ",
"<U+039A><U+039F><U+0396><U+0391><U+039D><U+0397>                                  ",
"<U+039A><U+039F><U+039C><U+039F><U+03A4><U+0397><U+039D><U+0397>                                ",
"<U+039A><U+039F><U+039D><U+0399><U+03A4>S<U+0391>                                 ",
"<U+039A><U+039F><U+03A1><U+0399><U+039D>T<U+039F>S                                ",
"<U+039A><U+03A5>T<U+0397><U+03A1><U+0391>_<U+0391>/<U+0394>                              ",
"<U+039A><U+03A5><U+039C><U+0397>                                    ",
"<U+039A>OS                                     ", "<U+039A>OS_<U+03A0><U+039F><U+039B><U+0397>                                ",
"<U+039B><U+0391><U+039C><U+0399><U+0391>                                   ",
"<U+039B><U+0391><U+03A1><U+0399>S<U+0391>                                  ",
"<U+039B><U+0395><U+03A1><U+039F>S                                   ",
"<U+039B><U+0395><U+03A5><U+039A><U+0391><U+0394><U+0391> (<U+039D><U+0397>S<U+0399>)                          ",
"<U+039B><U+0395>O<U+039D><U+0399><U+0394><U+0399><U+039F>                                ",
"<U+039B><U+0397><U+039C><U+039D><U+039F>S                                  ",
"<U+039B><U+0399><U+0394>O<U+03A1><U+0399><U+039A><U+0399>                                ",
"<U+039C><U+0391><U+039A><U+0395><U+0394><U+039F><U+039D><U+0399><U+0391>                               ",
"<U+039C><U+0391><U+03A1><U+0391>TO<U+039D><U+0391>S                               ",
"<U+039C><U+0395>TO<U+039D><U+0397>                                  ",
"<U+039C><U+0395>S<U+039F><U+039B><U+039F>GG<U+0399>                               ",
"<U+039C><U+0397><U+039B><U+039F>S_<U+0391><U+039C>S                               ",
"<U+039C><U+03A5><U+039A><U+039F><U+039D><U+039F>S                                 ",
"<U+039C><U+03A5><U+03A4><U+0399><U+039B><U+0397><U+039D><U+0397>                                ",
"<U+039D><U+0391><U+039E><U+039F>S                                   ",
"<U+039D><U+0391><U+03A5><U+03A0><U+0391><U+039A><U+03A4><U+039F>S                               ",
"<U+039D><U+0391><U+03A5><U+03A0><U+039B><U+0399><U+039F>                                 ",
"<U+039D><U+0395><U+0391> F<U+0399><U+039B><U+0391><U+0394><U+0395><U+039B>F<U+0395><U+0399><U+0391>                         ",
"<U+039E><U+0391><U+039D>T<U+0397>                                   ",
"<U+039F><U+03A1><U+0395>S<U+03A4><U+0399><U+0391><U+0394><U+0391>                               ",
"<U+03A0><U+0391><U+0399><U+0391><U+039D><U+0399><U+0391>                                 ",
"<U+03A0><U+0391><U+039B><U+0391><U+0399><U+039F><U+03A7>O<U+03A1><U+0391>                              ",
"<U+03A0><U+0391><U+03A1><U+039F>S_<U+0391>/<U+0394>                               ",
"<U+03A0><U+0391><U+03A4><U+03A1><U+0391>                                   ",
"<U+03A0><U+0395><U+0399><U+03A1><U+0391><U+0399><U+0391>S                                ",
"<U+03A0><U+039F><U+039B><U+03A5>G<U+03A5><U+03A1><U+039F>S                               ",
"<U+03A0><U+039F><U+03A4><U+0399><U+0394><U+0391><U+0399><U+0391>                                ",
"<U+03A0><U+03A4><U+039F><U+039B><U+0395><U+039C><U+0391><U+0399><U+0394><U+0391>                              ",
"<U+03A0><U+03A5><U+03A1>G<U+039F>S                                  ",
"<U+03A1><U+0391>F<U+0397><U+039D><U+0391>                                  ",
"<U+03A1><U+0395>T<U+03A5><U+039C><U+039D><U+039F>                                 ",
"<U+03A1><U+039F><U+0394><U+039F>S                                   ",
"S<U+0391><U+039C><U+039F>S                                   ",
"S<U+0395><U+0394><U+0395>S                                   ",
"S<U+0395><U+03A1><U+03A1><U+0395>S                                  ",
"S<U+0397><U+03A4><U+0395><U+0399><U+0391>                                  ",
"S<U+039A><U+0399><U+0391>T<U+039F>S                                 ",
"S<U+039A><U+039F><U+03A4><U+0399><U+039D><U+0391>                                 ",
"S<U+039A><U+03A5><U+03A1><U+039F>S                                  ",
"S<U+039F><U+03A5><U+0394><U+0391>                                   ",
"S<U+039F><U+03A5>F<U+039B><U+0399>                                  ",
"S<U+03A0><U+0391><U+03A1><U+03A4><U+0397>                                  ",
"S<U+03A0><U+0391><U+03A4><U+0391>(<U+0392><U+0395><U+039D><U+0399><U+0396><U+0395><U+039B><U+039F>S)                        ",
"S<U+03A0><U+0395><U+03A4>S<U+0395>S                                 ",
"S<U+03A4><U+0395>F<U+0391><U+039D><U+0399> (<U+039A><U+039F><U+03A1><U+0399><U+039D>T<U+0399><U+0391>S)                     ",
"S<U+03A5><U+039A><U+03A5>O<U+039D><U+0391>                                 ",
"S<U+03A5><U+03A1><U+039F>S_<U+0391>/<U+0394>                               ",
"<U+03A4><U+0391><U+039D><U+0391>G<U+03A1><U+0391>                                 ",
"<U+03A4><U+0391><U+03A4><U+039F><U+0399> (<U+0394><U+0395><U+039A><U+0395><U+039B><U+0395><U+0399><U+0391>)                        ",
"<U+03A4><U+0396><U+0395><U+03A1><U+039C><U+0399><U+0391><U+0394><U+0395>S                              ",
"<U+03A4><U+03A1><U+0399><U+039A><U+0391><U+039B><U+0391> <U+0397><U+039C><U+0391>T<U+0395><U+0399><U+0391>S                        ",
"<U+03A4><U+03A1><U+0399><U+039A><U+0391><U+039B><U+0391> T<U+0395>SS<U+0391><U+039B><U+0399><U+0391>S                       ",
"<U+03A4><U+03A1><U+0399><U+03A0><U+039F><U+039B><U+0397>                                 ",
"<U+03A4><U+03A5><U+039C><U+03A0><U+0391><U+039A><U+0399>                                 ",
"<U+03A4><U+03A5><U+03A1><U+0399><U+039D>T<U+0391>                                 ",
"F<U+0391><U+03A1>S<U+0391><U+039B><U+0391>                                 ",
"F<U+039B>O<U+03A1><U+0399><U+039D><U+0391>                                 ",
"F<U+039F><U+03A5><U+03A1><U+039D><U+0397>                                  ",
"F<U+03A5><U+03A7><U+03A4><U+0399><U+0391>                                  ",
"<U+03A7><U+0391><U+039B><U+039A><U+0399><U+0394><U+0391>                                 ",
"<U+03A7><U+0391><U+039D><U+0399><U+0391>                                   ",
"<U+03A7><U+0399><U+039F>S                                    ",
"<U+03A7><U+03A1><U+03A5>S<U+039F><U+03A5><U+03A0><U+039F><U+039B><U+0397>_<U+039A><U+0391><U+0392><U+0391><U+039B><U+0391>                       ",
"O<U+03A1><U+0395><U+039F><U+0399>                                   "
)
c(“G”,
“GS”,
“G”,
“G”,
“S”,
"()                          ",
"                          ",
“S”,
"                                ",
"                               ",
“OG”,
“S”,
“O”,
“一般事务(G)”,
“GS”,
" ()                             ",
“(FT)”,
“党卫军”,
“党卫军”,
“S”,
“S”,
“(TS)”,
“S”,
"                                  ",
“GS”,
“GT”,
“SF”,
“S”,
“S”,
"                                   ",
“党卫军”,
“S”,
“ae”,
“TS”,
“TS_”,
“S”,
"                                ",
“TSS”、“T(S)”,
"                               ",
"_/                              ",
“O”,
" ()                           ",
“(GOS)”,
"                               ",
"                                ",
"                               ",
“S”,
“TS_u/”,
“TS_”,
“S”,
“党卫军”,
“党卫军”,
“S”,
“S”,
"                                 ",
"                                  ",
"                                ",
“S”,
“TS”,
“T_/”,
"                                    ",
“操作系统”、“操作系统”,
"                                   ",
“S”,
“S”,
“(S)“,
“O”,
“S”,
“O”,
"                               ",
“TOS”,
“至”,
“新加坡政府”,
“S_S”,
“S”,
"                                ",
“S”,
“S”,
"                                 ",
“FF”,
“T”,
“S”,
"                                 ",
“O”,
“S_/”,
"                                   ",
“S”,
“GS”,
"                                ",
"                              ",
“GS”,
“F”,
“T”,
“S”,
“党卫军”,
“党卫军”,
“党卫军”,
“S”,
“STS”,
“S”,
“党卫军”,
“S”,
“SF”,
“S”,
“S(S)”,
“SSS”,
“SF(TS)”,
“所以”,
“SS_uu/”,
“G”,
" ()                        ",
“S”,
“TS”,
“TSSS”,
"                                 ",
"                                 ",
“T”,
“财政司司长”,
“FO”,
“F”,
“F”,
"                                 ",
"                                   ",
“S”,
“S_”,
“O”
)

您所拥有的看起来像是普通的7位ASCII字符,并尝试通过将其中一些代码点包装为:
来编码Unicode代码点


据我所知,这不是一种公认的Unicode编码,部分原因是您如何放置一个真正的
。我已经在base R中编写了一个方便、通用且内部有点不稳定的函数,用于此目的。它是:

dsub <- function(input,re,f=function(s,d) paste0(s,c(if (length(d)==0L) NULL else paste0('<',d,'>'),''),collapse='')) {
    splits <- strsplit(input,re,perl=T);
    delims <- lapply(strsplit(gsub(paste0('(',re,')'),'.\\1',input,perl=T),paste0('\\.(?=',re,')'),perl=T),function(x) sub(paste0('^(',re,').*'),'\\1',x[-1],perl=T))
    lapply(1:length(splits), function(i) { s <- splits[[i]]; d <- delims[[i]]; f(c(s,if (length(s)==length(d)) '' else NULL),d); } );
};
并使用本问题中给出的相当广泛的示例数据:

input <- c("<U+0391>G<U+03A1><U+0399><..."); ## (excerpted)
do.call(c,dsub(input,re,f));
##   [1] "ΑGΡΙΝΙΟ                                 " "ΑGΧΙΑΛΟS                                " "ΑΙGΙΝΑ                                  " "ΑΙGΙΟ                                   "
##   [5] "ΑΙΔΗΨΟS                                 " "ΑΚΤΙΟ(ΠΡΕΒΕΖΑ)                          " "ΑΛΕΞΑΝΔΡΟΥΠΟΛΗ                          " "ΑΛΙΑΡΤΟS                                "
##   [9] "ΑΝΑΒΡΥΤΑ                                " "ΑΝΔΡΑΒΙΔΑ                               " "ΑΝOGΕΙΑ                                 " "ΑΡΑΞΟS                                  "
##  [13] "ΑΡΑΧOΒΑ                                 " "ΑΡGΟS(ΠΥΡGΕΛΑ)                          " "ΑΡGΟSΤΟΛΙ                               " "ΑΡΤΑ (ΠΟΛΗ)                             "
##  [17] "ΑΡΤΑ (FΙΛΟTΕΗ)                          " "ΑSΤΕΡΟSΚΟΠΕΙΟ                           " "ΑSΤΡΟS                                  " "ΑSΤΥΠΑΛΑΙΑ                              "
##  [21] "ΒΑΜΟS                                   " "ΒΕΛΟ (ΚΟΡΙΝTΙΑS)                        " "ΒΟΛΟS                                   " "ΒΥΤΙΝΑ                                  "
##  [25] "GΟΡΤΥS                                  " "GΥTΕΙΟ                                  " "ΔΕSFΙΝΑ                                 " "ΔΙΑΒΟΛΙΤSΙ                              "
##  [29] "ΔΟΜΟΚΟS                                 " "ΔΡΑΜΑ                                   " "ΕΔΕSSΑ                                  " "ΕΛΕΥSΙΝΑ                                "
##  [33] "ΕΛΛΗΝΙΚΟ aeρ                            " "ΖΑΚΥΝTΟS                                " "ΖΑΚΥΝTΟS_ΠΟΛΗ                           " "ΖΑΡΟS                                   "
##  [37] "ΗΡΑΚΛΕΙΟ                                " "TΑSΟS                                   " "TΗΡΑ (SΑΝΤΟΡΙΝΗ"                          "ΙΕΡΑΠΕΤΡΑ                               "
##  [41] "ΙΚΑΡΙΑ_Α/Δ                              " "ΙOΑΝΝΙΝΑ                                " "ΚΑΒΑΛΑ (ΠΟΛΗ)                           " "ΚΑΒΑΛΑ(ΑΜΥGΔΑΛΕOΝΑS)                    "
##  [45] "ΚΑΛΑΒΡΥΤΑ                               " "ΚΑΛΑΜΑΤΑ                                " "ΚΑΛΑΜΠΑΚΑ                               " "ΚΑΡΔΙΤSΑ                                "
##  [49] "ΚΑΡΠΑTΟS_Α/Δ                            " "ΚΑΡΠΑTΟS_ΠΟΛΗ                           " "ΚΑΡΠΕΝΗSΙ                               " "ΚΑΡΥSΤΟS                                "
##  [53] "ΚΑSΟS                                   " "ΚΑSΤΕΛΛΙ                                " "ΚΑSΤΟΡΙΑ                                " "ΚΕΡΚΥΡΑ                                 "
##  [57] "ΚΟΖΑΝΗ                                  " "ΚΟΜΟΤΗΝΗ                                " "ΚΟΝΙΤSΑ                                 " "ΚΟΡΙΝTΟS                                "
##  [61] "ΚΥTΗΡΑ_Α/Δ                              " "ΚΥΜΗ                                    " "ΚOS                                     " "ΚOS_ΠΟΛΗ                                "
##  [65] "ΛΑΜΙΑ                                   " "ΛΑΡΙSΑ                                  " "ΛΕΡΟS                                   " "ΛΕΥΚΑΔΑ (ΝΗSΙ)                          "
##  [69] "ΛΕOΝΙΔΙΟ                                " "ΛΗΜΝΟS                                  " "ΛΙΔOΡΙΚΙ                                " "ΜΑΚΕΔΟΝΙΑ                               "
##  [73] "ΜΑΡΑTOΝΑS                               " "ΜΕTOΝΗ                                  " "ΜΕSΟΛΟGGΙ                               " "ΜΗΛΟS_ΑΜS                               "
##  [77] "ΜΥΚΟΝΟS                                 " "ΜΥΤΙΛΗΝΗ                                " "ΝΑΞΟS                                   " "ΝΑΥΠΑΚΤΟS                               "
##  [81] "ΝΑΥΠΛΙΟ                                 " "ΝΕΑ FΙΛΑΔΕΛFΕΙΑ                         " "ΞΑΝTΗ                                   " "ΟΡΕSΤΙΑΔΑ                               "
##  [85] "ΠΑΙΑΝΙΑ                                 " "ΠΑΛΑΙΟΧOΡΑ                              " "ΠΑΡΟS_Α/Δ                               " "ΠΑΤΡΑ                                   "
##  [89] "ΠΕΙΡΑΙΑS                                " "ΠΟΛΥGΥΡΟS                               " "ΠΟΤΙΔΑΙΑ                                " "ΠΤΟΛΕΜΑΙΔΑ                              "
##  [93] "ΠΥΡGΟS                                  " "ΡΑFΗΝΑ                                  " "ΡΕTΥΜΝΟ                                 " "ΡΟΔΟS                                   "
##  [97] "SΑΜΟS                                   " "SΕΔΕS                                   " "SΕΡΡΕS                                  " "SΗΤΕΙΑ                                  "
## [101] "SΚΙΑTΟS                                 " "SΚΟΤΙΝΑ                                 " "SΚΥΡΟS                                  " "SΟΥΔΑ                                   "
## [105] "SΟΥFΛΙ                                  " "SΠΑΡΤΗ                                  " "SΠΑΤΑ(ΒΕΝΙΖΕΛΟS)                        " "SΠΕΤSΕS                                 "
## [109] "SΤΕFΑΝΙ (ΚΟΡΙΝTΙΑS)                     " "SΥΚΥOΝΑ                                 " "SΥΡΟS_Α/Δ                               " "ΤΑΝΑGΡΑ                                 "
## [113] "ΤΑΤΟΙ (ΔΕΚΕΛΕΙΑ)                        " "ΤΖΕΡΜΙΑΔΕS                              " "ΤΡΙΚΑΛΑ ΗΜΑTΕΙΑS                        " "ΤΡΙΚΑΛΑ TΕSSΑΛΙΑS                       "
## [117] "ΤΡΙΠΟΛΗ                                 " "ΤΥΜΠΑΚΙ                                 " "ΤΥΡΙΝTΑ                                 " "FΑΡSΑΛΑ                                 "
## [121] "FΛOΡΙΝΑ                                 " "FΟΥΡΝΗ                                  " "FΥΧΤΙΑ                                  " "ΧΑΛΚΙΔΑ                                 "
## [125] "ΧΑΝΙΑ                                   " "ΧΙΟS                                    " "ΧΡΥSΟΥΠΟΛΗ_ΚΑΒΑΛΑ                       " "OΡΕΟΙ                                   "

input这里有另一种方法来恢复字符串中编码的真正未编码字符(借用自)。在这里,我们仔细匹配表单
,并通过一些位操作将该十六进制值展开为适当大小的unicode字符

trueunicode <- function(x) {
    packuni<-Vectorize(function(cp) {
        bv <- intToBits(cp)
        maxbit <- tail(which(bv!=as.raw(0)),1)
        if(maxbit < 8) {
            rawToChar(as.raw(codepoint))
        } else if (maxbit < 12) {
            rawToChar(rev(packBits(c(bv[1:6], as.raw(c(0,1)), bv[7:11], as.raw(c(0,1,1))), "raw")))
        } else if (maxbit < 17){
            rawToChar(rev(packBits(c(bv[1:6], as.raw(c(0,1)), bv[7:12], as.raw(c(0,1)), bv[13:16], as.raw(c(0,1,1,1))), "raw")))    
        } else {
           stop("too many bits")
        }
    })
    m <- gregexpr("<U\\+[0-9a-fA-F]{4}>", x)
    codes <- regmatches(x,m)
    chars <- lapply(codes, function(x) {
        codepoints <- strtoi(paste0("0x", substring(x,4,7)))
        packuni(codepoints)

    })
    regmatches(x,m) <- chars
    Encoding(x)<-"UTF-8"
    x
}

@不管你引用不引用Spacedman的话,问题都是一样的。有什么想法吗?想法:基本上你已经搞得一团糟了。一些
dsub <- function(input,re,f=function(s,d) paste0(s,c(if (length(d)==0L) NULL else paste0('<',d,'>'),''),collapse='')) {
    splits <- strsplit(input,re,perl=T);
    delims <- lapply(strsplit(gsub(paste0('(',re,')'),'.\\1',input,perl=T),paste0('\\.(?=',re,')'),perl=T),function(x) sub(paste0('^(',re,').*'),'\\1',x[-1],perl=T))
    lapply(1:length(splits), function(i) { s <- splits[[i]]; d <- delims[[i]]; f(c(s,if (length(s)==length(d)) '' else NULL),d); } );
};
input <- c('Luc TR<U+00c9>HAN','aa<U+00ca>bb<U+00cb>cc','<U+00CC><U+00Cd>','','  ');
re <- '<U\\+([0-9a-fA-F]{4})>';
f <- function(s,d) paste0(s,c(if (length(d)==0L) NULL else intToUtf8(paste0('0x',sub(re,'\\1',d)),multiple=T),''),collapse='');
do.call(c,dsub(input,re,f));
## [1] "Luc TRÉHAN" "aaÊbbËcc"   "ÌÍ"         ""           "  "
input <- c("<U+0391>G<U+03A1><U+0399><..."); ## (excerpted)
do.call(c,dsub(input,re,f));
##   [1] "ΑGΡΙΝΙΟ                                 " "ΑGΧΙΑΛΟS                                " "ΑΙGΙΝΑ                                  " "ΑΙGΙΟ                                   "
##   [5] "ΑΙΔΗΨΟS                                 " "ΑΚΤΙΟ(ΠΡΕΒΕΖΑ)                          " "ΑΛΕΞΑΝΔΡΟΥΠΟΛΗ                          " "ΑΛΙΑΡΤΟS                                "
##   [9] "ΑΝΑΒΡΥΤΑ                                " "ΑΝΔΡΑΒΙΔΑ                               " "ΑΝOGΕΙΑ                                 " "ΑΡΑΞΟS                                  "
##  [13] "ΑΡΑΧOΒΑ                                 " "ΑΡGΟS(ΠΥΡGΕΛΑ)                          " "ΑΡGΟSΤΟΛΙ                               " "ΑΡΤΑ (ΠΟΛΗ)                             "
##  [17] "ΑΡΤΑ (FΙΛΟTΕΗ)                          " "ΑSΤΕΡΟSΚΟΠΕΙΟ                           " "ΑSΤΡΟS                                  " "ΑSΤΥΠΑΛΑΙΑ                              "
##  [21] "ΒΑΜΟS                                   " "ΒΕΛΟ (ΚΟΡΙΝTΙΑS)                        " "ΒΟΛΟS                                   " "ΒΥΤΙΝΑ                                  "
##  [25] "GΟΡΤΥS                                  " "GΥTΕΙΟ                                  " "ΔΕSFΙΝΑ                                 " "ΔΙΑΒΟΛΙΤSΙ                              "
##  [29] "ΔΟΜΟΚΟS                                 " "ΔΡΑΜΑ                                   " "ΕΔΕSSΑ                                  " "ΕΛΕΥSΙΝΑ                                "
##  [33] "ΕΛΛΗΝΙΚΟ aeρ                            " "ΖΑΚΥΝTΟS                                " "ΖΑΚΥΝTΟS_ΠΟΛΗ                           " "ΖΑΡΟS                                   "
##  [37] "ΗΡΑΚΛΕΙΟ                                " "TΑSΟS                                   " "TΗΡΑ (SΑΝΤΟΡΙΝΗ"                          "ΙΕΡΑΠΕΤΡΑ                               "
##  [41] "ΙΚΑΡΙΑ_Α/Δ                              " "ΙOΑΝΝΙΝΑ                                " "ΚΑΒΑΛΑ (ΠΟΛΗ)                           " "ΚΑΒΑΛΑ(ΑΜΥGΔΑΛΕOΝΑS)                    "
##  [45] "ΚΑΛΑΒΡΥΤΑ                               " "ΚΑΛΑΜΑΤΑ                                " "ΚΑΛΑΜΠΑΚΑ                               " "ΚΑΡΔΙΤSΑ                                "
##  [49] "ΚΑΡΠΑTΟS_Α/Δ                            " "ΚΑΡΠΑTΟS_ΠΟΛΗ                           " "ΚΑΡΠΕΝΗSΙ                               " "ΚΑΡΥSΤΟS                                "
##  [53] "ΚΑSΟS                                   " "ΚΑSΤΕΛΛΙ                                " "ΚΑSΤΟΡΙΑ                                " "ΚΕΡΚΥΡΑ                                 "
##  [57] "ΚΟΖΑΝΗ                                  " "ΚΟΜΟΤΗΝΗ                                " "ΚΟΝΙΤSΑ                                 " "ΚΟΡΙΝTΟS                                "
##  [61] "ΚΥTΗΡΑ_Α/Δ                              " "ΚΥΜΗ                                    " "ΚOS                                     " "ΚOS_ΠΟΛΗ                                "
##  [65] "ΛΑΜΙΑ                                   " "ΛΑΡΙSΑ                                  " "ΛΕΡΟS                                   " "ΛΕΥΚΑΔΑ (ΝΗSΙ)                          "
##  [69] "ΛΕOΝΙΔΙΟ                                " "ΛΗΜΝΟS                                  " "ΛΙΔOΡΙΚΙ                                " "ΜΑΚΕΔΟΝΙΑ                               "
##  [73] "ΜΑΡΑTOΝΑS                               " "ΜΕTOΝΗ                                  " "ΜΕSΟΛΟGGΙ                               " "ΜΗΛΟS_ΑΜS                               "
##  [77] "ΜΥΚΟΝΟS                                 " "ΜΥΤΙΛΗΝΗ                                " "ΝΑΞΟS                                   " "ΝΑΥΠΑΚΤΟS                               "
##  [81] "ΝΑΥΠΛΙΟ                                 " "ΝΕΑ FΙΛΑΔΕΛFΕΙΑ                         " "ΞΑΝTΗ                                   " "ΟΡΕSΤΙΑΔΑ                               "
##  [85] "ΠΑΙΑΝΙΑ                                 " "ΠΑΛΑΙΟΧOΡΑ                              " "ΠΑΡΟS_Α/Δ                               " "ΠΑΤΡΑ                                   "
##  [89] "ΠΕΙΡΑΙΑS                                " "ΠΟΛΥGΥΡΟS                               " "ΠΟΤΙΔΑΙΑ                                " "ΠΤΟΛΕΜΑΙΔΑ                              "
##  [93] "ΠΥΡGΟS                                  " "ΡΑFΗΝΑ                                  " "ΡΕTΥΜΝΟ                                 " "ΡΟΔΟS                                   "
##  [97] "SΑΜΟS                                   " "SΕΔΕS                                   " "SΕΡΡΕS                                  " "SΗΤΕΙΑ                                  "
## [101] "SΚΙΑTΟS                                 " "SΚΟΤΙΝΑ                                 " "SΚΥΡΟS                                  " "SΟΥΔΑ                                   "
## [105] "SΟΥFΛΙ                                  " "SΠΑΡΤΗ                                  " "SΠΑΤΑ(ΒΕΝΙΖΕΛΟS)                        " "SΠΕΤSΕS                                 "
## [109] "SΤΕFΑΝΙ (ΚΟΡΙΝTΙΑS)                     " "SΥΚΥOΝΑ                                 " "SΥΡΟS_Α/Δ                               " "ΤΑΝΑGΡΑ                                 "
## [113] "ΤΑΤΟΙ (ΔΕΚΕΛΕΙΑ)                        " "ΤΖΕΡΜΙΑΔΕS                              " "ΤΡΙΚΑΛΑ ΗΜΑTΕΙΑS                        " "ΤΡΙΚΑΛΑ TΕSSΑΛΙΑS                       "
## [117] "ΤΡΙΠΟΛΗ                                 " "ΤΥΜΠΑΚΙ                                 " "ΤΥΡΙΝTΑ                                 " "FΑΡSΑΛΑ                                 "
## [121] "FΛOΡΙΝΑ                                 " "FΟΥΡΝΗ                                  " "FΥΧΤΙΑ                                  " "ΧΑΛΚΙΔΑ                                 "
## [125] "ΧΑΝΙΑ                                   " "ΧΙΟS                                    " "ΧΡΥSΟΥΠΟΛΗ_ΚΑΒΑΛΑ                       " "OΡΕΟΙ                                   "
trueunicode <- function(x) {
    packuni<-Vectorize(function(cp) {
        bv <- intToBits(cp)
        maxbit <- tail(which(bv!=as.raw(0)),1)
        if(maxbit < 8) {
            rawToChar(as.raw(codepoint))
        } else if (maxbit < 12) {
            rawToChar(rev(packBits(c(bv[1:6], as.raw(c(0,1)), bv[7:11], as.raw(c(0,1,1))), "raw")))
        } else if (maxbit < 17){
            rawToChar(rev(packBits(c(bv[1:6], as.raw(c(0,1)), bv[7:12], as.raw(c(0,1)), bv[13:16], as.raw(c(0,1,1,1))), "raw")))    
        } else {
           stop("too many bits")
        }
    })
    m <- gregexpr("<U\\+[0-9a-fA-F]{4}>", x)
    codes <- regmatches(x,m)
    chars <- lapply(codes, function(x) {
        codepoints <- strtoi(paste0("0x", substring(x,4,7)))
        packuni(codepoints)

    })
    regmatches(x,m) <- chars
    Encoding(x)<-"UTF-8"
    x
}
input <- c("<U+0391>G<U+03A1><U+0399><U+039D><U+0399><U+039F>", "<U+0391>G<U+03A7><U+0399><U+0391><U+039B><U+039F>S","<U+0391><U+0399>G<U+0399><U+039D><U+0391>", "<U+0391><U+0399>G<U+0399><U+039F>", "<U+0391><U+0399><U+0394><U+0397><U+03A8><U+039F>S","<U+0391><U+039A><U+03A4><U+0399><U+039F>(<U+03A0><U+03A1><U+0395><U+0392><U+0395><U+0396><U+0391>)")
trueunicode(input)
# [1] "ΑGΡΙΝΙΟ"        "ΑGΧΙΑΛΟS"       "ΑΙGΙΝΑ"         "ΑΙGΙΟ"         
# [5] "ΑΙΔΗΨΟS"        "ΑΚΤΙΟ(ΠΡΕΒΕΖΑ)"