Xml 2字节序列验证/分析错误的位置2处的字节“?”无效

Xml 2字节序列验证/分析错误的位置2处的字节“?”无效,xml,r,encoding,utf-8,igraph,Xml,R,Encoding,Utf 8,Igraph,我必须导出一个out of R igraph来手动添加列值。当我想再次导入graphml文件时,它必须是正确的UTF-8和有效的xml。因此,在保存到UTF-8之前,我使用iconv转换数据,正如您在下面代码的for循环中看到的那样 library(igraph) edges <- read.csv2("https://www.dropbox.com/s/p8e7hcck0d4nnrp/Subgraph_nowvalid.graphml?dl=0", header=TRUE, quo

我必须导出一个out of R igraph来手动添加列值。当我想再次导入graphml文件时,它必须是正确的UTF-8和有效的xml。因此,在保存到UTF-8之前,我使用iconv转换数据,正如您在下面代码的for循环中看到的那样

   library(igraph)
edges <- read.csv2("https://www.dropbox.com/s/p8e7hcck0d4nnrp/Subgraph_nowvalid.graphml?dl=0", header=TRUE, quote="");
amount <- nrow(edges);
amount;
sources <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));
targets <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));

for (i in 1:ncol(edges)) {
    edges[,i] <- iconv(edges[,i], to="UTF-8", sub="");
    if (is.character(edges[,i])) {
        edges[,i] <- gsub("[[:cntrl:]]", "", edges[,i])
    }
}

sources[,1] <- edges[,1];
sources[,2:8] <- NA;
sources[,9] <- edges[,4];
sources[,10:13] <- NA;
targets[,1] <- edges[,2];
targets[,2] <- edges[,7];
targets[,3] <- edges[,8];
targets[,4] <- edges[,9];
targets[,5] <- edges[,10];
targets[,6] <- edges[,11];
targets[,7] <- edges[,12];
targets[,8] <- edges[,13];
targets[,9:13] <- NA;

print("REPORT: vertices data frames filled")

sources <- unique(sources);
targets <- unique(targets);
print("REPORT: Duplicated sources and targets removed");

nodes <- within(merge(sources, targets, by="Vertexname", all=TRUE), {
            Description <- ifelse(is.na(Description.x), paste(Description.y), Description.x); Description.x = NULL; Description.y = NULL; 
            Follower <- ifelse(is.na(Follower.x), Follower.y, Follower.x); Follower.x = NULL; Follower.y = NULL; 
            Friends <- ifelse(is.na(Friends.x), Friends.y, Friends.x); Friends.x = NULL; Friends.y = NULL;
            Favourites <- ifelse(is.na(Favourites.x), Favourites.y, Favourites.x); Favourites.x = NULL; Favourites.y = NULL;
            Statuses <- ifelse(is.na(Statuses.x), Statuses.y, Statuses.x); Statuses.x = NULL; Statuses.y = NULL;
            ProfileAge <- ifelse(is.na(ProfileAge.x), ProfileAge.y, ProfileAge.x); ProfileAge.x = NULL; ProfileAge.y = NULL;
            Listed <- ifelse(is.na(Listed.x), Listed.y, Listed.x); Listed.x = NULL; Listed.y = NULL;
            Timestamp <- ifelse(is.na(Timestamp.y), Timestamp.x, Timestamp.y); Timestamp.x = NULL; Timestamp.y = NULL;
            OutDegree <- ifelse(is.na(OutDegree.x), OutDegree.y, OutDegree.x); OutDegree.x = NULL; OutDegree.y = NULL;
            InDegree <- ifelse(is.na(InDegree.x), InDegree.y, InDegree.x); InDegree.x = NULL; InDegree.y = NULL;
            WOutDegree <- ifelse(is.na(WOutDegree.x), WOutDegree.y, WOutDegree.x); WOutDegree.x = NULL; WOutDegree.y = NULL;
            WInDegree <- ifelse(is.na(WInDegree.x), WInDegree.y, WInDegree.x); WInDegree.x = NULL; WInDegree.y = NULL});
print("REPORT: Sources and Targets merged");

nodes <- subset(nodes, !duplicated(nodes$Vertexname));
print("REPORT: Duplicated vertices removed");

nrow(nodes);

edges <- edges[complete.cases(edges[,1:2]),];
nodes <- nodes[complete.cases(nodes[,1]),];
print("REPORT: Invalid edges and nodes removed");

g <- graph.data.frame(edges, directed=TRUE, nodes);
print("REPORT: Graph created");

outdegrees <- degree(g, v=V(g), mode="out");
indegrees <- degree(g, v=V(g), mode="in");
woutdegrees <- graph.strength(g, v=V(g), mode="out");
windegrees <- graph.strength(g, v=V(g), mode="in");
g <- set.vertex.attribute(g, "OutDegree", V(g), outdegrees);
g <- set.vertex.attribute(g, "InDegree", V(g), indegrees);
g <- set.vertex.attribute(g, "WOutDegree", V(g), woutdegrees);
g <- set.vertex.attribute(g, "WInDegree", V(g), windegrees);
print("REPORT: Degree calculated and added as vertex attribute");

# Filter

nodes <- get.data.frame(g, "vertices");
nodes <- nodes[order(nodes$OutDegree, decreasing = TRUE),];
nrow(nodes);
minOutDegree <- nodes[1335,"OutDegree"]; # 1335
minOutDegree;
nodes <- nodes[order(nodes$InDegree, decreasing = TRUE),];
minInDegree <- nodes[1335,"InDegree"];
minInDegree;

nodes2 <- subset(nodes, nodes$OutDegree >= minOutDegree | nodes$InDegree >= minInDegree);
nrow(nodes2);
nodes3 <- subset(nodes, nodes$OutDegree >= minOutDegree & nodes$InDegree >= minInDegree);
nrow(nodes3);

g <- set.vertex.attribute(g, "Group", V(g), NA);
g <- induced.subgraph(g, V(g)$OutDegree >= minOutDegree | V(g)$InDegree >= minInDegree);
length(E(g));
length(V(g));
g <- induced.subgraph(g, V(g)$OutDegree > 0 & V(g)$InDegree > 0);
length(E(g));
length(V(g));
g <- induced.subgraph(g, V(g)$OutDegree > (V(g)$InDegree / 3));
length(E(g));
length(V(g));
write.graph(g, "SomePath");
print("REPORT: Subgraph Test saved");
因此,我使用XMLValidatorBuddy验证了graphml文件UTF-8是否被选为下拉字段中使用的编码,但无论选择哪种编码,都会发生错误。这就是我得到错误的地方:

2字节序列的位置2处的字节“?”无效

根据XMLValidator,错误发生在第4278行


问题的答案对我没有帮助,因为我应该有一个UTF-8编码的graphml文件,因为在R中进行了转换。

这一行绝对不正确:

edges[,i] <- gsub("[[:ctrl:]]", "", edges[,i])
我知道它的目的是从边缘属性中去掉任何不允许的控制字符,这样就不会在GraphML编写器上遇到任何麻烦,但是[[:ctrl:]应该是[[:cntrl:]]。实际上,我的R版本在看到[[:ctrl:]时会抱怨,但也许你的版本不会


此外,在将字符串转换为UTF-8后,我将避免弄乱字符串的各个字符。如果要从字符串中删除控制字符,请在转换为UTF-8之前执行此操作。由于UTF-8编码的工作原理,字符代码小于128的Unicode字符(其中包含您所担心的所有控制字符)将保持不变,UTF-8编码不会引入任何字符代码小于128的额外ASCII字符。

您能举一个再现此问题的最小示例吗?你的这个代码转储依赖于一个我们没有的文件,然后做了一百件事情,其中任何一件都可能是问题所在。唯一的链接是一个6Mb的XML文件,我不想下载它。你能制作一个很小的图形并生成一个类似的破损图形文件吗?当然,这只需要几行,最多12行。我试图缩小图表的范围,以便只剩下与可能导致问题的节点相关的边,因为第4278行包含该节点的数据。但我没有错。我不知道如何在不知道源的情况下复制错误…你删除了链接文件了吗?哦,对不起,我更改文件时忘记了删除超链接。第一行中graphml文件后面的链接现在指向与第二代码行中的read.graph相同的文件。问题同时解决了谢谢你的留言!很明显,一个R不幸没有看到的打字错误。我还切换了代码行,以便在iconv之前剥离。这很有效!上面的代码现在创建了一个graphml文件,可以立即再次导入,不会出现任何错误。我将代码重新定位到for循环中,以便它首先将所有列转换为UTF-8,然后去除字符向量的控制字符。非常感谢你的帮助!
edges[,i] <- gsub("[[:ctrl:]]", "", edges[,i])