R 解析字符串值中的层次结构
我试图从单个字符向量创建一个边列表。我要处理的列表长度超过93k个元素,但作为示例,我将提供一个小摘录 chracter字符串是ICD10代码层次结构的一部分,并且该字符串中存在父子关系。这意味着单个字符串A0101的父级为A010 它看起来是这样的: A00 A000 A001 A009 A01 A010 A0100 A0101 A02 A03 等 我的向量不包含除字符串以外的任何其他数据,但我基本上需要转换R 解析字符串值中的层次结构,r,R,我试图从单个字符向量创建一个边列表。我要处理的列表长度超过93k个元素,但作为示例,我将提供一个小摘录 chracter字符串是ICD10代码层次结构的一部分,并且该字符串中存在父子关系。这意味着单个字符串A0101的父级为A010 它看起来是这样的: A00 A000 A001 A009 A01 A010 A0100 A0101 A02 A03 等 我的向量不包含除字符串以外的任何其他数据,但我基本上需要转换 dat <- c("A00", "A000", "A001", "A009",
dat <- c("A00", "A000", "A001", "A009", "A01", "A010", "A0100", "A0101", "A02")
假设ICD10中节点名称的长度完全定义了顺序,较短的节点是父节点,这里有一种方法可以将每个节点与其直接父节点(如果可用)连接起来 虽然我认为这里的逻辑清晰可辨,但我很想知道一个更精简的解决方案会是什么样子
# Some longer fake data to prove that it works acceptably
# with 93k rows (took a few seconds). These are just
# numbers of different lengths, converted to characters, but they
# should suffice if the assumption about length = order is correct.
set.seed(42)
fake <- runif(93000, 0, 500) %>%
magrittr::raise_to_power(3) %>%
as.integer() %>%
as.character()
# Step 1 - prep
library(dplyr); library(tidyr)
fake_2 <- fake %>%
as_data_frame() %>%
mutate(row = row_number()) %>%
# Step 2 - widen by level and fill in all parent nodes
mutate(level = str_length(value)) %>%
spread(level, value) %>%
fill(everything()) %>%
# Step 3 - Get two highest non-NA nodes
gather(level, code, -row) %>%
arrange(row, level) %>%
filter(!is.na(code)) %>%
group_by(row) %>%
top_n(2, wt = level) %>%
# Step 4 - Spread once more to get pairs
mutate(pos = row_number()) %>%
ungroup() %>%
select(-level) %>%
spread(pos, code)
输出操作数据
# A tibble: 9 x 3
row `1` `2`
<int> <chr> <chr>
1 1 A00 NA
2 2 A00 A000
3 3 A00 A001
4 4 A00 A009
5 5 A01 A009
6 6 A01 A010
7 7 A010 A0100
8 8 A010 A0101
9 9 A010 A0101
93k伪数据输出
> head(fake, 10)
[1] "55174190" "50801321" "46771275" "6480673"
[5] "20447474" "879955" "4365410" "11434009"
[9] "5002257" "9200296"
> head(fake_2, 10)
# A tibble: 10 x 3
row `1` `2`
<int> <chr> <chr>
1 1 55174190 NA
2 2 50801321 NA
3 3 46771275 NA
4 4 6480673 46771275
5 5 6480673 20447474
6 6 6480673 20447474
7 7 4365410 20447474
8 8 4365410 11434009
9 9 5002257 11434009
10 10 9200296 11434009
我相当肯定有更有效的方法来实现这一点,但这段代码应该从icd.data包下载icd10cm数据。使用icd软件包中的儿童检测系统,然后广泛使用tidyverse返回边缘列表。我必须有点创造性地连接层次结构的顶部,因为它们不包括ICD10数据的章节和子章节作为单独的2位或1位代码 基本上,子章节变成2位代码,章节变成1位代码,然后有一个根节点连接顶部的所有内容
library(icd.data)
icd10 <- icd10cm2016
library(icd)
code_children <- lapply(icd10$code, children)
code_vec <- sapply(code_children, paste, collapse = ",")
code_df <- as.data.frame(code_vec, stringsAsFactors = F)
library(dplyr);library(stringr);library(tidyr)
code_df_new <- code_df %>%
mutate(parent = sapply(strsplit(code_vec,","), "[", 1)) %>%
separate(code_vec,
paste("code", 1:max(str_count(code_df$code_vec, ",")), sep ="."),
",",extra = "merge")
library(reshape2)
edgelist <- melt(code_df_new, id = "parent") %>%
filter(!is.na(value)) %>%
select(parent, child = value) %>%
arrange(parent)
edgelist <- subset(edgelist, edgelist$parent != edgelist$child)
edgelist <- subset(edgelist, nchar(edgelist$child) == nchar(edgelist$parent) + 1)
subchaps <- icd10 %>% select(three_digit, sub_chapter, chapter) %>%
mutate(two_digit = substr(three_digit, 1, 2)) %>%
select(parent = two_digit, child = three_digit) %>%
distinct()
chaps <- icd10 %>% select(three_digit, sub_chapter, chapter) %>%
mutate(
two_digit = substr(three_digit, 1, 2),
one_digit = substr(three_digit, 1, 1)) %>%
select(parent = one_digit, child = two_digit) %>%
distinct()
root <- icd10 %>% select(three_digit) %>%
mutate(parent = "root", child = substr(three_digit, 1, 1)) %>%
select(parent, child) %>%
distinct()
edgelist_final <- edgelist %>%
bind_rows(list(chaps, subchaps, root)) %>%
arrange(parent)
如果有人有任何技巧或方法来提高这段代码的效率,我洗耳恭听。眼睛?Hi Trip,将列表转换为边缘列表的逻辑是什么?起初我认为A00-A000是第一条边,A001-A009是第二条边,A01-A010是第三条边,但示例输出看起来不同。谢谢:我的想法可能有误,但我正在尝试创建一种分层数据格式,然后可以使用它来测量树上各个节点之间的距离。这些值代表ICD10诊断值,我试图测量诊断代码集的相似性。哦,这真的很有趣。我想下面的人明白了!不幸的是,对字符长度的假设是不够的。较短的代码是附加了chracter的代码的父节点,但根字符串必须匹配。。。换句话说,A0101是A010的孩子,而不是A011,否则代码运行得很好,但这可能是我没有正确解释ICD10的层次结构的错误。我对这项工作的目标很感兴趣。也许我们可以连接?Hi@JackWasey…目标是利用此方法计算icd本体中的距离。这里有一些关于根据患者的诊断包计算患者相似性的已发表论文。这很有趣,但我在这方面寻找了没有使用共病的出版物。请给我指一些好吗?
library(icd.data)
icd10 <- icd10cm2016
library(icd)
code_children <- lapply(icd10$code, children)
code_vec <- sapply(code_children, paste, collapse = ",")
code_df <- as.data.frame(code_vec, stringsAsFactors = F)
library(dplyr);library(stringr);library(tidyr)
code_df_new <- code_df %>%
mutate(parent = sapply(strsplit(code_vec,","), "[", 1)) %>%
separate(code_vec,
paste("code", 1:max(str_count(code_df$code_vec, ",")), sep ="."),
",",extra = "merge")
library(reshape2)
edgelist <- melt(code_df_new, id = "parent") %>%
filter(!is.na(value)) %>%
select(parent, child = value) %>%
arrange(parent)
edgelist <- subset(edgelist, edgelist$parent != edgelist$child)
edgelist <- subset(edgelist, nchar(edgelist$child) == nchar(edgelist$parent) + 1)
subchaps <- icd10 %>% select(three_digit, sub_chapter, chapter) %>%
mutate(two_digit = substr(three_digit, 1, 2)) %>%
select(parent = two_digit, child = three_digit) %>%
distinct()
chaps <- icd10 %>% select(three_digit, sub_chapter, chapter) %>%
mutate(
two_digit = substr(three_digit, 1, 2),
one_digit = substr(three_digit, 1, 1)) %>%
select(parent = one_digit, child = two_digit) %>%
distinct()
root <- icd10 %>% select(three_digit) %>%
mutate(parent = "root", child = substr(three_digit, 1, 1)) %>%
select(parent, child) %>%
distinct()
edgelist_final <- edgelist %>%
bind_rows(list(chaps, subchaps, root)) %>%
arrange(parent)