Python 在树中折叠子树,使其与树梢的平均距离<;x
我有一个Newick格式的层次结构树,例如:Python 在树中折叠子树,使其与树梢的平均距离<;x,python,tree,formatting,hierarchy,transformation,Python,Tree,Formatting,Hierarchy,Transformation,我有一个Newick格式的层次结构树,例如: (A:0.556705,(B:0.251059,C:0.251059):0.305646):0.556705; 括号表示树拓扑(分支结构/分组),数字表示分支长度(距离),例如: 我需要折叠到尖端(终端节点/尖端/叶子)的平均距离小于给定值x的分支(子树)。这样,输入和折叠输出都是Newick格式。例如,如果x是0.29,那么上面的B和C折叠成BC,我们得到如下结果: (A:0.556705,BC:0.556705):0.556705; 对于
(A:0.556705,(B:0.251059,C:0.251059):0.305646):0.556705;
括号表示树拓扑(分支结构/分组),数字表示分支长度(距离),例如:
我需要折叠到尖端(终端节点/尖端/叶子)的平均距离小于给定值x的分支(子树)。这样,输入和折叠输出都是Newick格式。例如,如果x是0.29,那么上面的B和C折叠成BC,我们得到如下结果:
(A:0.556705,BC:0.556705):0.556705;
对于任何Newick树(例如Python),是否有一种简单的编程方法来实现这种折叠?这个快速且脏的代码片段似乎适用于最小树,但我需要更多的示例数据来检查更复杂的树:
#! /usr/bin/python3
class Tree:
def __init__ (self, tokens):
self.children = []
while tokens:
self.children.append ( (tokens [1], float (tokens [0] ) ) )
tokens = tokens [2:]
def __repr__ (self):
return '<{}>'.format (self.children)
def pprint (self, indent = 0):
prefix = ' ' * indent
for child, dist in self.children:
if isinstance (child, Tree):
print (prefix, dist)
child.pprint (indent + 1)
else:
print (prefix, child, dist)
def collapse (self, limit):
self.children = [ (child.collapse (limit) [0], dist)
if isinstance (child, Tree)
else (child, dist) for child, dist in self.children]
avg = sum (dist for _, dist in self.children) / len (self.children)
if avg > limit:
return (self, 0)
else:
if any (isinstance (child, Tree) for child in self.children):
print ('Would like to collapse, but cannot, as at least one child is a tree.')
return (self, 0)
return (''.join (child for child, _ in self.children), 0)
def parse (tree):
stack = []
buff = ''
while True:
c = tree [0]
if c == ';': break
tree = tree [1:]
if c == '(':
stack.insert (0, '(')
continue
if c == ')':
if buff: stack.insert (0, buff)
buff = ''
popped = ''
tokens = []
while True:
token = stack [0]
stack = stack [1:]
if token == '(': break
tokens.append (token)
stack.insert (0, Tree (tokens) )
continue
if c in ':,':
if buff: stack.insert (0, buff)
buff = ''
continue
buff += c
if buff: stack.insert (0, buff)
return Tree (stack)
t = parse ('(A:0.556705,(B:0.251059,C:0.251059):0.305646):0.556705;')
t.pprint ()
t.collapse (.3)
print ()
t.pprint ()
图3中倒塌的树是:
0.556705
CB 0.305646
A 0.556705
按1.0折叠的树是:
CBA 0.556705
按0.1折叠的树是:
0.556705
0.305646
C 0.251059
B 0.251059
A 0.556705
这里是代码的另一个更清晰的版本(产生nerwick符号输出): 没有好处:输入树必须用括号括起来
#! /usr/bin/python3
class Tree:
def __init__ (self, weight, children):
self.weight = weight
self.children = children [:]
def distances (self, curDistance = .0, acc = None):
if acc is None: acc = []
for child in self.children:
child.distances (self.weight + curDistance, acc)
return acc
def collapse (self, limit):
self.children = [child.collapse (limit) for child in self.children]
distances = self.distances (-self.weight)
avg = sum (distances) / len (distances)
if avg > limit: return self
return Node (self.weight, ''.join (self.descendants () ) )
def descendants (self):
descendants = []
for child in self.children:
descendants.extend (child.descendants () )
return descendants
def __repr__ (self):
return '({}):{}'.format (','.join (str (child) for child in self.children), self.weight)
class Node:
def __init__ (self, weight, name):
self.weight = weight
self.name = name
def distances (self, curDistance, acc):
acc.append (curDistance + self.weight)
def collapse (self, limit):
return self
def descendants (self):
return [self.name]
def __repr__ (self):
return '{}:{}'.format (self.name, self.weight)
class Stack (list):
def pop (self):
e = self [0]
del self [0]
return e
def push (self, e):
self.insert (0, e)
def parse (tree):
buff = ''
stack = Stack ()
while True:
c = tree [0]
if c == ';': break
tree = tree [1:]
if c == '(':
stack.push (c)
continue
if c in ':,':
if buff: stack.push (buff)
buff = ''
continue
if c == ')':
if buff: stack.push (buff)
buff = ''
popped = ''
children = []
while True:
weight = stack.pop ()
if weight == '(': break
weight = float (weight)
child = stack.pop ()
if isinstance (child, Tree):
child.weight = weight
else:
child = Node (weight, child)
children.append (child)
stack.push (Tree (0, children) )
continue
buff += c
return stack.pop ()
t = parse ('((A:0.9,(B:0.2,C:0.3):0.3,(E:0.05,F:0.08):0.1):0.6);')
print ('Input tree is {}'.format (t) )
for limit in range (1, 6):
limit = limit / 10
print ('Collapsed by {} is {}'.format (limit, t.collapse (limit) ) )
输出为(带有修改的输入):
这里可能有一个更健壮的解析器,它会吃掉所有示例
0.251059
是B
和C
的距离(与塌陷相关),还是0.305646
?(从未使用过clades。)哎呀!谢谢你让我意识到这篇文章中有一个错误,0.251059是B和C到它们共同的父节点(它们分开的地方)的分支长度,0.305646是从它们的父节点到树根的距离。这有意义吗?(查看图表有助于)现在哪一个必须小于极限才能崩溃?从直系祖先到末端的距离或从根到末端的距离?这取决于,例如,如果x为0.29,则B和C将塌陷为BC,而没有其他分支将塌陷(整棵树本身算作一个分支),如果x为0.6,则整棵树将塌陷为ABCOK,然后看看我的答案,看看这是否是你需要的。不确定第二个版本是否正常工作,或者我是否很傻。。我得到:“ValueError:zero-length字段名格式”@hello\u there\u andy请确保您的树被括起来。此外,我仅在此页面上以标记为“popular”的格式解析树:。因此,如果您使用其余7种格式中的另一种,请随意修补解析器或转换输入
#! /usr/bin/python3
class Tree:
def __init__ (self, weight, children):
self.weight = weight
self.children = children [:]
def distances (self, curDistance = .0, acc = None):
if acc is None: acc = []
for child in self.children:
child.distances (self.weight + curDistance, acc)
return acc
def collapse (self, limit):
self.children = [child.collapse (limit) for child in self.children]
distances = self.distances (-self.weight)
avg = sum (distances) / len (distances)
if avg > limit: return self
return Node (self.weight, ''.join (self.descendants () ) )
def descendants (self):
descendants = []
for child in self.children:
descendants.extend (child.descendants () )
return descendants
def __repr__ (self):
return '({}):{}'.format (','.join (str (child) for child in self.children), self.weight)
class Node:
def __init__ (self, weight, name):
self.weight = weight
self.name = name
def distances (self, curDistance, acc):
acc.append (curDistance + self.weight)
def collapse (self, limit):
return self
def descendants (self):
return [self.name]
def __repr__ (self):
return '{}:{}'.format (self.name, self.weight)
class Stack (list):
def pop (self):
e = self [0]
del self [0]
return e
def push (self, e):
self.insert (0, e)
def parse (tree):
buff = ''
stack = Stack ()
while True:
c = tree [0]
if c == ';': break
tree = tree [1:]
if c == '(':
stack.push (c)
continue
if c in ':,':
if buff: stack.push (buff)
buff = ''
continue
if c == ')':
if buff: stack.push (buff)
buff = ''
popped = ''
children = []
while True:
weight = stack.pop ()
if weight == '(': break
weight = float (weight)
child = stack.pop ()
if isinstance (child, Tree):
child.weight = weight
else:
child = Node (weight, child)
children.append (child)
stack.push (Tree (0, children) )
continue
buff += c
return stack.pop ()
t = parse ('((A:0.9,(B:0.2,C:0.3):0.3,(E:0.05,F:0.08):0.1):0.6);')
print ('Input tree is {}'.format (t) )
for limit in range (1, 6):
limit = limit / 10
print ('Collapsed by {} is {}'.format (limit, t.collapse (limit) ) )
Input tree is (((F:0.08,E:0.05):0.1,(C:0.3,B:0.2):0.3,A:0.9):0.6):0
Collapsed by 0.1 is ((FE:0.1,(C:0.3,B:0.2):0.3,A:0.9):0.6):0
Collapsed by 0.2 is ((FE:0.1,(C:0.3,B:0.2):0.3,A:0.9):0.6):0
Collapsed by 0.3 is ((FE:0.1,CB:0.3,A:0.9):0.6):0
Collapsed by 0.4 is ((FE:0.1,CB:0.3,A:0.9):0.6):0
Collapsed by 0.5 is (FECBA:0.6):0
#! /usr/bin/python3
class Tree:
def __init__ (self):
self.name = ''
self.length = None
self.children = []
def __repr__ (self):
return '({}){}{}'.format (
','.join (str (child) for child in self.children),
self.name,
':{}'.format (self.length) if self.length is not None else '')
class Node:
def __init__ (self, name = ''):
self.name = name
self.length = None
def __repr__ (self):
return '{}{}'.format (self.name, ':{}'.format (self.length) if self.length is not None else '')
class Stack (list):
def push (self, e):
self.append (e)
def pop (self):
e = self [-1]
del self [-1]
return e
def peek (self):
return self [-1]
class UnexpectedSymbolException (Exception): pass
def parseNameOrLength (stack, buff):
if stack.peek () == ':':
try: length = float (buff)
except ValueError:
raise ValueError ('Non-numeric length "{}" at position {}.'.format (buff, pos) )
stack.pop ()
stack.peek ().length = length
elif buff:
stack.peek ().name = buff
def parse (tree):
stack = Stack ()
stack.push (Tree () )
buff = ''
pos = -1
tree = tree.strip ()
while True:
pos += 1
c = tree [0]
tree = tree [1:].strip ()
if tree: la = tree [0]
if c == '(':
if buff:
raise UnexpectedSymbolException (
'Unexpected symbol {} at position {}.'.format (c, pos) )
if la == '(': stack.push (Tree () )
else: stack.push (Node () )
continue
if c == ')':
parseNameOrLength (stack, buff)
buff = ''
child = stack.pop ()
stack.peek ().children.append (child)
continue
if c in ',;':
parseNameOrLength (stack, buff)
buff = ''
if c == ';': return stack.pop ()
child = stack.pop ()
stack.peek ().children.append (child)
if la == '(': stack.push (Tree () )
else: stack.push (Node () )
continue
if c == ':':
if stack.peek () == ':':
raise UnexpectedSymbolException (
'Unexpected symbol {} at position {}.'.format (c, pos) )
stack.peek ().name = buff
stack.push (':')
buff = ''
continue
buff += c
tests = '''(,,(,));
(A,B,(C,D));
(A,B,(C,D)E)F;
(:0.1,:0.2,(:0.3,:0.4):0.5);
(:0.1,:0.2,(:0.3,:0.4):0.5):0.0;
(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);
(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;
((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;'''.split ('\n')
for test in tests: print (parse (test) )