Python 定制词汇表的一种热编码
我有如下Python 定制词汇表的一种热编码,python,nlp,stanford-nlp,one-hot-encoding,Python,Nlp,Stanford Nlp,One Hot Encoding,我有如下charset charset =set([ '$', '^', '#', '(', ')', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', 'Br', 'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', '[2H]', '[Br-]', '[C@@H]', '[C@@]', '[C@H]', '[C@]', '[Cl-]', '[H]', '[I
charset
charset =set([ '$', '^', '#', '(', ')', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', 'Br',
'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', '[2H]', '[Br-]', '[C@@H]', '[C@@]', '[C@H]', '[C@]',
'[Cl-]', '[H]', '[I-]', '[N+]', '[N-]', '[N@+]', '[N@@+]', '[NH+]', '[NH2+]', '[NH3+]', '[N]',
'[Na+]', '[O-]', '[P+]', '[S+]', '[S-]', '[S@+]', '[S@@+]', '[SH]', '[Si]', '[n+]', '[n-]',
'[nH+]', '[nH]', '[o+]', '[se]', '\\', 'c', 'n', 'o', 's', '!', 'E'])
char_to_int = dict((c,i) for i,c in enumerate(charset))
int_to_char = dict((i,c) for i,c in enumerate(charset))
基于这个charset
,我创建了char\u to\u int
,如下所示
charset =set([ '$', '^', '#', '(', ')', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', 'Br',
'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', '[2H]', '[Br-]', '[C@@H]', '[C@@]', '[C@H]', '[C@]',
'[Cl-]', '[H]', '[I-]', '[N+]', '[N-]', '[N@+]', '[N@@+]', '[NH+]', '[NH2+]', '[NH3+]', '[N]',
'[Na+]', '[O-]', '[P+]', '[S+]', '[S-]', '[S@+]', '[S@@+]', '[SH]', '[Si]', '[n+]', '[n-]',
'[nH+]', '[nH]', '[o+]', '[se]', '\\', 'c', 'n', 'o', 's', '!', 'E'])
char_to_int = dict((c,i) for i,c in enumerate(charset))
int_to_char = dict((i,c) for i,c in enumerate(charset))
{'[nH]':0,[2H]':1,'2':2,'N':3,'Cl':4,'c':5,'$':6,
"O":7,"(8,6:9,s):10,[S@+]“:11,[C@@H]”:12,'C':
13,“[nH+]”:14,“/”:15,[nH+]”:16,[Br-]:17,[Si]':18,
'4': 19, '[N@+]“:20,”[se]':21,'P':22,[SH]':23,”[N+]:
24,[N]:25,“^”:26,'5':27,'7':28,'N':29,!':30,
“\”:31,[n-]:32,'S':33,[NH3+]:34,“\”:35,'I':36,
[O-]:37,'1':38,[NH2+]:39,[S@@@+]:40,'Br':41,'F':
42,[Na+]:43,'E':44,[S-]:45,':46,':47,[C@]:
48, '=': 49, '3': 50, '-': 51, '[C@H]“:52,”[Cl-]:53,”[I-]:
54,[H]:55,[P+]:56,[S+]:57,'o':58,[N@@@+]:59,
“[N-]”:60,“[N+]”:61,[o+]”:62,[C@@@63}”
和int_to_char
如下所示
charset =set([ '$', '^', '#', '(', ')', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', 'Br',
'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', '[2H]', '[Br-]', '[C@@H]', '[C@@]', '[C@H]', '[C@]',
'[Cl-]', '[H]', '[I-]', '[N+]', '[N-]', '[N@+]', '[N@@+]', '[NH+]', '[NH2+]', '[NH3+]', '[N]',
'[Na+]', '[O-]', '[P+]', '[S+]', '[S-]', '[S@+]', '[S@@+]', '[SH]', '[Si]', '[n+]', '[n-]',
'[nH+]', '[nH]', '[o+]', '[se]', '\\', 'c', 'n', 'o', 's', '!', 'E'])
char_to_int = dict((c,i) for i,c in enumerate(charset))
int_to_char = dict((i,c) for i,c in enumerate(charset))
{0:'[nH]',1:'[2H]',2:'2',3:'N',4:'Cl',5:'c',6:'$',等等,
7:'O',8:'('9:'6',10:'s',11:'[S@+]”,12:“[C@@H]”,13:
"C",14:"[nH+]",15:"/",16:"[nH+]",17:"[Br-]",18:"[Si]",,
19: '4', 20: '[N@+]",21:"东南",22:"东南",23:"东南",24:
“[N+]”,25:“[N]”,26:“^”,27:'5',28:'7',29:'N',30:'!”,
31:“\”,32:“[n-]”,33:“S”,34:“[NH3+]”,35:“#”,36:“I”,
37:“[O-]”,38:'1',39:“[NH2+]”,40:“[S@@@+]”,41:'Br',42:
"F,43:"[Na+]",44:"E",45:"[S-]",46:",47",48:
“[C@]”,49:“=”,50:'3',51:'-”,52:'[C@H]”,53:“[Cl-]”,54:
“[I-]”,55:“[H]”,56:“[P+]”,57:“[S+]”,58:“o”,59:“[N@@@+]”,
60:“[N-]”,61:“[N+]”,62:“[o+]”,63:“[C@@@]”
我有一个字符串,我想根据char\u to\u int
和int\u to\u char
将其转换为一个热编码
string = 'N[C@H]1C[C@@H](N2Cc3nn4cccnc4c3C2)CC[C@@H]1c1cc(F)c(F)cc1F'
是否有任何有效的方法可以使用自定义的char\u to\u int
和int\u to\u char
将字符串
转换为一个热向量
from itertools import chain, repeat, islice
import re
string = 'N[C@H]1C[C@@H](N2Cc3nn4cccnc4c3C2)CC[C@@H]1c1cc(F)c(F)cc1F'
items_list=[ '$', '^', '#', '(', ')', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', 'Br',
'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', '[2H]', '[Br-]', '[C@@H]', '[C@@]', '[C@H]', '[C@]',
'[Cl-]', '[H]', '[I-]', '[N+]', '[N-]', '[N@+]', '[N@@+]', '[NH+]', '[NH2+]', '[NH3+]', '[N]',
'[Na+]', '[O-]', '[P+]', '[S+]', '[S-]', '[S@+]', '[S@@+]', '[SH]', '[Si]', '[n+]', '[n-]',
'[nH+]', '[nH]', '[o+]', '[se]', '\\', 'c', 'n', 'o', 's', '!', 'E']
charset = set(items_list)
char_to_int = dict((c,i) for i,c in enumerate(charset))
pattern = '|'.join(re.escape(item) for item in items_list)
tokens = re.findall(pattern, string)
x=[char_to_int[k] for k in tokens]
这里,x
是一个热编码
x=[3, 52, 38, 13, 12, 8, 3, 2, 13, 5, 50, 29, 29, 19, 5, 5, 5, 29, 5, 19, 5, 50, 13, 2, 47, 13, 13, 12, 38, 5, 38, 5, 5, 8, 42, 47, 5, 8, 42, 47, 5, 5, 38, 42]