Python 定制词汇表的一种热编码

Python 定制词汇表的一种热编码,python,nlp,stanford-nlp,one-hot-encoding,Python,Nlp,Stanford Nlp,One Hot Encoding,我有如下charset charset =set([ '$', '^', '#', '(', ')', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', 'Br', 'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', '[2H]', '[Br-]', '[C@@H]', '[C@@]', '[C@H]', '[C@]', '[Cl-]', '[H]', '[I

我有如下
charset

charset =set([ '$', '^', '#', '(', ')', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', 'Br', 
           'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', '[2H]', '[Br-]', '[C@@H]', '[C@@]', '[C@H]', '[C@]', 
           '[Cl-]', '[H]', '[I-]', '[N+]', '[N-]', '[N@+]', '[N@@+]', '[NH+]', '[NH2+]', '[NH3+]', '[N]', 
           '[Na+]', '[O-]', '[P+]', '[S+]', '[S-]', '[S@+]', '[S@@+]', '[SH]', '[Si]', '[n+]', '[n-]', 
           '[nH+]', '[nH]', '[o+]', '[se]', '\\', 'c', 'n', 'o', 's', '!', 'E'])
char_to_int = dict((c,i) for i,c in enumerate(charset))
int_to_char = dict((i,c) for i,c in enumerate(charset))
基于这个
charset
,我创建了
char\u to\u int
,如下所示

charset =set([ '$', '^', '#', '(', ')', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', 'Br', 
           'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', '[2H]', '[Br-]', '[C@@H]', '[C@@]', '[C@H]', '[C@]', 
           '[Cl-]', '[H]', '[I-]', '[N+]', '[N-]', '[N@+]', '[N@@+]', '[NH+]', '[NH2+]', '[NH3+]', '[N]', 
           '[Na+]', '[O-]', '[P+]', '[S+]', '[S-]', '[S@+]', '[S@@+]', '[SH]', '[Si]', '[n+]', '[n-]', 
           '[nH+]', '[nH]', '[o+]', '[se]', '\\', 'c', 'n', 'o', 's', '!', 'E'])
char_to_int = dict((c,i) for i,c in enumerate(charset))
int_to_char = dict((i,c) for i,c in enumerate(charset))
{'[nH]':0,[2H]':1,'2':2,'N':3,'Cl':4,'c':5,'$':6, "O":7,"(8,6:9,s):10,[S@+]“:11,[C@@H]”:12,'C': 13,“[nH+]”:14,“/”:15,[nH+]”:16,[Br-]:17,[Si]':18, '4': 19, '[N@+]“:20,”[se]':21,'P':22,[SH]':23,”[N+]: 24,[N]:25,“^”:26,'5':27,'7':28,'N':29,!':30, “\”:31,[n-]:32,'S':33,[NH3+]:34,“\”:35,'I':36, [O-]:37,'1':38,[NH2+]:39,[S@@@+]:40,'Br':41,'F': 42,[Na+]:43,'E':44,[S-]:45,':46,':47,[C@]: 48, '=': 49, '3': 50, '-': 51, '[C@H]“:52,”[Cl-]:53,”[I-]: 54,[H]:55,[P+]:56,[S+]:57,'o':58,[N@@@+]:59, “[N-]”:60,“[N+]”:61,[o+]”:62,[C@@@63}”

int_to_char
如下所示

charset =set([ '$', '^', '#', '(', ')', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', 'Br', 
           'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', '[2H]', '[Br-]', '[C@@H]', '[C@@]', '[C@H]', '[C@]', 
           '[Cl-]', '[H]', '[I-]', '[N+]', '[N-]', '[N@+]', '[N@@+]', '[NH+]', '[NH2+]', '[NH3+]', '[N]', 
           '[Na+]', '[O-]', '[P+]', '[S+]', '[S-]', '[S@+]', '[S@@+]', '[SH]', '[Si]', '[n+]', '[n-]', 
           '[nH+]', '[nH]', '[o+]', '[se]', '\\', 'c', 'n', 'o', 's', '!', 'E'])
char_to_int = dict((c,i) for i,c in enumerate(charset))
int_to_char = dict((i,c) for i,c in enumerate(charset))
{0:'[nH]',1:'[2H]',2:'2',3:'N',4:'Cl',5:'c',6:'$',等等, 7:'O',8:'('9:'6',10:'s',11:'[S@+]”,12:“[C@@H]”,13: "C",14:"[nH+]",15:"/",16:"[nH+]",17:"[Br-]",18:"[Si]",, 19: '4', 20: '[N@+]",21:"东南",22:"东南",23:"东南",24: “[N+]”,25:“[N]”,26:“^”,27:'5',28:'7',29:'N',30:'!”, 31:“\”,32:“[n-]”,33:“S”,34:“[NH3+]”,35:“#”,36:“I”, 37:“[O-]”,38:'1',39:“[NH2+]”,40:“[S@@@+]”,41:'Br',42: "F,43:"[Na+]",44:"E",45:"[S-]",46:",47",48: “[C@]”,49:“=”,50:'3',51:'-”,52:'[C@H]”,53:“[Cl-]”,54: “[I-]”,55:“[H]”,56:“[P+]”,57:“[S+]”,58:“o”,59:“[N@@@+]”, 60:“[N-]”,61:“[N+]”,62:“[o+]”,63:“[C@@@]”

我有一个字符串,我想根据
char\u to\u int
int\u to\u char
将其转换为一个热编码

string = 'N[C@H]1C[C@@H](N2Cc3nn4cccnc4c3C2)CC[C@@H]1c1cc(F)c(F)cc1F'
是否有任何有效的方法可以使用自定义的
char\u to\u int
int\u to\u char
字符串
转换为一个热向量

from itertools import chain, repeat, islice
import re

string = 'N[C@H]1C[C@@H](N2Cc3nn4cccnc4c3C2)CC[C@@H]1c1cc(F)c(F)cc1F'

items_list=[ '$', '^', '#', '(', ')', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', 'Br', 
       'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', '[2H]', '[Br-]', '[C@@H]', '[C@@]', '[C@H]', '[C@]', 
       '[Cl-]', '[H]', '[I-]', '[N+]', '[N-]', '[N@+]', '[N@@+]', '[NH+]', '[NH2+]', '[NH3+]', '[N]', 
       '[Na+]', '[O-]', '[P+]', '[S+]', '[S-]', '[S@+]', '[S@@+]', '[SH]', '[Si]', '[n+]', '[n-]', 
       '[nH+]', '[nH]', '[o+]', '[se]', '\\', 'c', 'n', 'o', 's', '!', 'E']

charset = set(items_list)
char_to_int = dict((c,i) for i,c in enumerate(charset))

pattern = '|'.join(re.escape(item) for item in items_list)


tokens = re.findall(pattern, string)
x=[char_to_int[k] for k in tokens]
这里,
x
是一个热编码

x=[3, 52, 38, 13, 12, 8, 3, 2, 13, 5, 50, 29, 29, 19, 5, 5, 5, 29, 5, 19, 5, 50, 13, 2, 47, 13, 13, 12, 38, 5, 38, 5, 5, 8, 42, 47, 5, 8, 42, 47, 5, 5, 38, 42]