Python 基于给定字典标记连接字符

Python 基于给定字典标记连接字符,python,dictionary,Python,Dictionary,我想根据给定的字典对连接字符进行标记化,并给出和输出找到的标记化单词。例如,我有以下几点 dictionary = ['yak', 'kin', 'yakkin', 'khai', 'koo'] chars = 'yakkinpadthaikhaikoo' 输出应如下所示: [('yakkin', (0, 6), 6), ('padthai', (6, 13), 7), ('khai', (13, 17), 4), ('koo', (17, 20), 3)] 我想把元组列表作为输出。元组中的


dictionary = ['yak', 'kin', 'yakkin', 'khai', 'koo']
chars = 'yakkinpadthaikhaikoo'

[('yakkin', (0, 6), 6), ('padthai', (6, 13), 7), ('khai', (13, 17), 4), ('koo', (17, 20), 3)]


import numpy as np

def tokenize(chars, dictionary):
    n_chars = len(chars)
    start = 0
    char_found = []
    words = []
    for _ in range(int(n_chars/3)):
        for r in range(1, n_chars + 1):
            if chars[start:(start + r)] in dictionary:
                char_found.append((chars[start:(start + r)], (start, start + r), len(chars[start:start+r])))
        id_offset = np.argmax([t[1][1] for t in char_found])
        start = char_found[id_offset][2]
        if char_found[id_offset] not in words:
    return words

tokenize(chars, dictionary) # give only [('yakkin', (0, 6), 6)]



def tokenize(string, dictionary):
    # sorting dictionary words by length
    # because we need to find longest word if its possible
    # like "yakkin" instead of "yak"
    sorted_dictionary = sorted(dictionary,
                               key=lambda word: len(word),
    start = 0
    tokens = []
    while start < len(string):
        substring = string[start:]
            word = next(word
                        for word in sorted_dictionary
                        if substring.startswith(word))
            offset = len(word)
        except StopIteration:
            # no words from dictionary were found
            # at the beginning of substring,
            # looking for next appearance of dictionary words
            words_indexes = [substring.find(word)
                             for word in sorted_dictionary]
            # if word is not found, "str.find" method returns -1
            appeared_words_indexes = filter(lambda index: index > 0,
                offset = min(appeared_words_indexes)
            except ValueError:
                # an empty sequence was passed to "min" function
                # because there are no words from dictionary in substring
                offset = len(substring)
            word = substring[:offset]
        token = word, (start, start + offset), offset
        start += offset
    return tokens


def tokenize(chars, word_list):
    tokens = []
    for word in word_list:
        word_len = len(word)
        index = 0

        # skips words that appear in longer words
        skip = False
        for other_word in word_list:
            if word in other_word and len(other_word) > len(word):
                print("skipped word:", word)
                skip = True
        if skip:

        while index < len(chars):
            index = chars.find(word, index) # start search from index
            if index == -1: # find() returns -1 if not found
            # Append the tuple and continue the search at the end of the word
            tokens.append((word, (index, word_len+index), word_len))
            index += word_len

    return tokens

def tokenize(chars, word_list):
    tokens = []
    for word in word_list:
        word_len = len(word)
        index = 0

        # skips words that appear in longer words
        skip = False
        for other_word in word_list:
            if word in other_word and len(other_word) > len(word):
                print("skipped word:", word)
                skip = True
        if skip:

        while index < len(chars):
            index = chars.find(word, index) # start search from index
            if index == -1: # find() returns -1 if not found
            # Append the tuple and continue the search at the end of the word
            tokens.append((word, (index, word_len+index), word_len))
            index += word_len

    return tokens
>>>tokenize('yakkinpadthaikhaikoo', ['yak', 'kin', 'yakkin', 'khai', 'koo'])

skipped word: yak
skipped word: kin
[('yakkin', (0, 6), 6), 
 ('khai', (13, 17), 4), 
 ('koo', (17, 20), 3)]