Python 使用正则表达式作为标记器?

Python 使用正则表达式作为标记器?,python,regex,nlp,tokenize,Python,Regex,Nlp,Tokenize,我试图把我的语料库标记成句子。我试着使用spacy和nltk,但效果不好,因为我的文本有点棘手。下面是我制作的一个人工样本,涵盖了我所知道的所有边缘情况: It is relevant to point that Case No. 778 - Martin H. v. The Woods, it was mentioned that death to one cannot be generalised. However, the High Court while enhancing the s


1) It is relevant to point that Case No. 778 - Martin H. v. The Woods, it was mentioned that death to one cannot be generalised.
2) However, the High Court while enhancing the same from life to death, in our view,has not assigned adequate and acceptable reasons.
3) In our opinion, it is not a rarest of rare case where extreme penalty of death is called for instead sentence of imprisonment for life as ordered by the trial Court would be appropriate.
4)15. In the light of the above discussion, while
 maintaining the conviction of the appellant-accused for the offence under Section 302. IPC, 
award of extreme penalty of death by the High Court is set aside and we restore the sentence of
 life imprisonment as directed by the trial Court.


sent = re.split('(?<!\w\.\w.)(?<![A-Z]\.)(?<![1-9]\.)(?<![1-9]\.)(?<![v]\.)(?<![vs]\.)(?<=\.|\?) ',j)




import re

def split_into_sentences(text):
    # Regex pattern
    alphabets= "([A-Za-z])"
    prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    # website regex from
    websites = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    digits = "([0-9])"
    section = "(Section \d+)([.])(?= \w)"
    item_number = "(^|\s\w{2})([.])(?=[-+ ]?\d+)"
    abbreviations = "(^|[\s\(\[]\w{1,2}s?)([.])(?=[\s\)\]]|$)"
    parenthesized = "\((.*?)\)"
    bracketed = "\[(.*?)\]"
    curly_bracketed = "\{(.*?)\}"
    enclosed = '|'.join([parenthesized, bracketed, curly_bracketed])
    # text replacement
    # replace unwanted stop period with <prd>
    # actual stop periods with <stop>
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites, lambda m:'.', '<prd>'), text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(section,"\\1<prd>",text)
    text = re.sub(item_number,"\\1<prd>",text)
    text = re.sub(abbreviations, "\\1<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(enclosed, lambda m:'.', '<prd>'), text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")

    # Tokenize sentence based upon <stop>
    sentences = text.split("<stop>")
    if sentences[-1].isspace():
        # remove last since only whitespace
        sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]

    return sentences
for index, token in enumerate(split_into_sentences(s), start = 1):
    print(f'{index}) {token}')












'(?<=[^A-Z][a-z]\w)[/.] '

s = '''Mr. or Mrs. or Dr. (not sure of their title) Smith will be here in the morning at eight.He's arriving on flight No. 48213 out of Denver.He'll take the No. 2 bus from the airport.However, he may grab a taxi instead.'''
s = '''The respondent, in his statement Ex.-73, which is accepted and found to be truthful. The passcode is either No.5, No. 5, No.-5, No.+5.'''
s = '''He went to New York. He is 10 years old.'''
s = '''15) In the light of  Ex. P the above discussion, while maintaining the conviction of the appellant-accused for the offence under Section 302 IPC, award of extreme penalty of death by the High Court is set aside and we restore the sentence of life imprisonment as directed by the trial Court. The appeal is allowed in part to the extent mentioned above.'''
sent=re.split('(?<=[^A-Z][a-z]\w)[/.] ',j)