Python 序数替换_Python_Nlp_Nltk_Ordinals

Python 序数替换

python nlp

Python 序数替换,python,nlp,nltk,ordinals,Python,Nlp,Nltk,Ordinals,我目前正在寻找用适当的序号表示法（第一、第二、第三）替换第一、第二、第三……等词的方法。上周我一直在谷歌上搜索，没有从NLTK中找到任何有用的标准工具或函数那么有没有或者应该手动编写一些正则表达式呢感谢您的建议公认的答案有一个算法可以解决一半的问题：它将的“第一个”转换为1。要从那里转到“1st”，请执行以下操作： suffixes = ["th", "st", "nd", "rd", ] + ["th"] * 16 suffixed_num = str(num) + suffixes[n

我目前正在寻找用适当的序号表示法（第一、第二、第三）替换第一、第二、第三……等词的方法。上周我一直在谷歌上搜索，没有从NLTK中找到任何有用的标准工具或函数

那么有没有或者应该手动编写一些正则表达式呢

感谢您的建议

公认的答案有一个算法可以解决一半的问题：它将

的“第一个”

转换为

。要从那里转到

“1st”

，请执行以下操作：

suffixes = ["th", "st", "nd", "rd", ] + ["th"] * 16
suffixed_num = str(num) + suffixes[num % 100]

这只适用于数字0-19。

我想在我的一个项目中使用序数，经过一些原型之后，我认为这种方法虽然不小，但适用于任何正整数，是的任何整数

它的工作原理是确定数字是大于还是小于20，如果数字小于20，它会将int 1转换为字符串1st、2、2nd；3、3；其余部分将添加“st”

对于超过20的数字，需要最后一位和第二位到最后一位的数字，我分别称之为十位和单位，并对它们进行测试，看看要向数字添加什么

顺便说一句，这是在python中实现的，因此我不确定其他语言是否能够找到字符串上的最后一位或倒数第二位，如果它们这样做的话，那么它应该很容易翻译

def o(numb):
    if numb < 20: #determining suffix for < 20
        if numb == 1: 
            suffix = 'st'
        elif numb == 2:
            suffix = 'nd'
        elif numb == 3:
            suffix = 'rd'
        else:
            suffix = 'th'  
    else:   #determining suffix for > 20
        tens = str(numb)
        tens = tens[-2]
        unit = str(numb)
        unit = unit[-1]
        if tens == "1":
           suffix = "th"
        else:
            if unit == "1": 
                suffix = 'st'
            elif unit == "2":
                suffix = 'nd'
            elif unit == "3":
                suffix = 'rd'
            else:
                suffix = 'th'
    return str(numb)+ suffix

def o（麻木）：
如果numb<20:#确定<20的后缀
如果numb==1：
后缀='st'
elif numb==2：
后缀='nd'
elif numb==3：
后缀='rd'
其他：
后缀='th'
其他：#确定>20的后缀
tens=str（无编号）
十个=十个[-2]
单位=str（numb）
单位=单位[-1]
如果十位数==“1”：
后缀=“th”
其他：
如果单位==“1”：
后缀='st'
elif单位==“2”：
后缀='nd'
elif单位==“3”：
后缀='rd'
其他：
后缀='th'
返回str（numb）+后缀

为了便于使用，我调用了函数“o”，可以通过导入文件名来调用该函数，我通过导入ordinal然后导入ordinal.o（number）来调用该文件名

让我知道你的想法：D

以下是一个简单的解决方案：

我发现自己也在做类似的事情，需要将带有序号（“第三个St”）的地址转换为地理编码器可以理解的格式（“第三个St”）。虽然这不是很优雅，但一个快速而肮脏的解决方案是使用生成用于翻译的词典

influct.py有一个

number\u to_words（）

函数，可以将一个数字（例如

）转换为它的单词形式（例如

“两个”

）。此外，还有一个

ordinal（）

函数，它将接受任何数字（数字或单词形式）并将其转换为序号形式（例如

4th

，

6th

6th

）。这两个词本身都不能满足您的需要，但您可以使用它们一起生成一个字典，将任何提供的序数词（在合理范围内）翻译为相应的数字序数。看一看:

>>> import inflect
>>> p = inflect.engine()
>>> word_to_number_mapping = {}
>>>
>>> for i in range(1, 100):
...     word_form = p.number_to_words(i)  # 1 -> 'one'
...     ordinal_word = p.ordinal(word_form)  # 'one' -> 'first'
...     ordinal_number = p.ordinal(i)  # 1 -> '1st'
...     word_to_number_mapping[ordinal_word] = ordinal_number  # 'first': '1st'
...
>>> print word_to_number_mapping['sixth']
6th
>>> print word_to_number_mapping['eleventh']
11th
>>> print word_to_number_mapping['forty-third']
43rd

如果您愿意提交一些时间，那么可以检查这两个函数中的influct.py的内部工作，并构建自己的代码来动态执行此操作（我没有尝试过这样做）。

这样如何：

suf = lambda n: "%d%s"%(n,{1:"st",2:"nd",3:"rd"}.get(n if n<20 else n%10,"th"))
print [suf(n) for n in xrange(1,32)]

['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th',
 '11th', '12th', '13th', '14th', '15th', '16th', '17th', '18th', '19th',
 '20th', '21st', '22nd', '23rd', '24th', '25th', '26th', '27th', '28th',
 '29th', '30th', '31st']

suf=lambda n:“%d%s”%（n，{1:“st”，2:“nd”，3:“rd”}.get（n如果n这里有一个我刚刚编写的更复杂的解决方案，它考虑了复合序数。因此，它从第一个
一直工作到999个
。我需要它将字符串街道名称转换为数字序数：
import re
from collections import OrderedDict

ONETHS = {
    'first': '1ST', 'second': '2ND', 'third': '3RD', 'fourth': '4TH', 'fifth': '5TH', 'sixth': '6TH', 'seventh': '7TH',
    'eighth': '8TH', 'ninth': '9TH'
}

TEENTHS = {
    'tenth': '10TH', 'eleventh': '11TH', 'twelfth': '12TH', 'thirteenth': '13TH',
    'fourteenth': '14TH', 'fifteenth': '15TH', 'sixteenth': '16TH', 'seventeenth': '17TH', 'eighteenth': '18TH',
    'nineteenth': '19TH'
}

TENTHS = {
    'twentieth': '20TH', 'thirtieth': '30TH', 'fortieth': '40TH', 'fiftieth': '50TH', 'sixtieth': '60TH',
    'seventieth': '70TH', 'eightieth': '80TH', 'ninetieth': '90TH',
}

HUNDREDTH = {'hundredth': '100TH'}  # HUNDREDTH not s

ONES = {'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8',
        'nine': '9'}

TENS = {'twenty': '20', 'thirty': '30', 'forty': '40', 'fifty': '50', 'sixty': '60', 'seventy': '70', 'eighty': '80',
        'ninety': '90'}

HUNDRED = {'hundred': '100'}

# Used below for ALL_ORDINALS
ALL_THS = {}
ALL_THS.update(ONETHS)
ALL_THS.update(TEENTHS)
ALL_THS.update(TENTHS)
ALL_THS.update(HUNDREDTH)

ALL_ORDINALS = OrderedDict()
ALL_ORDINALS.update(ALL_THS)
ALL_ORDINALS.update(TENS)
ALL_ORDINALS.update(HUNDRED)
ALL_ORDINALS.update(ONES)


def split_ordinal_word(word):
    ordinals = []
    if not word:
        return ordinals 

    for key, value in ALL_ORDINALS.items():
        if word.startswith(key):
            ordinals.append(key)
            ordinals += split_ordinal_word(word[len(key):])
            break
    return ordinals

def get_ordinals(s):
    ordinals, start, end = [], [], []
    s = s.strip().replace('-', ' ').replace('and', '').lower()
    s = re.sub(' +',' ', s)  # Replace multiple spaces with a single space
    s = s.split(' ')

    for word in s:
        found_ordinals = split_ordinal_word(word)
        if found_ordinals:
            ordinals += found_ordinals
        else:  # else if word, for covering blanks
            if ordinals:  # Already have some ordinals
                end.append(word)
            else:
                start.append(word)
    return start, ordinals, end


def detect_ordinal_pattern(ordinals):
    ordinal_length = len(ordinals)
    ordinal_string = '' # ' '.join(ordinals)
    if ordinal_length == 1:
        ordinal_string = ALL_ORDINALS[ordinals[0]]
    elif ordinal_length == 2:
        if ordinals[0] in ONES.keys() and ordinals[1] in HUNDREDTH.keys():
            ordinal_string = ONES[ordinals[0]] + '00TH'
        elif ordinals[0] in HUNDRED.keys() and ordinals[1] in ONETHS.keys():
            ordinal_string = HUNDRED[ordinals[0]][:-1] + ONETHS[ordinals[1]]
        elif ordinals[0] in TENS.keys() and ordinals[1] in ONETHS.keys():
            ordinal_string = TENS[ordinals[0]][0] + ONETHS[ordinals[1]]
    elif ordinal_length == 3:
        if ordinals[0] in HUNDRED.keys() and ordinals[1] in TENS.keys() and ordinals[2] in ONETHS.keys():
            ordinal_string = HUNDRED[ordinals[0]][0] + TENS[ordinals[1]][0] + ONETHS[ordinals[2]]
        elif ordinals[0] in ONES.keys() and ordinals[1] in HUNDRED.keys() and ordinals[2] in ALL_THS.keys():
            ordinal_string =  ONES[ordinals[0]] + ALL_THS[ordinals[2]]
    elif ordinal_length == 4:
        if ordinals[0] in ONES.keys() and ordinals[1] in HUNDRED.keys() and ordinals[2] in TENS.keys() and \
           ordinals[3] in ONETHS.keys():
                ordinal_string = ONES[ordinals[0]] + TENS[ordinals[2]][0] + ONETHS[ordinals[3]]

    return ordinal_string

下面是一些示例用法：
# s = '32 one   hundred and forty-third st toronto, on'
#s = '32 forty-third st toronto, on'
#s = '32 one-hundredth st toronto, on'
#s = '32 hundred and third st toronto, on'
#s = '32 hundred and thirty first st toronto, on'
# s = '32 nine hundred and twenty third st toronto, on'
#s = '32 nine hundred and ninety ninth st toronto, on'
s = '32 sixty sixth toronto, on'

st, ords, en = get_ordinals(s)
print st, detect_ordinal_pattern(ords), en

此函数适用于每个数字n。如果n为负，则将其转换为正。如果n不是整数，则将其转换为整数
def ordinal( n ):

    suffix = ['th', 'st', 'nd', 'rd', 'th', 'th', 'th', 'th', 'th', 'th']

    if n < 0:
        n *= -1

    n = int(n)

    if n % 100 in (11,12,13):
        s = 'th'
    else:
        s = suffix[n % 10]

    return str(n) + s

def序号（n）：
后缀=['th'，'st'，'nd'，'rd'，'th'，'th'，'th'，'th'，'th'，'th']
如果n<0：
n*=-1
n=int（n）
如果n%100 in（11,12,13）：
s='th'
其他：
s=后缀[n%10]
返回str（n）+s
如果使用django，您可以执行以下操作：
from django.contrib.humanize.templatetags.humanize import ordinal
var = ordinal(number)

（或者在django模板中使用ordinal作为模板过滤器，尽管从python代码中这样调用也可以）
如果不使用django，你可以窃取它，这非常简洁。
我向Gareth的lambda代码致敬。如此优雅。尽管如此，我对它的工作原理只有一半的了解。因此我尝试解构它，并提出了以下建议：
def ordinal(integer):

    int_to_string = str(integer)

    if int_to_string == '1' or int_to_string == '-1':
        print int_to_string+'st'
        return int_to_string+'st';
    elif int_to_string == '2' or int_to_string == '-2':
        print int_to_string+'nd'
        return int_to_string+'nd';
    elif int_to_string == '3' or int_to_string == '-3':
        print int_to_string+'rd'
        return int_to_string+'rd';

    elif int_to_string[-1] == '1' and int_to_string[-2] != '1':
        print int_to_string+'st'
        return int_to_string+'st';
    elif int_to_string[-1] == '2' and int_to_string[-2] != '1':
        print int_to_string+'nd'
        return int_to_string+'nd';
    elif int_to_string[-1] == '3' and int_to_string[-2] != '1':
        print int_to_string+'rd'
        return int_to_string+'rd';

    else:
        print int_to_string+'th'
        return int_to_string+'th';


>>> print [ordinal(n) for n in range(1,25)]
1st
2nd
3rd
4th
5th
6th
7th
8th
9th
10th
11th
12th
13th
14th
15th
16th
17th
18th
19th
20th
21st
22nd
23rd
24th
['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th',             
'11th', '12th', '13th', '14th', '15th', '16th', '17th', '18th', '19th', 
'20th', '21st', '22nd', '23rd', '24th']

Gareth的代码使用modern.format（）表示
ordinal=lambda n:“{}{}”.format（n，“tsnrhtdd”[（n/10%10！=1）*（n%10如果您不想导入外部模块，而更喜欢一行解决方案，那么以下内容可能（稍微）比公认的答案更具可读性：
def suffix(i):
    return {1:"st", 2:"nd", 3:"rd"}.get(i%10*(i%100 not in [11,12,13]), "th"))

按照和的建议，它使用dictionary.get

我使用布尔乘法来处理特殊情况（11,12,13），而不必启动if块。如果条件（I%100不在[11,12,13]）
计算为False
，则整数为0，我们得到默认的“th”情况。
另一个解决方案是num2words库（|）。
它特别提供了不同的语言，因此本地化/国际化（又称l10n/i18n）是一个很简单的问题
使用pip install num2words
安装后，使用起来很简单：
from num2words import num2words
# english is default
num2words(4458, to="ordinal_num")
'4458th'

# examples for other languages
num2words(4458, lang="en", to="ordinal_num")
'4458th'

num2words(4458, lang="es", to="ordinal_num")
'4458º'

num2words(4458, lang="de", to="ordinal_num")
'4458.'

num2words(4458, lang="id", to="ordinal_num")
'ke-4458'

奖金：
num2words(4458, lang="en", to="ordinal")
'four thousand, four hundred and fifty-eighth'

这可以处理任意长度的数字、11到13的例外情况以及负整数
def ith(i):return(('th'*(10<(abs(i)%100)<14))+['st','nd','rd',*['th']*7][(abs(i)-1)%10])[0:2]

注意：使用Python 3.6进行了测试；abs（）函数在没有明确包含数学模块的情况下可用。
这是使用num2words包的可选选项
>>> from num2words import num2words
>>> num2words(42, to='ordinal_num')
    '42nd'

如果您不想引入对外部库（as）的额外依赖，也不想让未来的代码维护者困扰您，那么您可以这样做
num2words(4458, lang="en", to="ordinal")
'four thousand, four hundred and fifty-eighth'

def ith(i):return(('th'*(10<(abs(i)%100)<14))+['st','nd','rd',*['th']*7][(abs(i)-1)%10])[0:2]

# test routine
for i in range(-200,200):
    print(i,ith(i))

>>> from num2words import num2words
>>> num2words(42, to='ordinal_num')
    '42nd'

def make_ordinal(n):
    '''
    Convert an integer into its ordinal representation::

        make_ordinal(0)   => '0th'
        make_ordinal(3)   => '3rd'
        make_ordinal(122) => '122nd'
        make_ordinal(213) => '213th'
    '''
    n = int(n)
    suffix = ['th', 'st', 'nd', 'rd', 'th'][min(n % 10, 4)]
    if 11 <= (n % 100) <= 13:
        suffix = 'th'
    return str(n) + suffix

import sys

a = int(sys.argv[1])

for i in range(1,a+1):

j = i
if(j%100 == 11 or j%100 == 12 or j%100 == 13):
    print("%dth Hello"%(j))
    continue            
i %= 10
if ((j%10 == 1) and ((i%10 != 0) or (i%10 != 1))):
    print("%dst Hello"%(j))
elif ((j%10 == 2) and ((i%10 != 0) or (i%10 != 1))):
    print("%dnd Hello"%(j))
elif ((j%10 == 3) and ((i%10 != 0) or (i%10 != 1))):
    print("%drd Hello"%(j))
else:
    print("%dth Hello"%(j))