Python 检查pandas数据库中的字符串是否包含子字符串并删除_Python_String_List_For Loop_Split

Python 检查pandas数据库中的字符串是否包含子字符串并删除

python string list for-loop

Python 检查pandas数据库中的字符串是否包含子字符串并删除,python,string,list,for-loop,split,Python,String,List,For Loop,Split,我正在清理熊猫数据框“受影响百分比”的一列。它包含整数范围（例如：“70-80”、“70和80”、“65到70”）我试图创建一个函数来清除所有这些，以创建整数平均值这很管用>>> def clean_split_range(row): # Initial value contains the current value for the PERCENTAGE AFFECTED column initial_perc = str(row['PERCENTAGE_AFFECTED']) chars

我正在清理熊猫数据框“受影响百分比”的一列。它包含整数范围（例如：“70-80”、“70和80”、“65到70”）

我试图创建一个函数来清除所有这些，以创建整数平均值

这很管用>>>

def clean_split_range(row):
# Initial value contains the current value for the PERCENTAGE AFFECTED column
initial_perc = str(row['PERCENTAGE_AFFECTED'])
chars = '<>!,?":;() '

#Remove chars in initial value
if any(c in chars for c in initial_perc): 
    split_range =[]
    cleanWord = ""
    for char in initial_perc:            
        if char in chars:
            char = ""
        cleanWord += char
    split_range.append(cleanWord)
    initial_perc = ''.join(split_range)



#Split initial_perc into two elements if "-" is found   
split_range = initial_perc.split('-')
# If a "-"  is found, split_date will contain a list with two items
if len(split_range) > 1:        
    try:
        final_perc = int(reduce(lambda x, y: x + y, list(map(int, split_range))) / (len(split_range)))
    except ValueError:
        split_range = split_range[0].split('+')
        final_perc = split_range[0]            
    finally:
        if str(final_perc).isalpha():
            final_perc = 0

elif initial_perc.find('and') != -1:
    split_other = initial_perc.split('and')
    if len(split_other) > 1:
        try:
            final_perc = int(reduce(lambda x, y: x + y, list(map(int, split_other))) / (len(split_other)))
        except ValueError:
            split_other = split_other[0].split('+')
            final_perc = split_other[0]
        finally:
            if str(final_perc).isalpha():
                final_perc = 0

elif initial_perc.find('to') != -1:
    split_other = initial_perc.split('to')
    if len(split_other) > 1:
        try:
            final_perc = int(reduce(lambda x, y: x + y, list(map(int, split_other))) / (len(split_other)))
        except ValueError:
            split_other = split_other[0].split('+')
            final_perc = split_other[0]
        finally:
            if str(final_perc).isalpha():
                final_perc = 0   



elif initial_perc.find('±') != -1:
    split_other = initial_perc.split('±')
    final_perc = split_other[0]

elif initial_perc.startswith('over'):
    split_other = initial_perc.split('over')
    final_perc = split_other[1]     

elif initial_perc.find('around') != -1:
    split_other = initial_perc.split('around')
    final_perc = split_other[1]



elif initial_perc.isalpha():
    final_perc = 0

# If no "-" is found, split_date will just contain 1 item, the initial_date
else:
    final_perc = initial_perc

return final_perc

def clean_split_范围（行）：
#初始值包含受影响百分比列的当前值
初始perc=str（第[‘受影响百分比’]行）
字符='！，？":;() '
#移除初始值中的字符
如有（大写字母c代表大写字母c）：
分割范围=[]
cleanWord=“”
对于首字母_perc中的字符：
如果字符中有字符：
char=“”
cleanWord+=char
拆分范围。追加（cleanWord）
初始_perc=''.join（分割范围）
#如果找到“-”，则将初始_perc拆分为两个元素
拆分范围=初始拆分百分比（'-'））
#如果找到“-”，拆分日期将包含一个包含两项的列表
如果len（分割范围）>1：
尝试：
final_perc=int（reduce（lambda x，y:x+y，list（map（int，split_范围））/（len（split_范围）））
除值错误外：
拆分范围=拆分范围[0]。拆分（+）
最终_perc=分割_范围[0]
最后：
如果str（final_perc）.isalpha（）：
最终perc=0
elif initial_perc.find（'and'）！=-1:
分割其他=初始分割（'和'）
如果len（分割其他）>1：
尝试：
final_perc=int（reduce（lambda x，y:x+y，list（map（int，split_other））/（len（split_other）））
除值错误外：
split_other=split_other[0]。split（+）
最终perc=拆分其他[0]
最后：
如果str（final_perc）.isalpha（）：
最终perc=0
elif首字母perc.find（'to'）！=-1:
分割其他=初始分割（“到”）
如果len（分割其他）>1：
尝试：
final_perc=int（reduce（lambda x，y:x+y，list（map（int，split_other））/（len（split_other）））
除值错误外：
split_other=split_other[0]。split（+）
最终perc=拆分其他[0]
最后：
如果str（final_perc）.isalpha（）：
最终perc=0
elif初始百分比查找（“±”）！=-1:
拆分其他=初始拆分（“±”）
最终perc=拆分其他[0]
elif首字母perc.startswith（'over'）：
分割其他=初始分割（'超过'）
最终perc=拆分其他[1]
elif首字母perc.find（'around'）！=-1:
split_other=初始分割（'around'））
最终perc=拆分其他[1]
elif initial_perc.isalpha（）：
最终perc=0
#如果没有找到“-”，拆分日期将只包含1项，即初始的拆分日期
其他：
最终预期=初始预期
返回最终perc

但是：我试图简化此操作，以便在条目中包含“-”、“和”、“到”子字符串时。我已创建了一个要拆分并删除的子字符串列表（拆分列表）：

def new_clean_split_range(row):
# Initial value contains the current value for the PERCENTAGE AFFECTED column
initial_perc = str(row['PERCENTAGE_AFFECTED'])
chars = '<>!,?":;() '
split_list = ['-','and']



# Split initial_perc into two elements if "-" is found    
if any(a in initial_perc for a in split_list):
    for a in split_list:
        split_range = initial_perc.split(a)
        # If a "-"  is found in split_list, initial_perc will contain a list with two items
        if len(split_range) > 1:        
            try:
                final_perc = int(reduce(lambda x, y: x + y, list(map(int, split_range))) / (len(split_range)))
            except ValueError:
                split_range = split_range[0].split('+')
                final_perc = split_range[0]            
            finally:
                if str(final_perc).isalpha():
                    final_perc = 0
        else:
            final_perc = initial_perc  



#Remove chars in initial value
if any(c in chars for c in initial_perc): 
    split_range =[]
    cleanWord = ""
    for char in initial_perc:            
        if char in chars:
            char = ""
        cleanWord += char
    split_range.append(cleanWord)
    initial_perc = ''.join(split_range)
    split_range = ''    



elif initial_perc.find('±') != -1:
    split_other = initial_perc.split('±')
    final_perc = split_other[0]

elif initial_perc.startswith('over'): 
    split_other = initial_perc.split('over')
    final_perc = split_other[1]     

elif initial_perc.find('around') != -1:
    split_other = initial_perc.split('around')
    final_perc = split_other[1]









elif initial_perc.isalpha():
    final_perc = 0

# If no "-" is found, split_date will just contain 1 item, the initial_date
else:
    final_perc = initial_perc

return final_perc

def新建清洁分割范围（行）：
#初始值包含受影响百分比列的当前值
初始perc=str（第[‘受影响百分比’]行）
字符='！，？：；() '
拆分列表=['-'，'和']
#如果找到“-”，则将初始_perc拆分为两个元素
如果有的话（对于拆分列表，首字母为a）：
对于拆分中的列表：
分割范围=初始分割百分比（a）
#如果在split_列表中找到“-”，则初始_perc将包含一个包含两项的列表
如果len（分割范围）>1：
尝试：
final_perc=int（reduce（lambda x，y:x+y，list（map（int，split_范围））/（len（split_范围）））
除值错误外：
拆分范围=拆分范围[0]。拆分（+）
最终_perc=分割_范围[0]
最后：
如果str（final_perc）.isalpha（）：
最终perc=0
其他：
最终预期=初始预期
#移除初始值中的字符
如有（大写字母c代表大写字母c）：
分割范围=[]
cleanWord=“”
对于首字母_perc中的字符：
如果字符中有字符：
char=“”
cleanWord+=char
拆分范围。追加（cleanWord）
初始_perc=''.join（分割范围）
分割范围=“”
elif首字母perc.find（“±”）！=-1:
拆分其他=初始拆分（“±”）
最终perc=拆分其他[0]
elif首字母perc.startswith（'over'）：
分割其他=初始分割（'超过'）
最终perc=拆分其他[1]
elif首字母perc.find（'around'）！=-1:
split_other=初始分割（'around'））
最终perc=拆分其他[1]
elif initial_perc.isalpha（）：
最终perc=0
#如果没有找到“-”，拆分日期将只包含1项，即初始的拆分日期
其他：
最终预期=初始预期
返回最终perc

任何帮助都很好：）

我建议使用正则表达式

看看这个

import re
results = re.findall(r"(\d{2,3}\.?\d*).*?(\d{2,3}\.?\d*)", x).pop() #x is input
print results
#results will be tuple and you can handle it easily.

检查以下输入和输出

输入
“70.5894-80.9894”
‘70和85’，
‘65到70’，
“72 75”

输出
（'70.5894'，'80.9894'）
（'70'，'85'）
（'65'，'70'）
（'72'，'75'）

请提供“初始perc”和预期输出的所有输入（您已输入，但仅符合要求）不确定如何为您附加，但它包含整数，范围例如：“70-80”、“70和80”、“65到70”，例如：“那么，我能提出另一种解决方案吗？因为它有点复杂glitchy@DexJ是的，我愿意接受建议。。我有一个有限的数据集，原始代码可以工作，但我只是想把它浓缩一下，然后我建议在您的案例中使用正则表达式解决方案。所有这些都不可能改为将答案放在这里。那么如何避免类型错误呢？我可以做一个列表理解/for循环，在数据帧列中迭代这个regex方法吗？你是说我的类型错误，我没有得到它？是的，对于这个正则表达式方法，可以使用for循环