Python 对数据帧中的连续相同数字进行计数_Python_Python 3.x

Python 对数据帧中的连续相同数字进行计数

python python-3.x

Python 对数据帧中的连续相同数字进行计数,python,python-3.x,Python,Python 3.x,我试图在下面的数据框中计算连续的相同值： DF: 因此，我希望输出如下内容： [2000000123:（-1,7）， (4, 1), (6, 1), (0, 1), (8, 1), (9, 1), (7, 1), (10, 1), (3, 1), (2, 1), (3, 1), （2、1）] ID号后跟连续的数字计数，例如：数字-1连续重复7次 ---------------更新------------------ 使用的代码 import pandas as pd data = { '

我试图在下面的数据框中计算连续的相同值：

DF:

因此，我希望输出如下内容：

[2000000123:（-1,7）， (4, 1), (6, 1), (0, 1), (8, 1), (9, 1), (7, 1), (10, 1), (3, 1), (2, 1), (3, 1), （2、1）]

ID号后跟连续的数字计数，例如：

数字-1连续重复7次

---------------更新------------------

使用的代码

import pandas as pd 
data = {
 'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
               [200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
 '0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
      [ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'], 
 'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
              [ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}

df = pd.DataFrame.from_dict(data)  # convert dictionary to dataframe

dict_count = {}
for nid in df.obligacion.unique():
    vector_mora = df['dias_mora'][df.obligacion == nid].values
    groups = groupby(vector_mora)
    result = [(label, sum(1 for _ in group)) for label, group in groups] 
    dict_count[nid] = result
dict_count

我找不到一个聪明的方法来处理熊猫，所以需要一个列表和循环

import pandas as pd

data = {
     'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
                   [200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
     '0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
          [ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'], 
     'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
                  [ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}

df = pd.DataFrame.from_dict(data)  # convert dictionary to dataframe
lob = df['obligacion'].unique().tolist()   # distinct list of first columne
ddall = {}
for o in lob:  # each ob
    ldm = df[df['obligacion']==o]['dias_mora'].tolist()  # filter by ob, convert last column to list

    all = []
    cnt = 0
    for i in range(len(ldm)-1): # each element in list
       cnt += 1
       if ldm[i] != ldm[i+1]:   # if last element in this sequence
          all.append((ldm[i],cnt))  # append tuple to final list
          cnt = 0
    else:
       all.append((ldm[i+1],cnt+1))  # last element
    ddall[o] = [(int(e[0]),e[1]) for e in list(all)]

print(ddall)

输出

{
  200000000123: [(-1, 7), (4, 1), (6, 1), (0, 1), (8, 1), (9, 1), (7, 1), (10, 1), (3, 1), (2, 1), (3, 1), (2, 1)], 
  200000000444: [(12, 1), (0, 1), (4, 3), (7, 1), (10, 1), (4, 1), (-6, 1), (-7, 1), (8, 2), (17, 1), (10, 2), (-2, 1), (3, 1), (2, 1)]
}

----更新----

根据Pandas文档，应该避免在数据帧上进行迭代，因为它非常慢。为了加快这个脚本的速度，我将键列转换为列表，对它们进行压缩，并在zip对象上进行迭代。脚本的运行速度大约是原来的两倍。输出是相同的

下面是更快的脚本：

import pandas as pd

data = {
     'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
                   [200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
     '0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
          [ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'], 
     'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
                  [ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}

df = pd.DataFrame.from_dict(data)  # convert dictionary to dataframe

# convert key columns to lists for faster scan
lstob = df['obligacion'].to_list()
lstdm = df['dias_mora'].to_list()

ddall = {}
lastob = "___"  # will delete this entry
lst = []
lastv = cnt = 1
tt = zip(lstob, lstdm)  # combine lists for iteration
for t in tt:  # each ob/dm
   if t[0] != lastob:  # new ob
      lst.append((int(lastv), cnt)) # add last sequence
      ddall[lastob] = lst  # add list to dictionary
      lastob = t[0]
      lst = []
      lastv = t[1]
      cnt = 1
   else:  # same ob
      if t[1] != lastv: # if new dm
         lst.append((int(lastv), cnt))
         lastv = t[1]
         cnt = 1
      else:
         cnt += 1 # just increment ctr
else: # last row in dataset
   lst.append((int(t[1]), cnt))
   ddall[lastob] = lst

del ddall['___']  # remove temporary entry
print(ddall)

----更新#2----

如果要在输出中添加dias_mora，可以在计算值时收集dm条目

为此，以下是更新的代码：

import pandas as pd

data = {
     'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
                   [200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
     '0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
          [ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'], 
     'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
                  [ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}

df = pd.DataFrame.from_dict(data)  # convert dictionary to dataframe

# convert key columns to lists for faster scan
lstob = df['obligacion'].to_list()
lst0  = df['0'].to_list()
lstdm = df['dias_mora'].to_list()

cur0 = ""

ddall = {}
lastob = "___"  # will delete this entry
lst = []
lastv = cnt = 1
tt = zip(lstob, lst0, lstdm)  # combine lists for iteration
for t in tt:  # each ob/dm
   if t[0] != lastob:  # new ob
      lst.append((int(lastv), cnt, cur0)) # add last sequence
      ddall[lastob] = lst  # add list to dictionary
      lastob = t[0]
      lst = []
      lastv = t[2]
      cur0 = t[1]
      cnt = 1
   else:  # same ob
      if t[2] != lastv: # if new dm
         lst.append((int(lastv), cnt, cur0))
         lastv = t[2]
         cur0 = t[1]
         cnt = 1
      else:
         cnt += 1 # just increment ctr
         cur0 += ',' + t[1]
else: # last row in dataset
   lst.append((int(t[2]), cnt, cur0))
   ddall[lastob] = lst

del ddall['___']  # remove temporary entry
print(ddall)

输出（格式化）

如果需要，可以使用string

split

方法将dm列表转换为列表。

您可以尝试这种方法

data = {'obligacion':
    { 0: 200000000123,  1: 200000000123,
      2: 200000000123,  3: 200000000123,
      4: 200000000123,  5: 200000000123,
      6: 200000000123,  7: 200000000123,
      8: 200000000123,  9: 200000000123,
     10: 200000000456, 11: 200000000456,
     12: 200000000456, 13: 200000000456,
     14: 200000000456, 15: 200000000456,
     16: 200000000456, 17: 200000000456},
    0:
    { 0: 'mora18', 1: 'mora17',
      2: 'mora16', 3: 'mora15',
      4: 'mora14', 5: 'mora13',
      6: 'mora12', 7: 'mora11',
      8: 'mora10', 9:  'mora9',
     10: 'mora8', 11:  'mora7',
     12: 'mora6', 13: 'mora5',
     14: 'mora4', 15: 'mora3',
     16: 'mora2', 17: 'mora1'},
    'dias_mora':
    { 0: '-1',  1: '0',
      2: '-1',  3: '6',
      4: '-1',  5: '4',
      6: '-1',  7: '4',
      8:  '6',  9: '0',
     10:  '8', 11: '9',
     12:  '7', 13: '10',
     14:  '3', 15: '2',
     16:  '3', 17: '2'}}

df = pd.DataFrame.from_dict(data)  # convert dictionary to dataframe

from collections import defaultdict
d_new = defaultdict(list) #setup a dictionary that can contain lists

#use a crosstab function to count the values for each obligacion
ctab = pd.crosstab(index=df['dias_mora'], columns=df['obligacion'])

#for each  obligacion, get the dias_mora key and counts
for obl,d_m in ctab.items():
    #for each dias_mora key, check if count is > 0. If so, write to dict
    for d_m_key,count_dm in d_m.items():
        if count_dm > 0: d_new[obl].append((d_m_key,count_dm))

#convert defaultdict to normal dict
d_new = dict(d_new)

print (d_new)

其输出如下所示：

{200000000123: [('-1', 4), ('0', 2), ('4', 2), ('6', 2)], 200000000456: [('10', 1), ('2', 2), ('3', 2), ('7', 1), ('8', 1), ('9', 1)]}

{200000000123: [(-1, 4), (0, 2), (4, 2), (6, 2)], 200000000456: [(10, 1), (2, 2), (3, 2), (7, 1), (8, 1), (9, 1)]}

如果要将元组中的键转换为数字，则可以在将其推入列表时对其进行更改

if count_dm > 0: d_new[obl].append((int(d_m_key),count_dm))

这将为您提供如下结果：

{200000000123: [('-1', 4), ('0', 2), ('4', 2), ('6', 2)], 200000000456: [('10', 1), ('2', 2), ('3', 2), ('7', 1), ('8', 1), ('9', 1)]}

{200000000123: [(-1, 4), (0, 2), (4, 2), (6, 2)], 200000000456: [(10, 1), (2, 2), (3, 2), (7, 1), (8, 1), (9, 1)]}

如果第一列的值随着行的变化而变化，我的意思是，数据只是5000+行的前18行。虽然速度很慢，但正如预期的那样，如果有任何方法可以跳过for，我将不胜感激。太棒了，速度快得多，我用自己的代码更新了帖子，也许你想看看我计算的方式。我无法对结果进行排序，因此重复次数最多的数字位于第一个位置。是否有方法将列0添加到元组？以便它保存为每个数字找到的第一个数据。例如2000000123:[（-1,7，'mora18'），（4,1，'mora17'）……]。如果有必要，我可以提出一个新问题。程序应该计算连续重复的相同数字，而不是总数，这一部分很难解释