Python:尽可能快地向下广播数据帧
嘿,我希望尽可能小和快地安全地向下转换数据帧的数据类型。数据帧在一列中可以有混合数据类型的任意组合,大多数是带有np.nan或字符串'nan'的字符串列。带NaN的整数列转换为24.2数据类型“Int8”和“Int16”。。。空列表和空字典似乎会导致转换和向下转换失败,因为它们被转换为浮动(为什么??),所以我排除了它们。我的方法可行,但我不能相信没有其他简单的方法可以做到这一点,特别是没有更快的解决方案。我现在想了很久,我需要一个快速的方法,因为我使用的数据帧可能是1000000x150个单元大 我的方法:Python:尽可能快地向下广播数据帧,python,pandas,numpy,types,Python,Pandas,Numpy,Types,嘿,我希望尽可能小和快地安全地向下转换数据帧的数据类型。数据帧在一列中可以有混合数据类型的任意组合,大多数是带有np.nan或字符串'nan'的字符串列。带NaN的整数列转换为24.2数据类型“Int8”和“Int16”。。。空列表和空字典似乎会导致转换和向下转换失败,因为它们被转换为浮动(为什么??),所以我排除了它们。我的方法可行,但我不能相信没有其他简单的方法可以做到这一点,特别是没有更快的解决方案。我现在想了很久,我需要一个快速的方法,因为我使用的数据帧可能是1000000x150个单元
def convertAndDowncast(column,downcast=True):
try:
column = pd.to_numeric(column, downcast='float',errors='ignore')
if downcast==True:
column = pd.to_numeric(column, downcast='integer',errors='ignore')
if column.dtype == 'int8':
column = column.astype('Int8',casting='safe')
elif column.dtype == 'int16':
column = column.astype('Int16',casting='safe')
elif column.dtype == 'int32' or column.dtype == 'int64':
column = column.astype('Int32',casting='safe')
except Exception as e:
print(e)
return column
finally:
return column
def dtypeCorrection(df,downcast=True):
if isinstance(df,pd.DataFrame):
maskOfNans = df.isnull().values
array = df.values
excludedColumns = set(df.columns[(df.applymap(type) == list).any(0)]) | set(df.columns[(df.applymap(type) == dict).any(0)])
maskOfStringNans = ((((array=='nan')|(array == 'NaN'))|(array =='NaT'))|(array == 'None'))
combinedMasks = maskOfNans|maskOfStringNans
array[combinedMasks] = 0
df[df.columns] = array
for column in df[set(df)-excludedColumns]:
df[column] = convertAndDowncast(df[column],downcast=downcast)
df = df.mask(combinedMasks, np.nan)
return df
df = pd.DataFrame.from_dict({0:{'integerColumn':1,'strColumn':'test0','floatColumn':0.1,'strIntegerColumn':'0','strFloatColumn':'0.1',
'strObjectColumn':'[1,2,3]','objectColumn':[1,2,3],'strIntegerColumn2':'1','strFloatColumn2':'0.2',
'testColumn':{},'testColumn2':[],'testColumn3':[1,2,3]},
1:{'integerColumn':np.nan,'strColumn':'test1','floatColumn':np.nan,'strIntegerColumn':'NaN','strFloatColumn':'nan',
'strObjectColumn':'NaN','objectColumn':np.nan,'strIntegerColumn2':np.nan,'strFloatColumn2':np.nan,
'testColumn':{},'testColumn2':[],'testColumn3':[1,2,3]}},orient='index')
dtypeCorrection(df,downcast=True)
testColumn3 object
integerColumn Int8
strObjectColumn object
strFloatColumn2 float32
strIntegerColumn Int8
strIntegerColumn2 Int8
testColumn2 object
testColumn object
strFloatColumn float32
objectColumn object
floatColumn float32
strColumn object
测试:
def convertAndDowncast(column,downcast=True):
try:
column = pd.to_numeric(column, downcast='float',errors='ignore')
if downcast==True:
column = pd.to_numeric(column, downcast='integer',errors='ignore')
if column.dtype == 'int8':
column = column.astype('Int8',casting='safe')
elif column.dtype == 'int16':
column = column.astype('Int16',casting='safe')
elif column.dtype == 'int32' or column.dtype == 'int64':
column = column.astype('Int32',casting='safe')
except Exception as e:
print(e)
return column
finally:
return column
def dtypeCorrection(df,downcast=True):
if isinstance(df,pd.DataFrame):
maskOfNans = df.isnull().values
array = df.values
excludedColumns = set(df.columns[(df.applymap(type) == list).any(0)]) | set(df.columns[(df.applymap(type) == dict).any(0)])
maskOfStringNans = ((((array=='nan')|(array == 'NaN'))|(array =='NaT'))|(array == 'None'))
combinedMasks = maskOfNans|maskOfStringNans
array[combinedMasks] = 0
df[df.columns] = array
for column in df[set(df)-excludedColumns]:
df[column] = convertAndDowncast(df[column],downcast=downcast)
df = df.mask(combinedMasks, np.nan)
return df
df = pd.DataFrame.from_dict({0:{'integerColumn':1,'strColumn':'test0','floatColumn':0.1,'strIntegerColumn':'0','strFloatColumn':'0.1',
'strObjectColumn':'[1,2,3]','objectColumn':[1,2,3],'strIntegerColumn2':'1','strFloatColumn2':'0.2',
'testColumn':{},'testColumn2':[],'testColumn3':[1,2,3]},
1:{'integerColumn':np.nan,'strColumn':'test1','floatColumn':np.nan,'strIntegerColumn':'NaN','strFloatColumn':'nan',
'strObjectColumn':'NaN','objectColumn':np.nan,'strIntegerColumn2':np.nan,'strFloatColumn2':np.nan,
'testColumn':{},'testColumn2':[],'testColumn3':[1,2,3]}},orient='index')
dtypeCorrection(df,downcast=True)
testColumn3 object
integerColumn Int8
strObjectColumn object
strFloatColumn2 float32
strIntegerColumn Int8
strIntegerColumn2 Int8
testColumn2 object
testColumn object
strFloatColumn float32
objectColumn object
floatColumn float32
strColumn object
数据类型的输出:
def convertAndDowncast(column,downcast=True):
try:
column = pd.to_numeric(column, downcast='float',errors='ignore')
if downcast==True:
column = pd.to_numeric(column, downcast='integer',errors='ignore')
if column.dtype == 'int8':
column = column.astype('Int8',casting='safe')
elif column.dtype == 'int16':
column = column.astype('Int16',casting='safe')
elif column.dtype == 'int32' or column.dtype == 'int64':
column = column.astype('Int32',casting='safe')
except Exception as e:
print(e)
return column
finally:
return column
def dtypeCorrection(df,downcast=True):
if isinstance(df,pd.DataFrame):
maskOfNans = df.isnull().values
array = df.values
excludedColumns = set(df.columns[(df.applymap(type) == list).any(0)]) | set(df.columns[(df.applymap(type) == dict).any(0)])
maskOfStringNans = ((((array=='nan')|(array == 'NaN'))|(array =='NaT'))|(array == 'None'))
combinedMasks = maskOfNans|maskOfStringNans
array[combinedMasks] = 0
df[df.columns] = array
for column in df[set(df)-excludedColumns]:
df[column] = convertAndDowncast(df[column],downcast=downcast)
df = df.mask(combinedMasks, np.nan)
return df
df = pd.DataFrame.from_dict({0:{'integerColumn':1,'strColumn':'test0','floatColumn':0.1,'strIntegerColumn':'0','strFloatColumn':'0.1',
'strObjectColumn':'[1,2,3]','objectColumn':[1,2,3],'strIntegerColumn2':'1','strFloatColumn2':'0.2',
'testColumn':{},'testColumn2':[],'testColumn3':[1,2,3]},
1:{'integerColumn':np.nan,'strColumn':'test1','floatColumn':np.nan,'strIntegerColumn':'NaN','strFloatColumn':'nan',
'strObjectColumn':'NaN','objectColumn':np.nan,'strIntegerColumn2':np.nan,'strFloatColumn2':np.nan,
'testColumn':{},'testColumn2':[],'testColumn3':[1,2,3]}},orient='index')
dtypeCorrection(df,downcast=True)
testColumn3 object
integerColumn Int8
strObjectColumn object
strFloatColumn2 float32
strIntegerColumn Int8
strIntegerColumn2 Int8
testColumn2 object
testColumn object
strFloatColumn float32
objectColumn object
floatColumn float32
strColumn object