Python 从文本文件中提取数据并将其转换为df_Python_Regex_Python 3.x_Pandas_Dataframe

Python 从文本文件中提取数据并将其转换为df

python regex python-3.x pandas dataframe

Python 从文本文件中提取数据并将其转换为df,python,regex,python-3.x,pandas,dataframe,Python,Regex,Python 3.x,Pandas,Dataframe,有一个包含如下值的txt文件。如何将其转换为数据帧尝试了几种删除空白并将其排列在数据框中的方法。整个数据可以str格式存储，包括日期 cn No: 9991 PUEN: S55D Date : 05/01/2017 Development Name: Status: Active Development Location: Address: 3 ADAM PARK #3-3 Contact No.: Name Agent: Managing No.: 56

有一个包含如下值的txt文件。如何将其转换为数据帧

尝试了几种删除空白并将其排列在数据框中的方法。整个数据可以

str

格式存储，包括日期

cn No:    9991
PUEN:    S55D
Date :    05/01/2017
Development Name:   
Status: Active
Development Location:   
Address: 3 ADAM PARK #3-3 
Contact No.: 
Name Agent: 
Managing  No.: 5648123
cn No:    4671
PUEN:    T11F
Date :    16/07/2019
Development Name:   MEGA
Status: Active
Development Location:   
Address: 39 WOODLANDS CLOSE,  #01-64, 
Contact No.: 6258 6944
Name  Agent: 
Managing  No.:

尝试将文本文件转换为数据帧

f = open('outs.txt', 'w')
sys.stdout = f
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as e:
    for hostinfo in e.map(lambda x: get_certificate(x[0], x[1]), HOSTS):
        basic_info(hostinfo)

sys.stdout = orig_stdout
f.close()           
f = open("outs.txt", "r")
a=(f.read())
data = a
a=(pd.read_csv(StringIO(data),
              header=None,

              sep="/",
              names=['string'])
     #limit number of splits to 1
  .string.str.split(':',n=1,expand=True)
  .rename({0:'Name',1:'temp'},axis=1)
  .assign(temp = lambda x: np.where(x.Name.str.strip()
                             #look for string that ends 
                             #with a bracket
                              .str.match(r'(.*[)]$)'),
                              x.cn No,
                              x.temp),
          Name = lambda x: x.Name.str.replace(r'(.*[)]$)','cn No.')
          )
   #remove whitespace
 .assign(cn No. = lambda x: x.Name.str.strip())
 .pivot(columns='Name',values='temp')
 .ffill()
 .dropna(how='any')
 .reset_index(drop=True)
 .rename_axis(None,axis=1)
 .filter(['cn No','PUEN','Date','Development Name','status','Development Location','Address','Contact No.','Name Agent','Managing No.'])      
  )

所以我们有一个文本文件，上面有这样的内容

import pandas as pd

# Dictionary to store the header and values
my_dict = dict()

# Open the file
with open("./temp.txt", 'r') as file_object: 

    # Read the content
    content = file_object.readlines() 

    # For every row 
    for row in content:

    # Get the header and data
    header, data = row.split(":") 

    # Check if the header is not in dict keys
    if header not in my_dict.keys(): 

        # We add the data with corresponding key
        my_dict[header.strip()] = data.strip()


# Returns a dataframe with the values
pd.DataFrame.from_dict(my_dict, orient='index')

如果有几百行呢

import pandas as pd

# Dictionary to store the header and values
my_dict = dict()

# Open the file
with open("./temp.txt", 'r') as file_object: 

    # Read the content
    content = file_object.readlines() 

    # For every row 
    for row in content:

    # Get the header and data
    header, data = row.split(":") 

    # Check if the header is not in dict keys
    if header not in my_dict.keys(): 

        # We add the data with corresponding key
        my_dict[header.strip()] = data.strip()


# Returns a dataframe with the values
pd.DataFrame.from_dict(my_dict, orient='index')