使用Python和Pandas重命名基于数据帧内容的文件_Python_Pandas

使用Python和Pandas重命名基于数据帧内容的文件

python pandas

使用Python和Pandas重命名基于数据帧内容的文件,python,pandas,Python,Pandas,我试图读取xlsx文件，将列中的所有参考号与文件夹中的文件进行比较，如果它们对应，则将它们重命名为与参考号关联的电子邮件 Excel文件包含以下字段： Reference EmailAddress 1123 bob.smith@yahoo.com 1233 john.drako@gmail.com 1334 samuel.manuel@yahoo.com ... ..... 我的文件夹申请人仅包含名为参

我试图读取

xlsx

文件，将列中的所有参考号与文件夹中的文件进行比较，如果它们对应，则将它们重命名为与参考号关联的电子邮件

Excel文件包含以下字段：

 Reference     EmailAddress
   1123        bob.smith@yahoo.com
   1233        john.drako@gmail.com
   1334        samuel.manuel@yahoo.com
   ...         .....

我的文件夹

申请人

仅包含名为参考列的文档文件：

如何将

applicationSCVS

文件夹的内容与excel文件中的引用字段进行比较，如果匹配，则将所有文件重命名为相应的电子邮件地址

以下是我到目前为止所做的尝试：

import os
import pandas as pd

dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
references = dfOne['Reference']

emailAddress = dfOne['EmailAddress']

cleanedEmailList = [x for x in emailAddress if str(x) != 'nan']

print(cleanedEmailList)
excelArray = []
filesArray = []

for root, dirs, files in os.walk("applicantCVs"):
    for filename in files:
        print(filename) #Original file name with type 1233.doc
        reworkedFile = os.path.splitext(filename)[0]
        filesArray.append(reworkedFile)

for entry in references:
    excelArray.append(str(entry))

for i in excelArray:
    if i in filesArray:
        print(i, "corresponds to the file names")

我将引用名称与文件夹内容进行比较，如果相同，则将其打印出来：

 for i in excelArray:
        if i in filesArray:
            print(i, "corresponds to the file names")

我曾尝试使用

os.rename（filename，cleanedEmailList）

重命名它，但它不起作用，因为

cleanedEmailList

是一个电子邮件数组

如何匹配和重命名文件

更新：

from os.path import dirname
import pandas as pd
from pathlib import Path
import os

dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")

emailAddress = dfOne['EmailAddress']
reference = dfOne['Reference'] = dfOne.references.astype(str)

references = dict(dfOne.dropna(subset=[reference, "EmailAddress"]).set_index(reference)["EmailAddress"])
print(references)
files = Path("applicantCVs").glob("*")

for file in files:
    new_name = references.get(file.stem, file.stem)
    file.rename(file.with_name(f"{new_name}{file.suffix}"))

import os

#Sample Data#
#dfOne = pd.DataFrame({'Reference': [1123, 1233, 1334, 4444, 5555],'EmailAddress': ["bob.smith@yahoo.com", "john.drako@gmail.com", "samuel.manuel@yahoo.com", np.nan, "samuel.manuel@yahoo.com"]})
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
dfOne.dropna(inplace=True)  #Drop rows with NaN

for root, dirs, files in os.walk("applicantsCVs"):
    for file in files:
        file_name, ext = os.path.splitext(file)
        email = dfOne[dfOne['Reference'].astype(str).str.contains(file_name)]["EmailAddress"]
        if email.values:
            os.rename(os.path.join(root, file), os.path.join(root, email.values[0]+ext))

根据样本数据：

Reference     EmailAddress
   1123        bob.smith@yahoo.com
   1233        john.drako@gmail.com
   nan         jane.smith#example.com
   1334        samuel.manuel@yahoo.com

首先，将引用集作为键，新名称作为值，组装一个

dict

：

references = dict(df.dropna(subset=["Reference","EmailAddress"]).set_index("Reference")["EmailAddress"])

请注意，此处的参考是

str

s。如果它们不在原始数据库中，可以使用

astype（str）

然后使用

pathlib.Path

查找数据目录中的所有文件：

files = Path("../data/renames").glob("*")

重命名可以非常简单：

for file in files:
    new_name = references.get(file.stem, file.stem )
    file.rename(file.with_name(f"{new_name}{file.suffix}"))

references.get

要求输入新的文件名，如果找不到，则使用原始的stem

把“email associate”（我想是你的新名字吧？）添加到字典里，在字典里键是你的参考号，怎么样？这可能看起来像：

cor_dict = {}

for i in excelArray:
        if i in filesArray:
            cor_dict[i] =dfOne['EmailAddress'].at[dfOne.Reference == i]


for entry in cor_dict.items():
    path = 'path to file...'
    filename = str(entry[0])+'.doc'
    new_filename =  str(entry[1]).replace('@','_') + '_.doc'

    filepath = os.path.join(path, filename)
    new_filepath = os.path.join(path,new_filename)

    os.rename(filename, new_filename)

您可以使用

df.apply（）

在数据帧中直接执行此操作：

编辑：

现在可以使用模块使用任何扩展（

.doc

，

.docx

）这是一种使用简单迭代的方法

Ex:

from os.path import dirname
import pandas as pd
from pathlib import Path
import os

dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")

emailAddress = dfOne['EmailAddress']
reference = dfOne['Reference'] = dfOne.references.astype(str)

references = dict(dfOne.dropna(subset=[reference, "EmailAddress"]).set_index(reference)["EmailAddress"])
print(references)
files = Path("applicantCVs").glob("*")

for file in files:
    new_name = references.get(file.stem, file.stem)
    file.rename(file.with_name(f"{new_name}{file.suffix}"))

import os

#Sample Data#
#dfOne = pd.DataFrame({'Reference': [1123, 1233, 1334, 4444, 5555],'EmailAddress': ["bob.smith@yahoo.com", "john.drako@gmail.com", "samuel.manuel@yahoo.com", np.nan, "samuel.manuel@yahoo.com"]})
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
dfOne.dropna(inplace=True)  #Drop rows with NaN

for root, dirs, files in os.walk("applicantsCVs"):
    for file in files:
        file_name, ext = os.path.splitext(file)
        email = dfOne[dfOne['Reference'].astype(str).str.contains(file_name)]["EmailAddress"]
        if email.values:
            os.rename(os.path.join(root, file), os.path.join(root, email.values[0]+ext))

或者如果您只有

.docx

文件要重命名

import os

dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")

dfOne["Reference"] = dfOne["Reference"].astype(str)
dfOne.dropna(inplace=True)  #Drop rows with NaN
ext = ".docx"
for root, dirs, files in os.walk("applicantsCVs"):
    files = r"\b" + "|".join(os.path.splitext(i)[0] for i in files) + r"\b"
    for email, ref in dfOne[dfOne['Reference'].astype(str).str.contains(files, regex=True)].values:
        os.rename(os.path.join(root, ref+ext), os.path.join(root, email+ext))

让我们考虑Excel表中的示例数据如下：

Reference   EmailAddress
1123    bob.smith@yahoo.com
1233    john.drako@gmail.com
1334    samuel.manuel@yahoo.com
nan     python@gmail.com

解决此问题涉及以下步骤

第一步从excel工作表中正确导入数据

“my.xlsx”

。这里我使用的是样本数据

import pandas as pd
import os
#import data from excel sheet and drop rows with nan 
df = pd.read_excel('my.xlsx').dropna()
#check the head of data if the data is in desirable format
df.head()

在这里，您将看到引用中的数据类型是float类型

步骤2 将引用列中的数据类型更改为整数，然后更改为字符串

df['Reference']=df.Reference.astype(int, inplace=True)
df = df.astype(str,inplace=True)
df.head()

现在数据的格式符合要求

步骤3 重命名所需文件夹中的文件。压缩要在for循环中使用的“Reference”和“EmailAddress”列表

#absolute path to folder. I consider you have the folder "application cv" in the home directory
path_to_files='/home/applicant cv/'
for ref,email in zip(list(df['Reference']),list(df['EmailAddress'])):
    try: 
        os.rename(path_to_files+ref+'.doc',path_to_files+email+'.doc')
    except:
        print ("File name doesn't exist in the list, I am leaving it as it is")

步骤1:从excel工作表导入数据

“Book1.xlsx”

步骤2:选择

“.docx”

文件所在的路径并存储其名称。仅获取要比较的文件名的相关部分

mypath = r'path of docx files\doc files'
from os import listdir,rename
from os.path import isfile, join
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
#print(onlyfiles)
currentfilename=onlyfiles[0].split(".")[0]

步骤3:运行循环以检查名称是否与引用匹配。只需使用

os中的rename（src，dest）
函数
for i in range(3):
    #print(currentfilename,df['ref'][i])
    if str(currentfilename)==str(df['Reference'][i]):
        corrosponding_email=df['EmailAddress'][i]
        #print(mypath+"\\"+corrosponding_email)
rename(mypath+"\\"+str(currentfilename)+".docx",mypath+"\\"+corrosponding_email+".docx")

使用示例签出代码：
您想将文件重命名为什么？您想将Word文档的内容匹配，还是只想将excel文件的引用
与Word文档的名称匹配？@MaartenFabré我想将文件重命名为CSV中的电子邮件地址列您可以使用pd.dropna（subset='EmailAddress'））要过滤空地址，我很难将您的更改集成到我的脚本中。很抱歉，响应太晚。我遇到的问题是，无论何时运行此代码，我都不会将文件重命名为相应的电子邮件地址，但文件仍然保持不变。它适用于我使用的测试文件。您可以从顶部开始诊断问题，并检查哪个部件出现故障。references
中的键是否正确且str
s，files
是否真的是一个包含所有文件的generator
等等。我发现问题在于new_name
只包含参考号。str
s和文件都是正确的。问题是` file.rename（file.with_name（f{new_name}{file.suffix}）`传递new_name，作为引用号而不是电子邮件。电子邮件地址将是一个已经正常的字符串，它是您需要转换为字符串的引用
import pandas as pd
df = pd.read_excel (r'path of your file here\Book1.xlsx')        
print (df)

mypath = r'path of docx files\doc files'
from os import listdir,rename
from os.path import isfile, join
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
#print(onlyfiles)
currentfilename=onlyfiles[0].split(".")[0]

for i in range(3):
    #print(currentfilename,df['ref'][i])
    if str(currentfilename)==str(df['Reference'][i]):
        corrosponding_email=df['EmailAddress'][i]
        #print(mypath+"\\"+corrosponding_email)
rename(mypath+"\\"+str(currentfilename)+".docx",mypath+"\\"+corrosponding_email+".docx")