使用Python和Pandas重命名基于数据帧内容的文件

使用Python和Pandas重命名基于数据帧内容的文件,python,pandas,Python,Pandas,我试图读取xlsx文件,将列中的所有参考号与文件夹中的文件进行比较,如果它们对应,则将它们重命名为与参考号关联的电子邮件 Excel文件包含以下字段: Reference EmailAddress 1123 bob.smith@yahoo.com 1233 john.drako@gmail.com 1334 samuel.manuel@yahoo.com ... ..... 我的文件夹申请人仅包含名为参

我试图读取
xlsx
文件,将列中的所有参考号与文件夹中的文件进行比较,如果它们对应,则将它们重命名为与参考号关联的电子邮件

Excel文件包含以下字段:

 Reference     EmailAddress
   1123        bob.smith@yahoo.com
   1233        john.drako@gmail.com
   1334        samuel.manuel@yahoo.com
   ...         .....
我的文件夹
申请人
仅包含名为参考列的文档文件:

如何将
applicationSCVS
文件夹的内容与excel文件中的引用字段进行比较,如果匹配,则将所有文件重命名为相应的电子邮件地址

以下是我到目前为止所做的尝试:

import os
import pandas as pd

dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
references = dfOne['Reference']

emailAddress = dfOne['EmailAddress']

cleanedEmailList = [x for x in emailAddress if str(x) != 'nan']

print(cleanedEmailList)
excelArray = []
filesArray = []

for root, dirs, files in os.walk("applicantCVs"):
    for filename in files:
        print(filename) #Original file name with type 1233.doc
        reworkedFile = os.path.splitext(filename)[0]
        filesArray.append(reworkedFile)

for entry in references:
    excelArray.append(str(entry))

for i in excelArray:
    if i in filesArray:
        print(i, "corresponds to the file names")
我将引用名称与文件夹内容进行比较,如果相同,则将其打印出来:

 for i in excelArray:
        if i in filesArray:
            print(i, "corresponds to the file names")
我曾尝试使用
os.rename(filename,cleanedEmailList)
重命名它,但它不起作用,因为
cleanedEmailList
是一个电子邮件数组

如何匹配和重命名文件

更新:

from os.path import dirname
import pandas as pd
from pathlib import Path
import os

dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")

emailAddress = dfOne['EmailAddress']
reference = dfOne['Reference'] = dfOne.references.astype(str)

references = dict(dfOne.dropna(subset=[reference, "EmailAddress"]).set_index(reference)["EmailAddress"])
print(references)
files = Path("applicantCVs").glob("*")

for file in files:
    new_name = references.get(file.stem, file.stem)
    file.rename(file.with_name(f"{new_name}{file.suffix}"))
import os

#Sample Data#
#dfOne = pd.DataFrame({'Reference': [1123, 1233, 1334, 4444, 5555],'EmailAddress': ["bob.smith@yahoo.com", "john.drako@gmail.com", "samuel.manuel@yahoo.com", np.nan, "samuel.manuel@yahoo.com"]})
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
dfOne.dropna(inplace=True)  #Drop rows with NaN

for root, dirs, files in os.walk("applicantsCVs"):
    for file in files:
        file_name, ext = os.path.splitext(file)
        email = dfOne[dfOne['Reference'].astype(str).str.contains(file_name)]["EmailAddress"]
        if email.values:
            os.rename(os.path.join(root, file), os.path.join(root, email.values[0]+ext))
根据样本数据:

Reference     EmailAddress
   1123        bob.smith@yahoo.com
   1233        john.drako@gmail.com
   nan         jane.smith#example.com
   1334        samuel.manuel@yahoo.com
首先,将引用集作为键,新名称作为值,组装一个
dict

references = dict(df.dropna(subset=["Reference","EmailAddress"]).set_index("Reference")["EmailAddress"])
请注意,此处的参考是
str
s。如果它们不在原始数据库中,可以使用
astype(str)

然后使用
pathlib.Path
查找数据目录中的所有文件:

files = Path("../data/renames").glob("*")
重命名可以非常简单:

for file in files:
    new_name = references.get(file.stem, file.stem )
    file.rename(file.with_name(f"{new_name}{file.suffix}"))
references.get
要求输入新的文件名,如果找不到,则使用原始的stem

把“email associate”(我想是你的新名字吧?)添加到字典里,在字典里键是你的参考号,怎么样? 这可能看起来像:

cor_dict = {}

for i in excelArray:
        if i in filesArray:
            cor_dict[i] =dfOne['EmailAddress'].at[dfOne.Reference == i]


for entry in cor_dict.items():
    path = 'path to file...'
    filename = str(entry[0])+'.doc'
    new_filename =  str(entry[1]).replace('@','_') + '_.doc'

    filepath = os.path.join(path, filename)
    new_filepath = os.path.join(path,new_filename)

    os.rename(filename, new_filename)

您可以使用
df.apply()
在数据帧中直接执行此操作:

编辑
现在可以使用模块使用任何扩展(
.doc
.docx
)这是一种使用简单迭代的方法

Ex:

from os.path import dirname
import pandas as pd
from pathlib import Path
import os

dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")

emailAddress = dfOne['EmailAddress']
reference = dfOne['Reference'] = dfOne.references.astype(str)

references = dict(dfOne.dropna(subset=[reference, "EmailAddress"]).set_index(reference)["EmailAddress"])
print(references)
files = Path("applicantCVs").glob("*")

for file in files:
    new_name = references.get(file.stem, file.stem)
    file.rename(file.with_name(f"{new_name}{file.suffix}"))
import os

#Sample Data#
#dfOne = pd.DataFrame({'Reference': [1123, 1233, 1334, 4444, 5555],'EmailAddress': ["bob.smith@yahoo.com", "john.drako@gmail.com", "samuel.manuel@yahoo.com", np.nan, "samuel.manuel@yahoo.com"]})
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
dfOne.dropna(inplace=True)  #Drop rows with NaN

for root, dirs, files in os.walk("applicantsCVs"):
    for file in files:
        file_name, ext = os.path.splitext(file)
        email = dfOne[dfOne['Reference'].astype(str).str.contains(file_name)]["EmailAddress"]
        if email.values:
            os.rename(os.path.join(root, file), os.path.join(root, email.values[0]+ext))

或者如果您只有
.docx
文件要重命名

import os

dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")

dfOne["Reference"] = dfOne["Reference"].astype(str)
dfOne.dropna(inplace=True)  #Drop rows with NaN
ext = ".docx"
for root, dirs, files in os.walk("applicantsCVs"):
    files = r"\b" + "|".join(os.path.splitext(i)[0] for i in files) + r"\b"
    for email, ref in dfOne[dfOne['Reference'].astype(str).str.contains(files, regex=True)].values:
        os.rename(os.path.join(root, ref+ext), os.path.join(root, email+ext))

让我们考虑Excel表中的示例数据如下:

Reference   EmailAddress
1123    bob.smith@yahoo.com
1233    john.drako@gmail.com
1334    samuel.manuel@yahoo.com
nan     python@gmail.com
解决此问题涉及以下步骤

第一步 从excel工作表中正确导入数据
“my.xlsx”
。这里我使用的是样本数据

import pandas as pd
import os
#import data from excel sheet and drop rows with nan 
df = pd.read_excel('my.xlsx').dropna()
#check the head of data if the data is in desirable format
df.head() 
在这里,您将看到引用中的数据类型是float类型

步骤2 将引用列中的数据类型更改为整数,然后更改为字符串

df['Reference']=df.Reference.astype(int, inplace=True)
df = df.astype(str,inplace=True)
df.head()
现在数据的格式符合要求

步骤3 重命名所需文件夹中的文件。压缩要在for循环中使用的“Reference”和“EmailAddress”列表

#absolute path to folder. I consider you have the folder "application cv" in the home directory
path_to_files='/home/applicant cv/'
for ref,email in zip(list(df['Reference']),list(df['EmailAddress'])):
    try: 
        os.rename(path_to_files+ref+'.doc',path_to_files+email+'.doc')
    except:
        print ("File name doesn't exist in the list, I am leaving it as it is")

步骤1:从excel工作表导入数据
“Book1.xlsx”

步骤2:选择
“.docx”
文件所在的路径并存储其名称。 仅获取要比较的文件名的相关部分

mypath = r'path of docx files\doc files'
from os import listdir,rename
from os.path import isfile, join
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
#print(onlyfiles)
currentfilename=onlyfiles[0].split(".")[0]

步骤3:运行循环以检查名称是否与引用匹配。只需使用
os中的
rename(src,dest)
函数

for i in range(3):
    #print(currentfilename,df['ref'][i])
    if str(currentfilename)==str(df['Reference'][i]):
        corrosponding_email=df['EmailAddress'][i]
        #print(mypath+"\\"+corrosponding_email)
rename(mypath+"\\"+str(currentfilename)+".docx",mypath+"\\"+corrosponding_email+".docx")

使用示例签出代码:

您想将文件重命名为什么?您想将Word文档的内容匹配,还是只想将excel文件的
引用
与Word文档的名称匹配?@MaartenFabré我想将文件重命名为CSV中的电子邮件地址列您可以使用pd.dropna(subset='EmailAddress'))要过滤空地址,我很难将您的更改集成到我的脚本中。很抱歉,响应太晚。我遇到的问题是,无论何时运行此代码,我都不会将文件重命名为相应的电子邮件地址,但文件仍然保持不变。它适用于我使用的测试文件。您可以从顶部开始诊断问题,并检查哪个部件出现故障。
references
中的键是否正确且
str
s,
files
是否真的是一个包含所有文件的
generator
等等。我发现问题在于
new_name
只包含参考号。
str
s和
文件都是正确的。问题是` file.rename(file.with_name(f{new_name}{file.suffix})`传递
new_name
,作为引用号而不是电子邮件。电子邮件地址将是一个已经正常的字符串,它是您需要转换为字符串的引用
import pandas as pd
df = pd.read_excel (r'path of your file here\Book1.xlsx')        
print (df)
mypath = r'path of docx files\doc files'
from os import listdir,rename
from os.path import isfile, join
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
#print(onlyfiles)
currentfilename=onlyfiles[0].split(".")[0]
for i in range(3):
    #print(currentfilename,df['ref'][i])
    if str(currentfilename)==str(df['Reference'][i]):
        corrosponding_email=df['EmailAddress'][i]
        #print(mypath+"\\"+corrosponding_email)
rename(mypath+"\\"+str(currentfilename)+".docx",mypath+"\\"+corrosponding_email+".docx")