Python 如何在组合对象之前处理索引器异常
我希望实现的目标:Python 如何在组合对象之前处理索引器异常,python,python-3.x,web-scraping,Python,Python 3.x,Web Scraping,我希望实现的目标: import ast import sys # Create empty lists [Global] jobs = [] names = [] dates = [] summaries = [] locations = [] # Function - Ingest parsed HTML data | Filter out required values def getJobs(parsedHTML): # Loop - Get job title f
import ast
import sys
# Create empty lists [Global]
jobs = []
names = []
dates = []
summaries = []
locations = []
# Function - Ingest parsed HTML data | Filter out required values
def getJobs(parsedHTML):
# Loop - Get job title
for div in parsedHTML.find_all(name='h2', attrs={'class':'title'}):
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
val = str(a.getText().strip())
if val is None:
locations.append({"job-title": "null"})
else:
dictItem = {"job-title": f"{val}"}
jobs.append(dictItem)
# Loop - Get job poster's name
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'company'}):
val = str(span.getText().strip())
if val is None:
locations.append({"company-name": "null"})
else:
dictItem = {"company-name": f"{val}"}
names.append(dictItem)
# Loop - Get the date the job post was created
for div in parsedHTML.find_all(name='div', attrs={'class':'result-link-bar'}):
for span in div.find_all(name='span', attrs={'class':'date date-a11y'}):
val = str(span.getText().strip())
if val is None:
locations.append({"date-created": "null"})
else:
dictItem = {"date-created": f"{val}"}
dates.append(dictItem)
# Loop - Get short job description
for divParent in parsedHTML.find_all(name='div', attrs={'class':'result'}):
for divChild in divParent.find_all(name='div', attrs={'class':'summary'}):
val = str(divChild.getText().strip())
if val is None:
locations.append({"short-description": "null"})
else:
dictItem = {"short-description": f"{val}"}
summaries.append(dictItem)
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'location'}):
val = str(span.getText().strip())
if val is None:
locations.append({"location": "null"})
else:
dictItem = {"location": f"{val}"}
locations.append(dictItem)
# Function - Generate test data
def testData(parsedHTML, typeProc):
# typeProc == True | Export data to text files
if typeProc:
#getJobs(parsedHTML)
with open("jobs.txt", "w") as file:
for line in jobs:
file.write(str(line))
file.write("\n")
file.close()
with open("names.txt", "w") as file:
for line in names:
file.write(str(line))
file.write("\n")
file.close()
with open("dates.txt", "w") as file:
for line in dates:
file.write(str(line))
file.write("\n")
file.close()
with open("summaries.txt", "w") as file:
for line in summaries:
file.write(str(line))
file.write("\n")
file.close()
with open("locations.txt", "w") as file:
for line in locations:
file.write(str(line))
file.write("\n")
file.close()
# typeProc == False | Import data from txt files, convert to dictionary and append to list
elif typeProc == False:
with open("jobs.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
jobs.append(content[i])
file.close()
with open("names.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
names.append(content[i])
file.close()
with open("dates.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
dates.append(content[i])
file.close()
with open("summaries.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
summaries.append(content[i])
file.close()
with open("locations.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
locations.append(content[i])
file.close()
# Else | If this else is hit, something is greatly fvcked
else:
print("Function: testData | Error: if statement else output")
sys.exit(1)
# Function - Remove items from all lists
def wipeLists():
jobs.clear()
names.clear()
dates.clear()
summaries.clear()
locations.clear()
# Function - JSON Blob Generator
def genJSON(parsedHTML):
# Testing with cached local IRL data
#testData(parsedHTML, False)
getJobs(parsedHTML)
jsonBlob = []
# Merge dictionaries | Combining dictionaries into single object + Append to jsonBlob list
for i in range(len(jobs)):
sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i], **locations[i]}
#sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i]}
jsonBlob.append(sumObj)
return jsonBlob
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for divChild in div.find_all(name='div', attrs={'class':'recJobLoc'}):
dictItem = {"location": f"{divChild['data-rc-loc']}"}
locations.append(dictItem)
下面添加的代码过滤经过解析的HTML页面以查找特定值。然后以字典的形式将每个特定值添加到其自己的特定列表中。一旦所有的值都添加到列表中,其中的字典就会合并成一个JSON blob,然后我就可以导出了
注意-这是quick PoC的一部分,所以它写得又快又脏。原谅我
我的问题:
import ast
import sys
# Create empty lists [Global]
jobs = []
names = []
dates = []
summaries = []
locations = []
# Function - Ingest parsed HTML data | Filter out required values
def getJobs(parsedHTML):
# Loop - Get job title
for div in parsedHTML.find_all(name='h2', attrs={'class':'title'}):
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
val = str(a.getText().strip())
if val is None:
locations.append({"job-title": "null"})
else:
dictItem = {"job-title": f"{val}"}
jobs.append(dictItem)
# Loop - Get job poster's name
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'company'}):
val = str(span.getText().strip())
if val is None:
locations.append({"company-name": "null"})
else:
dictItem = {"company-name": f"{val}"}
names.append(dictItem)
# Loop - Get the date the job post was created
for div in parsedHTML.find_all(name='div', attrs={'class':'result-link-bar'}):
for span in div.find_all(name='span', attrs={'class':'date date-a11y'}):
val = str(span.getText().strip())
if val is None:
locations.append({"date-created": "null"})
else:
dictItem = {"date-created": f"{val}"}
dates.append(dictItem)
# Loop - Get short job description
for divParent in parsedHTML.find_all(name='div', attrs={'class':'result'}):
for divChild in divParent.find_all(name='div', attrs={'class':'summary'}):
val = str(divChild.getText().strip())
if val is None:
locations.append({"short-description": "null"})
else:
dictItem = {"short-description": f"{val}"}
summaries.append(dictItem)
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'location'}):
val = str(span.getText().strip())
if val is None:
locations.append({"location": "null"})
else:
dictItem = {"location": f"{val}"}
locations.append(dictItem)
# Function - Generate test data
def testData(parsedHTML, typeProc):
# typeProc == True | Export data to text files
if typeProc:
#getJobs(parsedHTML)
with open("jobs.txt", "w") as file:
for line in jobs:
file.write(str(line))
file.write("\n")
file.close()
with open("names.txt", "w") as file:
for line in names:
file.write(str(line))
file.write("\n")
file.close()
with open("dates.txt", "w") as file:
for line in dates:
file.write(str(line))
file.write("\n")
file.close()
with open("summaries.txt", "w") as file:
for line in summaries:
file.write(str(line))
file.write("\n")
file.close()
with open("locations.txt", "w") as file:
for line in locations:
file.write(str(line))
file.write("\n")
file.close()
# typeProc == False | Import data from txt files, convert to dictionary and append to list
elif typeProc == False:
with open("jobs.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
jobs.append(content[i])
file.close()
with open("names.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
names.append(content[i])
file.close()
with open("dates.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
dates.append(content[i])
file.close()
with open("summaries.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
summaries.append(content[i])
file.close()
with open("locations.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
locations.append(content[i])
file.close()
# Else | If this else is hit, something is greatly fvcked
else:
print("Function: testData | Error: if statement else output")
sys.exit(1)
# Function - Remove items from all lists
def wipeLists():
jobs.clear()
names.clear()
dates.clear()
summaries.clear()
locations.clear()
# Function - JSON Blob Generator
def genJSON(parsedHTML):
# Testing with cached local IRL data
#testData(parsedHTML, False)
getJobs(parsedHTML)
jsonBlob = []
# Merge dictionaries | Combining dictionaries into single object + Append to jsonBlob list
for i in range(len(jobs)):
sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i], **locations[i]}
#sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i]}
jsonBlob.append(sumObj)
return jsonBlob
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for divChild in div.find_all(name='div', attrs={'class':'recJobLoc'}):
dictItem = {"location": f"{divChild['data-rc-loc']}"}
locations.append(dictItem)
将以下列表和字典组合在一起时,我在导出blob时不会遇到任何问题:
- 工作
- 名字
- 日期
- 摘要
import ast
import sys
# Create empty lists [Global]
jobs = []
names = []
dates = []
summaries = []
locations = []
# Function - Ingest parsed HTML data | Filter out required values
def getJobs(parsedHTML):
# Loop - Get job title
for div in parsedHTML.find_all(name='h2', attrs={'class':'title'}):
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
val = str(a.getText().strip())
if val is None:
locations.append({"job-title": "null"})
else:
dictItem = {"job-title": f"{val}"}
jobs.append(dictItem)
# Loop - Get job poster's name
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'company'}):
val = str(span.getText().strip())
if val is None:
locations.append({"company-name": "null"})
else:
dictItem = {"company-name": f"{val}"}
names.append(dictItem)
# Loop - Get the date the job post was created
for div in parsedHTML.find_all(name='div', attrs={'class':'result-link-bar'}):
for span in div.find_all(name='span', attrs={'class':'date date-a11y'}):
val = str(span.getText().strip())
if val is None:
locations.append({"date-created": "null"})
else:
dictItem = {"date-created": f"{val}"}
dates.append(dictItem)
# Loop - Get short job description
for divParent in parsedHTML.find_all(name='div', attrs={'class':'result'}):
for divChild in divParent.find_all(name='div', attrs={'class':'summary'}):
val = str(divChild.getText().strip())
if val is None:
locations.append({"short-description": "null"})
else:
dictItem = {"short-description": f"{val}"}
summaries.append(dictItem)
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'location'}):
val = str(span.getText().strip())
if val is None:
locations.append({"location": "null"})
else:
dictItem = {"location": f"{val}"}
locations.append(dictItem)
# Function - Generate test data
def testData(parsedHTML, typeProc):
# typeProc == True | Export data to text files
if typeProc:
#getJobs(parsedHTML)
with open("jobs.txt", "w") as file:
for line in jobs:
file.write(str(line))
file.write("\n")
file.close()
with open("names.txt", "w") as file:
for line in names:
file.write(str(line))
file.write("\n")
file.close()
with open("dates.txt", "w") as file:
for line in dates:
file.write(str(line))
file.write("\n")
file.close()
with open("summaries.txt", "w") as file:
for line in summaries:
file.write(str(line))
file.write("\n")
file.close()
with open("locations.txt", "w") as file:
for line in locations:
file.write(str(line))
file.write("\n")
file.close()
# typeProc == False | Import data from txt files, convert to dictionary and append to list
elif typeProc == False:
with open("jobs.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
jobs.append(content[i])
file.close()
with open("names.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
names.append(content[i])
file.close()
with open("dates.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
dates.append(content[i])
file.close()
with open("summaries.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
summaries.append(content[i])
file.close()
with open("locations.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
locations.append(content[i])
file.close()
# Else | If this else is hit, something is greatly fvcked
else:
print("Function: testData | Error: if statement else output")
sys.exit(1)
# Function - Remove items from all lists
def wipeLists():
jobs.clear()
names.clear()
dates.clear()
summaries.clear()
locations.clear()
# Function - JSON Blob Generator
def genJSON(parsedHTML):
# Testing with cached local IRL data
#testData(parsedHTML, False)
getJobs(parsedHTML)
jsonBlob = []
# Merge dictionaries | Combining dictionaries into single object + Append to jsonBlob list
for i in range(len(jobs)):
sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i], **locations[i]}
#sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i]}
jsonBlob.append(sumObj)
return jsonBlob
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for divChild in div.find_all(name='div', attrs={'class':'recJobLoc'}):
dictItem = {"location": f"{divChild['data-rc-loc']}"}
locations.append(dictItem)
我发现,有时找不到该值,因为我无法控制该值未包含在解析的HTML中的原因,即在创建该值时未将其添加到用户。本例中的问题是位置列表的len为14,而其他列表的len为15,这导致在使用for循环组合列表时出现索引器异常
我的问题:
import ast
import sys
# Create empty lists [Global]
jobs = []
names = []
dates = []
summaries = []
locations = []
# Function - Ingest parsed HTML data | Filter out required values
def getJobs(parsedHTML):
# Loop - Get job title
for div in parsedHTML.find_all(name='h2', attrs={'class':'title'}):
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
val = str(a.getText().strip())
if val is None:
locations.append({"job-title": "null"})
else:
dictItem = {"job-title": f"{val}"}
jobs.append(dictItem)
# Loop - Get job poster's name
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'company'}):
val = str(span.getText().strip())
if val is None:
locations.append({"company-name": "null"})
else:
dictItem = {"company-name": f"{val}"}
names.append(dictItem)
# Loop - Get the date the job post was created
for div in parsedHTML.find_all(name='div', attrs={'class':'result-link-bar'}):
for span in div.find_all(name='span', attrs={'class':'date date-a11y'}):
val = str(span.getText().strip())
if val is None:
locations.append({"date-created": "null"})
else:
dictItem = {"date-created": f"{val}"}
dates.append(dictItem)
# Loop - Get short job description
for divParent in parsedHTML.find_all(name='div', attrs={'class':'result'}):
for divChild in divParent.find_all(name='div', attrs={'class':'summary'}):
val = str(divChild.getText().strip())
if val is None:
locations.append({"short-description": "null"})
else:
dictItem = {"short-description": f"{val}"}
summaries.append(dictItem)
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'location'}):
val = str(span.getText().strip())
if val is None:
locations.append({"location": "null"})
else:
dictItem = {"location": f"{val}"}
locations.append(dictItem)
# Function - Generate test data
def testData(parsedHTML, typeProc):
# typeProc == True | Export data to text files
if typeProc:
#getJobs(parsedHTML)
with open("jobs.txt", "w") as file:
for line in jobs:
file.write(str(line))
file.write("\n")
file.close()
with open("names.txt", "w") as file:
for line in names:
file.write(str(line))
file.write("\n")
file.close()
with open("dates.txt", "w") as file:
for line in dates:
file.write(str(line))
file.write("\n")
file.close()
with open("summaries.txt", "w") as file:
for line in summaries:
file.write(str(line))
file.write("\n")
file.close()
with open("locations.txt", "w") as file:
for line in locations:
file.write(str(line))
file.write("\n")
file.close()
# typeProc == False | Import data from txt files, convert to dictionary and append to list
elif typeProc == False:
with open("jobs.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
jobs.append(content[i])
file.close()
with open("names.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
names.append(content[i])
file.close()
with open("dates.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
dates.append(content[i])
file.close()
with open("summaries.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
summaries.append(content[i])
file.close()
with open("locations.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
locations.append(content[i])
file.close()
# Else | If this else is hit, something is greatly fvcked
else:
print("Function: testData | Error: if statement else output")
sys.exit(1)
# Function - Remove items from all lists
def wipeLists():
jobs.clear()
names.clear()
dates.clear()
summaries.clear()
locations.clear()
# Function - JSON Blob Generator
def genJSON(parsedHTML):
# Testing with cached local IRL data
#testData(parsedHTML, False)
getJobs(parsedHTML)
jsonBlob = []
# Merge dictionaries | Combining dictionaries into single object + Append to jsonBlob list
for i in range(len(jobs)):
sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i], **locations[i]}
#sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i]}
jsonBlob.append(sumObj)
return jsonBlob
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for divChild in div.find_all(name='div', attrs={'class':'recJobLoc'}):
dictItem = {"location": f"{divChild['data-rc-loc']}"}
locations.append(dictItem)
如下面的代码所示,我试图通过分配一个占位符值“null”来处理这个问题,此时未找到刮取的值,但由于某种原因,该值未应用,并且仍然遇到Indexer异常。任何帮助都将不胜感激,提前谢谢你
我的代码:
import ast
import sys
# Create empty lists [Global]
jobs = []
names = []
dates = []
summaries = []
locations = []
# Function - Ingest parsed HTML data | Filter out required values
def getJobs(parsedHTML):
# Loop - Get job title
for div in parsedHTML.find_all(name='h2', attrs={'class':'title'}):
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
val = str(a.getText().strip())
if val is None:
locations.append({"job-title": "null"})
else:
dictItem = {"job-title": f"{val}"}
jobs.append(dictItem)
# Loop - Get job poster's name
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'company'}):
val = str(span.getText().strip())
if val is None:
locations.append({"company-name": "null"})
else:
dictItem = {"company-name": f"{val}"}
names.append(dictItem)
# Loop - Get the date the job post was created
for div in parsedHTML.find_all(name='div', attrs={'class':'result-link-bar'}):
for span in div.find_all(name='span', attrs={'class':'date date-a11y'}):
val = str(span.getText().strip())
if val is None:
locations.append({"date-created": "null"})
else:
dictItem = {"date-created": f"{val}"}
dates.append(dictItem)
# Loop - Get short job description
for divParent in parsedHTML.find_all(name='div', attrs={'class':'result'}):
for divChild in divParent.find_all(name='div', attrs={'class':'summary'}):
val = str(divChild.getText().strip())
if val is None:
locations.append({"short-description": "null"})
else:
dictItem = {"short-description": f"{val}"}
summaries.append(dictItem)
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'location'}):
val = str(span.getText().strip())
if val is None:
locations.append({"location": "null"})
else:
dictItem = {"location": f"{val}"}
locations.append(dictItem)
# Function - Generate test data
def testData(parsedHTML, typeProc):
# typeProc == True | Export data to text files
if typeProc:
#getJobs(parsedHTML)
with open("jobs.txt", "w") as file:
for line in jobs:
file.write(str(line))
file.write("\n")
file.close()
with open("names.txt", "w") as file:
for line in names:
file.write(str(line))
file.write("\n")
file.close()
with open("dates.txt", "w") as file:
for line in dates:
file.write(str(line))
file.write("\n")
file.close()
with open("summaries.txt", "w") as file:
for line in summaries:
file.write(str(line))
file.write("\n")
file.close()
with open("locations.txt", "w") as file:
for line in locations:
file.write(str(line))
file.write("\n")
file.close()
# typeProc == False | Import data from txt files, convert to dictionary and append to list
elif typeProc == False:
with open("jobs.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
jobs.append(content[i])
file.close()
with open("names.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
names.append(content[i])
file.close()
with open("dates.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
dates.append(content[i])
file.close()
with open("summaries.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
summaries.append(content[i])
file.close()
with open("locations.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
locations.append(content[i])
file.close()
# Else | If this else is hit, something is greatly fvcked
else:
print("Function: testData | Error: if statement else output")
sys.exit(1)
# Function - Remove items from all lists
def wipeLists():
jobs.clear()
names.clear()
dates.clear()
summaries.clear()
locations.clear()
# Function - JSON Blob Generator
def genJSON(parsedHTML):
# Testing with cached local IRL data
#testData(parsedHTML, False)
getJobs(parsedHTML)
jsonBlob = []
# Merge dictionaries | Combining dictionaries into single object + Append to jsonBlob list
for i in range(len(jobs)):
sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i], **locations[i]}
#sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i]}
jsonBlob.append(sumObj)
return jsonBlob
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for divChild in div.find_all(name='div', attrs={'class':'recJobLoc'}):
dictItem = {"location": f"{divChild['data-rc-loc']}"}
locations.append(dictItem)
感谢@pavel就如何处理这一问题所作的说明。我发现我要查找的值在创建时实际上是一个必填字段,出于某种原因,我在筛选解析数据时没有获得正确的值量 我再次查看了页面的源代码,发现有另一个字段具有我要查找的确切值。因此,现在不是获取父div中span元素的文本,而是获取父div元素的自定义data-*属性值。我在测试时没有遇到任何错误 更新代码:
import ast
import sys
# Create empty lists [Global]
jobs = []
names = []
dates = []
summaries = []
locations = []
# Function - Ingest parsed HTML data | Filter out required values
def getJobs(parsedHTML):
# Loop - Get job title
for div in parsedHTML.find_all(name='h2', attrs={'class':'title'}):
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
val = str(a.getText().strip())
if val is None:
locations.append({"job-title": "null"})
else:
dictItem = {"job-title": f"{val}"}
jobs.append(dictItem)
# Loop - Get job poster's name
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'company'}):
val = str(span.getText().strip())
if val is None:
locations.append({"company-name": "null"})
else:
dictItem = {"company-name": f"{val}"}
names.append(dictItem)
# Loop - Get the date the job post was created
for div in parsedHTML.find_all(name='div', attrs={'class':'result-link-bar'}):
for span in div.find_all(name='span', attrs={'class':'date date-a11y'}):
val = str(span.getText().strip())
if val is None:
locations.append({"date-created": "null"})
else:
dictItem = {"date-created": f"{val}"}
dates.append(dictItem)
# Loop - Get short job description
for divParent in parsedHTML.find_all(name='div', attrs={'class':'result'}):
for divChild in divParent.find_all(name='div', attrs={'class':'summary'}):
val = str(divChild.getText().strip())
if val is None:
locations.append({"short-description": "null"})
else:
dictItem = {"short-description": f"{val}"}
summaries.append(dictItem)
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'location'}):
val = str(span.getText().strip())
if val is None:
locations.append({"location": "null"})
else:
dictItem = {"location": f"{val}"}
locations.append(dictItem)
# Function - Generate test data
def testData(parsedHTML, typeProc):
# typeProc == True | Export data to text files
if typeProc:
#getJobs(parsedHTML)
with open("jobs.txt", "w") as file:
for line in jobs:
file.write(str(line))
file.write("\n")
file.close()
with open("names.txt", "w") as file:
for line in names:
file.write(str(line))
file.write("\n")
file.close()
with open("dates.txt", "w") as file:
for line in dates:
file.write(str(line))
file.write("\n")
file.close()
with open("summaries.txt", "w") as file:
for line in summaries:
file.write(str(line))
file.write("\n")
file.close()
with open("locations.txt", "w") as file:
for line in locations:
file.write(str(line))
file.write("\n")
file.close()
# typeProc == False | Import data from txt files, convert to dictionary and append to list
elif typeProc == False:
with open("jobs.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
jobs.append(content[i])
file.close()
with open("names.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
names.append(content[i])
file.close()
with open("dates.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
dates.append(content[i])
file.close()
with open("summaries.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
summaries.append(content[i])
file.close()
with open("locations.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
locations.append(content[i])
file.close()
# Else | If this else is hit, something is greatly fvcked
else:
print("Function: testData | Error: if statement else output")
sys.exit(1)
# Function - Remove items from all lists
def wipeLists():
jobs.clear()
names.clear()
dates.clear()
summaries.clear()
locations.clear()
# Function - JSON Blob Generator
def genJSON(parsedHTML):
# Testing with cached local IRL data
#testData(parsedHTML, False)
getJobs(parsedHTML)
jsonBlob = []
# Merge dictionaries | Combining dictionaries into single object + Append to jsonBlob list
for i in range(len(jobs)):
sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i], **locations[i]}
#sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i]}
jsonBlob.append(sumObj)
return jsonBlob
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for divChild in div.find_all(name='div', attrs={'class':'recJobLoc'}):
dictItem = {"location": f"{divChild['data-rc-loc']}"}
locations.append(dictItem)
感谢每一位试图帮助我们的人。这已经解决。如果我是你,我担心的不是处理错误,而是解决问题。所以有时候位置不见了?哪张唱片不见了?第一张唱片中有没有丢失?那么所有的记录都会有不正确的位置。事实上,IndexError正在帮助您避免造成混乱。如果您试图解析的记录中没有“位置”项,“null”占位符将失败。@pavel-问题已解决,谢谢您的评论。我还添加了我自己的答案。