Python中的黄页刮刀不工作
我正试图从黄页上搜集数据。我用过很多次这个刮刀,但最近已经停了 得到了这个错误Python中的黄页刮刀不工作,python,web-scraping,yellow-pages,Python,Web Scraping,Yellow Pages,我正试图从黄页上搜集数据。我用过很多次这个刮刀,但最近已经停了 得到了这个错误 “非类型”对象没有找到属性“组”0结果 有人能帮我解决这个问题吗 我在这件事上哪里做错了 import requests import requests_random_user_agent import urllib.parse from bs4 import BeautifulSoup import re from math import ceil import csv import os import sys i
“非类型”对象没有找到属性“组”0结果
有人能帮我解决这个问题吗
我在这件事上哪里做错了
import requests
import requests_random_user_agent
import urllib.parse
from bs4 import BeautifulSoup
import re
from math import ceil
import csv
import os
import sys
import subprocess
from os import system, name
import time
from tqdm import tqdm
class Scraper:
def __init__(self,keyword,location):
self.keyword=keyword
self.location=location
self.params = urllib.parse.urlencode({"search_terms": self.keyword, "geo_location_terms": self.location})
def get_info(self, link):
try:
r = requests.get(link)
html = BeautifulSoup(r.content, "html.parser")
except:
return False
try:
name = html.find('h1').text
except:
name = None
try:
phone = html.find(class_='phone').text
except:
phone = None
try:
website = html.find('a',class_='primary-btn website-link')["href"]
if len(website.split("?")) > 1:
website = website.split("?")[0]
except:
website = None
try:
email = html.find('a', class_='email-business')["href"].split(":")[1]
except:
email=None
try:
address = html.find('h2',class_='address').text
except:
address=None
return {"name": name, "email": email, "phone": phone, "address": address, "website":website}
def get_num_pages(self):
try:
url = f"https://www.yellowpages.com/search?{self.params}"
response = requests.get(url)
html = BeautifulSoup(response.content, "html.parser")
pagination = html.find(class_="pagination")
if not pagination:
pagination = 1
links = html.select("a[class='business-name']")
num_results = 0
for l in links:
try:
l["data-analytics"]
num_results += 1
except:
continue
return num_results, pagination
num_results = int(re.search('We found(.*?)results',pagination.text).group(1))
return num_results, int(ceil(int(num_results) / 30))
except Exception as e:
print(e)
return False, False
def get_links(self, page):
try:
url = f"https://www.yellowpages.com/search?{self.params}&page={page}"
response = requests.request("GET", url, timeout=10)
html = BeautifulSoup(response.content, "html.parser")
links = html.select("a[class='business-name']")
links_filtered=[]
for l in links:
try:
l["data-analytics"]
links_filtered.append(l)
except:
continue
links_list = []
for link in links_filtered:
links_list.append(f"https://www.yellowpages.com{link['href']}")
return links_list
except Exception as e:
print(e)
return []
def open_file(filename):
try:
if sys.platform == "win32":
os.startfile(filename)
else:
opener = "open" if sys.platform == "darwin" else "xdg-open"
subprocess.call([opener, filename])
except:
return False
def create_csv(elements):
row_list = [["Name", "Address", "Phone", "Email", "Website"]]
for e in elements:
name = e["name"]
address = e["address"]
phone = e["phone"]
email = e["email"]
website = e["website"]
row_list.append([name, address, phone, email, website])
with open('output.csv', 'w', newline='', encoding='utf8') as file:
writer = csv.writer(file)
writer.writerows(row_list)
def clear():
# for windows
if name == 'nt':
_ = system('cls')
# for mac and linux(here, os.name is 'posix')
else:
_ = system('clear')
def main():
clear()
try:
while True:
keyword = input("Keyword: ")
if keyword != "":
break
while True:
city = input("City: ")
if city != "":
break
clear()
scraper = Scraper(keyword, city)
results, num_pages = scraper.get_num_pages()
if not results:
print("0 results found")
return False
print(f"{results} results found {keyword} - {city}")
data = []
pages = tqdm(range(1, num_pages + 1))
for page in pages:
clear()
try:
pages.set_description(f"Scraping page {page}/{num_pages}...")
links = scraper.get_links(page)
if not (len(links) > 0):
continue
links = tqdm(links)
for link in links:
try:
links.set_description(f"Scraping {link}")
info = scraper.get_info(link)
# print(info)
data.append(info)
create_csv(data)
except:
continue
except:
continue
print("Opening file...")
open_file("output.csv")
print("Task finished")
except:
return False
if __name__ == "__main__":
main()
它在线路上失败了
num_results = int(re.search('We found(.*?)results',pagination.text).group(1))
打开浏览器,对搜索结果页面进行一次非常简单的检查,就会发现页面上没有文本“We found x results”。因此,re.search
返回None
,即使有许多结果
调整脚本,使其在没有num_pages
的情况下工作,并且仅通过底部的页面链接或通过增加URL中的page=
参数进行分页,直到不再列出更多的结果/页面
仅供参考,下一次,投入一些最小的调试工作,而不是发布整个脚本