Python 可点击文本抓取
我正在努力清理这个网站——”https://www.realcommercial.com.au/sold/?autoSuggest=false&page=1/". 我已成功地抓取了所有需要的数据,但电话号码是可单击文本。下面是我的代码Python 可点击文本抓取,python,web-scraping,beautifulsoup,Python,Web Scraping,Beautifulsoup,我正在努力清理这个网站——”https://www.realcommercial.com.au/sold/?autoSuggest=false&page=1/". 我已成功地抓取了所有需要的数据,但电话号码是可单击文本。下面是我的代码 import requests import pandas as pd from bs4 import BeautifulSoup urls = [] for i in range(1,4): pages = "https://www.real
import requests
import pandas as pd
from bs4 import BeautifulSoup
urls = []
for i in range(1,4):
pages = "https://www.realcommercial.com.au/sold/?autoSuggest=false&page={0}".format(i)
urls.append(pages)
Data = []
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'Address_link_1aaSW'})
hrefs = [x['href'] for x in links]
s = "https://www.realcommercial.com.au"
href = [s + x for x in hrefs ]
for m in href:
entry=[]
pages = requests.get(m)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Add_st = soup_2.find_all('h1', attrs={'class' :'Address_container_3HZgj'})
Address_Street = [Address_Street.text.strip() for Address_Street in Add_st]
Prop = soup_2.find_all('div', attrs={'class' :'PrimaryDetails_propertyTypes_1SLzV'})
Prop_Type = [Prop_Type.text.strip() for Prop_Type in Prop]
Dtnarea = soup_2.find_all('div', attrs ={'class' :'Attribute_attribute_3lq_3'})
Date = [Date.text.strip() for Date in Dtnarea]
Land_Area = [x for x in Date if x.startswith('Land Area')]
Floor_Area = [y for y in Date if y.startswith('Floor Area')]
Sold_date = [z for z in Date if z.startswith('Sold on')]
Agency = soup_2.find_all('a', attrs={'class' :'AgencyPanel_agencyNameLink_nCd-h'})
Agency_Name = [Agency_Name.text.strip() for Agency_Name in Agency]
Agent = soup_2.find_all('h4', attrs={'class' :'AgentDetails_name_23QWU'})
Agent_Name = [Agent_Name.text.strip() for Agent_Name in Agent]
phone = [broth['href'] for broth in soup_2.select("#wrapper > div > div.DetailContainer_detailPage_2O71T > div:nth-child(7) > div.AgencyPanel_wrapper_mVRp2 > div > ul > li > div.AgentDetails_details_28P5A > div.AgentDetails_actions_2CKN0 > a")]
Phone_Number = [Phone_Number.text.strip() for Phone_Number in Phone]
对于电话号码,我得到以下输出:-
0 [#, #, #]
1 [#, #]
2 [#, #]
3 [#]
4 [#]
5 [#, #]
6 [#, #]
7 [#, #]
8 [#, #]
9 [#, #]
10 [#, #]
11 [#, #]
12 [#, #]
13 [#, #]
14 [#, #]
15 [#, #]
16 [#, #, #, #]
17 [#, #]
18 [#, #]
如何单击文本并获取隐藏的电话号码
谢谢 我修改了你的代码
您可以先单击电话链接以显示selenium的电话号码
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
urls = []
for i in range(1,4):
pages = "https://www.realcommercial.com.au/sold/?autoSuggest=false&page={0}".format(i)
urls.append(pages)
Data = []
options = webdriver.ChromeOptions()
# hidden browser
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
browser = webdriver.Chrome(chrome_options=options)
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'Address_link_1aaSW'})
hrefs = [x['href'] for x in links]
s = "https://www.realcommercial.com.au"
href = [s + x for x in hrefs ]
print(href)
for m in href:
browser.get(m)
entry=[]
pages = requests.get(m)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
# Find all phone number link elements
phone_links = browser.find_elements_by_css_selector('#wrapper > div > div.DetailContainer_detailPage_2O71T > div:nth-child(7) > div.AgencyPanel_wrapper_mVRp2 > div > ul > li > div.AgentDetails_details_28P5A > div.AgentDetails_actions_2CKN0 > a')
# Using JavaScript to click it -> execute_script("arguments[0].click();", link)
# You can also use selenium click() method -> "link.click()" if you want to simulate user behavior
# I prefer to use JS because it performs clicking better.
for link in phone_links:
browser.execute_script("arguments[0].click();", link)
# Retrieve phone number text elements from <span>
phones = browser.find_elements_by_css_selector('#wrapper > div > div.DetailContainer_detailPage_2O71T > div:nth-child(7) > div.AgencyPanel_wrapper_mVRp2 > div > ul > li > div.AgentDetails_details_28P5A > div.AgentDetails_actions_2CKN0 > a > span')
# Get the phone number text
for phone in phones:
print(phone.text)
我修改了你的代码
您可以先单击电话链接以显示selenium的电话号码
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
urls = []
for i in range(1,4):
pages = "https://www.realcommercial.com.au/sold/?autoSuggest=false&page={0}".format(i)
urls.append(pages)
Data = []
options = webdriver.ChromeOptions()
# hidden browser
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
browser = webdriver.Chrome(chrome_options=options)
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'Address_link_1aaSW'})
hrefs = [x['href'] for x in links]
s = "https://www.realcommercial.com.au"
href = [s + x for x in hrefs ]
print(href)
for m in href:
browser.get(m)
entry=[]
pages = requests.get(m)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
# Find all phone number link elements
phone_links = browser.find_elements_by_css_selector('#wrapper > div > div.DetailContainer_detailPage_2O71T > div:nth-child(7) > div.AgencyPanel_wrapper_mVRp2 > div > ul > li > div.AgentDetails_details_28P5A > div.AgentDetails_actions_2CKN0 > a')
# Using JavaScript to click it -> execute_script("arguments[0].click();", link)
# You can also use selenium click() method -> "link.click()" if you want to simulate user behavior
# I prefer to use JS because it performs clicking better.
for link in phone_links:
browser.execute_script("arguments[0].click();", link)
# Retrieve phone number text elements from <span>
phones = browser.find_elements_by_css_selector('#wrapper > div > div.DetailContainer_detailPage_2O71T > div:nth-child(7) > div.AgencyPanel_wrapper_mVRp2 > div > ul > li > div.AgentDetails_details_28P5A > div.AgentDetails_actions_2CKN0 > a > span')
# Get the phone number text
for phone in phones:
print(phone.text)
电话号码来自使用列表ID作为参数的API调用。下面是你可以玩的。属性信息实际上位于每个页面的脚本标记中,因此我提取这些列表和id,并使用id作为键存储在字典中。然后,我使用id参数发出API请求,并从响应中提取电话号码、代理和单个代理名称。还有更多信息(在此过程中,您需要的所有信息都可用)
电话号码来自使用列表ID作为参数的API调用。下面是你可以玩的。属性信息实际上位于每个页面的脚本标记中,因此我提取这些列表和id,并使用id作为键存储在字典中。然后,我使用id参数发出API请求,并从响应中提取电话号码、代理和单个代理名称。还有更多信息(在此过程中,您需要的所有信息都可用)
非常感谢您的快速响应,但是,我在尝试运行此WebDriverException时收到以下错误:消息:“chromedriver”可执行文件需要位于路径中。请查看是否可以使用beautifulsoup?browser=webdriver.Chrome(Chrome\u options=options,executable\u path=“your chromedriver path”)来完成此操作。如果没有Selenium,我还无法找到解决方案。我仍然收到以下错误WebDriverException:消息:服务C:/Program Files(x86)/Google/Chrome/Application/Chrome.exe意外退出。状态代码是:0非常感谢您的快速响应,但是,我在尝试运行此WebDriverException时收到以下错误:消息:“chromedriver”可执行文件需要位于路径中。请查看是否可以使用beautifulsoup?browser=webdriver.Chrome(Chrome\u options=options,executable\u path=“your chromedriver path”)来完成此操作。如果没有Selenium,我还无法找到解决方案。我仍然收到以下错误WebDriverException:消息:服务C:/Program Files(x86)/Google/Chrome/Application/Chrome.exe意外退出。状态代码为:0
import requests,re,json
api_base = 'https://api.realcommercial.com.au/listing-ui/listings/'
p = re.compile(r'pageData = (.*?);', re.DOTALL)
results = {}
ids = []
with requests.Session() as s:
for page in range(4):
r = s.get(f'https://www.realcommercial.com.au/sold/?autoSuggest=false&page={page}')
soup = bs(r.content, 'lxml')
data = json.loads(p.findall(r.text)[0])
for item in data['exactMatchListings']:
item_id = item['id']
if item_id not in ids:
ids+=item_id
results[item['id']] = item
item_json = s.get(f'{api_base}{item_id}').json()['listing']['agencies'][0]
agency = item_json['name']
try:
named_agent = item_json['salespeople'][0]['name']
except:
named_agent = 'N/A'
tel = item_json['phone']['dial']
print(item_id, agency, named_agent, tel)