Python 可点击文本抓取_Python_Web Scraping_Beautifulsoup

Python 可点击文本抓取

python web-scraping

Python 可点击文本抓取,python,web-scraping,beautifulsoup,Python,Web Scraping,Beautifulsoup,我正在努力清理这个网站——”https://www.realcommercial.com.au/sold/?autoSuggest=false&page=1/". 我已成功地抓取了所有需要的数据，但电话号码是可单击文本。下面是我的代码 import requests import pandas as pd from bs4 import BeautifulSoup urls = [] for i in range(1,4): pages = "https://www.real

我正在努力清理这个网站——”https://www.realcommercial.com.au/sold/?autoSuggest=false&page=1/". 我已成功地抓取了所有需要的数据，但电话号码是可单击文本。下面是我的代码

import requests import pandas as pd from bs4 import BeautifulSoup urls = [] for i in range(1,4): pages = "https://www.realcommercial.com.au/sold/?autoSuggest=false&page={0}".format(i) urls.append(pages) Data = [] for info in urls: page = requests.get(info) soup = BeautifulSoup(page.content, 'html.parser') links = soup.find_all('a', attrs ={'class' :'Address_link_1aaSW'}) hrefs = [x['href'] for x in links] s = "https://www.realcommercial.com.au" href = [s + x for x in hrefs ] for m in href: entry=[] pages = requests.get(m) soup_2 =BeautifulSoup(pages.content, 'html.parser') Add_st = soup_2.find_all('h1', attrs={'class' :'Address_container_3HZgj'}) Address_Street = [Address_Street.text.strip() for Address_Street in Add_st] Prop = soup_2.find_all('div', attrs={'class' :'PrimaryDetails_propertyTypes_1SLzV'}) Prop_Type = [Prop_Type.text.strip() for Prop_Type in Prop] Dtnarea = soup_2.find_all('div', attrs ={'class' :'Attribute_attribute_3lq_3'}) Date = [Date.text.strip() for Date in Dtnarea] Land_Area = [x for x in Date if x.startswith('Land Area')] Floor_Area = [y for y in Date if y.startswith('Floor Area')] Sold_date = [z for z in Date if z.startswith('Sold on')] Agency = soup_2.find_all('a', attrs={'class' :'AgencyPanel_agencyNameLink_nCd-h'}) Agency_Name = [Agency_Name.text.strip() for Agency_Name in Agency] Agent = soup_2.find_all('h4', attrs={'class' :'AgentDetails_name_23QWU'}) Agent_Name = [Agent_Name.text.strip() for Agent_Name in Agent] phone = [broth['href'] for broth in soup_2.select("#wrapper > div > div.DetailContainer_detailPage_2O71T > div:nth-child(7) > div.AgencyPanel_wrapper_mVRp2 > div > ul > li > div.AgentDetails_details_28P5A > div.AgentDetails_actions_2CKN0 > a")] Phone_Number = [Phone_Number.text.strip() for Phone_Number in Phone]
对于电话号码，我得到以下输出：-

0 [#, #, #] 1 [#, #] 2 [#, #] 3 [#] 4 [#] 5 [#, #] 6 [#, #] 7 [#, #] 8 [#, #] 9 [#, #] 10 [#, #] 11 [#, #] 12 [#, #] 13 [#, #] 14 [#, #] 15 [#, #] 16 [#, #, #, #] 17 [#, #] 18 [#, #]
如何单击文本并获取隐藏的电话号码
谢谢
我修改了你的代码
您可以先单击电话链接以显示selenium的电话号码

import requests from bs4 import BeautifulSoup from selenium import webdriver urls = [] for i in range(1,4): pages = "https://www.realcommercial.com.au/sold/?autoSuggest=false&page={0}".format(i) urls.append(pages) Data = [] options = webdriver.ChromeOptions() # hidden browser options.add_argument('headless') options.add_argument('window-size=1920x1080') options.add_argument("disable-gpu") browser = webdriver.Chrome(chrome_options=options) for info in urls: page = requests.get(info) soup = BeautifulSoup(page.content, 'html.parser') links = soup.find_all('a', attrs ={'class' :'Address_link_1aaSW'}) hrefs = [x['href'] for x in links] s = "https://www.realcommercial.com.au" href = [s + x for x in hrefs ] print(href) for m in href: browser.get(m) entry=[] pages = requests.get(m) soup_2 =BeautifulSoup(pages.content, 'html.parser') # Find all phone number link elements phone_links = browser.find_elements_by_css_selector('#wrapper > div > div.DetailContainer_detailPage_2O71T > div:nth-child(7) > div.AgencyPanel_wrapper_mVRp2 > div > ul > li > div.AgentDetails_details_28P5A > div.AgentDetails_actions_2CKN0 > a') # Using JavaScript to click it -> execute_script("arguments[0].click();", link) # You can also use selenium click() method -> "link.click()" if you want to simulate user behavior # I prefer to use JS because it performs clicking better. for link in phone_links: browser.execute_script("arguments[0].click();", link) # Retrieve phone number text elements from <span> phones = browser.find_elements_by_css_selector('#wrapper > div > div.DetailContainer_detailPage_2O71T > div:nth-child(7) > div.AgencyPanel_wrapper_mVRp2 > div > ul > li > div.AgentDetails_details_28P5A > div.AgentDetails_actions_2CKN0 > a > span') # Get the phone number text for phone in phones: print(phone.text)
我修改了你的代码
您可以先单击电话链接以显示selenium的电话号码

import requests from bs4 import BeautifulSoup from selenium import webdriver urls = [] for i in range(1,4): pages = "https://www.realcommercial.com.au/sold/?autoSuggest=false&page={0}".format(i) urls.append(pages) Data = [] options = webdriver.ChromeOptions() # hidden browser options.add_argument('headless') options.add_argument('window-size=1920x1080') options.add_argument("disable-gpu") browser = webdriver.Chrome(chrome_options=options) for info in urls: page = requests.get(info) soup = BeautifulSoup(page.content, 'html.parser') links = soup.find_all('a', attrs ={'class' :'Address_link_1aaSW'}) hrefs = [x['href'] for x in links] s = "https://www.realcommercial.com.au" href = [s + x for x in hrefs ] print(href) for m in href: browser.get(m) entry=[] pages = requests.get(m) soup_2 =BeautifulSoup(pages.content, 'html.parser') # Find all phone number link elements phone_links = browser.find_elements_by_css_selector('#wrapper > div > div.DetailContainer_detailPage_2O71T > div:nth-child(7) > div.AgencyPanel_wrapper_mVRp2 > div > ul > li > div.AgentDetails_details_28P5A > div.AgentDetails_actions_2CKN0 > a') # Using JavaScript to click it -> execute_script("arguments[0].click();", link) # You can also use selenium click() method -> "link.click()" if you want to simulate user behavior # I prefer to use JS because it performs clicking better. for link in phone_links: browser.execute_script("arguments[0].click();", link) # Retrieve phone number text elements from <span> phones = browser.find_elements_by_css_selector('#wrapper > div > div.DetailContainer_detailPage_2O71T > div:nth-child(7) > div.AgencyPanel_wrapper_mVRp2 > div > ul > li > div.AgentDetails_details_28P5A > div.AgentDetails_actions_2CKN0 > a > span') # Get the phone number text for phone in phones: print(phone.text)

电话号码来自使用列表ID作为参数的API调用。下面是你可以玩的。属性信息实际上位于每个页面的脚本标记中，因此我提取这些列表和id，并使用id作为键存储在字典中。然后，我使用id参数发出API请求，并从响应中提取电话号码、代理和单个代理名称。还有更多信息（在此过程中，您需要的所有信息都可用）

电话号码来自使用列表ID作为参数的API调用。下面是你可以玩的。属性信息实际上位于每个页面的脚本标记中，因此我提取这些列表和id，并使用id作为键存储在字典中。然后，我使用id参数发出API请求，并从响应中提取电话号码、代理和单个代理名称。还有更多信息（在此过程中，您需要的所有信息都可用）

非常感谢您的快速响应，但是，我在尝试运行此WebDriverException时收到以下错误：消息：“chromedriver”可执行文件需要位于路径中。请查看是否可以使用beautifulsoup？browser=webdriver.Chrome（Chrome\u options=options，executable\u path=“your chromedriver path”）来完成此操作。如果没有Selenium，我还无法找到解决方案。我仍然收到以下错误WebDriverException:消息：服务C:/Program Files（x86）/Google/Chrome/Application/Chrome.exe意外退出。状态代码是：0非常感谢您的快速响应，但是，我在尝试运行此WebDriverException时收到以下错误：消息：“chromedriver”可执行文件需要位于路径中。请查看是否可以使用beautifulsoup？browser=webdriver.Chrome（Chrome\u options=options，executable\u path=“your chromedriver path”）来完成此操作。如果没有Selenium，我还无法找到解决方案。我仍然收到以下错误WebDriverException:消息：服务C:/Program Files（x86）/Google/Chrome/Application/Chrome.exe意外退出。状态代码为：0
import requests,re,json api_base = 'https://api.realcommercial.com.au/listing-ui/listings/' p = re.compile(r'pageData = (.*?);', re.DOTALL) results = {} ids = [] with requests.Session() as s: for page in range(4): r = s.get(f'https://www.realcommercial.com.au/sold/?autoSuggest=false&page={page}') soup = bs(r.content, 'lxml') data = json.loads(p.findall(r.text)[0]) for item in data['exactMatchListings']: item_id = item['id'] if item_id not in ids: ids+=item_id results[item['id']] = item item_json = s.get(f'{api_base}{item_id}').json()['listing']['agencies'][0] agency = item_json['name'] try: named_agent = item_json['salespeople'][0]['name'] except: named_agent = 'N/A' tel = item_json['phone']['dial'] print(item_id, agency, named_agent, tel)