Python 如何在Bs4中从selenium infinite scroll获取所有数据

Python 如何在Bs4中从selenium infinite scroll获取所有数据,python,selenium,web-scraping,beautifulsoup,instagram,Python,Selenium,Web Scraping,Beautifulsoup,Instagram,在我的脚本中,我想从instagram搜索中获取数据,在我的代码中,selenium根据我应用的条件滚动搜索结果。但一旦加载完成,我尝试在bs4对象中获取滚动数据,它只返回第一个结果(在60-70之间)。它不会获取滚动后加载的数据 import time import re import json from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup from selenium import w

在我的脚本中,我想从instagram搜索中获取数据,在我的代码中,selenium根据我应用的条件滚动搜索结果。但一旦加载完成,我尝试在bs4对象中获取滚动数据,它只返回第一个结果(在60-70之间)。它不会获取滚动后加载的数据

import time
import re
import json
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chop = Options()
chop.add_argument("--disable-gpu")
chop.add_argument("--no-sandbox")
chop.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(executable_path="c:/Users/Haseeb-Ahmad/3D
Objects/chromedriver.exe",options=chop)
driver = webdriver.Chrome()
driver.implicitly_wait(10)

driver.get('https://www.instagram.com/accounts/login/')
emailInput = driver.find_elements_by_css_selector('form input')[0]
passwordInput = driver.find_elements_by_css_selector('form input')[1]
emailInput.send_keys('#username')
passwordInput.send_keys('#password')
passwordInput.send_keys(Keys.ENTER) #login so i can avoid the login-popup
time.sleep(5)

try:
    SCROLL_PAUSE_TIME = 1
    driver.get("https://www.instagram.com/explore/tags/pakistan/")
    x = True
    count = 0
    while x == True :
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)
        count = count + 1
        if count > 10:
            x = False
        else:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(SCROLL_PAUSE_TIME)
            continue
    bsObj = BeautifulSoup(driver.page_source,features='html.parser')
    scripts = bsObj.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
    stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
    data = json.loads(stringified_json)['entry_data']['TagPage'][0]['graphql'['hashtag'['edge_hashtag_to_media']['edges']
    print(len(data)) #to chk data returned 
finally:
    driver.quit()

我在Twitter上也遇到过同样的问题,我发现bs4并不是实现这一点的最有效的库

我用硒来做,效果很好

这是我的密码:

account_names = []
account_tags = []
account_link = []

def scroll():

SCROLL_PAUSE_TIME = 1
global account_name

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait to load page     
    time.sleep(SCROLL_PAUSE_TIME)       
    account_name = driver.find_elements_by_xpath('//*[@id="react-root"]/div/div/div/main/div/div/div/div/div/div[2]/div/div/section/div/div/div/div/div/div/div/div[2]/div[1]/div[1]/a/div/div[1]/div[1]/span/span')
    for act_name in account_name:
        global acctname
        acctname = act_name.text
        account_names.append(acctname)

    account_handle = driver.find_elements_by_xpath('//*[@id="react-root"]/div/div/div/main/div/div/div/div/div/div[2]/div/div/section/div/div/div/div/div/div/div/div[2]/div[1]/div[1]/a/div/div[2]/div/span')
    for act_handle in account_handle:
        acct_handles = act_handle.text
        account_tags.append(acct_handles)

    soup = BeautifulSoup(driver.page_source, 'lxml')
    account_links = soup.find_all('a', href=True, class_='css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l')
    for acct_links in account_links:
        global act_link
        act_link = acct_links['href']
        account_link.append(act_link)


    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

scroll()
希望这有帮助