Python 如何从使用js填充的网站中获取数据?

Python 如何从使用js填充的网站中获取数据?,python,selenium,selenium-webdriver,web-scraping,Python,Selenium,Selenium Webdriver,Web Scraping,我试图从sharechat.com中获取帖子数据(如,共享,图像等),但问题是我无法使用Selenium找到帖子的图像URL,因为我怀疑它使用Javascript填充 我尝试过使用Selenium来查找最外层的HTML(显示的HTML),我得到了所有其他帖子信息,如喜欢的数量、共享、评论等,但我无法获得商店图片,因为我找不到它的URL 我这样做是为了社交网络的情感分析和推荐趋势的研究,所以我希望能抓取帖子数据以及标签和喜欢、分享的数量等。我只是无法抓取标签和图片的URL 是您需要运行的gecko

我试图从sharechat.com中获取帖子数据(如,共享,图像等),但问题是我无法使用Selenium找到帖子的图像URL,因为我怀疑它使用Javascript填充

我尝试过使用Selenium来查找最外层的HTML(显示的HTML),我得到了所有其他帖子信息,如喜欢的数量、共享、评论等,但我无法获得商店图片,因为我找不到它的URL

我这样做是为了社交网络的情感分析和推荐趋势的研究,所以我希望能抓取帖子数据以及标签和喜欢、分享的数量等。我只是无法抓取标签和图片的URL

是您需要运行的geckodriver文件
这是:


这是折射代码。在末尾添加了标签和图像逻辑

import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'

files = "dataset_link_1.txt"
# if not os.path.exists(files):
#     file(files, 'w').close()
enter = open(files,'w');

url = serviceurl
driver = webdriver.Firefox(executable_path=r'D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
    pass;

for i in range(1,20):

    SCROLL_PAUSE_TIME = 0.5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    #get the number of feeds
    feedCards = driver.find_elements_by_xpath("//section[@class='post-batch']//div[contains(@class,'feedCard')]")
    for ifeedCard in range(len(feedCards)):
        # get Number of watches
        watches = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'lhcaption')]/div[1]").text.encode('utf-8')
        print(watches)
        enter.write("Total No of views:\n%s\n" % (watches));
        # get title
        title = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//span[contains(@class,'darkText')]").text.encode('utf-8')
        print(title)
        enter.write("Title:\n%s\n" % (title));
        # get owner bio
        writerBio = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'Pstart')]//div[contains(@class,'darkTextSecondary')]").text.encode('utf-8')
        print(writerBio)
        enter.write("Writer's Bio:\n%s\n" % (writerBio));
        # get owner name
        writerName = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//strong").text.encode('utf-8')
        print(writerName)
        enter.write("Writer Name:\n%s\n" % (writerName));
        # get comment
        comment = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to comment']//span").text.encode('utf-8')
        print(comment)
        enter.write("Number of comments:\n%s\n" % (comment));
        # get share via whatsapp
        whatsApp = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to share']//span").text.encode('utf-8')
        print(whatsApp)
        enter.write("Whatsapp Share:\n%s\n" % (whatsApp));
        #get tags
        tags = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'primaryDark')]").text.encode('utf-8')
        print(tags)
        enter.write("Tags:\n%s\n" % (tags));
        # get onwer image
        image = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//img").get_attribute('src')
        print(image)
        enter.write("Owner Image link:\n%s\n" % (image));
        # post image
        postImage = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//a[@class='D(b)']").get_attribute('href')
        print(postImage)
        enter.write("post image link:\n%s\n" % (postImage))


driver.quit()
enter.close()
如果您试图将文件下载到其他文件夹。使用下面的代码

profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", 'Here goes your folder where you want to download')
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")
下载文件后,只需使用以下命令将文件重命名为所需名称

os.rename(download_file_name,desired_name) # you can pass the file name with path.

这是折射代码。在末尾添加了标签和图像逻辑

import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'

files = "dataset_link_1.txt"
# if not os.path.exists(files):
#     file(files, 'w').close()
enter = open(files,'w');

url = serviceurl
driver = webdriver.Firefox(executable_path=r'D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
    pass;

for i in range(1,20):

    SCROLL_PAUSE_TIME = 0.5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    #get the number of feeds
    feedCards = driver.find_elements_by_xpath("//section[@class='post-batch']//div[contains(@class,'feedCard')]")
    for ifeedCard in range(len(feedCards)):
        # get Number of watches
        watches = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'lhcaption')]/div[1]").text.encode('utf-8')
        print(watches)
        enter.write("Total No of views:\n%s\n" % (watches));
        # get title
        title = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//span[contains(@class,'darkText')]").text.encode('utf-8')
        print(title)
        enter.write("Title:\n%s\n" % (title));
        # get owner bio
        writerBio = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'Pstart')]//div[contains(@class,'darkTextSecondary')]").text.encode('utf-8')
        print(writerBio)
        enter.write("Writer's Bio:\n%s\n" % (writerBio));
        # get owner name
        writerName = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//strong").text.encode('utf-8')
        print(writerName)
        enter.write("Writer Name:\n%s\n" % (writerName));
        # get comment
        comment = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to comment']//span").text.encode('utf-8')
        print(comment)
        enter.write("Number of comments:\n%s\n" % (comment));
        # get share via whatsapp
        whatsApp = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to share']//span").text.encode('utf-8')
        print(whatsApp)
        enter.write("Whatsapp Share:\n%s\n" % (whatsApp));
        #get tags
        tags = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'primaryDark')]").text.encode('utf-8')
        print(tags)
        enter.write("Tags:\n%s\n" % (tags));
        # get onwer image
        image = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//img").get_attribute('src')
        print(image)
        enter.write("Owner Image link:\n%s\n" % (image));
        # post image
        postImage = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//a[@class='D(b)']").get_attribute('href')
        print(postImage)
        enter.write("post image link:\n%s\n" % (postImage))


driver.quit()
enter.close()
如果您试图将文件下载到其他文件夹。使用下面的代码

profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", 'Here goes your folder where you want to download')
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")
下载文件后,只需使用以下命令将文件重命名为所需名称

os.rename(download_file_name,desired_name) # you can pass the file name with path.

我更改web驱动程序路径和范围变量。如果创建文件夹C:\Py,下面的“我的代码”将输出一个名为PageSource_StackOverflowQ2.txt的文本文件,其中包含图像src路径

我在htlm中遇到了很多关于二进制字符的问题,所以可能有更好的方法来实现这一点,但希望这能帮助您实现您的目标

如果图像路径在一行中包含这9个字符,我的代码将停止 (“title=”)

导入系统 导入csv 导入操作系统 导入时间 导入URL库 导入日期时间 从selenium导入webdriver 从selenium.webdriver.common.keys导入密钥 从selenium.webdriver.common.action\u链导入ActionChains serviceurl=https://sharechat.com/trending/Hindi' #files=“dataset\u link\u 1.txt” enter=open('C:\\Py\\dataset\u link\u 1.txt','w+')) #如果不存在os.path.exists(文件): #文件(文件“w”).close() #输入=打开(文件,'w'); url=serviceurl #driver=webdriver.Firefox(可执行文件\u path='D:\CHIT CHAT\scraster\geckodriver'); driver=webdriver.Firefox(可执行文件[u path=r'C:\\Py\\geckodriver.exe'); driver.maximize_window()#用于最大化窗口 获取(url); driver.implicitly_wait(3)#隐式等待10秒 当驱动程序运行时,执行_脚本(“return document.readyState”)!='完成': 通过; #对于范围(1,20)内的i: 对于范围(1,2)内的i: 滚动\暂停\时间=0.5 #获取滚动高度 last\u height=driver.execute\u脚本(“return document.body.scrollHeight”) 尽管如此: #向下滚动至底部 执行脚本(“window.scrollTo(0,document.body.scrollHeight);”) #等待加载页面 时间。睡眠(滚动\u暂停\u时间) #计算新的滚动高度并与上一个滚动高度进行比较 new\u height=driver.execute\u脚本(“returndocument.body.scrollHeight”) 如果新高度==上次高度: 打破 最后高度=新高度 var=driver。通过xpath(“/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[3]/div[1]”%(i)).text.encode('utf-8') 打印(var)#手表数量 输入.write(“视图总数:\n%s\n”%(var)); var=driver。通过xpath(“/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[1]/div[1]/span”%(i)).text.encode('utf-8') 打印(var)#标题 enter.write(“标题:\n%s\n”%(var)); var=driver。通过xpath(“/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div/div[1]/a/div[2]/div/div[2]”%(i)).text.encode('utf-8') 打印(var)#所有者简历 输入.write(“作者简历:\n%s\n”%(var)); var=driver。通过xpath(“/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div/div[1]/a/div[2]/div/div[1]/strong”%(i)).text.encode('utf-8') 打印(var)#业主个人简历 输入.write(“编写器名称:\n%s\n”%(var)); var=driver。通过xpath(“/html/body/div/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[2]/div/span“%(i)).text.encode('utf-8') 打印(var)#注释 输入.write(“总评论:\n%s\n”%(var)); var=driver。通过xpath(“/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[1]/div/span”%(i)).text.encode('utf-8') 打印(var)#whatsapp 输入.write(“Whatsapp共享:\n%s\n”%(var)); PageSource1=[driver.page\u source] PageSource1=PageSource1[0]。编码(“utf-8”) 文件=打开('C:\\Py\\PageSource\u StackOverflowQ.txt','ab') file.write(PageSource1) file.close() FindPageCount=[] 文件=打开('C:\\Py\\PageSource\u StackOverflowQ1.txt','w') 将open('C:\\Py\\PageSource\u StackOverflowQ.txt',“rb”)作为输出文件,将open('C:\\Py\\PageSource\u StackOverflowQ1.txt',“a”)作为f1: 对于输出文件中的行: uline=line.decode('ascii',errors='ignore') f1.write(uline) outfile.close() f1.close() 数据=打开('C:\\Py\\PageSource\u StackOverflowQ1.txt','r')。读取行() 将open('C:\\Py\\PageSource\u StackOverflowQ1.txt')作为f,将open('C:\\Py\\PageSource\u StackOverflowQ2.txt',“w”)作为f1: 数据=f.readlines() 对于范围内的i(len(数据)): 行=数据[i] 如果(“img src”在第行):
q=line.split(“>I更改web驱动程序路径和范围变量。如果创建文件夹C:\Py,下面的我的代码将输出一个名为PageSource_StackOverflowQ2.txt的文本文件,其中包含图像src路径

我在htlm中遇到了很多关于二进制字符的问题,所以可能有更好的方法来实现这一点,但希望这能帮助您实现您的目标

如果