Python 如何在循环中返回网页抓取结果并保存到Excel文件?

Python 如何在循环中返回网页抓取结果并保存到Excel文件?,python,python-3.x,web-scraping,Python,Python 3.x,Web Scraping,如何将网页抓取结果保存到excel文件 我试着找个方法已经有一段时间了。或者有什么想法让我能够选择我想要获取的其他页面 这是我的代码: from urllib.request import urlopen as req from openpyxl import Workbook from bs4 import BeautifulSoup as soup from selenium import webdriver from selenium.webdriver.common.keys impor

如何将网页抓取结果保存到excel文件

我试着找个方法已经有一段时间了。或者有什么想法让我能够选择我想要获取的其他页面

这是我的代码:

from urllib.request import urlopen as req
from openpyxl import Workbook
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from urllib.request import urlopen
import time

import requests 


def checkproduct(url):
   opt = webdriver.ChromeOptions()
   opt.add_argument('headless') 

   driver = webdriver.Chrome()
   #driver = webdriver.Chrome(options=opt)
   driver.get(url)

   time.sleep(1)
   driver.execute_script("window.scrollTo(0, 400);")
   time.sleep(1)
   driver.execute_script("window.scrollTo(0, 1200);")
   time.sleep(1)
   driver.execute_script("window.scrollTo(0, 3000);")
   time.sleep(1)

   page_html = driver.page_source
   data = soup(page_html,'html.parser')

   allproduct = data.findAll('div',{'class':'c16H9d'})
   product_title = allproduct[0].text
   product_url = 'https:'+ allproduct[0].a['href']

   list_title = []
   list_url = []
   list_price = []
   list_image = []

   for pd in allproduct:
    pd_title = pd.text
    pd_url = 'https:' + pd.a['href']
    list_title.append(pd_title)
    list_url.append('https:' + pd.a['href'])


   allprice = data.findAll('span',{'class':'c13VH6'})
   for pc in allprice:
    pc_price = pc.text
    pc_price = pc_price.replace('฿','')
    pc_price = pc_price.replace(',','') 
    list_price.append(float(pc_price))


   allimages = data.findAll('img',{'class':'c1ZEkM'})
   for productimages in allimages:
    productimages_url = productimages['src']
    list_image.append(productimages_url)


   print(list_title)
   print(list_url)
   print(pc_price)
   print(list_image)


   driver.close()


   return(list_title,list_price,list_url,list_image)


base_url = "https://www.lazada.co.th/shop-smart-tv?pages="

n = 2
for i in range(1, n+1):
  response = base_url + "%d" %i
  url = response
  print (url)
  checkproduct(url)
  print ('_________________________')



laptop = checkproduct(url)
excelfile = Workbook()
row = excelfile.active
header = ['Product','Price','URL','Images']
row.append(header)


for i,j,k,l in zip(laptop[0],laptop[1],laptop[2],laptop[3]):
  row.append([i,j,k,l])


  #row['A45'] = 'ถูกสุด'
  #row['A46'] = 'แพงสุด'

  #min_price = min(laptop[1])
  #find = laptop[1].index(min_price)

  #row['B45'] = laptop[0][find]
  #row['C45'] = laptop[1][find]
  #row['D45'] = laptop[2][find]

  #max_price = max(laptop[1])
  #find = laptop[1].index(max_price)

  #row['B46'] = laptop[0][find]
  #row['C46'] = laptop[1][find]
  #row['D46'] = laptop[2][find]
excelfile.save('Lazada_Product2.xlsx')
print('Done')

在这个循环中,它只将一组数据提取到Excel文件中,我需要做什么才能提取更多数据?或者提取多个页面

问题是您没有从函数中正确返回结果。你的凹痕看起来破了

尝试此简化且干净的代码,该代码工作正常,可将数百个条目保存到Excel工作表中:

from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
import pandas as pd

def checkproduct(url):

    driver = webdriver.Chrome()
    driver.get(url)

    driver.execute_script("window.scrollTo(0, 3000);")
    time.sleep(10) 

    page_html = driver.page_source
    data = soup(page_html, 'html.parser')

    allproduct = data.findAll('div', {'class':'c16H9d'})
    product_title = allproduct[0].text
    product_url = 'https:'+ allproduct[0].a['href']

    list_title = []
    list_url = []
    list_price = []
    list_image = []

    for pd in allproduct:
        pd_title = pd.text
        pd_url = 'https:' + pd.a['href']
        list_title.append(pd_title)
        list_url.append('https:' + pd.a['href'])

    allprice = data.findAll('span',{'class':'c13VH6'})

    for pc in allprice:
        pc_price = pc.text
        pc_price = pc_price.replace('฿','')
        pc_price = pc_price.replace(',','') 
        list_price.append(float(pc_price))

    allimages = data.findAll('img',{'class':'c1ZEkM'})

    for productimages in allimages:
        productimages_url = productimages['src']
        list_image.append(productimages_url)

    driver.close()

    return([list_title, list_price, list_url, list_image])

base_url = "https://www.lazada.co.th/shop-smart-tv?pages="

n = 3
rows = []

for i in range(1, n+1):
    response = base_url + f"{i}"
    url = response
    print (url)
    # you need to save the returned values from your function!
    results = checkproduct(url)
    rows.append(pd.DataFrame(results).T)

df = pd.concat(rows).reset_index(drop=True)
df.columns = ['Product','Price','URL','Images']
df.to_excel("Lazada_Product.xlsx")

请注意,我使用Pandas数据框来方便数据操作和保存。

您能更具体地说明这个问题吗?你刚刚放弃了你的整个计划。如果你的问题解决了,请将答案标记为已接受,以便其他人可以看到你的问题已得到回答。谢谢,帕特祖伊奇,你有我很多。但是在试着逃跑之后。我发现它收到的信息就像是同一页。你有什么解决办法吗?不客气。你确定吗?该脚本将所有页面中的所有刮取数据保存到一个Excel工作表中。当然,此脚本不能在“保存1个页面”循环中的Excel文件中刮取到其他页面。你有什么解决办法吗?我刚刚又查了一遍,无法确认。该脚本每页只抓取40个样本。如果我设置
n=1
,我会得到40个样本。如果我设置
n=2
80个样本,以此类推。再次提醒:请注意,所有数据仅保存在一张Excel表格中。它不是每个网页一张纸。但同样的:你从页面中获取所有数据。