Python 3.x 有人能建议怎么刮这个吗https://www3.wipo.int/branddb/en/# 网站在一个快速的方式?
我想以更快的方式浏览这个网站。我需要使用过滤器。我用selenium编写了一个代码来刮取这个网站,它刮取了数据并保存在exel文件中。一个用selenium编写的代码来刮取这个网站,但刮取数据需要55个小时。 我使用的代码是:Python 3.x 有人能建议怎么刮这个吗https://www3.wipo.int/branddb/en/# 网站在一个快速的方式?,python-3.x,selenium-webdriver,web-scraping,data-mining,xlsxwriter,Python 3.x,Selenium Webdriver,Web Scraping,Data Mining,Xlsxwriter,我想以更快的方式浏览这个网站。我需要使用过滤器。我用selenium编写了一个代码来刮取这个网站,它刮取了数据并保存在exel文件中。一个用selenium编写的代码来刮取这个网站,但刮取数据需要55个小时。 我使用的代码是: import time, xlsxwriter, requests from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains import
import time, xlsxwriter, requests
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import bs4
browser = webdriver.Chrome("chromedriver.exe")
browser.get('https://www3.wipo.int/branddb/en/#')
time.sleep(5)
browser.find_element_by_xpath("//*[@id=\"ui-id-10\"]").click()
browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[5]/div[1]/div/div[6]/div/a[1]").click()
browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[5]/div[1]/div/div[6]/div/a[2]").click()
element_to_hover_over = browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div[5]/ul/li/a")
hover = ActionChains(browser).move_to_element(element_to_hover_over)
hover.perform()
time.sleep(0.1)
browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div[5]/ul/li/ul/li[1]/a").click()
time.sleep(11)
browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[7]/div[1]/div/div[6]/div/a[12]").click()
browser.find_element_by_xpath("/html/body/div[4]/div[2]/form/div[1]/div/div[2]/div/div[1]/div[7]/a[1]").click()
time.sleep(10)
element_to_hover_over = browser.find_element_by_css_selector("#results > div.results_navigation.top_results_navigation.displayButtons > div.results_pager.ui-widget-content > div.rowCountContainer.lightBackground > span > div.rowCountSelectContainer > ul > li > a")
hover = ActionChains(browser).move_to_element(element_to_hover_over)
hover.perform()
time.sleep(1)
browser.find_element_by_css_selector("#results > div.results_navigation.top_results_navigation.displayButtons > div.results_pager.ui-widget-content > div.rowCountContainer.lightBackground > span > div.rowCountSelectContainer > ul > li > ul > li:nth-child(4) > a").click()
row = 0
for i in range(1,1001):
workbook = xlsxwriter.Workbook(str(i) + ".xlsx")
worksheet = workbook.add_worksheet()
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
soup = bs4.BeautifulSoup(browser.page_source, 'lxml')
tr = soup.findAll('tr', {'role': 'row'})
num = 1
img_num = 0
for every in tr:
try:
every['id']
worksheet.write(row,0, str(num))
#brand, source, status, relevance, origin, holder, holder_count, number, date, image_class, nice_ci, img = '', '', '', '', '','','','','','','',''
for each in every:
try:
each['aria-hidden']
except KeyError:
try:
if each['aria-describedby'] == "gridForsearch_pane_BRAND":
worksheet.write(row,1, each['title'])
elif each['aria-describedby'] == "gridForsearch_pane_SOURCE":
worksheet.write(row,2, each.getText())
elif each['aria-describedby'] == "gridForsearch_pane_STATUS":
worksheet.write(row,3, each.getText())
elif each['aria-describedby'] == "gridForsearch_pane_score":
worksheet.write(row,4, each.getText())
elif each['aria-describedby'] == "gridForsearch_pane_OO":
worksheet.write(row,5, each.getText())
elif each['aria-describedby'] == "gridForsearch_pane_HOL":
worksheet.write(row,6, each.getText())
elif each['aria-describedby'] == "gridForsearch_pane_HOLC":
worksheet.write(row,7, each.getText())
elif each['aria-describedby'] == "gridForsearch_pane_ID":
worksheet.write(row,8, each.getText())
elif each['aria-describedby'] == "gridForsearch_pane_AD":
worksheet.write(row,9, each.getText())
elif each['aria-describedby'] == "gridForsearch_pane_LOGO":
worksheet.write(row,10, each.getText())
elif each['aria-describedby'] == "gridForsearch_pane_NC":
worksheet.write(row,11, each.getText())
elif each['aria-describedby'] == "gridForsearch_pane_IMG":
try:
img = "https://www3.wipo.int/branddb" + each.img['src'][2:]
res = requests.get(img)
img_file = open(str(img_num)+'.jpg', 'wb')
for chunk in res.iter_content(100000):
img_file.write(chunk)
img_file.close()
worksheet.insert_image(row,12, str(img_num)+'.jpg', {
'x_scale': 1,
'y_scale': 0.5,
'positioning': 1
})
img_num += 1
except TypeError:
img = ''
worksheet.write(row,12, '')
pass
except KeyError:
pass
num += 1
row += 1
except KeyError:
pass
workbook.close()
browser.find_element_by_css_selector("#results > div.results_navigation.bottom_results_navigation.displayButtons > div.results_pager.ui-widget-content > div.arrow_container > a:nth-child(4)").click()
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
print(i)
browser.quit()
我建议:
beautifulsoup4
time.sleep()
而使用Selenium显式等待或隐式等待
JavaScript
frameworkpuppeter
,它被誉为比selenium快得多