Python . 它在做什么,和你想要的有什么不同?道歉。我的意思是,它不是点击下一步,刮,再点击下一步。它只需单击“下一步”一次。它保持运行并打印“不再剩下页面”,而不转到新页面。我会放一个链接,但你必须登录到lexisNexis,这样链接只会带你到主页,而不是搜索
Python . 它在做什么,和你想要的有什么不同?道歉。我的意思是,它不是点击下一步,刮,再点击下一步。它只需单击“下一步”一次。它保持运行并打印“不再剩下页面”,而不转到新页面。我会放一个链接,但你必须登录到lexisNexis,这样链接只会带你到主页,而不是搜索,python,selenium,Python,Selenium,. 它在做什么,和你想要的有什么不同?道歉。我的意思是,它不是点击下一步,刮,再点击下一步。它只需单击“下一步”一次。它保持运行并打印“不再剩下页面”,而不转到新页面。我会放一个链接,但你必须登录到lexisNexis,这样链接只会带你到主页,而不是搜索结果。啊,这很有意义。我会试试的! from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.suppo
. 它在做什么,和你想要的有什么不同?道歉。我的意思是,它不是点击下一步,刮,再点击下一步。它只需单击“下一步”一次。它保持运行并打印“不再剩下页面”,而不转到新页面。我会放一个链接,但你必须登录到lexisNexis,这样链接只会带你到主页,而不是搜索结果。啊,这很有意义。我会试试的!
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
#from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
#import requests
#import re
import csv
import numpy as np
#import pandas as pd
###############################################################################
#CLICKING AND SEARCH
###############################################################################
browser = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver')
browser.implicitly_wait(5)
#Goes to library website and finds database
browser.get('https://sfx.carli.illinois.edu/sfxuiu?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&ctx_enc=info:ofi/enc:UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/sfxit.com:azlist&sfx.ignore_date_threshold=1&rft.object_id=63750000000001351&svc.fulltext=yes')
browser.find_element_by_link_text('LEXIS NEXIS DATABASES').click()
alert = browser.switch_to.alert
alert.accept()
browser.close()
browser.switch_to.window(browser.window_handles[0])
#Login to NexisUni through university library ONLY WHEN NOT ON CAMPUS
browser.find_element_by_id('j_username').send_keys('USERNAME')
browser.find_element_by_id('j_password').send_keys('PASS')
browser.find_element_by_name('_eventId_proceed').click()
#click on advanced search on NexisUni homepage
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH ('/html/body/main/div[13]/div[2]/div[1]/header/div[3]/ul/li[1]/button'))
advancedSearch = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[1]/header/div[3]/ul/li[1]/button')
advancedSearch.click()
#Selecting Specific Content Type
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/button')))
Select_Content = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/button')
Select_Content.click()
#Choose News
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/aside/div[2]/ul[2]/li[2]/button')))
Choose_News = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/aside/div[2]/ul[2]/li[2]/button')
Choose_News.click()
#Type in Search Term
browser.find_element_by_xpath('//*[@id="headline"]').send_keys('Law')
#Type in Publication
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('//*[@id="publication"]')))
Pub = browser.find_element_by_xpath('//*[@id="publication"]')
Pub.send_keys('The Associated Press')
#input date range
select = Select(browser.find_element_by_id('date'))
select.select_by_visible_text('Date is after')
browser.find_element_by_id('dateFrom').send_keys('01/01/1980')
#click on Search
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/footer/span/button[1]')))
Search = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/footer/span/button[1]')
Search.click()
###############################################################################
#SCRAPING
###############################################################################
scd = browser.page_source
soup = BeautifulSoup(scd, "lxml")
HEADLINES = soup.findAll('a', attrs={"data-action":"title"})
headlines=[]
for H in HEADLINES:
headlines.append(H.text.strip())
DETAILS = soup.findAll('div', attrs={"class":"dataInfo translate"})
details = []
for D in DETAILS:
details.append(D.text.strip())
Dates1 = [i.split('\t\t\t\t\t\n\n',2)[1] for i in details]
Dates = [i.split('\n',1)[0] for i in Dates1]
Source1 = [i.split('\t\t\t\t\t\n\n',1)[1] for i in details]
Source = [i.split('\n',1)[1] for i in Source1]
News = zip(headlines,Dates,Source)
result = "/Users/danashaat/Desktop/data.csv"
with open(result, 'a') as result:
newswriter = csv.writer(result)
for row in News:
newswriter.writerow(row)
#Next Page:
while True:
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')))
Next = browser.find_element_by_xpath('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')
if len(Next) < 1:
print("No more pages left")
break
else:
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')))
Next = browser.find_element_by_xpath('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')
Next.click()