Python 我正试图刮一个aspx网站,但无法超越第2页
我正在尝试抓取一个aspx站点:。出于测试目的,请使用33133作为zipcode,100作为半径 最初,我通过在搜索页面上迭代来收集个人资料链接,我成功地在第一页获得了前20个链接,但无法超出第1页,消息来源说-'很抱歉,您查找的页面或文件找不到' 请参阅下面我的代码:Python 我正试图刮一个aspx网站,但无法超越第2页,python,web-scraping,urllib,python-3.5,Python,Web Scraping,Urllib,Python 3.5,我正在尝试抓取一个aspx站点:。出于测试目的,请使用33133作为zipcode,100作为半径 最初,我通过在搜索页面上迭代来收集个人资料链接,我成功地在第一页获得了前20个链接,但无法超出第1页,消息来源说-'很抱歉,您查找的页面或文件找不到' 请参阅下面我的代码: #!/usr/bin/env python # -*- coding: utf-8 -*- import sys, re import urllib.request, urllib.parse, time, csv from
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, re
import urllib.request, urllib.parse, time, csv
from bs4 import BeautifulSoup
from lxml import html
from sys import argv
profile_links = []
def result_checker(self):
No_results = self.xpath('//td[@colspan="3"]//p//text()')
if "No results" in str(No_results):
print (str(No_results).replace("['","").replace(".']","")+" for other zipcodes")
time.sleep(10)
sys.exit()
else:
pass
def Get_data(zipcode, radius):
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.8,pt;q=0.6',
'Connection':'keep-alive',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Host':'www.tcms.com',
'Origin':'https://www.aae.org',
'Referer':'https://www.aae.org/patients/find.aspx'}
class MyOpener(urllib.request.FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
myopener = MyOpener()
url = 'https://www.aae.org/patients/find.aspx'
f = myopener.open(url)
soup = BeautifulSoup(f,'lxml')
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
EktronClientManager = soup.select("#EktronClientManager")[0]['value']
formData = (
('__EVENTVALIDATION', eventvalidation),
('__VIEWSTATE', viewstate),
('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius', radius),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode',zipcode),
('EktronClientManager',EktronClientManager),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind','SEARCH'))
encodedFields = urllib.parse.urlencode(formData)
f1 = myopener.open(url, encodedFields)
source = f1.read()
target = open('sample.txt','w')
target.write(str(source))
target.close()
source1 = html.fromstring(source)
result_checker(source1)
links = source1.xpath("//table[@class='Results']//tr//a//@href")
for each in links:
if "MemberID" and "AddressID" in each:
print (each)
profile_links.append("https://www.aae.org/patients/"+str(each))
else:
pass
j = 2
soup2 = BeautifulSoup(source,'lxml')
viewstate = soup2.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']
while j < 5:
pages = 'Page$'+str(j)
print (pages,'\n---------------')
formData1 = (('__EVENTTARGET','ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults'),
('__EVENTARGUMENT',pages),
('__VIEWSTATE',viewstate),
('__EVENTVALIDATION',eventvalidation),
('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'))
encodedFields1 = urllib.parse.urlencode(formData1)
f2 = myopener.open(url, encodedFields1)
source2 = f2.read()
target = open('sample.txt','w')
target.write(str(source2))
target.close()
source3 = html.fromstring(source2)
links2 = source3.xpath("//table[@class='Results']//tr//a//@href")
for each1 in links2:
if "MemberID" and "AddressID" in each1:
print (each1)
profile_links.append("https://www.aae.org/patients/"+str(each1))
else:
pass
soup3 = BeautifulSoup(source2,'lxml')
viewstate = soup3.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup3.select("#__EVENTVALIDATION")[0]['value']
j+=1
if __name__ == "__main__":
#Get_data('38132', 5)
Get_data('33133', 100)
#/usr/bin/env python
#-*-编码:utf-8-*-
导入系统,re
导入urllib.request、urllib.parse、时间、csv
从bs4导入BeautifulSoup
从lxml导入html
从系统导入argv
配置文件链接=[]
def结果检查程序(自身):
No_results=self.xpath('//td[@colspan=“3”]//p//text()
如果str中“无结果”(无结果):
打印(str(无结果)。替换(“[”,”)。替换(“.]”,“)+”用于其他Zipcode)
时间。睡眠(10)
sys.exit()
其他:
通过
def Get_数据(zipcode,radius):
headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
“接受编码”:“gzip,deflate”,
‘接受语言’:‘en-US,en;q=0.8,pt;q=0.6’,
“连接”:“保持活动状态”,
“内容类型”:“application/x-www-form-urlencoded;charset=UTF-8”,
‘主持人’:‘www.tcms.com’,
“来源”:https://www.aae.org',
“Referer”:https://www.aae.org/patients/find.aspx'}
类MyOpener(urllib.request.FancyURLopener):
版本='Mozilla/5.0(Windows NT 6.1)AppleWebKit/537.17(KHTML,类似Gecko)Chrome/24.0.1312.57 Safari/537.17'
myopener=myopener()
url='1〕https://www.aae.org/patients/find.aspx'
f=myopener.open(url)
汤=美汤(f,'lxml')
viewstate=soup。选择(“#uu viewstate”)[0]['value']
eventvalidation=soup。选择(“#u eventvalidation”)[0]['value']
EktronClientManager=soup。选择(“#EktronClientManager”)[0]['value']
formData=(
(“事件验证”,事件验证),
(“视图状态”,视图状态),
('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'),
('ctl00$ctl00$cphContentTypes$cphPageContent$AAEFINDEDO$DDLDRADIUS',radius),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaefindedo$txtZipCode',zipcode),
('EktronClientManager',EktronClientManager),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaefinddo$btnFind','SEARCH'))
encodedFields=urllib.parse.urlencode(formData)
f1=myopener.open(url,encodedFields)
source=f1.read()
target=open('sample.txt','w')
target.write(str(源))
target.close()
source1=html.fromstring(源)
结果检查程序(源1)
links=source1.xpath(//table[@class='Results']///tr//a//@href)
对于每个链接:
如果“MemberID”和“AddressID”分别为:
打印(每个)
配置文件链接。附加(“https://www.aae.org/patients/“+str(每个))
其他:
通过
j=2
soup2=beautifulsou(源,“lxml”)
viewstate=soup2。选择(“#uu viewstate”)[0]['value']
eventvalidation=soup2。选择(“#u eventvalidation”)[0]['value']
当j<5时:
pages='Page$'+str(j)
打印(第页,'\n-----------------')
formData1=(“事件目标”,“ctl00$ctl00$cphContentTypes$cphPageContent$AAEFINDEDO$grdResults”),
(“事件参数”,第页),
(“视图状态”,视图状态),
(“事件验证”,事件验证),
('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'))
encodedFields1=urllib.parse.urlencode(formData1)
f2=myopener.open(url,编码字段1)
source2=f2.read()
target=open('sample.txt','w')
target.write(str(source2))
target.close()
source3=html.fromstring(source2)
links2=source3.xpath(//table[@class='Results']///tr//a//@href)
对于链接2中的每个1:
如果each1中的“MemberID”和“AddressID”:
打印(each1)
配置文件链接。附加(“https://www.aae.org/patients/“+str(each1))
其他:
通过
soup3=BeautifulSoup(源代码2,'lxml')
viewstate=soup3。选择(“#uu viewstate”)[0]['value']
eventvalidation=soup3。选择(“#uu eventvalidation”)[0]['value']
j+=1
如果名称=“\uuuuu main\uuuuuuuu”:
#获取_数据('38132',5)
获取数据('33133',100)
是的,Greg Sadetsky,你对cookie的看法完全正确,创建一个会话,然后用所需的数据参数传递所有POST请求
在Requests lib的帮助下,我能够创建一个会话来存储可以跨请求使用的cookies
import requests
from bs4 import BeautifulSoup
from requests import Request, Session
from lxml import html
def Get_data(zipcode, radius):
All_links = []
url = 'https://www.aae.org/patients/find.aspx'
s = requests.Session()
r = s.get(url)
#print (r.text.encode('utf-8'))
soup = BeautifulSoup(r.content,'lxml')
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
EktronClientManager = soup.select("#EktronClientManager")[0]['value']
params = {'EktronClientManager':EktronClientManager,
'__VIEWSTATE':viewstate,
'__EVENTVALIDATION':eventvalidation,
'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search',
'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius':radius,
'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode':zipcode,
'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind':'SEARCH'}
r2 = s.post(url,data=params)
source = html.fromstring(r2.content)
links = source.xpath("//table[@class='Results']//tr//a//@href")
for each in links:
if "MemberID" and "AddressID" in each:
print (each)
All_links.append("https://www.aae.org/patients/"+str(each))
#print (r2.content)
soup1 = BeautifulSoup(r2.content,'lxml')
viewstate = soup1.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup1.select("#__EVENTVALIDATION")[0]['value']
EktronClientManager = soup1.select("#EktronClientManager")[0]['value']
j = 2
while j < 7:
page = 'Page$'+str(j)
print (page)
params1 = {'__EVENTTARGET':'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults',
'__EVENTARGUMENT':page,
'EktronClientManager':EktronClientManager,
'__VIEWSTATE':viewstate,
'__EVENTVALIDATION':eventvalidation,
'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search'}
r3 = s.post(url,data=params1)
source1 = html.fromstring(r3.content)
links1 = source1.xpath("//table[@class='Results']//tr//a//@href")
for each1 in links1:
if "MemberID" and "AddressID" in each1:
print (each1)
All_links.append("https://www.aae.org/patients/"+str(each1))
soup2 = BeautifulSoup(r3.content,'lxml')
viewstate = soup2.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']
EktronClientManager = soup2.select("#EktronClientManager")[0]['value']
j+=1
Get_data(33133, 100)
导入请求
从bs4导入BeautifulSoup
从请求导入请求,会话
从lxml导入html
def Get_数据(zipcode,radius):
所有链接=[]
url='1〕https://www.aae.org/patients/find.aspx'
s=请求。会话()
r=s.get(url)
#打印(r.text.encode('utf-8'))
汤=美汤(r.含量,'lxml')
viewstate=soup。选择(“#uu viewstate”)[0]['value']
eventvalidation=soup。选择(“#u eventvalidation”)[0]['value']
EktronClientManager=soup。选择(“#EktronClientManager”)[0]['value']
参数={'EktronClientManager':EktronClientManager,
“_VIEWSTATE”:VIEWSTATE,
“\uu EVENTVALIDATION”:EVENTVALIDATION,
'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search',
“ctl00$ctl00$cphContentTypes$cphPageContent$AAEFINDEDO$DDLDRADIUS”:半径,
“ctl00$ctl00$cphContentTypes$cphPageContent$aaefindenddo$txtZipCode”:zipcode,
“ctl00$ctl00$cphContentTypes$cphPageContent$aaefinddo$btnFind':'SEARCH'}
r2=s.post(url,数据=params)
source=html.fromstring(r2.content)
links=source.xpath(“//table[@class='Results']///tr//a//@href”)
对于每个链接:
如果“会员”