Python 刮黄页
我正试图从people.yellowpages.com中获取数据,我只需要电子邮件、电话和地址。我最近一直在编写这段代码,它为与业务相关的组织工作。但当涉及到搜索个人数据时,它不起作用。任何人都可以帮我找出我做错了什么 注意:我需要从people.yellowpages.com中刮取个人数据。当我试图运行这个程序时,它经历了一个循环,然后出错Python 刮黄页,python,web-scraping,beautifulsoup,yellow-pages,Python,Web Scraping,Beautifulsoup,Yellow Pages,我正试图从people.yellowpages.com中获取数据,我只需要电子邮件、电话和地址。我最近一直在编写这段代码,它为与业务相关的组织工作。但当涉及到搜索个人数据时,它不起作用。任何人都可以帮我找出我做错了什么 注意:我需要从people.yellowpages.com中刮取个人数据。当我试图运行这个程序时,它经历了一个循环,然后出错 import requests from lxml import html import unicodecsv as csv import argpars
import requests
from lxml import html
import unicodecsv as csv
import argparse
import time
def parse_listing(keyword):
"""
Function to process
: param keyword: search query
: param place : place name
"""
url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
print("retrieving ",url)
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'www.yellowpages.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
# Adding retries
print("helllo")
for retry in range(10):
try:
response = requests.get(url,verify=False, headers = headers )
print("parsing page")
print(response)
sleep(10)
if response.status_code==200:
parser = html.fromstring(response.text)
#making links absolute
base_url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
parser.make_links_absolute(base_url)
print(base_url)
XPATH_LISTINGS = "//div[@class='main-content']//div[@class='phone-result']"
listings = parser.xpath(XPATH_LISTINGS)
scraped_results = []
print("wait")
for results in listings:
XPATH_fullname = ".//a[@class='fullname']//text()"
XPATH_phone = ".//div[@itemprop='phone']//text()"
XPATH_address = ".//div[@class='info']//div//p[@itemprop='address']"
#XPATH_AGE = "//*[@id="center"]/div[1]/div/div[1]/div[3]/p"
raw_fullname = results.xpath(XPATH_fullname)
raw_phone = results.xpath(XPATH_phone)
#raw_AGE = results.xpath(XPATH_AGE)
raw_address = results.xpath(XPATH_address)
print("worked")
fullname = ''.join(raw_fullname).strip() if raw_fullname else None
phone = ''.join(raw_phone).strip() if raw_phone else None
address = ''.join(raw_address).strip() if raw_address else None
#age = ''.join(raw_AGE).strip() if raw_zip_code else None
business_details = {
'name':fullname,
'telephone':phone,
'address':address,
#'age':AGE,
'listing_url':response.url
}
scraped_results.append(business_details)
return scraped_results
print(business_details)
elif response.status_code==404:
print("Could not find a location matching",keyword)
#no need to retry for non existing page
break
else:
print("Failed to process page")
return []
except:
print("Failed to process page")
return []
if __name__=="__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('keyword',help = 'keyword')
#argparser.add_argument('place',help = 'Place Name')
args = argparser.parse_args()
keyword = args.keyword
#place = args.place
scraped_data = parse_listing(keyword,)
if scraped_data:
print("Writing scraped data to %s-%s-scraped-data.csv"%(keyword))
with open('%s-%s-scraped-data.csv'%(keyword,),'wb') as csvfile:
fieldnames = ['NAME','telephone','ADDRESS','listing_url']
writer = csv.DictWriter(csvfile,fieldnames = fieldnames,quoting=csv.QUOTE_ALL)
writer.writeheader()
for data in scraped_data:
writer.writerow(data)
永远不要这样做:
except:
您必须始终指定某些例外情况。
让我们尝试手动运行请求。获取:
(Pdb) requests.get(url,verify=False, headers = headers )
/usr/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
InsecureRequestWarning)
....
*** requests.exceptions.TooManyRedirects: Exceeded 30 redirects.
查看错误:requests.exceptions.TooManyRedirects:超过30个重定向
让我们尝试在不允许重定向的情况下获取:
(Pdb) response = requests.get(url,verify=False, headers = headers, allow_redirects=False)
/usr/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
InsecureRequestWarning)
(Pdb) response
<Response [301]>
(Pdb) response.headers
{'Date': 'Mon, 18 Nov 2019 09:09:35 GMT', 'Content-Type': 'text/html', 'Content-Length': '178', 'Connection': 'keep-alive', 'Location': 'https://people.yellowpages.com/whitepages/?last_name=john', 'Set-Cookie': 'TS0145ce01=01d0bb65df96e04f8ea20dfc3b81c2fbe967f216df827b11fbedaa89ee06a10f05ae6a0759; Path=/'}
(Pdb) url
'https://people.yellowpages.com/whitepages/?last_name=john'
(Pdb) response.headers["Location"]
'https://people.yellowpages.com/whitepages/?last_name=john'
在标题中
'Host':'www.yellowpages.com',