Python Web scrape html加油站
鉴于这个链接,我想刮每个加油站及其信息Python Web scrape html加油站,python,web-scraping,beautifulsoup,Python,Web Scraping,Beautifulsoup,鉴于这个链接,我想刮每个加油站及其信息 import requests from bs4 import BeautifulSoup page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml") soup=BeautifulSoup(page.content,'html.parser') for x in soup.find_all('p'):
import requests
from bs4 import BeautifulSoup
page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup=BeautifulSoup(page.content,'html.parser')
for x in soup.find_all('p'):
print x
提取正确数据的下一步是什么?更新
这是@Dan-Dev的最后一段代码。它有点像黑客。。。请原谅,我没有时间写更短的代码
import re
import requests
from bs4 import BeautifulSoup
from pprint import pprint
def is_phone_number(txt):
r = re.compile(r"(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})")
return r.match(txt)
def is_gas_type(txt):
return any(term in txt.lower() for term in ['lead', 'diesel'])
def is_lat_lon(txt):
return any(term in txt.lower() for term in ['lat', 'lon'])
def is_hour_of_operation(txt):
return any(term in txt.lower() for term in ['24 hrs', ' am ', ' pm ', 'm-f'])
def str_colon_list_to_str_float_dict(rlist):
"""["a:1.0", "b:2.0"] => {"a":1.0, "b":2.0}"""
intermediate_dict = dict(map(lambda s: s.split(':'), rlist))
return dict((k, float(v)) for k, v in intermediate_dict.iteritems())
page = requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all('table')[5]
gas_stations = []
for x in table.find_all('td', {'valign': 'top', 'colspan': None, 'width': None}):
gas_station = []
# split text on line breaks and then remove whitespace
for y in x.text.splitlines():
line = ' '.join(y.split())
gas_station.append(line)
# skip lines consisting of only empty strings
if not ('' in set(gas_station) and len(set(gas_station)) == 1):
gas_stations.append(gas_station)
gas_stations_dict = {}
for gas_station in gas_stations:
gas_station_dict = {}
address_list = []
lat_long_list = []
for i, g in enumerate(gas_station):
g = g.encode("utf-8")
if i == 0:
gas_station_dict['Name'] = g
elif is_phone_number(g):
gas_station_dict['Phone Number'] = g
elif is_lat_lon(g):
lat_long_list.append(g)
elif is_gas_type(g):
gas_station_dict['Gas Type'] = g
elif is_hour_of_operation(g):
gas_station_dict['Hours of Operation'] = g
else:
address_list.append(g)
gas_station_dict['Coordinates'] = str_colon_list_to_str_float_dict(lat_long_list)
gas_station_dict['Address'] = ' '.join(address_list)
gas_stations_dict[gas_station_dict['Name']] = gas_station_dict
pprint(gas_stations_dict)
结果:
我以前的回答是:
我试着使用选择器小工具,就像我在评论中提到的那样,但我在html中没有找到任何一致的模式来抓取所有的电台名称。我刮了很多,我有很多政府网站同样的问题。我不知道这是不称职还是故意为了防止刮擦。。。无论如何,这里有一些代码可以打印出一些信息:
import requests
from bs4 import BeautifulSoup, NavigableString
page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup=BeautifulSoup(page.content,'html.parser')
for x in soup.find_all('p'):
for y in x:
if isinstance(y, NavigableString):
print y.encode("utf-8")
else:
for z in y:
if isinstance(z, NavigableString):
print z.encode("utf-8")
从这一点上,您可以根据需要的信息对其进行修改。只要看一眼,似乎每个车站组的最后一行就是经度
最后,完成后,我会仔细查看它,以确保您拥有所需的所有信息。例如,当您输入find_all p标记时,Folsom DOT不会被提取。更新
这是@Dan-Dev的最后一段代码。它有点像黑客。。。请原谅,我没有时间写更短的代码
import re
import requests
from bs4 import BeautifulSoup
from pprint import pprint
def is_phone_number(txt):
r = re.compile(r"(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})")
return r.match(txt)
def is_gas_type(txt):
return any(term in txt.lower() for term in ['lead', 'diesel'])
def is_lat_lon(txt):
return any(term in txt.lower() for term in ['lat', 'lon'])
def is_hour_of_operation(txt):
return any(term in txt.lower() for term in ['24 hrs', ' am ', ' pm ', 'm-f'])
def str_colon_list_to_str_float_dict(rlist):
"""["a:1.0", "b:2.0"] => {"a":1.0, "b":2.0}"""
intermediate_dict = dict(map(lambda s: s.split(':'), rlist))
return dict((k, float(v)) for k, v in intermediate_dict.iteritems())
page = requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all('table')[5]
gas_stations = []
for x in table.find_all('td', {'valign': 'top', 'colspan': None, 'width': None}):
gas_station = []
# split text on line breaks and then remove whitespace
for y in x.text.splitlines():
line = ' '.join(y.split())
gas_station.append(line)
# skip lines consisting of only empty strings
if not ('' in set(gas_station) and len(set(gas_station)) == 1):
gas_stations.append(gas_station)
gas_stations_dict = {}
for gas_station in gas_stations:
gas_station_dict = {}
address_list = []
lat_long_list = []
for i, g in enumerate(gas_station):
g = g.encode("utf-8")
if i == 0:
gas_station_dict['Name'] = g
elif is_phone_number(g):
gas_station_dict['Phone Number'] = g
elif is_lat_lon(g):
lat_long_list.append(g)
elif is_gas_type(g):
gas_station_dict['Gas Type'] = g
elif is_hour_of_operation(g):
gas_station_dict['Hours of Operation'] = g
else:
address_list.append(g)
gas_station_dict['Coordinates'] = str_colon_list_to_str_float_dict(lat_long_list)
gas_station_dict['Address'] = ' '.join(address_list)
gas_stations_dict[gas_station_dict['Name']] = gas_station_dict
pprint(gas_stations_dict)
结果:
我以前的回答是:
我试着使用选择器小工具,就像我在评论中提到的那样,但我在html中没有找到任何一致的模式来抓取所有的电台名称。我刮了很多,我有很多政府网站同样的问题。我不知道这是不称职还是故意为了防止刮擦。。。无论如何,这里有一些代码可以打印出一些信息:
import requests
from bs4 import BeautifulSoup, NavigableString
page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup=BeautifulSoup(page.content,'html.parser')
for x in soup.find_all('p'):
for y in x:
if isinstance(y, NavigableString):
print y.encode("utf-8")
else:
for z in y:
if isinstance(z, NavigableString):
print z.encode("utf-8")
从这一点上,您可以根据需要的信息对其进行修改。只要看一眼,似乎每个车站组的最后一行就是经度
最后,完成后,我会仔细查看它,以确保您拥有所需的所有信息。例如,当您输入find_all p标记时,Folsom DOT不会被拉取。您可以从给定的HTML中选择的内容很少。因此,您可以取消选择您不想要的内容。如果需要第6个表,则不需要具有colspan属性或任何宽度属性的td元素。但td元素必须具有valign top
import requests
from bs4 import BeautifulSoup
page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup=BeautifulSoup(page.content,'html.parser')
table = soup.find_all('table')[5]
for x in table.find_all('td', {'valign': 'top', 'colspan': None, 'width':None}):
print (x.text)
print ('#############')
根据评论更新
import requests
from bs4 import BeautifulSoup
import pprint
import re
page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup=BeautifulSoup(page.content,'html.parser')
table = soup.find_all('table')[5]
tmps = [i.text for i in table.find_all('td', {'valign': 'top', 'colspan': None, 'width':None})]
my_dict = dict()
for tmp in tmps:
if len(tmp.strip()) != 0:
# Clean up the output and add to dictionary.
my_dict[re.sub( '\s+', ' ', tmp.split('\n', 1)[0] ).strip()] = re.sub( '\s+', ' ', tmp ).strip()
pp = pprint.PrettyPrinter(indent=4)
pp.pprint (my_dict)
产出:
{ 'Bayside Facility': 'Bayside Facility 4294 Rt. 47 Leesburg Open 24 Hrs '
'Unleaded / Diesel 856-785-0040 X-5429 Latitude: '
'39.23339997 Longitude: -74.96568202',
'Bedminster DOT': 'Bedminster DOT 455 Rt. 202/206 South Pluckemin Open 24 '
'Hrs Unleaded / Diesel 908-234-2130 Latitude: '
'40.65123677 Longitude: -74.64499021',
'Berlin DOT': 'Berlin DOT 36 Walker Ave. Berlin Open 24 Hrs Unleaded / '
'Diesel 856-767-7717 Latitude: 39.80369329 Longitude: '
'-74.93442722',
'Bloomsbury DOT': 'Bloomsbury DOT 1000 Rt. 173 Bloomsbury Open 24 Hrs '
'Unleaded / Diesel 908-479-4851 Latitude: 40.66078600 '
'Longitude: -75.06664165',
'Bordentown DOT': 'Bordentown DOT Dunns Mill Rd. -off Rt. 130 Bordentown '
'Unleaded -Open 24 Hrs Diesel – 7:30 am -3:45 pm M-F '
'609-298-2980 Latitude: 40.13178135 Longitude: '
'-74.71658907',
...
但它错过了两个加油站:
<td valign="top" width="235"><p>Elizabeth DOT<br />
<td align="top" width="264">Summit DOT<br />
由于它们具有宽度属性,因此对于HTML,也无法选择这些属性。对于给定的HTML,您几乎没有什么可选择的。因此,您可以取消选择您不想要的内容。如果需要第6个表,则不需要具有colspan属性或任何宽度属性的td元素。但td元素必须具有valign top
import requests
from bs4 import BeautifulSoup
page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup=BeautifulSoup(page.content,'html.parser')
table = soup.find_all('table')[5]
for x in table.find_all('td', {'valign': 'top', 'colspan': None, 'width':None}):
print (x.text)
print ('#############')
根据评论更新
import requests
from bs4 import BeautifulSoup
import pprint
import re
page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup=BeautifulSoup(page.content,'html.parser')
table = soup.find_all('table')[5]
tmps = [i.text for i in table.find_all('td', {'valign': 'top', 'colspan': None, 'width':None})]
my_dict = dict()
for tmp in tmps:
if len(tmp.strip()) != 0:
# Clean up the output and add to dictionary.
my_dict[re.sub( '\s+', ' ', tmp.split('\n', 1)[0] ).strip()] = re.sub( '\s+', ' ', tmp ).strip()
pp = pprint.PrettyPrinter(indent=4)
pp.pprint (my_dict)
产出:
{ 'Bayside Facility': 'Bayside Facility 4294 Rt. 47 Leesburg Open 24 Hrs '
'Unleaded / Diesel 856-785-0040 X-5429 Latitude: '
'39.23339997 Longitude: -74.96568202',
'Bedminster DOT': 'Bedminster DOT 455 Rt. 202/206 South Pluckemin Open 24 '
'Hrs Unleaded / Diesel 908-234-2130 Latitude: '
'40.65123677 Longitude: -74.64499021',
'Berlin DOT': 'Berlin DOT 36 Walker Ave. Berlin Open 24 Hrs Unleaded / '
'Diesel 856-767-7717 Latitude: 39.80369329 Longitude: '
'-74.93442722',
'Bloomsbury DOT': 'Bloomsbury DOT 1000 Rt. 173 Bloomsbury Open 24 Hrs '
'Unleaded / Diesel 908-479-4851 Latitude: 40.66078600 '
'Longitude: -75.06664165',
'Bordentown DOT': 'Bordentown DOT Dunns Mill Rd. -off Rt. 130 Bordentown '
'Unleaded -Open 24 Hrs Diesel – 7:30 am -3:45 pm M-F '
'609-298-2980 Latitude: 40.13178135 Longitude: '
'-74.71658907',
...
但它错过了两个加油站:
<td valign="top" width="235"><p>Elizabeth DOT<br />
<td align="top" width="264">Summit DOT<br />
由于它们具有宽度属性,因此对于HTML,也无法选择这些属性。使用浏览器扩展,它将指出您需要的HTML元素。它会给你找到的确切标签。我建议使用这个:我需要用python来完成…@ziggy我们不是这么说的。他说,要确定在python代码中需要提取哪些标记,请使用一个工具来帮助您查看这些标记是什么。使用一个浏览器扩展,它将指出您需要的html元素。它会给你找到的确切标签。我建议使用这个:我需要用python来完成…@ziggy我们不是这么说的。他说,为了确定需要在python代码中提取哪些标记,请使用一个工具来帮助您查看这些标记是什么。当Ranyea idk再次出现时,这没有为我返回任何信息…它是否为您返回信息?奇怪。。。至少有一个错误。什么操作系统、IDE等?windows 10、pyscripter、无错误、Python2.7I将代码放入文件中,并在命令行上运行。我有一个Mac和Python 2.7。你能试着从命令行运行它吗?对于Windows,我的记忆有点模糊,但是如果您还没有将Python解释器添加到路径中,那么您必须将其添加到路径中。接下来,将代码保存在一个文件中,并在命令行中键入python filename.py。当Ranyea idk再次出现时,这没有为我返回任何信息…它是否为您返回信息?奇怪。。。至少有一个错误。什么操作系统、IDE等?windows 10、pyscripter、无错误、Python2.7I将代码放入文件中,并在命令行上运行。我有一个Mac和Python 2.7。你能试着从命令行运行它吗?对于Windows,我的记忆有点模糊,但是如果您还没有将Python解释器添加到路径中,那么您必须将其添加到路径中。接下来,将代码保存在一个文件中,并在命令行中键入python filename.py。哇,这很好。好的,最后一件事,如果我想将每个加油站的名称存储在一个字典中,其中加油站的名称是键,其余的值我将如何合并?哇,这很好,好的,最后一件事,如果我想将每个加油站存储在一个字典中,其中加油站的名称为键,其余的值为键,我将如何合并这些值?