Python 如何使用bs4获取多个嵌套的div值并以json格式输出?
我想要刮取的Html数据如下所示。它由许多div标记组成Python 如何使用bs4获取多个嵌套的div值并以json格式输出?,python,html,json,web-scraping,bs4,Python,Html,Json,Web Scraping,Bs4,我想要刮取的Html数据如下所示。它由许多div标记组成 <div id="hour3"> <div id="day0" class="hour3"> <div class="row first"> <div class="label">Time</div> <div style="font-size: 12px;">14:00</div> &
<div id="hour3">
<div id="day0" class="hour3">
<div class="row first">
<div class="label">Time</div>
<div style="font-size: 12px;">14:00</div>
<div style="font-size: 12px;">17:00</div>
</div>
<div class="row wd">
<div class="label h3_wd">Temperature</div>
<div>27.5℃ </div>
<div>27.8℃ </div>
</div>
<div id="day1" class="hour3">
<div class="row first">
<div class="label">Time</div>
<div style="font-size: 12px;">8:00</div>
<div style="font-size: 12px;">11:00</div>
</div>
<div class="row wd">
<div class="label h3_wd">Temperature</div>
<div>27.5℃ </div>
<div>27.8℃ </div>
</div>
在网站上,它看起来像一张桌子
day0 day1
Time 14:00 17:00 08:00 11:00
Temperature 27.5℃ 27.8℃ 27.5℃ 27.8℃
我有:
import time, re
import urllib2
from bs4 import BeautifulSoup
start_time = time.time()
url = 'some url'
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html,'html.parser')
for datas in soup.findAll('div', attrs = {'id':'hour3'}):
for dates in datas('div',{'class':'row first'}):
for temp in datas('div',{'class':'row wd'}):
result = {
'day0':[
{
'date' : dates.text.strip(),
'temperature' : temp.text.strip()
}
]
}
print result
我得到:
{'day0': [{'date': u'Description 1\n \n\n 14:00\n \n\n 17:00\n \n\n 08:00\n \n\n 11:00\n, 'temperature': 27.5\u2103 \n \n\n 27.8\u2103 \n \n\n 27.5\u2103 \n \n\n 27.8\u2103 \n \n\n}]}
如何获取所需格式的数据?html\u doc=''
html_doc='''<div id="hour3">
<div id="day0" class="hour3">
<div class="row first">
<div class="label">Time</div>
<div style="font-size: 12px;">14:00</div>
<div style="font-size: 12px;">17:00</div>
</div>
<div class="row wd">
<div class="label h3_wd">Temperature</div>
<div>27.5 </div>
<div>27.8 </div>
</div>
<div id="day1" class="hour3">
<div class="row first">
<div class="label">Time</div>
<div style="font-size: 12px;">8:00</div>
<div style="font-size: 12px;">11:00</div>
</div>
<div class="row wd">
<div class="label h3_wd">Temperature</div>
<div>27.5 </div>
<div>27.8 </div>
</div>'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
result={}
days_conut=0
for datas in soup.findAll('div', attrs = {'id':'hour3'}):
for dates in datas('div',{'class':'row first'}):
for temp in datas('div',{'class':'row wd'}):
lst_of_time=dates.text.split()[1:]
lst_of_temp=temp.text.split()[1:]
count=0
result['day'+str(days_conut)]=[]
for i in lst_of_temp:
result['day'+str(days_conut)].append({'time':lst_of_time[count],'temperature':lst_of_temp[count]})
count=count+1
days_conut=days_conut+1
print result
时间
14:00
17:00
温度
27.5
27.8
时间
8:00
11:00
温度
27.5
27.8
'''
从bs4导入BeautifulSoup
soup=BeautifulSoup(html_doc,'html.parser')
结果={}
天数=0
对于soup.findAll('div',attrs={id':'hour3'})中的数据:
对于数据中的日期('div',{'class':'row first'}):
对于数据中的临时('div',{'class':'row wd'}):
lst_of_time=dates.text.split()[1:]
lst_of_temp=temp.text.split()[1:]
计数=0
结果['day'+str(days_conut)]=[]
对于温度的第一个温度中的i:
结果['day'+str(days_conut)]。追加({'time':lst_of_time[count],'temperature':lst_of_temp[count]})
计数=计数+1
天数=天数+1
打印结果
html_doc='''<div id="hour3">
<div id="day0" class="hour3">
<div class="row first">
<div class="label">Time</div>
<div style="font-size: 12px;">14:00</div>
<div style="font-size: 12px;">17:00</div>
</div>
<div class="row wd">
<div class="label h3_wd">Temperature</div>
<div>27.5 </div>
<div>27.8 </div>
</div>
<div id="day1" class="hour3">
<div class="row first">
<div class="label">Time</div>
<div style="font-size: 12px;">8:00</div>
<div style="font-size: 12px;">11:00</div>
</div>
<div class="row wd">
<div class="label h3_wd">Temperature</div>
<div>27.5 </div>
<div>27.8 </div>
</div>'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
result={}
days_conut=0
for datas in soup.findAll('div', attrs = {'id':'hour3'}):
for dates in datas('div',{'class':'row first'}):
for temp in datas('div',{'class':'row wd'}):
lst_of_time=dates.text.split()[1:]
lst_of_temp=temp.text.split()[1:]
count=0
result['day'+str(days_conut)]=[]
for i in lst_of_temp:
result['day'+str(days_conut)].append({'time':lst_of_time[count],'temperature':lst_of_temp[count]})
count=count+1
days_conut=days_conut+1
print result
...
soup = BeautifulSoup(html, 'html.parser')
result = {}
for day in soup.find_all('div', attrs = {'class': 'hour3'}):
times = day.find('div', {'class': 'row first'}).find_all('div')
temps = day.find('div', {'class': 'row wd'}).find_all('div')
result[day.get('id')] = [
{'Time': t.text, 'Temperature': temp.text}
for t, temp in zip(times[1:], temps[1:])
]
# [1:] - to skip header column
print result