如何使用Python提取以下HTML片段
我有以下Python代码:如何使用Python提取以下HTML片段,python,html,regex,Python,Html,Regex,我有以下Python代码: def getAddress(text): text = re.sub('\t', '', text) text = re.sub('\n', '', text) blocks = re.findall('<div class="result-box" itemscope itemtype="http://schema.org/LocalBusiness">([a-zA-Z0-9 ",;:\.#&_=()\'<>
def getAddress(text):
text = re.sub('\t', '', text)
text = re.sub('\n', '', text)
blocks = re.findall('<div class="result-box" itemscope itemtype="http://schema.org/LocalBusiness">([a-zA-Z0-9 ",;:\.#&_=()\'<>/\\\t\n\-]*)</span>Follow company</span>', text)
name = ''
strasse = ''
locality = ''
plz = ''
region = ''
i = 0
for block in blocks:
names = re.findall('class="url">(.*)</a>', block)
strassen = re.findall('<span itemprop="streetAddress">([a-zA-Z0-9 ,;:\.&#]*)</span>', block)
localities = re.findall('<span itemprop="addressLocality">([a-zA-Z0-9 ,;:&]*)</span>', block)
plzs = re.findall('<span itemprop="postalCode">([0-9]*)</span>', block)
regions = re.findall('<span itemprop="addressRegion">([a-zA-Z]*)</span>', block)
try:
for name in names:
name = str(name)
name = re.sub('<[^<]+?>', '', name)
break
for strasse in strassen:
strasse = str(strasse)
strasse = re.sub('<[^<]+?>', '', strasse)
break
for locality in localities:
locality = str(locality)
locality = re.sub('<[^<]+?>', '', locality)
break
for plz in plzs:
plz = str(plz)
plz = re.sub('<[^<]+?>', '', plz)
break
for region in regions:
region = str(region)
region = re.sub('<[^<]+?>', '', region)
break
except:
continue
print i
i = i + 1
if plz == '':
plz = getZipCode(strasse, locality, region)
address = '"' + name + '"' + ';' + '"' + strasse + '";' + locality + ';' + str(plz) + ';' + region + '\n'
#saveToCSV(address)
我想过滤掉这个html片段。这个剪报重复了好几次。我希望函数为每个代码段返回一个条目。但它会返回一个包含两个代码段的条目。我必须改变什么
<div class="result-box" itemscope itemtype="http://schema.org/LocalBusiness">
<div class="clear">
<h2 itemprop="name"><a href="http://www.manta.com/c/mxlk5yt/belgium-jewelers-corp" class="url">Belgium Jewelers Corp</a></h2> </div>
<div itemprop="address" itemscope itemtype="http://schema.org/PostalAddress"> <span itemprop="addressLocality">Lawrenceville</span> <span itemprop="addressRegion">NJ</span>
</div> <a href="#" class="followCompany" data-emid="mxlk5yt" data-companyname="Belgium Jewelers Corp" data-location="ListingFollowButton" data-location-page="Megabrowse">
<span class="followMsg"><span class="followIcon mrs"></span>Follow company</span>
<span class="followingMsg"><span class="followIcon mrs"></span>Following</span>
<span class="unfollowMsg"><span class="followIcon mrs"></span>Unfollow company</span>
</a> <p class="type">Jewelry Stores</p> </div>
</li> <li> <div class="icons">
<ul> </ul>
</div>
您应该看看针对Python的HTMLParser。众所周知,正则表达式不适合解析HTML。您应该看看用于Python的HTMLParser。众所周知,正则表达式不适合解析HTML。请放下那把锤子;HTML不是正则表达式形状的钉子。解析HTML的正则表达式很快就会变得复杂,而且非常脆弱,当HTML发生细微变化时很容易被破坏 使用合适的HTML解析器。将使您的任务变得微不足道:
from bs4 import BeautifulSoup
soup = BeautifulSoup(text)
for block in soup.find_all('div', class_="result-box", itemtype="http://schema.org/LocalBusiness"):
print block.find('a', class_='url').string
street = block.find('span', itemprop="streetAddress")
if street:
print street.string
locality = block.find('span', itemprop="addressLocality")
if locality:
print locality.string
# .. etc. ..
请放下那把锤子;HTML不是正则表达式形状的钉子。解析HTML的正则表达式很快就会变得复杂,而且非常脆弱,当HTML发生细微变化时很容易被破坏 使用合适的HTML解析器。将使您的任务变得微不足道:
from bs4 import BeautifulSoup
soup = BeautifulSoup(text)
for block in soup.find_all('div', class_="result-box", itemtype="http://schema.org/LocalBusiness"):
print block.find('a', class_='url').string
street = block.find('span', itemprop="streetAddress")
if street:
print street.string
locality = block.find('span', itemprop="addressLocality")
if locality:
print locality.string
# .. etc. ..
为什么不使用HTML解析器来提取这些信息呢?正则表达式不是这里使用的工具。为什么不使用HTML解析器来提取这些信息呢?正则表达式不是这里使用的工具。非常感谢您的帮助。我现在用漂亮的汤。它使任务变得更容易,并大大减少了我的代码。非常感谢您的帮助。我现在用漂亮的汤。它使任务变得更容易,并大大减少了我的代码。