Python 3.x 还有一个Python编码问题
我尝试了以下代码:Python 3.x 还有一个Python编码问题,python-3.x,beautifulsoup,urllib,Python 3.x,Beautifulsoup,Urllib,我尝试了以下代码: def process_request(url): req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) return urlopen(req).read() def get_links(): url = c.first_url html = process_request(url) details_pages = [] soup = BeautifulSoup(h
def process_request(url):
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
return urlopen(req).read()
def get_links():
url = c.first_url
html = process_request(url)
details_pages = []
soup = BeautifulSoup(html, 'html.parser')
links = soup.select(".pagelist-bar a")
print(links)
for l in links:
print(l)
if l.has_attr('href'):
href_ = l['href']
detail = c.base_url + href_
logging.info("Page with List of persons: %s", detail)
details_pages.append(detail)
return details_pages
def person_urls():
pages = get_links()
for l in pages:
print("link: %s", l)
doc = process_request(l)
soup = BeautifulSoup(doc, 'html.parser')
fichas = soup.select(".ficha")
print(fichas)
在此url中:
无论我使用什么策略,这一行:
<a href="/es/colaboracion/buscados/index.html?buscar=si&category=abcd¬shown=">
有人能帮我吗?也许你应该试着在你的BeautifulSoup呼叫中用
html
替换html.parser
:
soup = BeautifulSoup(html, 'html')
links = soup.select(".pagelist-bar a")
#Ouptut
for x in links:
print(x.get('href'))
输出:
/es/colaboracion/buscados/index.html?pagina=1&buscar=si&category=¬shown=
/es/colaboracion/buscados/index.html?pagina=2&buscar=si&category=¬shown=
/es/colaboracion/buscados/index.html?pagina=3&buscar=si&category=¬shown=
/es/colaboracion/buscados/index.html?pagina=4&buscar=si&category=¬shown=
soup = BeautifulSoup(html, 'html')
links = soup.select(".pagelist-bar a")
#Ouptut
for x in links:
print(x.get('href'))
/es/colaboracion/buscados/index.html?pagina=1&buscar=si&category=¬shown=
/es/colaboracion/buscados/index.html?pagina=2&buscar=si&category=¬shown=
/es/colaboracion/buscados/index.html?pagina=3&buscar=si&category=¬shown=
/es/colaboracion/buscados/index.html?pagina=4&buscar=si&category=¬shown=