Python 3.x Python 3.x:'；ascii'；编解码器可以'；t编码字符'\xfc&x27；位置18：序号不在范围内（128）_Python 3.x_Utf 8_Beautifulsoup

Python 3.x Python 3.x:'；ascii'；编解码器可以'；t编码字符'\xfc&x27；位置18：序号不在范围内（128）

python-3.x utf-8

Python 3.x Python 3.x:'；ascii'；编解码器可以'；t编码字符'\xfc&x27；位置18：序号不在范围内（128）,python-3.x,utf-8,beautifulsoup,Python 3.x,Utf 8,Beautifulsoup,我一直在检查现有的问题。它们都不适合我我写了一些代码从一个网站的多个页面中获取信息当我运行代码时，它返回以下错误： “ascii”编解码器无法对位置18中的字符“\xfc”进行编码：序号不在范围内（128）当我在有限数量的链接上测试代码时，它可以工作。问题可能在于此链接： 'https://www.crowdcube.com/investment/brüpond-brewery-10622' 因为有一个在这种情况下，我可以删除该链接，这样就可以了。然而，我想知道如何处理这个问题的一

我一直在检查现有的问题。它们都不适合我

我写了一些代码从一个网站的多个页面中获取信息

当我运行代码时，它返回以下错误： “ascii”编解码器无法对位置18中的字符“\xfc”进行编码：序号不在范围内（128）

当我在有限数量的链接上测试代码时，它可以工作。
问题可能在于此链接：

'https://www.crowdcube.com/investment/brüpond-brewery-10622'

因为有一个

在这种情况下，我可以删除该链接，这样就可以了。然而，我想知道如何处理这个问题的一般

这是代码

from bs4 import BeautifulSoup
import urllib 
from time import sleep 
import re



def make_soup(url):
    html = urllib.request.urlopen(url)
    return BeautifulSoup(html, "lxml")

def get_links(section_url):
    get_link = make_soup(section_url)
    links_page = [a.attrs.get('href') for a in get_link.select('a[href]')]
    links_page = list(set(links_page))


    links = [l for l in links_page if 'https://www.crowdcube.com/investment/' in l] 

    return links

def get_data(url):
miss='.'
tree= make_soup(url)
try:
    #title
    title = tree.find_all('h2')[0].get_text()

    #description
    description=tree.find_all('div',{'class':'fullwidth'})
    description= description[1].find('p').get_text()
    description=re.sub(r'[^\w.]', ' ', description)   

   #location
    location=tree.find_all('div',{'class':'pitch-profile'})
    location=location[0].find('li').get_text()
    l=0
    loc=list(location)
    while l < len(loc):
       if loc[l]==',':
           loc[l]='-'
       l+=1   
    del(loc[0:10])
    location="".join(loc)
   #raised capital
    raised=tree.find_all('div',{'class':'cc-pitch__raised'})
    raised= raised[0].find('b').get_text()

    rais=list(raised)

    r=0
    while r < len(rais):
        if rais[r]==',':
           rais[r]='.'
        r+=1   

    currency=rais[0]

    del(rais[0])

    raised="".join(rais)

    #target
    target=tree.find_all('div',{'class':'cc-pitch__stats clearfix'})
    target= target[0].find('dd').get_text()
    targ=list(target)

    t=0
    while t < len(targ):
        if targ[t]==',':
           targ[t]='.'
        t+=1   


    del(targ[0])

    target="".join(targ)

    #category

    category=tree.find_all('li',{'class':'sectors'})
    category=category[0].find('span').get_text()

    category=category.strip()
    category=category.replace(" ", "")
    cat=list(category)    
    c=0
    while c < len(cat):
        if cat[c]==',':
           cat[c]='-'
        c+=1   
    category="".join(cat)

    backers=tree.find_all('div',{'class':'cc-pitch__stats clearfix'})

    for tag in backers:
        ddTags = tag.find_all("dd")


    backers= ddTags[3].get_text()


    backers

    return {"url": url.encode("utf-8"),
            "title": title.encode("utf-8"),
            "backers":backers.encode("utf-8"),
            "description":description.encode("utf-8"),
            "location":location.encode("utf-8"),
            "raised": raised.encode("utf-8"),
            "currency":currency.encode("utf-8"),
            "target": target.encode("utf-8"),
            "category": category.encode("utf-8")}
except(IndexError,RuntimeError, TypeError, NameError, UnicodeEncodeError):
    return {"url": url,
            "title": miss,
            "backers":miss,
            "description":miss,
            "location":miss,
            "raised": miss,
            "currency":miss,
            "target": miss,
            "category": miss}


if __name__ == '__main__':
    start_url = ("https://www.crowdcube.com/investments?sort_by=0&q=&hof=1&i1=0&i2=0&i3=0&i4=0&sort_by=7")

links = get_links(start_url)

data = [] # a list to store our dictionaries
for link in links:
    crowdcube = get_data(link)
    data.append(crowdcube)
    sleep(1)

从bs4导入美化组
导入URL库
从时间上导入睡眠
进口稀土
def制作汤（url）：
html=urllib.request.urlopen（url）
返回美化组（html，“lxml”）
def get_链接（部分url）：
获取链接=制作汤（部分url）
links_page=[a.attrs.get（'href'）用于in-get_链接。select（'a[href]'）]
链接页面=列表（设置（链接页面））
links=[l代表链接页面中的l，如果'https://www.crowdcube.com/investment/'在l中]
返回链接
def get_数据（url）：
小姐='。'
树=制作汤（url）
尝试：
#头衔
title=tree.find_all（'h2'）[0]。get_text（）
#描述
description=tree.find_all（'div'，{'class'：'fullwidth'}）
description=description[1]。查找（'p'）。获取文本（）
description=re.sub（r'[^\w.]'，''，description）
#位置
location=tree.find_all（'div'，{'class'：'pitch-profile'}）
位置=位置[0]。查找（'li'）。获取文本（）
l=0
loc=列表（位置）
当l


有什么建议吗？
提前感谢
urllib无法处理像Url中的'u'这样的UMLAUT：
'https://www.crowdcube.com/investment/brüpond-brewery-10622'

使用。requests lib与umlauts没有问题
例如，将“制作汤”功能更改为：

您有两个问题，但没有包含相关代码-可能在make\u soup功能中。请将问题限制为一个问题，并包含完整且可验证的代码：@AlastairMcCormack我已修复我的问题。希望现在是OK哪一行抛出错误？@AlastairMcCormack行973，在putrequest self中。\u输出（request.encode（'ascii'））该行不在代码中！请包括整个堆栈跟踪。另外，请正确缩进代码。
import requests

def make_soup(url):
    html = requests.get(url).text
    return BeautifulSoup(html, "lxml")