使用python urllib如何避免非HTML内容_Python_Content Type_Urllib_Limits

使用python urllib如何避免非HTML内容

python

使用python urllib如何避免非HTML内容,python,content-type,urllib,limits,Python,Content Type,Urllib,Limits,我正在使用urllib（注意不是urllib2）并从用户提供的URL获取页面标题。不幸的是，有时url不是HTML，而是远程站点上的某个大文件或某个运行时间很长的进程我已经检查了python文档，但是urllib是有限的，从源代码看，我似乎可以更改它，但我不能在服务器上这样做。这里提到了info（），但没有关于如何实现它的示例我使用的是FancyURLopener，我想它在urllib2中不可用，我不知道urllib2是否可以解决这个问题有没有办法定义套接字超时更重要的是，我如何将请求限

我正在使用urllib（注意不是urllib2）并从用户提供的URL获取页面标题。不幸的是，有时url不是HTML，而是远程站点上的某个大文件或某个运行时间很长的进程

我已经检查了python文档，但是urllib是有限的，从源代码看，我似乎可以更改它，但我不能在服务器上这样做。这里提到了info（），但没有关于如何实现它的示例

我使用的是FancyURLopener，我想它在urllib2中不可用，我不知道urllib2是否可以解决这个问题

有没有办法定义套接字超时

更重要的是，我如何将请求限制为HTML/XHTML内容类型，而完全忽略任何其他内容，即我希望确保不下载整个内容

我仍在浏览urllib源代码并检查urllib2，但我不是这些工具的专家。

，它声明

info（）

方法返回与URL关联的元信息。您可以使用它来获取标题，并查看内容类型（text/html），如果不是您想要的，则放弃请求

>>> import urllib
>>> d = urllib.urlopen('http://www.google.com/')
>>> try:
...     if d.info()['content-type'].startswith('text/html'):
...             print 'its html'
...     else:
...             print 'its not html'
... except KeyError:
...     print 'its not html'
... 
its html

我已经快速拼凑了一些东西，允许在

urllib

：）中为您指定

HEAD

请求

导入urllib
导入套接字
从urllib导入展开、toBytes、quote、splittype、splithost、splituser、unquote、AddInfo URL
类MyURLOpener（urllib.FancyURLopener）：
def open_http（self，url，data=None，method=None）：
“”“使用HTTP协议。”“”
导入httplib
用户\ u passwd=无
proxy_passwd=None
如果isinstance（url，str）：
主机，选择器=拆分主机（url）
如果主机：
user\u passwd，host=splituser（主机）
主机=unquote（主机）
realhost=host
其他：
主机，选择器=url
#检查代理是否包含授权信息
proxy_passwd，host=splituser（主机）
#现在我们继续我们想要获得的url
urltype，rest=splittype（选择器）
url=rest
用户\ u passwd=无
如果urltype.lower（）！='http'：
realhost=None
其他：
realhost，rest=splithost（rest）
如果是realhost：
用户\u passwd，realhost=splituser（realhost）
如果用户\u passwd：
选择器=“%s://%s%s”%（urltype、realhost、rest）
如果代理服务器绕过（realhost）：
主机=realhost
#打印“通过http:代理”，主机，选择器
如果不是主机：引发IOError，（'http error'，'未提供主机'）
如果代理\u passwd：
导入base64
proxy\u auth=base64.b64encode（proxy\u passwd.strip（））
其他：
proxy_auth=None
如果用户\u passwd：
导入base64
auth=base64.b64encode（user_passwd）.strip（）
其他：
auth=None
h=httplib.HTTP（主机）
如果方法不是无：
h、 putrequest（方法、选择器）
其他：
h、 putrequest（'GET'，选择器）
如果数据不是无：
#h、 putrequest（'POST'，选择器）
h、 putheader（'Content-Type'，'application/x-www-form-urlencoded'）
h、 putheader（'Content-Length'，'%d'%len（数据））
如果代理授权：h.putheader（'proxy-Authorization'，'Basic%s'%proxy\u auth）
if auth:h.putheader（'Authorization'，'Basic%s'%auth）
如果realhost:h.putheader（'Host'，realhost）
对于self.addHeader中的参数：h.putheader（*args）
h、 端头（数据）
errcode，errmsg，headers=h.getreply（）
fp=h.getfile（）
如果errcode==-1：
如果fp:fp.close（）
#HTTP状态行出现问题
引发IOError，（'http协议错误'，0，
“状态行不正确”，无）
#根据RFC 2616，“2xx”代码表示客户的
#请求已成功接收、理解并接受。
如果（事实上是200。我正在为您制作。这里是：）请注意，为了找到内容类型
标题，整个页面仍将被下载。为了解决这个问题，你可以做一个HEAD
请求，而不是get
请求，但是我还没有找到一种方法来使用urllib
@JohnDoe：你的解决方案甚至比你声称的更好。您可以使用它来检查标题，而无需下载整个文件。例如，如果您将其指向http://python.org/ftp/python/3.2.2/Python-3.2.2.tar.bz2
，您可以查看d.info（）
（几乎立即）而无需下载所有11MBs.Nice。在这种情况下，这当然是有用的。不过，我还是拼凑出了一种使用urllib方法的方法。另一种（不太黑的）方式，可以找到或。尽管出于所有目的和意图，unutbu的方法从它的声音来看应该可以很好地工作。我可以证实unutbu所说的。信息必须隐式使用HEAD。我也试过了。约翰·多伊，非常感谢你。
import urllib
import socket
from urllib import unwrap, toBytes, quote, splittype, splithost, splituser, unquote, addinfourl

class MyURLOpener(urllib.FancyURLopener):
    def open_http(self, url, data=None, method=None):
        """Use HTTP protocol."""
        import httplib
        user_passwd = None
        proxy_passwd= None
        if isinstance(url, str):
            host, selector = splithost(url)
            if host:
                user_passwd, host = splituser(host)
                host = unquote(host)
            realhost = host
        else:
            host, selector = url
            # check whether the proxy contains authorization information
            proxy_passwd, host = splituser(host)
            # now we proceed with the url we want to obtain
            urltype, rest = splittype(selector)
            url = rest
            user_passwd = None
            if urltype.lower() != 'http':
                realhost = None
            else:
                realhost, rest = splithost(rest)
                if realhost:
                    user_passwd, realhost = splituser(realhost)
                if user_passwd:
                    selector = "%s://%s%s" % (urltype, realhost, rest)
                if proxy_bypass(realhost):
                    host = realhost

            #print "proxy via http:", host, selector
        if not host: raise IOError, ('http error', 'no host given')

        if proxy_passwd:
            import base64
            proxy_auth = base64.b64encode(proxy_passwd).strip()
        else:
            proxy_auth = None

        if user_passwd:
            import base64
            auth = base64.b64encode(user_passwd).strip()
        else:
            auth = None
        h = httplib.HTTP(host)

        if method is not None:
            h.putrequest(method, selector)
        else:
            h.putrequest('GET', selector)

        if data is not None:
            #h.putrequest('POST', selector)
            h.putheader('Content-Type', 'application/x-www-form-urlencoded')
            h.putheader('Content-Length', '%d' % len(data))

        if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
        if auth: h.putheader('Authorization', 'Basic %s' % auth)
        if realhost: h.putheader('Host', realhost)
        for args in self.addheaders: h.putheader(*args)
        h.endheaders(data)
        errcode, errmsg, headers = h.getreply()
        fp = h.getfile()
        if errcode == -1:
            if fp: fp.close()
            # something went wrong with the HTTP status line
            raise IOError, ('http protocol error', 0,
                            'got a bad status line', None)
        # According to RFC 2616, "2xx" code indicates that the client's
        # request was successfully received, understood, and accepted.
        if (200 <= errcode < 300):
            return addinfourl(fp, headers, "http:" + url, errcode)
        else:
            if data is None:
                return self.http_error(url, fp, errcode, errmsg, headers)
            else:
                return self.http_error(url, fp, errcode, errmsg, headers, data)

    def open(self, fullurl, data=None, method=None):
        """Use URLopener().open(file) instead of open(file, 'r')."""
        fullurl = unwrap(toBytes(fullurl))
        # percent encode url, fixing lame server errors for e.g, like space
        # within url paths.
        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
        if self.tempcache and fullurl in self.tempcache:
            filename, headers = self.tempcache[fullurl]
            fp = open(filename, 'rb')
            return addinfourl(fp, headers, fullurl)
        urltype, url = splittype(fullurl)
        if not urltype:
            urltype = 'file'
        if urltype in self.proxies:
            proxy = self.proxies[urltype]
            urltype, proxyhost = splittype(proxy)
            host, selector = splithost(proxyhost)
            url = (host, fullurl) # Signal special case to open_*()
        else:
            proxy = None
        name = 'open_' + urltype
        self.type = urltype
        name = name.replace('-', '_')
        if not hasattr(self, name):
            if proxy:
                return self.open_unknown_proxy(proxy, fullurl, data)
            else:
                return self.open_unknown(fullurl, data)
        try:
            return getattr(self, name)(url, data, method)
        except socket.error, msg:
            raise IOError, ('socket error', msg), sys.exc_info()[2]


opener = MyURLOpener()

# NOTE: including any data no longer implicitly makes the method POST,
#       so you must now specify the method to POST if you include data
# NOTE: this overrides only open_http, and not open_https, but you can
#       use a similar technique, and override open_https as well

d = opener.open('http://www.google.com/', method='HEAD')