如何使用Python获取域中的所有链接？_Python_Web Scraping_Beautifulsoup_Lxml

如何使用Python获取域中的所有链接？

python web-scraping

如何使用Python获取域中的所有链接？,python,web-scraping,beautifulsoup,lxml,Python,Web Scraping,Beautifulsoup,Lxml,我想使用Python获取给定“根”URL（在列表中）的域中的所有链接。假设给定一个URL，它应该返回与根URL相同域的页面上的所有链接，然后在每个链接上递归访问它们并提取相同域的所有链接，依此类推。我所说的同一个域名的意思是，如果给定的唯一链接，我想回来是。。。任何外部的东西，例如，应该被丢弃。如何使用Python实现这一点编辑：我尝试使用lxml。我认为这并不完全有效，我也不确定如何考虑到已经处理过的页面的链接（导致无限循环） ~根据你问题的标签，我猜你用的是漂亮的汤。首先，您显然需要下载

我想使用Python获取给定“根”URL（在列表中）的域中的所有链接。假设给定一个URL，它应该返回与根URL相同域的页面上的所有链接，然后在每个链接上递归访问它们并提取相同域的所有链接，依此类推。我所说的同一个域名的意思是，如果给定的唯一链接，我想回来是。。。任何外部的东西，例如，应该被丢弃。如何使用Python实现这一点

编辑：我尝试使用lxml。我认为这并不完全有效，我也不确定如何考虑到已经处理过的页面的链接（导致无限循环）

根据你问题的标签，我猜你用的是漂亮的汤。首先，您显然需要下载该网页，例如使用urllib.request。当你把里面的东西串起来后，你就把它传给美丽的汤。在此之后，您可以找到与soup.find_all（'a'）的所有链接，假设soup是您美丽的soup对象。之后，您只需检查HREF：

最简单的版本是只检查href中是否有“”，但这不会捕获相对链接。我猜一些野生正则表达式可以（用“www.example.com”或“/”或“？”（php）开始查找所有内容），或者您可以查找包含www但不包含www.example.com的所有内容并将其丢弃，等等。正确的策略可能取决于您正在删除的网站，它是一种编码风格。

您可以使用正则表达式过滤掉此类链接

乙二醇


上面的代码片段包含从德国汉莎航空公司arlines网站删除URL所需的模块。这里唯一的附加功能是您可以指定要递归刮取的深度
 以下是我所做的工作，只提供了完整的URL。快，但有点脏
import requests
import re

domain = u"stackoverflow.com"
http_re = re.compile(u"(http:\/\/" + domain + "[\/\w \.-]*\/?)")

visited = set([])
def visit (url):
    visited.add (url)
    extracted_body = requests.get (url).text
    matches = re.findall (http_re, extracted_body)
    for match in matches:
        if match not in visited :
            visit (match)

visit(u"http://" + domain)    
print (visited)

@namita的代码中有一些bug。我修改了它，现在它工作得很好
import sys
import requests
import hashlib
from bs4 import BeautifulSoup
from datetime import datetime


def get_soup(link):
    """
    Return the BeautifulSoup object for input link
    """
    request_object = requests.get(link, auth=('user', 'pass'))
    soup = BeautifulSoup(request_object.content, "lxml")
    return soup

def get_status_code(link):
    """
    Return the error code for any url
    param: link
    """
    try:
        error_code = requests.get(link).status_code
    except requests.exceptions.ConnectionError:
        error_code = -1
    return error_code

def find_internal_urls(main_url, depth=0, max_depth=2):
    all_urls_info = []

    soup = get_soup(main_url)
    a_tags = soup.findAll("a", href=True)

    if main_url.endswith("/"):
        domain = main_url
    else:
        domain = "/".join(main_url.split("/")[:-1])
    print(domain)
    if depth > max_depth:
        return {}
    else:
        for a_tag in a_tags:
            if "http://" not in a_tag["href"] and "https://" not in a_tag["href"] and "/" in a_tag["href"]:
                url = domain + a_tag['href']
            elif "http://" in a_tag["href"] or "https://" in a_tag["href"]:
                url = a_tag["href"]
            else:
                continue
            # print(url)

            status_dict = {}
            status_dict["url"] = url
            status_dict["status_code"] = get_status_code(url)
            status_dict["timestamp"] = datetime.now()
            status_dict["depth"] = depth + 1
            all_urls_info.append(status_dict)
    return all_urls_info


if __name__ == "__main__":
    url = # your domain here
    depth = 1
    all_page_urls = find_internal_urls(url, 0, 2)
    # print("\n\n",all_page_urls)
    if depth > 1:
        for status_dict in all_page_urls:
            find_internal_urls(status_dict['url'])


从问号上看，你似乎已经知道该用什么了。也许你可以展示一下你的尝试，否则我认为这个问题太笼统了。有一些网络抓取框架，比如说，可能会对你有所帮助。这个问题很难解决，因为页面中的一些链接没有协议前缀，并且提供了本地路径。“.”是有效的URL。你想跟随什么而不是跟随什么？我只想跟随以根URL为前缀的任何内容。但是，某些相对链接的前缀不是根URL，但如果我在它们前面加上根URL，它们将是有效的。我也想要这些。这是有道理的，但这是如何重现的呢？它似乎只找到第一个“级别”的链接。你添加深度，它就会搜索到那么多的深度。但是find_internal_URL实际上是在哪里被自己调用的，从而在链接上递归？我提供了模块，你可以根据需要在任何地方使用该功能！我认为这行不通。您正在对“查找内部URL”中的所有页面URL进行变异。因此，在main方法中，您正在更改循环中迭代的内容，python将其视为一次聚会犯规。
import sys
import requests
import hashlib
from bs4 import BeautifulSoup
from datetime import datetime

def get_soup(link):
    """
    Return the BeautifulSoup object for input link
    """
    request_object = requests.get(link, auth=('user', 'pass'))
    soup = BeautifulSoup(request_object.content)
    return soup

def get_status_code(link):
    """
    Return the error code for any url
    param: link
    """
    try:
        error_code = requests.get(link).status_code
    except requests.exceptions.ConnectionError:
        error_code = 
    return error_code

def find_internal_urls(lufthansa_url, depth=0, max_depth=2):
    all_urls_info = []
    status_dict = {}
    soup = get_soup(lufthansa_url)
    a_tags = soup.findAll("a", href=True)

    if depth > max_depth:
        return {}
    else:
        for a_tag in a_tags:
            if "http" not in a_tag["href"] and "/" in a_tag["href"]:
                url = "http://www.lufthansa.com" + a_tag['href']
            elif "http" in a_tag["href"]:
                url = a_tag["href"]
            else:
                continue
            status_dict["url"] = url
            status_dict["status_code"] = get_status_code(url)
            status_dict["timestamp"] = datetime.now()
            status_dict["depth"] = depth + 1
            all_urls_info.append(status_dict)
    return all_urls_info
if __name__ == "__main__":
    depth = 2 # suppose 
    all_page_urls = find_internal_urls("someurl", 2, 2)
    if depth > 1:
        for status_dict in all_page_urls:
            find_internal_urls(status_dict['url'])

import requests
import re

domain = u"stackoverflow.com"
http_re = re.compile(u"(http:\/\/" + domain + "[\/\w \.-]*\/?)")

visited = set([])
def visit (url):
    visited.add (url)
    extracted_body = requests.get (url).text
    matches = re.findall (http_re, extracted_body)
    for match in matches:
        if match not in visited :
            visit (match)

visit(u"http://" + domain)    
print (visited)

import sys
import requests
import hashlib
from bs4 import BeautifulSoup
from datetime import datetime


def get_soup(link):
    """
    Return the BeautifulSoup object for input link
    """
    request_object = requests.get(link, auth=('user', 'pass'))
    soup = BeautifulSoup(request_object.content, "lxml")
    return soup

def get_status_code(link):
    """
    Return the error code for any url
    param: link
    """
    try:
        error_code = requests.get(link).status_code
    except requests.exceptions.ConnectionError:
        error_code = -1
    return error_code

def find_internal_urls(main_url, depth=0, max_depth=2):
    all_urls_info = []

    soup = get_soup(main_url)
    a_tags = soup.findAll("a", href=True)

    if main_url.endswith("/"):
        domain = main_url
    else:
        domain = "/".join(main_url.split("/")[:-1])
    print(domain)
    if depth > max_depth:
        return {}
    else:
        for a_tag in a_tags:
            if "http://" not in a_tag["href"] and "https://" not in a_tag["href"] and "/" in a_tag["href"]:
                url = domain + a_tag['href']
            elif "http://" in a_tag["href"] or "https://" in a_tag["href"]:
                url = a_tag["href"]
            else:
                continue
            # print(url)

            status_dict = {}
            status_dict["url"] = url
            status_dict["status_code"] = get_status_code(url)
            status_dict["timestamp"] = datetime.now()
            status_dict["depth"] = depth + 1
            all_urls_info.append(status_dict)
    return all_urls_info


if __name__ == "__main__":
    url = # your domain here
    depth = 1
    all_page_urls = find_internal_urls(url, 0, 2)
    # print("\n\n",all_page_urls)
    if depth > 1:
        for status_dict in all_page_urls:
            find_internal_urls(status_dict['url'])