Python 跟踪网站链接的重复过程（BeautifulSoup）_Python_Loops_Beautifulsoup

Python 跟踪网站链接的重复过程（BeautifulSoup）

python loops

Python 跟踪网站链接的重复过程（BeautifulSoup）,python,loops,beautifulsoup,Python,Loops,Beautifulsoup,我正在用Python编写一个代码，使用Beautiful soup获取URL中的所有“a”标记，然后我使用位置3处的链接，然后我应该遵循该链接，我将重复这个过程大约18次。我包含了下面的代码，该过程重复了两次。我无法想出一种方法在一个循环中重复同样的过程18次。任何帮助都将不胜感激 import re import urllib from BeautifulSoup import * htm1= urllib.urlopen('https://pr4e.dr-chuck.com/tsugi/m

我正在用Python编写一个代码，使用Beautiful soup获取URL中的所有“a”标记，然后我使用位置3处的链接，然后我应该遵循该链接，我将重复这个过程大约18次。我包含了下面的代码，该过程重复了两次。我无法想出一种方法在一个循环中重复同样的过程18次。任何帮助都将不胜感激

import re
import urllib

from BeautifulSoup import *
htm1= urllib.urlopen('https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Fikret.html ').read()
soup =BeautifulSoup(htm1)
tags = soup('a')
list1=list()
for tag in tags:
    x = tag.get('href', None)
    list1.append(x)

M= list1[2]

htm2= urllib.urlopen(M).read()
soup =BeautifulSoup(htm2)
tags1 = soup('a')
list2=list()
for tag1 in tags1:
    x2 = tag1.get('href', None)
    list2.append(x2)

y= list2[2]
print y

好的，我刚刚写了这段代码，它正在工作，但是我在结果中得到了相同的4个链接。循环中似乎有问题（请注意：我尝试了4次循环）

重新导入
导入URL库
从美联进口*
list1=list（）
url='1〕https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Fikret.html'
对于范围（4）内的i:#重复4次
htm2=urllib.urlopen（url.read（））
soup1=BeautifulSoup（htm2）
tags1=soup1（'a'）
对于标记1中的标记1：
x2=tag1.get（'href'，无）
列表1.追加（x2）
y=列表1[2]
如果len（x2）<3:#无第三链路
中断#退出循环
其他：
url=y
打印y

您应该使用递归

如果你想对你想去的级别设置一个限制，你可以通过你所处的级别作为参数，例如：

def GetLinks(initialPage, level):
    level += 1
    htm1= urllib.urlopen(initialPage).read()
    soup =BeautifulSoup(htm1)
    tags = soup('a')
    list1=list()
    for tag in tags:
        x = tag.get('href', None)
        list1.append(x)
        if level < 18:
            list1.append(GetLinks(x))
    return list1

def GetLinks（初始页面，级别）：
级别+=1
htm1=urllib.urlopen（initialPage.read（））
汤=美汤（htm1）
标签=汤（'a'）
list1=list（）
对于标记中的标记：
x=tag.get（'href'，无）
列表1.append（x）
如果级别<18：
列表1.append（GetLinks（x））
返回列表1

我无法想出一种方法，在一个循环中重复同样的过程18次

要在Python中重复18次，可以使用

for uuu范围内（18）

循环：

#!/usr/bin/env python2
from urllib2 import urlopen
from urlparse import urljoin
from bs4 import BeautifulSoup # $ pip install beautifulsoup4

url = 'http://example.com'
for _ in range(18):  # repeat 18 times
    soup = BeautifulSoup(urlopen(url))
    a = soup.find_all('a', href=True)  # all <a href> links
    if len(a) < 3:  # no 3rd link
        break  # exit the loop
    url = urljoin(url, a[2]['href'])  # 3rd link, note: ignore <base href>

#/usr/bin/env蟒蛇2
从urllib2导入urlopen
从urlparse导入urljoin
从bs4导入BeautifulSoup#$pip安装beautifulsoup4
url='1〕http://example.com'
对于范围内的（18）：#重复18次
soup=BeautifulSoup（url打开（url））
a=soup.find_all（'a'，href=True）#所有链接
如果len（a）<3:#无第三链路
中断#退出循环
url=urljoin（url，一个[2]['href']）#第三个链接，注意：忽略

这为您提供了所需的确切查询输出，我将用于循环内部，以便循环重复，并使用count来中断，使其在所需的链接处停止

from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import ssl
import re

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = "your first link here"
#to repeat 18 times#
for i in range(18):
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    count = 0
    for tag in tags:
        count = count +1
        #make it stop at position 3#
        if count>3:
            break
        url = tag.get('href', None)
print(url)

我发现使用while循环可以使代码更干净，并且可以更改代码的输入

    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import ssl
    
    # Ignore SSL certificate errors
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    
    url = input('Enter url: ')
    
    count = input('Enter count: ')
    count = int(count)
    
    pos = input('Enter position:')
    pos = int(pos)
    
    while count > 0:
        # Re-opens the link
        html = urlopen(url, context=ctx).read()
        soup = BeautifulSoup(html, "html.parser")
        
        #Extract 'href=' values
        lst= list()
        tags = soup("a")
        for tag in tags:
            href = tag.get("href", None)
            lst.append(href)
        #prints only the url 'http:/...'
        url = lst[pos]
        #prints out the url on that position
        print('Retrieving:', url)
        
        #makes sure the loop isn't infinite
        count = count - 1

如果您不知道如何为范围内的（18）执行

：

，那么请忘记链接并执行一些使用链接的操作。Casey，非常感谢您的帮助。我有一个问题，您提供的def中哪些行声明我需要重新运行第三个位置URL的进程？函数调用自身，因此递归。如果您查看第9行，您将看到GetLinks函数调用GetLinks，这会重复，直到页面上的链接用完或堆栈溢出/内存耗尽，如果您使用我发布的第二个函数，它会在退出之前调用自身18次（或您希望指定的任何数字）。假设您最初使用GetLinks（link，0）调用它递归在这里可能有点过头了，a.嗨，J.F.塞巴斯蒂安，你的答案看起来很棒，这正是我想要的。但是，我使用的是Python 2.7和bs3，因此您编写的代码没有在我的计算机上运行。我用for循环编写了一个类似的代码（在上面添加了它），代码运行得很好，但是我在结果中得到了相同的4个链接。很明显，我的循环出了问题。嗨，欢迎来到SO。谢谢你的回答。对于未来，请不要只提供代码答案。始终为您的解决方案添加一些解释。这使得人们更容易跟随你的想法。你能提供一个简短的代码描述吗？在完成**BeautifulSoap（数据）**后，将所有锚定标记放入一个列表中。在这个列表中运行循环，提取href值，你将得到一个列表（y1）使用单个页面的所有href值，然后选择所需位置的值并将其添加到另一个列表y2，然后使用y1[]清除列表y1因此，在下一个周期中，在相同的所需位置，列表y1中的href值不同。您基本上是将所有页面的所需位置的链接收集到列表y2中，然后在最后使用要打开的下一个链接更改url。

import urllib
from BeautifulSoup import *

URL = raw_input("Enter the URL:") #Put insurance
link_line = int(raw_input("Enter the line of the desired link:")) - 1 #Put insurance
count = int(raw_input("Enter the loop repeat times:")) #Put insurance

while count >= 0:
    html = urllib.urlopen(URL).read()
    soup = BeautifulSoup(html)
    tags = soup('a')
    print URL
    URL = tags[link_line].get("href", None)
    count = count - 1

import urllib
from BeautifulSoup import *
url = raw_input('http://example')
for i in range(18):
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html)

    tags = soup('a')
    s = []

    for tag in tags:
        x = tag.get('href', None)
        s.append(x)
    print s[3]
    url = s[3]
print "ansver:", s[3]

import urllib
from BeautifulSoup import *

url='http://python-data.dr-chuck.net/known_by_Eesa.html'
counts=raw_input('Enter number of pages to jump: ')
counts=int(counts)
pos=raw_input('Enter position: ')
pos=int(pos)
y1= list()
y2=list()
count=0
while True:
   data=urllib.urlopen(url).read()
   soup= BeautifulSoup(data)
   tags=soup('a')
   for tag in tags:
       value=tag.get('href',None)
       value=str(value)
       y1.append(value)
   t=y1[pos-1]
   y2.append(t)
   y1=[]
   count=count+1 
   if count==counts:break
   else:
    url=t
    continue
print y2

url = input('Enter - ')

def functiontofollowlink(url):

    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    count=0
    # Retrieve all of the anchor tags
    tags = soup('a')
    for tag in tags:
        #print(tag.get('href', None))
        count = count+1
        if count==18:
           url=tag.get('href', None)
return url
numberoftimes = int(input('Enter number of times to repeat:'))
#for fisrt you need to give the link then for numberoftimes decrease number by 1
#since we know previously which position to check so directly used in the function count = 18 else you can give number externally
#positionoflink = input('Enter position of the link:')

while numberoftimes>=0:
    numberoftimes=numberoftimes-1
    url=functiontofollowlink(url)

print(url)

from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import ssl
import re

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = "your first link here"
#to repeat 18 times#
for i in range(18):
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    count = 0
    for tag in tags:
        count = count +1
        #make it stop at position 3#
        if count>3:
            break
        url = tag.get('href', None)
print(url)

from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import ssl

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter :')
for i in range(7): #to repeat 7 times
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    count = 0
    for tag in tags:
        count = count +1
        if count>18: #to stop after printing 18th name
            break
        url = tag.get('href', None)
        if count == 18: #to print 18th name
            print(url)

    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import ssl
    
    # Ignore SSL certificate errors
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    
    url = input('Enter url: ')
    
    count = input('Enter count: ')
    count = int(count)
    
    pos = input('Enter position:')
    pos = int(pos)
    
    while count > 0:
        # Re-opens the link
        html = urlopen(url, context=ctx).read()
        soup = BeautifulSoup(html, "html.parser")
        
        #Extract 'href=' values
        lst= list()
        tags = soup("a")
        for tag in tags:
            href = tag.get("href", None)
            lst.append(href)
        #prints only the url 'http:/...'
        url = lst[pos]
        #prints out the url on that position
        print('Retrieving:', url)
        
        #makes sure the loop isn't infinite
        count = count - 1