使用python3爬网wikipedia子类别时出错_Python_Beautifulsoup_Wikipedia_Nameerror_File Not Found

使用python3爬网wikipedia子类别时出错

python

使用python3爬网wikipedia子类别时出错,python,beautifulsoup,wikipedia,nameerror,file-not-found,Python,Beautifulsoup,Wikipedia,Nameerror,File Not Found,社区成员你好我收到错误name错误：未定义名称“f”。代码如下。请帮忙。感谢您的任何帮助。3天以来我一直被这个问题困扰着。代码将提取Python 3中wikipedia类别的所有子类别名称我尝试了相对路径和绝对路径代码如下： import httplib2 from bs4 import BeautifulSoup import subprocess import time, wget import os, os.path #declarations catRoot = "http://

社区成员你好

我收到错误

name错误：未定义名称“f”

。代码如下。请帮忙。感谢您的任何帮助。3天以来我一直被这个问题困扰着。代码将提取Python 3中wikipedia类别的所有子类别名称

我尝试了相对路径和绝对路径

代码如下：

import httplib2
from bs4 import BeautifulSoup
import subprocess
import time, wget
import os, os.path

#declarations
catRoot = "http://en.wikipedia.org/wiki/Category:"
MAX_DEPTH = 100
done = []
ignore = []
path = 'trivial'
#Removes all newline characters and replaces with spaces
def removeNewLines(in_text):
return in_text.replace('\n', ' ')

# Downloads a link into the destination
def download(link, dest):
# print link
if not os.path.exists(dest) or os.path.getsize(dest) == 0:
    subprocess.getoutput('wget "' + link + '" -O "' + dest+ '"')    
    print ("Downloading")
def ensureDir(f):
    if not os.path.exists(f):
        os.mkdir(f)

# Cleans a text by removing tags
def clean(in_text):
    s_list = list(in_text)
    i,j = 0,0
    while i < len(s_list):
#iterate until a left-angle bracket is found
        if s_list[i] == '<':
            if s_list[i+1] == 'b' and s_list[i+2] == 'r' and s_list[i+3] == '>':
                i=i+1
                print ("hello")
                continue               
            while s_list[i] != '>':
#pop everything from the the left-angle bracket until the right-angle bracket
                s_list.pop(i)
#pops the right-angle bracket, too
            s_list.pop(i)

        elif s_list[i] == '\n':
            s_list.pop(i)
        else:
            i=i+1        
#convert the list back into text
join_char=''
return (join_char.join(s_list))#.replace("<br>","\n")

def getBullets(content):
    mainSoup = BeautifulSoup(contents, "html.parser")

# Gets empty bullets
def getAllBullets(content):
    mainSoup = BeautifulSoup(str(content), "html.parser")
    subcategories = mainSoup.findAll('div',attrs={"class" : "CategoryTreeItem"})
    empty = []
    full = []
    for x in subcategories:
        subSoup = BeautifulSoup(str(x))
        link = str(subSoup.findAll('a')[0])
        if (str(x)).count("CategoryTreeEmptyBullet") > 0:
            empty.append(clean(link).replace(" ","_"))
        elif (str(x)).count("CategoryTreeBullet") > 0:
            full.append(clean(link).replace(" ","_"))

    return((empty,full))

def printTree(catName, count):
    catName = catName.replace("\\'","'")
    if count == MAX_DEPTH : return
    download(catRoot+catName, path)
    filepath = "categories/Category:"+catName+".html" 

    print(filepath) 
    content = open('filepath', 'w+')

    content.readlines()
    (emptyBullets,fullBullets) = getAllBullets(content)
    f.close()
    for x in emptyBullets:
        for i in range(count): 
          print ("  "),
    download(catRoot+x, "categories/Category:"+x+".html")
    print (x)
    for x in fullBullets:
      for i in range(count): 
          print ("  "),
      print (x)
      if x in done:
         print ("Done... "+x)
         continue
      done.append(x)
      try: printTree(x, count + 1)        
      except: 
          print ("ERROR: " + x)
name = "Cricket"
printTree(name, 0)

导入httplib2
从bs4导入BeautifulSoup
导入子流程
导入时间，wget
导入操作系统，操作系统路径
#声明
catRoot=”http://en.wikipedia.org/wiki/Category:"
最大深度=100
完成=[]
忽略=[]
路径='平凡'
#删除所有换行符并替换为空格
def removeNewLines（以文本形式）：
以文本形式返回。替换（'\n'，''）
#将链接下载到目标
def下载（链接，目的地）：
#打印链接
如果不存在os.path.exists（目标）或os.path.getsize（目标）==0：
subprocess.getoutput（'wget'+link+'-O'+dest+''））
打印（“下载”）
def ensureDir（f）：
如果操作系统路径不存在（f）：
os.mkdir（f）
#通过删除标记来清除文本
def清洁（文本）：
列表=列表（在文本中）
i、 j=0,0
而我”，“\n”）
def getBullets（内容）：
mainSoup=BeautifulSoup（内容为“html.parser”）
#得到空子弹
def getAllBullets（内容）：
mainSoup=BeautifulSoup（str（content），“html.parser”）
subcategories=mainSoup.findAll（'div'，attrs={“class”：“CategoryTreeItem”}）
空=[]
完整=[]
对于子类别中的x：
subsup=BeautifulSoup（str（x））
link=str（subsup.findAll（'a'）[0]）
如果（str（x））.count（“CategoryTreeEmptyBullet”）>0：
empty.append（clean（link）.replace（“，”））
elif（str（x））.count（“CategoryTreebill”）>0:
full.append（清除（链接）.replace（“，”））
返回（（空，满））
def打印树（catName，计数）：
catName=catName。替换（“\\'”、“'”）
如果计数=最大深度：返回
下载（catRoot+catName，路径）
filepath=“categories/Category:”+catName+“.html”
打印（文件路径）
content=open（'filepath'，'w+'））
content.readlines（）
（emptyBullets，fullBullets）=getAllBullets（内容）
f、 关闭（）
对于空按钮中的x：
对于范围内的i（计数）：
打印（“”），
下载（catRoot+x，“类别/类别：“+x+”.html”）
打印（x）
对于完整项目符号中的x：
对于范围内的i（计数）：
打印（“”），
打印（x）
如果x已完成：
打印（“完成…”+x）
持续
完成。追加（x）
try:printree（x，count+1）
除：
打印（“错误：+x）
name=“板球”
printTree（名称，0）

遇到的错误如下所示

我认为

f.close（）

应该是

content.close（）

在这种情况下，通常使用上下文管理器，但如下所示：

with open(filepath, 'w+') as content:
    (emptyBullets,fullBullets) = getAllBullets(content)

然后Python将为您关闭该文件，即使在出现异常的情况下也是如此

（我还将

'filepath'

更改为

filepath

，我认为这就是本文的目的。）

错误正确地指出，您没有定义要使用的

。你是说

content.close（）

吗？如果我错了，请纠正我，但是你将文件定义为content.close（）而不是

f.close（）

在

content.close之后，它抛出赋值之前引用的局部变量“x”
。我想这是不对的。谢谢弗洛里安。合并您的建议后，它会在帐户中添加一个新错误，即FileNotFoundError:[Errno 2]没有这样的文件或目录：“categories/Category:Cricket.html”
我无法找到您。请详细说明，明白了。但是我在采纳你的建议时使用了w+
。但是，在那之后，我得到了下面的错误命令。错误是关于：以open（filepath，'w+'）作为内容：
FileNotFoundError:[Errno 2]没有这样的文件或目录：'categories/Category/Cricket.html'
总之，Florian感谢您的帮助。这意味着很多。目录类别是否存在？