Python 3.x 在python 3.4中使用bs4查找页面中的所有文本_Python 3.x_Beautifulsoup

Python 3.x 在python 3.4中使用bs4查找页面中的所有文本

python-3.x

Python 3.x 在python 3.4中使用bs4查找页面中的所有文本,python-3.x,beautifulsoup,Python 3.x,Beautifulsoup,我试图用bs4抓取一个URL，但我无法获取页面上所有可见的单词。我也尝试了查找所有（text=true），但没有成功网址： import argparse 导入操作系统进口稀土导入请求从集合导入订单从bs4导入BeautifulSoup title=“由Sagar编写” 印刷品（标题） parser=argparse.ArgumentParser（description=title，formatter\u class=argparse.RawTextHelpFormatter） ope

我试图用bs4抓取一个URL，但我无法获取页面上所有可见的单词。我也尝试了

查找所有（text=true）

，但没有成功

网址：

import argparse
导入操作系统
进口稀土
导入请求
从集合导入订单
从bs4导入BeautifulSoup
title=“由Sagar编写”
印刷品（标题）
parser=argparse.ArgumentParser（description=title，formatter\u class=argparse.RawTextHelpFormatter）
operationGroup=解析器。添加互斥组（必需=True）
operationGroup.add_参数（'-l'，action=“store”，dest=“webList”，
help=“指定一个文本文件，其中包含要刮取的URL列表（以换行符分隔）。”
optionGroup=解析器。添加参数组（'参数和选项'）
optionGroup.add_参数（'-o'，action=“store”，dest=“outputFile”，
help=“输出文件名。（默认值：wordlist.txt）”
optionGroup.add_参数（'-min'，action=“store”，dest=“minLength”，type=int，
help=“设置每个单词的最小字符数（默认值：3）。”
optionGroup.add_参数（'-max'，action=“store”，dest=“maxLength”，type=int，
help=“设置每个单词的最大字符数（默认值：30）。”
args=parser.parse_args（）
def可见（元素）：
如果['style'，'script'，'document'，'head'，'title']中的element.parent.name：
返回错误
elif re.match（“”，str（元素））：
返回错误
返回真值
def webUrl（完整URL）：
#URL验证
validull=re.compile(
r'^（？：http）s？：//|#http://或https://
r'^（？：http）s？：//www
r’（？：（？：[A-Z0-9]（？：[A-Z0-9-]{0,61}[A-Z0-9]）？\）+（？：[A-Z]{2,6}\.？|[A-Z0-9-]{2,6}\）|"域。。。
r'localhost |'#localhost。。。
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'）或ip
r'（？：\d+）#可选端口
r'（？：/？|[/？]\S+$”，re.IGNORECASE）
如果validUrl.匹配（完整URL）：
尝试：
u=请求。获取（完整URL）
html=u.content.decode（'utf8'）
soup=BeautifulSoup（html）
#tokens=soup.get_text（）
text=soup.get_text（）
可见文本=过滤器（可见，文本）
如果args.minLength或args.maxLength：
对于文本中的令牌：
如果不是（len（token.translate（None，charBlacklist））maxl）：
追加（str（token））
其他：
对于文本中的令牌：
追加（str（token））
打印（“抓取URL-{0}”。格式（完整URL））
例外情况除外，如e：
print（'连接到或分析{0}时出错。格式（fullUrl））
打印（'错误：%s“%e”）
其他：
打印（'无效的URL-{0}。格式必须是http://www.smeegesec.com.'.Format（fullUrl））
def webList（webListFile）：
如果os.path.isfile（webListFile）：
将打开（webListFile）作为f：
webList=f.readlines（）
对于webList中的url：
webUrl（url.rstrip（'\n'））
f、 关闭（）
其他：
打印（'打开文件时出错'）
def输出（）：
尝试：
如果不是args.outputFile：
args.outputFile='wordlist.txt'
outputFile=open（args.outputFile，“w”）
wordListFinal=OrderedDict.fromkeys（wordList.keys）（）
对于wordListFinal中的word：
打印（word）
outputFile.write（word）
outputFile.write（'\n'）
outputFile.close（）
打印（“\n{0}个唯一的单词已被刮除。”。格式（len（单词列表）））
打印（'输出文件成功写入：{0}'。格式（outputFile.name））
例外情况除外，如e：
打印（'创建输出文件时出错：{0}'。格式（outputFile.name））
打印（e）
如果名称=“\uuuuu main\uuuuuuuu”：
wordList=list（）
黑名单=“”
如果args.minLength或args.maxLength：
如果args.minLength为3，则minl=args.minLength
如果args.maxLength为30，则maxl=args.maxLength
如果minl>maxl：
print（'参数minLength不能大于maxLength。设置默认为min=3 max=30'）
minl=3
最大值=30
黑名单=“”
如果args.webList：
网络列表（args.webList）
输出（）

最好包括预期输出和实际输出的示例，或者您看到的任何错误的详细信息。有关更多详细信息，请参阅。使用Ctrl-K在代码中复制缩进，谢谢在问题中没有位置（投票并接受），这样

之间的文本就会显示出来。

import argparse
import os
import re
import requests
from collections import OrderedDict
from bs4 import BeautifulSoup

title = "Written by Sagar"
print(title)
parser = argparse.ArgumentParser(description=title,        formatter_class=argparse.RawTextHelpFormatter)

operationGroup = parser.add_mutually_exclusive_group(required=True)
operationGroup.add_argument('-l', action="store", dest="webList",
                            help="Specify a text file with a list of URLs to  scrape (separated by newline).")

optionGroup = parser.add_argument_group('paramters and options')
optionGroup.add_argument('-o', action="store", dest="outputFile",
                         help="Output filename. (Default: wordlist.txt)")
optionGroup.add_argument('-min', action="store", dest="minLength", type=int,
                         help="Set the minimum number of characters for each word (Default: 3).")
optionGroup.add_argument('-max', action="store", dest="maxLength", type=int,
                         help="Set the maximum number of characters for each word (Default: 30).")

args = parser.parse_args()


def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element)):
        return False
    return True


def webUrl(fullUrl):
    # URL validation
    validUrl = re.compile(
        r'^(?:http)s?://|'  # http:// or https://
        r'^(?:http)s?://www.'
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
        r'localhost|'  #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

    if validUrl.match(fullUrl):
        try:
            u = requests.get(fullUrl)
            html = u.content.decode('utf8')
            soup = BeautifulSoup(html)
            # tokens = soup.get_text()
            texts = soup.get_text()
            visible_texts = filter(visible, texts)


            if args.minLength or args.maxLength:
                for token in texts:
                    if not (len(token.translate(None, charBlacklist)) < minl or len(token) > maxl):
                        wordList.append(str(token))
            else:
                for token in texts:
                    wordList.append(str(token))

            print("Scraping URL - {0}".format(fullUrl))
        except Exception as e:
            print('There was an error connecting to or parsing {0}'.format(fullUrl))
            print('Error: %s' % e)
    else:
        print('INVALID URL - {0}. Format must be http(s)://www.smeegesec.com.'.format(fullUrl))


def webList(webListFile):
    if os.path.isfile(webListFile):
        with open(webListFile) as f:
            webList = f.readlines()

        for url in webList:
            webUrl(url.rstrip('\n'))

        f.close()
    else:
        print('Error opening file')


def output():
    try:
        if not args.outputFile:
            args.outputFile = 'wordlist.txt'
        outputFile = open(args.outputFile, 'w')
        wordListFinal = OrderedDict.fromkeys(wordList).keys()

        for word in wordListFinal:
            print(word)
            outputFile.write(word)
            outputFile.write('\n')
        outputFile.close()

        print('\n{0} unique words have been scraped.'.format(len(wordList)))
        print('Output file successfully written: {0}'.format(outputFile.name))
    except Exception as e:
        print('Error creating output file: {0}'.format(outputFile.name))
        print(e)


if __name__ == "__main__":

    wordList = list()
    charBlacklist = ""

    if args.minLength or args.maxLength:
        minl = args.minLength if args.minLength else 3
        maxl = args.maxLength if args.maxLength else 30
        if minl > maxl:
            print('Argument minLength cannot be greater than maxLength. Setting defaults to min=3 max=30.')
            minl = 3
            maxl = 30

    charBlacklist = ""

    if args.webList:
        webList(args.webList)

    output()