剥离html标记_Html_Python 3.x - Fatal编程技术网

剥离html标记

html python-3.x

剥离html标记,html,python-3.x,Html,Python 3.x,我知道这看起来像是一个已经被问过和回答过的问题，但我问这个问题的原因是因为我对这些问题的答案有疑问。因此，我用python编写了一个脚本，基本上读取一个包含表的文本文件，并以html文件的形式返回结果。我的老板现在希望我实现这个脚本，这样我基本上可以在html文件上使用它。因为我已经编写了在文本文件上运行的代码，所以我想从html文件中删除标记，并将文本存储到文本文件中，以便在原始脚本中读取。到目前为止，我得到的是：#来自numPy import loadtxt import sys from

我知道这看起来像是一个已经被问过和回答过的问题，但我问这个问题的原因是因为我对这些问题的答案有疑问。因此，我用python编写了一个脚本，基本上读取一个包含表的文本文件，并以html文件的形式返回结果。我的老板现在希望我实现这个脚本，这样我基本上可以在html文件上使用它。因为我已经编写了在文本文件上运行的代码，所以我想从html文件中删除标记，并将文本存储到文本文件中，以便在原始脚本中读取。到目前为止，我得到的是：#来自numPy import loadtxt

import sys
from urllib.request import urlopen
from html.parser import HTMLParser

class HTMLTextExtractor(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.result = [ ]

    def handle_data(self, d):
        self.result.append(d)

    def handle_charref(self, number):
        codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
    self.result.append(unichr(codepoint))

    def handle_entityref(self, name):
        codepoint = htmlentitydefs.name2codepoint[name]
        self.result.append(unichr(codepoint))

    def get_text(self):
        return u''.join(self.result)

def html_to_text(html):
    s = HTMLTextExtractor()
    s.feed(html)
    return s.get_text()

# import BeautifulSoup
numCorrect = 0
numWrong = 0
amount9000 = 0
amount540 = 0
amount541 = 0
if9000 = False
if540 = False
if541 = False
ifSuccess = False

newNextLine = True # sometimes the email would be repeated again and 
                # would mess my code up, so I used this to help
tester9000 = 0
tester540 = 0
tester541 = 0
testerSuccess = 0
tempStr = ""
tempStr2 = ""
temp9000Str = ""
temp9000Str2 = ""
temp541Str = ""
temp541Str2 = ""

# url = "file:///C:/Python34/CID-Sync-0619.html"    # reading an html file



Extractfile = open("ExtractFile.txt" , "w")
object = open("CID-Sync-0619.html", "r")
myString = object.read()
# print(sub('<[^<]+?>', '', myString))
Extractfile.write(strip_tags(myString))

Resultfile = open("EndResult.html", "w")
Resultfile.write("<!DOCTYPE html>\n")

Resultfile.write("<html>\n")
Resultfile.write("<body>\n")

Resultfile.write('<table border="5" style="width:1200px">')
Resultfile.write("<tr>")   # first row
Resultfile.write("  <td>e-mail</td>")# cells within the row
Resultfile.write("  <td>status</td> ")
Resultfile.write("  <td>CID</td>")
Resultfile.write("  <td>VHM ID</td>")
Resultfile.write("  <td>VHM Name</td>")
Resultfile.write("  <td>Error Message</td>")
Resultfile.write("  <td>SF Account ID</td>")
Resultfile.write("  <td>API Call Time</td>")
Resultfile.write("  <td>Verification Time</td>")

Resultfile.write("</tr>")  # end of first row





filename = "testing.txt"

file_object = open(filename, "r")
myList = file_object.readlines()
print("List made")

for line in myList:
    if("Verified Success" in line):
    testerSuccess = 1
    ifSuccess = True
    if541 = False
    if540 = False
    if9000 = False
    print ("success")
if("Error Code:540" in line):
    tester540 = 1
    if540 = True
    if541 = False
    ifSuccess = False
    if9000 = False
    print ("Error code 540")
if("Error Code:541" in line):
    tester541 = 1
    if541 = True
    if9000 = False
    if540 = False
    ifSuccess = False
    print("Error code 541")
if("Error Code: 9000" in line):
    tester9000 = 1
    if9000 = True
    if540 = False
    if541 = False
    ifSuccess = False
    print("Error code 9000")
if(ifSuccess):
    if("@" in line):
        numCorrect = numCorrect + 1
        tempList = line.split()
        Resultfile.write("<tr>")  # row
        Resultfile.write("  <td> %s </td>" %tempList[0])
        Resultfile.write("  <td> 0 </td>")
        testerSuccess = 1000
    else:
        if(testerSuccess != 1 ):
            temp = line.split()
            # print (temp)
            # print("the length is %d" %len(temp))
            if(len(temp)>3):
                Resultfile.write("  <td> %s </td>"%temp[0])
                Resultfile.write("<td> </td>")
                Resultfile.write("<td> </td>")
                Resultfile.write("<td> </td>") 
                Resultfile.write("  <td> %s </td>" %temp[1])
                Resultfile.write("  <td> %s </td>"% temp[2])
                Resultfile.write("  <td> %s</td>" % temp[3])
                Resultfile.write("</tr>") # end of row

if(if540):
    if("@" in line):
        stopNextForLoop = False
        numWrong = numWrong + 1
        amount540 = amount540 + 1
        tempList2 = line.split("\t")
        if("@" not in tempList2[0]):
            numWrong = numWrong - 1
            amount540 = amount540 - 1
            skipRest = True # check if this is True!!!!!!
            for items in tempList2:
                Resultfile.write("<td> %s" %items)
                stopNextForLoop = True
        tempStr = tempList2[0]
        # print ("1: " +tempStr)
        # print("2: "+tempStr2)
        if(len(tempList2)>1):
            # print("in the first")
            if(tempList2[1].lower() in tempStr2.lower() and stopNextForLoop == False):
                for items in tempList2:
                    Resultfile.write("<td> %s" %items)
                    skipRest = True
        # if(tempStr.lower() in tempStr2.lower()):
            # numWrong = numWrong - 1
            # amount540 = amount540 - 1
            # print("in here")
        else:
            tempStr2 = tempStr

        if(skipRest == False):

            # print (tempList2)
            Resultfile.write("<tr>")  # row
            Resultfile.write("  <td> %s </td>" %tempList2[0])
            Resultfile.write("  <td> 540 </td>")
            Resultfile.write("  <td> </td>")
            if(len(tempList2)>4):
                Resultfile.write("  <td> %s </td>"%tempList2[1])
                Resultfile.write("  <td> %s </td>" %tempList2[2])
                Resultfile.write("  <td> %s </td>"% tempList2[3])
                Resultfile.write("  <td> </td>")
                Resultfile.write("  <td> %s</td>" % tempList2[4])
                Resultfile.write("<td> </td>")
                # Resultfile.write("    <td> %s </td>" % temp[5])
                Resultfile.write("</tr>") # end of row

            tester540 = 1000
    else:
        if(tester540 != 1 ):
            temp = line.split("\t")
            # print (temp)
            if(len(temp)>3):
                Resultfile.write("  <td> %s </td>"%temp[0])
print("after")

导入系统从urllib.request导入urlopen 从html.parser导入HTMLParser 类HtmlTextRactor（HtmlPasser）：定义初始化（自）： HTMLPasser.\uuuuu初始化\uuuuuuuuu（自） self.result=[] def句柄_数据（自身，d）： self.result.append（d） def handle_charref（自身，编号）：如果（u'x'，u'x'）中的数字[0]为整数，则代码点=整数（数字[1:]，16）否则为整数（数字） self.result.append（unichr（代码点）） def handle_entityref（自身，名称）： codepoint=htmlentitydefs.name2codepoint[name] self.result.append（unichr（代码点）） def get_文本（自身）：返回u“”。加入（self.result） def html_至_文本（html）： s=HtmlTextractor（） s、提要（html）返回s.get_text（） #进口美联 numCorrect=0 numWrong=0 数量9000=0 数量540=0 数量541=0 if9000=假 if540=假 if541=假如果成功=错误 newNextLine=True#有时电子邮件会反复出现 #会弄乱我的代码，所以我用这个来帮助测试器9000=0 测试者540=0 tester541=0 testerSuccess=0 tempStr=“” tempStr2=“” temp9000Str=“” temp9000Str2=“” temp541Str=“” temp541Str2=“” #url=”file:///C:/Python34/CID-Sync-0619.html“#读取html文件 Extractfile=open（“Extractfile.txt”、“w”）对象=打开（“CID-Sync-0619.html”、“r”） myString=object.read（）

#打印（sub（“html参数不是html文件，而是html文件包含的字符串或文本。我以字符串的形式传递了html文件，整个过程非常顺利。

请有人回答我的问题或投票支持我的问题，这样我就可以通过在其他已回答的页面上添加注释来提问。有声望n/1确实阻碍了我在StackOverflow上的学习过程。如果你有这个AttributeError，你应该看看这个

MLStripper

对象是什么，为什么有东西试图调用一个不存在的方法。我发现了。html必须是字符串而不是html文件

File "<frozen importlib._bootstrap>", line 1153, in exec
  File "<frozen importlib._bootstrap>", line 1129, in _exec
  File "<frozen importlib._bootstrap>", line 1471, in exec_module
  File "<frozen importlib._bootstrap>", line 321, in _call_with_frames_removed
  File "C:\Python34\GetErrors.py", line 70, in <module>
    Extractfile.write(strip_tags(myString))
  File "C:\Python34\GetErrors.py", line 21, in strip_tags
    self.result.append(unichr(codepoint))
  File "C:\Python34\lib\html\parser.py", line 165, in feed
self.goahead(0)
File "C:\Python34\lib\html\parser.py", line 198, in goahead
if self.convert_charrefs and not self.cdata_elem:
AttributeError: 'MLStripper' object has no attribute 'convert_charrefs'