剥离html标记
我知道这看起来像是一个已经被问过和回答过的问题,但我问这个问题的原因是因为我对这些问题的答案有疑问。因此,我用python编写了一个脚本,基本上读取一个包含表的文本文件,并以html文件的形式返回结果。我的老板现在希望我实现这个脚本,这样我基本上可以在html文件上使用它。因为我已经编写了在文本文件上运行的代码,所以我想从html文件中删除标记,并将文本存储到文本文件中,以便在原始脚本中读取。到目前为止,我得到的是:#来自numPy import loadtxt剥离html标记,html,python-3.x,Html,Python 3.x,我知道这看起来像是一个已经被问过和回答过的问题,但我问这个问题的原因是因为我对这些问题的答案有疑问。因此,我用python编写了一个脚本,基本上读取一个包含表的文本文件,并以html文件的形式返回结果。我的老板现在希望我实现这个脚本,这样我基本上可以在html文件上使用它。因为我已经编写了在文本文件上运行的代码,所以我想从html文件中删除标记,并将文本存储到文本文件中,以便在原始脚本中读取。到目前为止,我得到的是:#来自numPy import loadtxt import sys from
import sys
from urllib.request import urlopen
from html.parser import HTMLParser
class HTMLTextExtractor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.result = [ ]
def handle_data(self, d):
self.result.append(d)
def handle_charref(self, number):
codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
self.result.append(unichr(codepoint))
def handle_entityref(self, name):
codepoint = htmlentitydefs.name2codepoint[name]
self.result.append(unichr(codepoint))
def get_text(self):
return u''.join(self.result)
def html_to_text(html):
s = HTMLTextExtractor()
s.feed(html)
return s.get_text()
# import BeautifulSoup
numCorrect = 0
numWrong = 0
amount9000 = 0
amount540 = 0
amount541 = 0
if9000 = False
if540 = False
if541 = False
ifSuccess = False
newNextLine = True # sometimes the email would be repeated again and
# would mess my code up, so I used this to help
tester9000 = 0
tester540 = 0
tester541 = 0
testerSuccess = 0
tempStr = ""
tempStr2 = ""
temp9000Str = ""
temp9000Str2 = ""
temp541Str = ""
temp541Str2 = ""
# url = "file:///C:/Python34/CID-Sync-0619.html" # reading an html file
Extractfile = open("ExtractFile.txt" , "w")
object = open("CID-Sync-0619.html", "r")
myString = object.read()
# print(sub('<[^<]+?>', '', myString))
Extractfile.write(strip_tags(myString))
Resultfile = open("EndResult.html", "w")
Resultfile.write("<!DOCTYPE html>\n")
Resultfile.write("<html>\n")
Resultfile.write("<body>\n")
Resultfile.write('<table border="5" style="width:1200px">')
Resultfile.write("<tr>") # first row
Resultfile.write(" <td>e-mail</td>")# cells within the row
Resultfile.write(" <td>status</td> ")
Resultfile.write(" <td>CID</td>")
Resultfile.write(" <td>VHM ID</td>")
Resultfile.write(" <td>VHM Name</td>")
Resultfile.write(" <td>Error Message</td>")
Resultfile.write(" <td>SF Account ID</td>")
Resultfile.write(" <td>API Call Time</td>")
Resultfile.write(" <td>Verification Time</td>")
Resultfile.write("</tr>") # end of first row
filename = "testing.txt"
file_object = open(filename, "r")
myList = file_object.readlines()
print("List made")
for line in myList:
if("Verified Success" in line):
testerSuccess = 1
ifSuccess = True
if541 = False
if540 = False
if9000 = False
print ("success")
if("Error Code:540" in line):
tester540 = 1
if540 = True
if541 = False
ifSuccess = False
if9000 = False
print ("Error code 540")
if("Error Code:541" in line):
tester541 = 1
if541 = True
if9000 = False
if540 = False
ifSuccess = False
print("Error code 541")
if("Error Code: 9000" in line):
tester9000 = 1
if9000 = True
if540 = False
if541 = False
ifSuccess = False
print("Error code 9000")
if(ifSuccess):
if("@" in line):
numCorrect = numCorrect + 1
tempList = line.split()
Resultfile.write("<tr>") # row
Resultfile.write(" <td> %s </td>" %tempList[0])
Resultfile.write(" <td> 0 </td>")
testerSuccess = 1000
else:
if(testerSuccess != 1 ):
temp = line.split()
# print (temp)
# print("the length is %d" %len(temp))
if(len(temp)>3):
Resultfile.write(" <td> %s </td>"%temp[0])
Resultfile.write("<td> </td>")
Resultfile.write("<td> </td>")
Resultfile.write("<td> </td>")
Resultfile.write(" <td> %s </td>" %temp[1])
Resultfile.write(" <td> %s </td>"% temp[2])
Resultfile.write(" <td> %s</td>" % temp[3])
Resultfile.write("</tr>") # end of row
if(if540):
if("@" in line):
stopNextForLoop = False
numWrong = numWrong + 1
amount540 = amount540 + 1
tempList2 = line.split("\t")
if("@" not in tempList2[0]):
numWrong = numWrong - 1
amount540 = amount540 - 1
skipRest = True # check if this is True!!!!!!
for items in tempList2:
Resultfile.write("<td> %s" %items)
stopNextForLoop = True
tempStr = tempList2[0]
# print ("1: " +tempStr)
# print("2: "+tempStr2)
if(len(tempList2)>1):
# print("in the first")
if(tempList2[1].lower() in tempStr2.lower() and stopNextForLoop == False):
for items in tempList2:
Resultfile.write("<td> %s" %items)
skipRest = True
# if(tempStr.lower() in tempStr2.lower()):
# numWrong = numWrong - 1
# amount540 = amount540 - 1
# print("in here")
else:
tempStr2 = tempStr
if(skipRest == False):
# print (tempList2)
Resultfile.write("<tr>") # row
Resultfile.write(" <td> %s </td>" %tempList2[0])
Resultfile.write(" <td> 540 </td>")
Resultfile.write(" <td> </td>")
if(len(tempList2)>4):
Resultfile.write(" <td> %s </td>"%tempList2[1])
Resultfile.write(" <td> %s </td>" %tempList2[2])
Resultfile.write(" <td> %s </td>"% tempList2[3])
Resultfile.write(" <td> </td>")
Resultfile.write(" <td> %s</td>" % tempList2[4])
Resultfile.write("<td> </td>")
# Resultfile.write(" <td> %s </td>" % temp[5])
Resultfile.write("</tr>") # end of row
tester540 = 1000
else:
if(tester540 != 1 ):
temp = line.split("\t")
# print (temp)
if(len(temp)>3):
Resultfile.write(" <td> %s </td>"%temp[0])
print("after")
导入系统
从urllib.request导入urlopen
从html.parser导入HTMLParser
类HtmlTextRactor(HtmlPasser):
定义初始化(自):
HTMLPasser.\uuuuu初始化\uuuuuuuuu(自)
self.result=[]
def句柄_数据(自身,d):
self.result.append(d)
def handle_charref(自身,编号):
如果(u'x',u'x')中的数字[0]为整数,则代码点=整数(数字[1:],16)否则为整数(数字)
self.result.append(unichr(代码点))
def handle_entityref(自身,名称):
codepoint=htmlentitydefs.name2codepoint[name]
self.result.append(unichr(代码点))
def get_文本(自身):
返回u“”。加入(self.result)
def html_至_文本(html):
s=HtmlTextractor()
s、 提要(html)
返回s.get_text()
#进口美联
numCorrect=0
numWrong=0
数量9000=0
数量540=0
数量541=0
if9000=假
if540=假
if541=假
如果成功=错误
newNextLine=True#有时电子邮件会反复出现
#会弄乱我的代码,所以我用这个来帮助
测试器9000=0
测试者540=0
tester541=0
testerSuccess=0
tempStr=“”
tempStr2=“”
temp9000Str=“”
temp9000Str2=“”
temp541Str=“”
temp541Str2=“”
#url=”file:///C:/Python34/CID-Sync-0619.html“#读取html文件
Extractfile=open(“Extractfile.txt”、“w”)
对象=打开(“CID-Sync-0619.html”、“r”)
myString=object.read()
#打印(sub(“html参数不是html文件,而是html文件包含的字符串或文本。我以字符串的形式传递了html文件,整个过程非常顺利。请有人回答我的问题或投票支持我的问题,这样我就可以通过在其他已回答的页面上添加注释来提问。有声望n/1确实阻碍了我在StackOverflow上的学习过程。如果你有这个AttributeError,你应该看看这个
MLStripper
对象是什么,为什么有东西试图调用一个不存在的方法。我发现了。html必须是字符串而不是html文件
File "<frozen importlib._bootstrap>", line 1153, in exec
File "<frozen importlib._bootstrap>", line 1129, in _exec
File "<frozen importlib._bootstrap>", line 1471, in exec_module
File "<frozen importlib._bootstrap>", line 321, in _call_with_frames_removed
File "C:\Python34\GetErrors.py", line 70, in <module>
Extractfile.write(strip_tags(myString))
File "C:\Python34\GetErrors.py", line 21, in strip_tags
self.result.append(unichr(codepoint))
File "C:\Python34\lib\html\parser.py", line 165, in feed
self.goahead(0)
File "C:\Python34\lib\html\parser.py", line 198, in goahead
if self.convert_charrefs and not self.cdata_elem:
AttributeError: 'MLStripper' object has no attribute 'convert_charrefs'