从主要站点抓取Python';URL和clean标记直接与www和URL一起保存到文件中
在我的代码url扫描器下:从主要站点抓取Python';URL和clean标记直接与www和URL一起保存到文件中,python,Python,在我的代码url扫描器下: import urllib2 import urllib from lxml.html import fromstring from lxml.html.clean import Cleaner from formatter import NullFormatter import cookielib import urllib,time import urlparse import datetime import new from htmllib import HTM
import urllib2
import urllib
from lxml.html import fromstring
from lxml.html.clean import Cleaner
from formatter import NullFormatter
import cookielib
import urllib,time
import urlparse
import datetime
import new
from htmllib import HTMLParser
from lxml.html import fromstring
from lxml.html.clean import Cleaner
import urllib2
import sys,popen2,os
import urlparse
def tagclean(url,Data=None):
html = urllib2.urlopen(url).read()
doc = fromstring(html)
tags = ['h1','h2','h3','h4','h5','h6', 'div', 'span', 'img', 'area', 'map']
args = {'meta':False, 'safe_attrs_only':False, 'page_structure':False,
'scripts':True, 'style':True, 'links':True, 'remove_tags':tags}
cleaner = Cleaner(**args)
path = '/html/body'
body = doc.xpath(path)[0]
return cleaner.clean_html(body).text_content().encode('ascii', 'ignore')
def writetofile(text,filename):
writefile = open(""+filename+"", "a")
writefile.write(str(text))
writefile.close()
if __name__=="__main__":
url = raw_input("Enter url:")
spliturl = url.split("http://")[1].replace(".","_")
metin=str(tagclean(url))
writetofile(text,spliturl+".txt")
我想把tagcleaner和这个结合起来…我在这里进行一个疯狂的猜测,因为我真的不知道你想要实现什么,但你不是想取代它吗
def scanurl(url):
print "saving: ",url,datetime.datetime.now().strftime("%H:%M:%S")
tmp=urllib.urlretrieve(url)
print "saving finished",datetime.datetime.now().strftime("%H:%M:%S")
parser= HTMLParser(NullFormatter( ))
parser.feed( open(tmp[0]).read( ) )
urls=[]
for a in parser.anchorlist:
urls.append(urlparse.urljoin( url, a ))
return urls
与
如果没有,你真的需要详细阐述你的问题。请在制定问题时花些功夫。你遇到了什么问题?你试了什么?为什么不起作用?你想解决哪个问题?您尝试使用的数据示例有哪些?我只想将干净的代码与url解析结合起来。我想用Python从主站点的url抓取干净的标记,并使用标记干净的源代码从主站点保存到url
parser.feed( open(tmp[0]).read( ) )
parser.feed( tagclean(url) )