Python Wiki路径搜索
出于个人的突发奇想,我编写了一些代码来搜索任意两篇维基百科文章之间最短的链接系列。事实证明,这是一种非常残酷的力量,需要很长时间才能找到目标,如果它的深度超过一两个链接,但它是有效的!我最终会跟踪并利用链接路径和其他东西,但我想先让搜索工作达到最佳状态。有没有一个更快的方法来做这件事,或者有一个好的方法来减少一些主要的弯路呢Python Wiki路径搜索,python,performance,url,beautifulsoup,Python,Performance,Url,Beautifulsoup,出于个人的突发奇想,我编写了一些代码来搜索任意两篇维基百科文章之间最短的链接系列。事实证明,这是一种非常残酷的力量,需要很长时间才能找到目标,如果它的深度超过一两个链接,但它是有效的!我最终会跟踪并利用链接路径和其他东西,但我想先让搜索工作达到最佳状态。有没有一个更快的方法来做这件事,或者有一个好的方法来减少一些主要的弯路呢 import urllib2 from bs4 import BeautifulSoup Start = 'http://en.wikipedia.org/wiki/Ala
import urllib2
from bs4 import BeautifulSoup
Start = 'http://en.wikipedia.org/wiki/Alan_Reid_%28politician%29'
End = 'http://en.wikipedia.org/wiki/Ayr'
#Using BeautifulSoup, this grabs the page
def soup_request(target):
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0")
page = urllib2.urlopen(target)
soup = BeautifulSoup(page)
return soup
#This will grab all Wiki links off a given page
def get_links(Start):
soup = soup_request(Start)
Wiki_links = []
#Finds all links
for url in soup.findAll('a'):
result = url.get('href')
try:
if str(result)[:5] == '/wiki':
Wiki_links.append(result)
except:
pass
for q in range(len(Wiki_links)):
Wiki_links[q] = 'http://en.wikipedia.org'+str(Wiki_links[q])
print "Got new links from",Start
return Wiki_links
#This will check all the given links to see if the title matches the goal webpage
def check_links(Links,End):
goalsoup = soup_request(End)
goaltitle = goalsoup.html.title
Found = False
count = 0
for q in Links:
if Found:
break
length = len(Links)
#Runs through all the given links and checks their titles for correct one
if q is not None:
count += 1
soup = soup_request(q)
print "Checked",count,"links out of",length
try:
title = soup.html.head.title
if title == goaltitle:
Found = True
print "Found it!"
break
except:
print 'doh'
pass
return Found
#Top function to do all the stuff in the right order, applying a maximum depth of how deep into the links
def wiki_crawl(Start, End, depth):
Old_Links = [Start]
count = depth
while count > 0:
New_Links = []
for q in range(len(Old_Links)):
New_Links.extend(get_links(Old_Links[q]))
Found = check_links(New_Links,End)
if Found:
print "All done."
break
Old_Links = New_Links
count -= 1
print "_______________________________________________________________ROUND DONE"
if not Found:
print "Did not find the page, you must go deeper!"
wiki_crawl(Start, End, 2)
下面是一些从wiki获取信息的函数。唯一的问题是,有时它会从网页上的信息中挤出一个空间
def take_out_parenthesis(st):
string = list(st)
for a in string:
if a == '(':
del string[st.find(a)]
if a == ')':
del string[st.find(a) - 1]
return ''.join(string)
def take_out_tags(string):
st = list(string)
odd = ['<', '>']
times = 0
for a in string:
if a in odd:
times += 1
times /= 2
for b in range(times):
start = string.find('<') - 1
end = string.find('>')
bet = end - start + 1
for a in range(bet):
del st[start]
string = ''.join(st)
return string
def take_out_brackets(string):
st = list(string)
odd = ['[', ']']
times = 0
for a in string:
if a in odd:
times += 1
times /= 2
for b in range(times):
start = string.find('[') - 1
end = string.find(']')
bet = end - start + 1
for a in range(bet):
del st[start]
string = ''.join(st)
return string
def take_from_web_page(text):
n = 0
url = text.replace(" ", "_")
search = "http://en.wikipedia.org/wiki/%s" % url
page = urllib2.urlopen(search).read()
start = page.find('<p><b>') + 6
end = page.find('</a>.', start) + 5
new_page = page[start:end]
for a in new_page:
if a == '<':
if new_page[n - 1] != ' ':
lst = list(new_page)
lst.insert(n, ' ')
new_page = ''.join(lst)
n += 1
n += 1
return take_out_parenthesis(take_out_brackets(take_out_tags(new_page)))
def取出括号(st):
字符串=列表(st)
对于输入字符串:
如果a=='(':
删除字符串[st.find(a)]
如果a==')':
删除字符串[st.find(a)-1]
返回“”。加入(字符串)
def取出标签(字符串):
st=列表(字符串)
奇数=['']
次数=0
对于输入字符串:
如果a为奇数:
次数+=1
次/=2
对于范围内的b(次):
开始=字符串。查找(“”)
下注=结束-开始+1
对于范围内的(下注):
德尔街[起点]
字符串=“”。连接(st)
返回字符串
def取出括号(字符串):
st=列表(字符串)
奇数=['[',']']
次数=0
对于输入字符串:
如果a为奇数:
次数+=1
次/=2
对于范围内的b(次):
start=string.find('[')-1
end=string.find(']'))
下注=结束-开始+1
对于范围内的(下注):
德尔街[起点]
字符串=“”。连接(st)
返回字符串
def从网页获取内容(文本):
n=0
url=text.replace(“,”替换)
搜索=”http://en.wikipedia.org/wiki/%s%url
page=urllib2.urlopen(search.read)()
开始=第页。查找(“”)+6
结束=第页。查找('.',开始)+5
新建页面=页面[开始:结束]
对于新页面中的内容:
如果a=='