Python 获取Wikipedia文章中不在括号内的第一个链接_Python_Parsing_Dom

Python 获取Wikipedia文章中不在括号内的第一个链接

python parsing dom

Python 获取Wikipedia文章中不在括号内的第一个链接,python,parsing,dom,Python,Parsing,Dom,所以我感兴趣的是，如果你随机浏览一篇维基百科文章，反复点击第一个链接，而不是括号内的链接，在95%的情况下，你最终会看到关于维基百科的文章我想用Python编写一个脚本，为我获取链接，最后打印一个访问过哪些文章的列表（linkA->linkB->linkC）等等我设法获得了网页的HTMLDOM，并设法去掉了一些不必要的链接和顶部的描述栏，从而消除了网页的歧义。到目前为止，我的结论是： DOM以某些页面右侧的表开始，例如在中。我们想忽略这些链接有效的链接元素都有一个元素作为它们的祖先（如果

所以我感兴趣的是，如果你随机浏览一篇维基百科文章，反复点击第一个链接，而不是括号内的链接，在95%的情况下，你最终会看到关于维基百科的文章

我想用Python编写一个脚本，为我获取链接，最后打印一个访问过哪些文章的列表（

linkA->linkB->linkC

）等等

我设法获得了网页的HTMLDOM，并设法去掉了一些不必要的链接和顶部的描述栏，从而消除了网页的歧义。到目前为止，我的结论是：

DOM以某些页面右侧的表开始，例如在中。我们想忽略这些链接

有效的链接元素都有一个

元素作为它们的祖先（如果它位于

标记或类似标记中，则通常是父或祖父母）。导致消歧页面的顶部栏似乎不包含任何
元素


无效链接包含一些后跟冒号的特殊单词，例如Wikipedia:


到目前为止，还不错。但是括号让我明白了。例如，在关于的文章中，第一个链接不是括号内的“/wiki/Species”，而是脚本找到了括号内的“/wiki/Taxonomy”
我不知道如何通过编程来实现这一点，因为我必须在父节点/子节点的某些组合中查找文本，这些组合可能并不总是相同的。有什么想法吗
我的代码可以在下面看到，但这是我很快编出来的，并不是很引以为豪的东西。不过，它有注释，所以你可以看到我的想法（我希望：）
“维基百科乐趣”
导入urllib2
从xml.dom.minidom导入解析字符串
导入时间
def validWikiArticleLinkString（href）：
“”接受一个字符串，如果它包含子字符串，则返回True
“/wiki/”开头，不包含任何
“特殊”维基页面。
"""
返回（href.find（“/wiki/”）==0
和href.find（“（消除歧义）”）=-1
和href.find（“文件：”）=-1
和href.find（“维基百科：”）=-1
和href.find（“门户：”）=-1
和href.find（“特殊：”）=-1
和href.find（“帮助：”）=-1
和href.find（“模板会话：”）=-1
和href.find（“模板：”）=-1
和href.find（“Talk:”）=-1
和href.find（“类别：”）=-1
和href.find（“Bibcode”）=-1
和href.find（“主页”）=-1）
如果名称=“\uuuuu main\uuuuuuuu”：
已访问=[]#已访问链接的列表。用于避免陷入循环
opener=urllib2.build\u opener（）
opener.addheaders=[（'User-agent'，'Mozilla/5.0'）]#需要api的头
currentPage=“Human”#开始的页面
尽管如此：
infle=opener.open（'http://en.wikipedia.org/w/index.php?title=%s&printable=yes“%currentPage”）
html=infle.read（）#检索我们所在的wiki页面的内容
htmlDOM=parseString（html）#获取解析后的html的DOM
aTags=htmlDOM.getElementsByTagName（“a”）#查找所有标记
对于aTags中的标记：
如果tag.attributes.keys（）中的“href”：#查看标记中是否有href属性
href=tag.attributes[“href”].value#获取href属性的值
如果validWikiArticleLinkString（href）：#如果我们有一种我们正在寻找的链接类型
#现在是棘手的部分。我们只想在主要内容区域寻找链接，
#我们希望第一个链接不在括号中。
#假设链接是有效的。
无效=错误
#出现在站点右侧的表首先出现在DOM中，因此我们需要确保
#我们不是在看一个标签里面的某个地方。
pn=tag.parentNode
虽然pn不是无：
如果str（pn）.find（“表at”）>=0：
无效=真
打破
其他：
pn=pn.parentNode
如果无效：#转到下一个链接
继续
#接下来我们看一下文章上面的描述性文本，如果有的话；例如
#这篇文章是关于。。。。或其他用途，请参阅。。。（消除歧义）。
#这些类型的链接将导致循环，因此我们将它们归类为无效链接。
#我们注意到该文本似乎不在块内，因此
#我们忽略了不在任何中的标记。
pnode=tag.parentNode
虽然pnode不是None：
如果str（pnode）.find（“p at”）>=0：
打破
pnode=pnode.parentNode
#如果我们已经到达根节点，它没有parentNode，我们将
#链接无效。
如果pnode为None：
无效=真
如果无效：
持续
######这就是我被卡住的地方：
#现在我们需要看看链接是否在括号内。下面是一些垃圾
#对于tag.parentNode.childNodes中的元素：
#虽然elem.firstChild不是无：
#elem=elem.firstChid
#打印元素nodeValue
print href#这将是下一个链接
newLink=href[6:]#除了/wiki/part
打破
#如果我们以前访问过此链接，请断开
"""Wikipedia fun"""
import urllib2
from xml.dom.minidom import parseString
import time

def validWikiArticleLinkString(href):
    """ Takes a string and returns True if it contains the substring
        '/wiki/' in the beginning and does not contain any of the
        "special" wiki pages. 
    """
    return (href.find("/wiki/") == 0
            and href.find("(disambiguation)") == -1 
            and href.find("File:") == -1 
            and href.find("Wikipedia:") == -1
            and href.find("Portal:") == -1
            and href.find("Special:") == -1
            and href.find("Help:") == -1
            and href.find("Template_talk:") == -1
            and href.find("Template:") == -1
            and href.find("Talk:") == -1
            and href.find("Category:") == -1
            and href.find("Bibcode") == -1
            and href.find("Main_Page") == -1)


if __name__ == "__main__":
    visited = []    # a list of visited links. used to avoid getting into loops

    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')] # need headers for the api

    currentPage = "Human"  # the page to start with

    while True:
        infile = opener.open('http://en.wikipedia.org/w/index.php?title=%s&printable=yes' % currentPage)
        html = infile.read()    # retrieve the contents of the wiki page we are at

        htmlDOM = parseString(html) # get the DOM of the parsed HTML
        aTags = htmlDOM.getElementsByTagName("a")   # find all <a> tags

        for tag in aTags:
            if "href" in tag.attributes.keys():         # see if we have the href attribute in the tag
                href = tag.attributes["href"].value     # get the value of the href attribute
                if validWikiArticleLinkString(href):                             # if we have one of the link types we are looking for

                    # Now come the tricky parts. We want to look for links in the main content area only,
                    # and we want the first link not in parentheses.

                    # assume the link is valid.
                    invalid = False            

                    # tables which appear to the right on the site appear first in the DOM, so we need to make sure
                    # we are not looking at a <a> tag somewhere inside a <table>.
                    pn = tag.parentNode                     
                    while pn is not None:
                        if str(pn).find("table at") >= 0:
                            invalid = True
                            break
                        else:
                            pn = pn.parentNode 

                    if invalid:     # go to next link
                        continue               

                    # Next we look at the descriptive texts above the article, if any; e.g
                    # This article is about .... or For other uses, see ... (disambiguation).
                    # These kinds of links will lead into loops so we classify them as invalid.

                    # We notice that this text does not appear to be inside a <p> block, so
                    # we dismiss <a> tags which aren't inside any <p>.
                    pnode = tag.parentNode
                    while pnode is not None:
                        if str(pnode).find("p at") >= 0:
                            break
                        pnode = pnode.parentNode
                    # If we have reached the root node, which has parentNode None, we classify the
                    # link as invalid.
                    if pnode is None:
                        invalid = True

                    if invalid:
                        continue


                    ######  this is where I got stuck:
                    # now we need to look if the link is inside parentheses. below is some junk

#                    for elem in tag.parentNode.childNodes:
#                        while elem.firstChild is not None:
#                            elem = elem.firstChid
#                        print elem.nodeValue

                    print href      # this will be the next link
                    newLink = href[6:]  # except for the /wiki/ part
                    break

        # if we have been to this link before, break the loop
        if newLink in visited:
            print "Stuck in loop."
            break
        # or if we have reached Philosophy
        elif newLink == "Philosophy":
            print "Ended up in Philosophy."
            break
        else:
            visited.append(currentPage)     # mark this currentPage as visited
            currentPage = newLink           # make the the currentPage we found the new page to fetch
            time.sleep(5)                   # sleep some to see results as debug