Python 从多个页面中提取链接和标题

Python 从多个页面中提取链接和标题,python,regex,rss,beautifulsoup,Python,Regex,Rss,Beautifulsoup,我正在尝试用下载链接建立我自己的rss 但是rss提要只提供了整个赛季的链接 我将这个链接带到整个赛季,并希望提取到该集本身的特定下载链接(上传/ul) 这就是我目前所得到的。 有没有可能让它工作起来 import feedparser, urllib2, re from BeautifulSoup import BeautifulSoup episodenliste = ['Game.of.Thrones','Arrow'] episode_link = [] episode_title

我正在尝试用下载链接建立我自己的rss

但是rss提要只提供了整个赛季的链接

我将这个链接带到整个赛季,并希望提取到该集本身的特定下载链接(上传/ul)

这就是我目前所得到的。 有没有可能让它工作起来

import feedparser, urllib2, re
from BeautifulSoup import BeautifulSoup

episodenliste = ['Game.of.Thrones','Arrow']

episode_link = []
episode_title = []
d = feedparser.parse('http://serienjunkies.org/xml/feeds/episoden.xml')
for post in d.entries:
    if ('DEUTSCH' in post.title) and any (word in post.title for word in episodenliste) and ('720p' in post.title):
        post.title = post.title.replace('[DEUTSCH] ','')
        post.title = re.sub(r'(.*S\d+E\d+)(.*)',r'\1' ,post.title)
        episode_link.append(post.link)
        episode_title.append(post.title)
        print post.title + ": " + post.link + "\n"

for search_title in episode_title:
    for get_dlLink in episode_link:
        page_ = urllib2.Request(get_dlLink)
        page = urllib2.urlopen(page_).read()
        soup = BeautifulSoup(page)
        print search_title
        title = soup.find('strong', text=search_title)
        if title is not None:
            print title
  #          link = title.parent
   #         links = link.find_all('a')
    #        print links
    #        for link2 in links:
     #           url = link2['href']
      #          print url
       #         pattern = 'http:\/\/download\.serienjunkies\.org.*%s_.*\.html' % ul
        #        if re.match(pattern, url):
         #           print url
据我所知,它的工作原理是我在页面上搜索标题

它访问从rss解析的页面。但它找不到标题

我的想法是:

首先找到标题,然后从中提取“children”/链接

谢谢你的帮助 提前感谢


[DEUTSCH]Arrow.S02E14.Gegen.die.Zeit.GERMAN.acquired.720p.HDTV.x264-ZZGtv
[DEUTSCH]Arrow.S02E14.Gegen.die.Zeit.GERMAN.acquired.720p.HDTV.x264-ZZGtv
2014年7月18日星期五00:00:00+0200
http://serienjunkies.org/arrow/arrow-staffel-2-hdtvweb-dl-sd720p1080p/
对不起,我不知道

<p><strong>Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS</strong><br><div id="download_mirrors" class="download_main"><strong>Download:</strong> <a href="http://download.serienjunkies.org/f-3e8ea978a2cf7bda/ul_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html" target="_blank" style="font-size:14px;font-weight:bold;">uploaded.net</a> <span style="font-size:10px">(best speed) </span><br><strong style="margin-left:14px">Mirrors:</strong> <img src="http://serienjunkies.org/media/img/stream/application_cascade.png" style="cursor:pointer;" title="Mirrors zeigen" onclick="toggle(&quot;Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS&quot;);"><div id="Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS" style="display: none;">
<strong style="margin-left:20px">Mirror:</strong> <a href="http://download.serienjunkies.org/f-55bc328624d93658/fm_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html" target="_blank">filemonkey.in</a><br>
<strong style="margin-left:20px">Mirror:</strong> <a href="http://download.serienjunkies.org/f-25023a87144345f9/so_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html" target="_blank">share-online.biz</a><br>
</div><div><strong style="margin-left:18px">Usenet:</strong> <a href="http://www.firstload.com/affiliate/log.php?log=50393&amp;fn=Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS" target="_blank">Highspeed Mirror</a></div></div></p>
Arrow.S02E14.Gegen.die.Zeit.German.DD51.coded.DL.720p.iTunesHD.AVC-TVS
下载:(最佳速度)
镜像: 镜像:
镜像:
Usenet:


如果未启用JavaScript,则HTML看起来完全不同:

Arrow.S02E14.Gegen.die.Zeit.German.DD51.coded.DL.720p.iTunesHD.AVC-TVS
下载:| filemonkey.in
下载:|在线分享。商务
下载:|上传到

由于RSS提要中不带
[DEUTSCH]
前缀的标题是该系列页面段落中的第一个文本,因此它可以作为搜索和提取条目的基础。上面有两个元素是
标记,其中包含该集的所有数据。这是链接,后跟文件宿主的名称

import feedparser
import requests
from bs4 import BeautifulSoup

FEED_URL = 'http://serienjunkies.org/xml/feeds/episoden.xml'


def is_interesting_entry(entry, title_prefix, series_names):
    return (
        entry.title.startswith(title_prefix)
        and any(name in entry.title for name in series_names)
    )


def process_entry(entry, title_prefix):
    if not entry.title.startswith(title_prefix):
        raise ValueError(
            'expected prefix {0!r} not found in {1!r}'.format(
                title_prefix, entry.title
            )
        )
    return (entry.title[len(title_prefix):], entry.link)


def process_feed(feed_url, title_prefix, series_names):
    return (
        process_entry(entry, title_prefix)
        for entry in feedparser.parse(feed_url).entries
        if is_interesting_entry(entry, title_prefix, series_names)
    )


def get_series_soup(url, cache=dict()):
    if url in cache:
        return cache[url]
    else:
        result = BeautifulSoup(requests.get(url).text)
        cache[url] = result
        return result


def get_download_urls(soup, title):
    title_text = soup.find(text=title)
    if not title_text:
        return dict()
    else:
        return dict(
            (a_tag.next_sibling.strip('| '), a_tag['href'])
            for a_tag in title_text.parent.parent('a')
        )


def main():
    series_names = ['Game.of.Thrones', 'Arrow']
    for title, url in process_feed(FEED_URL, '[DEUTSCH] ', series_names):
        print
        print title
        hoster2url = get_download_urls(get_series_soup(url), title)
        if hoster2url:
            for hoster, download_url in sorted(hoster2url.iteritems()):
                print '{0:>20s}: {1}'.format(hoster, download_url)
        else:
            print '  --- No downloads ---'


if __name__ == '__main__':
    main()

你能在这里发布提要的相关部分吗?该网站有特定于国家/地区的访问规则。您可能希望将程序分解为多个功能。这样就更容易单独测试单个步骤,并在此处提出更具体的问题。快速浏览一下,删除
[DEUTSCH]
前缀并在链接页面上搜索该文本(不带正则表达式)就足够了。向上两步(
parent
属性)是
标记,其中包含该插曲的下载链接。HTML看起来与不使用浏览器或浏览器但禁用JavaScript下载链接完全不同。
import feedparser
import requests
from bs4 import BeautifulSoup

FEED_URL = 'http://serienjunkies.org/xml/feeds/episoden.xml'


def is_interesting_entry(entry, title_prefix, series_names):
    return (
        entry.title.startswith(title_prefix)
        and any(name in entry.title for name in series_names)
    )


def process_entry(entry, title_prefix):
    if not entry.title.startswith(title_prefix):
        raise ValueError(
            'expected prefix {0!r} not found in {1!r}'.format(
                title_prefix, entry.title
            )
        )
    return (entry.title[len(title_prefix):], entry.link)


def process_feed(feed_url, title_prefix, series_names):
    return (
        process_entry(entry, title_prefix)
        for entry in feedparser.parse(feed_url).entries
        if is_interesting_entry(entry, title_prefix, series_names)
    )


def get_series_soup(url, cache=dict()):
    if url in cache:
        return cache[url]
    else:
        result = BeautifulSoup(requests.get(url).text)
        cache[url] = result
        return result


def get_download_urls(soup, title):
    title_text = soup.find(text=title)
    if not title_text:
        return dict()
    else:
        return dict(
            (a_tag.next_sibling.strip('| '), a_tag['href'])
            for a_tag in title_text.parent.parent('a')
        )


def main():
    series_names = ['Game.of.Thrones', 'Arrow']
    for title, url in process_feed(FEED_URL, '[DEUTSCH] ', series_names):
        print
        print title
        hoster2url = get_download_urls(get_series_soup(url), title)
        if hoster2url:
            for hoster, download_url in sorted(hoster2url.iteritems()):
                print '{0:>20s}: {1}'.format(hoster, download_url)
        else:
            print '  --- No downloads ---'


if __name__ == '__main__':
    main()