Python scrapy,如何在HTML标记元素中分隔文本

Python scrapy,如何在HTML标记元素中分隔文本,python,screen-scraping,scrapy,web-crawler,Python,Screen Scraping,Scrapy,Web Crawler,包含我的数据的代码: <div id="content"><!-- InstanceBeginEditable name="EditRegion3" --> <div id="content_div"> <div class="title" id="content_title_div"><img src="img/banner_outlets.jpg" width="920" height="157" al

包含我的数据的代码:

        <div id="content"><!-- InstanceBeginEditable name="EditRegion3" -->
      <div id="content_div">
    <div class="title" id="content_title_div"><img src="img/banner_outlets.jpg" width="920" height="157" alt="Outlets" /></div>
    <div id="menu_list">
<table border="0" cellpadding="5" cellspacing="5" width="100%">
    <tbody>
        <tr>
            <td valign="top">
                <p>
                    <span class="foodTitle">Century Square</span><br />
                    2 Tampines Central 5<br />
                    #01-44-47 Century Square<br />
                    Singapore 529509</p>
                <p>
                    <br />
                    <strong>Opening Hours:</strong><br />
                    7am to 12am (Sun-Thu &amp;&nbsp;PH)<br />
                    24 Hours (Fri &amp; Sat&nbsp;&amp;</p>
                <p>
                    Eve of PH)<br />
                    Telephone: 6789 0457</p>
            </td>
            <td valign="top">
                <img alt="Century Square" src="/assets/images/outlets/century_sq.jpg" style="width: 260px; height: 140px" /></td>
            <td valign="top">
                <span class="foodTitle">Liat Towers</span><br />
                541 Liat towers #01-01<br />
                Orchard Road<br />
                Singapore 238888<br />
                <br />
                <strong>Opening Hours: </strong><br />
                24 hours (Daily)<br />
                <br />
                Telephone: 6737 8036</td>
            <td valign="top">
                <img alt="Liat Towers" src="/assets/images/outlets/century_liat.jpg" style="width: 260px; height: 140px" /></td>
        </tr>
我会选择所有包含

然后,对于每个
td
单元格,获取所有文本节点

.//text()
你会得到这样的结果:

['\n                ',
 '\n                    ',
 'Century Square',
 '\n                    2 Tampines Central 5',
 '\n                    #01-44-47 Century Square',
 '\n                    Singapore 529509',
 '\n                ',
 '\n                    ',
 'Opening Hours:',
 u'\n                    7am to 12am (Sun-Thu &\xa0PH)',
 u'\n                    24 Hours (Fri & Sat\xa0&',
 '\n                ',
 '\n                    Eve of PH)',
 '\n                    Telephone: 6789 0457',
 '\n            ']

其中一些文本节点的字符串表示形式都是空白,因此请去掉它们并查找“营业时间”和“电话”关键字以处理循环中的行:

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import re
from todo.items import wendyItem

class wendySpider(BaseSpider):
    name = "wendyspider"
    allowed_domains = ["wendys.com.sg"]
    start_urls = ["http://www.wendys.com.sg/outlets.php"]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        cells = hxs.select('//div[@id="menu_list"]//td[@valign="top"][.//span[@class="foodTitle"]]')
        items = []
        for cell in cells:
            item = wendyItem()

            # get all text nodes
            # some lines are blank so .strip() them
            lines = cell.select('.//text()').extract()
            lines = [l.strip() for l in lines if l.strip()]

            # first non-blank line is the place name
            item['name'] = lines.pop(0)

            # for the other lines, check for "Opening hours" and "Telephone"
            # to store lines in correct list container

            address_lines = []
            hours_lines = []
            telephone_lines = []

            opening_hours = False
            telephone = False

            for line in lines:
                if 'Opening Hours' in line:
                    opening_hours = True
                elif 'Telephone' in line:
                    telephone = True
                if telephone:
                    telephone_lines.append(line)
                elif opening_hours:
                    hours_lines.append(line)
                else:
                    address_lines.append(line)

            # last address line is the postal code + town name
            item['address'] = "\n".join(address_lines[:-1])
            item['postal'] = address_lines[-1]

            # ommit "Opening hours" (first element in list)
            item['hours'] = "\n".join(hours_lines[1:])

            item['contact'] = "\n".join(telephone_lines)

            items.append(item)

        return items

天哪,非常感谢你,保罗!你真的帮了我很多忙,很棒的帖子。我没有足够的声望去投票,但是对于那些读到这篇文章的人来说,投票吧,谢谢@headaboutto爆炸。。。但是你有足够的理由接受这个答案,不是吗?;-)嗨@paultrmbrth,你能检查一下我的问题吗?谢谢你的回答support@paultrmbrth你能帮我吗
['\n                ',
 '\n                    ',
 'Century Square',
 '\n                    2 Tampines Central 5',
 '\n                    #01-44-47 Century Square',
 '\n                    Singapore 529509',
 '\n                ',
 '\n                    ',
 'Opening Hours:',
 u'\n                    7am to 12am (Sun-Thu &\xa0PH)',
 u'\n                    24 Hours (Fri & Sat\xa0&',
 '\n                ',
 '\n                    Eve of PH)',
 '\n                    Telephone: 6789 0457',
 '\n            ']
['\n                ',
 'Liat Towers',
 '\n                541 Liat towers #01-01',
 '\n                Orchard Road',
 '\n                Singapore 238888',
 'Opening Hours: ',
 '\n                24 hours (Daily)',
 '\n                Telephone: 6737 8036']
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import re
from todo.items import wendyItem

class wendySpider(BaseSpider):
    name = "wendyspider"
    allowed_domains = ["wendys.com.sg"]
    start_urls = ["http://www.wendys.com.sg/outlets.php"]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        cells = hxs.select('//div[@id="menu_list"]//td[@valign="top"][.//span[@class="foodTitle"]]')
        items = []
        for cell in cells:
            item = wendyItem()

            # get all text nodes
            # some lines are blank so .strip() them
            lines = cell.select('.//text()').extract()
            lines = [l.strip() for l in lines if l.strip()]

            # first non-blank line is the place name
            item['name'] = lines.pop(0)

            # for the other lines, check for "Opening hours" and "Telephone"
            # to store lines in correct list container

            address_lines = []
            hours_lines = []
            telephone_lines = []

            opening_hours = False
            telephone = False

            for line in lines:
                if 'Opening Hours' in line:
                    opening_hours = True
                elif 'Telephone' in line:
                    telephone = True
                if telephone:
                    telephone_lines.append(line)
                elif opening_hours:
                    hours_lines.append(line)
                else:
                    address_lines.append(line)

            # last address line is the postal code + town name
            item['address'] = "\n".join(address_lines[:-1])
            item['postal'] = address_lines[-1]

            # ommit "Opening hours" (first element in list)
            item['hours'] = "\n".join(hours_lines[1:])

            item['contact'] = "\n".join(telephone_lines)

            items.append(item)

        return items