Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/317.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python Scrapy-使用';小于';文字符号_Python_Html_Xpath_Scrapy_Html Parsing - Fatal编程技术网

Python Scrapy-使用';小于';文字符号

Python Scrapy-使用';小于';文字符号,python,html,xpath,scrapy,html-parsing,Python,Html,Xpath,Scrapy,Html Parsing,更新: 这是一个html的示例行,直接用Chrome中的“复制外部html”复制。在这篇文章中,我在td和/td之前添加了空格,以显示实际的html,而不会触发html: <td class="elem">3Fb1&lt;+1Lo&lt;+3Sb1</td> 响应中的数据为: 3Fb1 但应该是这样 3Fb1<+1Lo<+3Sb1 当文本出现时(如第一个例子),我得到了所有的东西,但是当它在文本的中间(第二个例子)时,它截断了它之后的所

更新:

这是一个html的示例行,直接用Chrome中的“复制外部html”复制。在这篇文章中,我在td和/td之前添加了空格,以显示实际的html,而不会触发html:

<td class="elem">3Fb1&lt;+1Lo&lt;+3Sb1</td> 
响应中的数据为:

3Fb1
但应该是这样

3Fb1<+1Lo<+3Sb1
当文本出现时(如第一个例子),我得到了所有的东西,但是当它在文本的中间(第二个例子)时,它截断了它之后的所有东西。

我会回去手动修复,但我正在提取数百万个数据点,所以这样做是不实际的

对这位新手的任何帮助都将不胜感激

刮削示例页面:

代码:


您遇到的问题是由于保留字符导致的HTML格式错误(小于symbol
您遇到的问题是由于保留字符导致的HTML格式错误)(小于symbol的
我用程序
wget
下载了你的页面,并在文本编辑器中进行了检查-它不使用
用于
我用程序
wget
下载了你的页面,并在文本编辑器中进行了检查-它不使用
用于
这不是一个琐碎的问题,而是一个
lxml
问题。你可以在本例中仍然使用
scrapy
,但使用不同的解析器:

>> from scrapy import Selector

>> sel = Selector(text=response.body, type="xml")
>> sel.xpath('//table[@class="elm"][1]//td[@class="elem"]//text()') # should return it correctly
您必须使用
sel
从该页面提取信息,而不是
response


已知的问题已经是

这不是一个scrapy问题,而是一个
lxml
问题。您仍然可以在这种情况下使用
scrapy
,但使用不同的解析器:

>> from scrapy import Selector

>> sel = Selector(text=response.body, type="xml")
>> sel.xpath('//table[@class="elm"][1]//td[@class="elem"]//text()') # should return it correctly
您必须使用
sel
从该页面提取信息,而不是
response


已知问题已经是

最好显示此页面的url和您的代码,以便我们测试它。典型页面:我的代码部分(尽可能适合)skater_number=1作为响应行。xpath('//table[@class=“sum”]/tbody/tr[not(contains(@class,“thead”))):elements_id=1 element_table='//table[@class=“elm”]['+str(skater_table)+']/tbody/tr[not(contains)(@class,“thead”)]]”用于响应中的行。xpath(element_table):elements={}elements['element_number']=row.xpath('td[@class=“num”]///text()')。extract()elements['executed'=row.xpath('td[2]//text()))将代码放入问题中。它将更具可读性。您也可以将url添加到问题中。您应该在创建问题时添加url和代码。最好将url显示到此页面和您的代码,以便我们可以对其进行测试。典型页面:我的代码部分(尽可能适合)skater_number=1作为响应行。xpath('//table[@class=“sum”]/tbody/tr[not(contains(@class,“thead”))]':elements_id=1 element_table='//table[@class=“elm”]['+str(skater_table)+']/tbody/tr[not(contains(@class,“thead”)]]'表示响应行。xpath(element_table):elements={}elements['element_number']=row.xpath('td[@class=“num”]///text()).extract()元素['executed_element']=row.xpath('td[2]//text()')。extract()对代码提出质疑。它将更具可读性。您也可以向问题添加url。您应该在创建问题时添加url和代码。
elements['executed_element'] = row.xpath('td[2]//text()').extract()
def parse(self, response):
    event = response.xpath('//title//text()').extract()
    category_segment = response.xpath('//h2[@class="catseg"]//text()').extract()
    skater_number = 1
    for row in response.xpath('//table[@class="sum"]/tbody/tr[not(contains(@class,"thead"))]'):
        skater_name = row.xpath('td[2]//text()').extract_first()
        skater_place = row.xpath('td[1]//text()').extract_first()
        skater_deductions = row.xpath('td[7]//text()').extract_first()
        # capture elements detail 
        skater_table = skater_place
        elements_id = 1
        element_table = '//table[@class="elm"][' + str(skater_table) +']/tbody/tr[not(contains(@class,"thead"))]'
        for row in response.xpath(element_table):
            elements = {}
            elements['Event'] = event 
            elements['Category_Segment'] = category_segment
            elements['skater_name'] = skater_name 
            elements['elements_id'] = elements_id
            elements['element_number'] = row.xpath('td[@class="num"]//text()').extract()
            elements['executed_element'] = row.xpath('td[2]//text()').extract()
            elements['element_info'] = row.xpath('td[3]//text()').extract()
            elements['base_value'] = row.xpath('td[4]//text()').extract()
            elements['bonus'] = row.xpath('td[5]//text()').extract()
            elements['GOE'] = row.xpath('td[6]//text()').extract()
            goe_table = str('.//td[@class="jud"]')
            judge_pointer = 8
            judge_number = 1
            elements_id += 1
            for cell in row.xpath(goe_table):
                elements['Judge Number'] = judge_number
                elements['Judge_GOE_Score'] = row.xpath('td[' + str(judge_pointer) + ']//text()').extract()
                yield elements
                judge_pointer += 1
                judge_number += 1
from bs4 import BeautifulSoup
from scrapy.http import TextResponse

# parse response body with BeautifulSoup
soup = BeautifulSoup(response.body, "html5lib")
# overwrite response body
response = TextResponse(url="my HTML string", body=str(soup))

# from here on use your code
event = response.xpath('//title//text()').extract()
...
body = response.body.replace(b'<<+', b'&lt;&lt;+').replace(b'<+', b'&lt;+')
selector = scrapy.Selector(text=body.decode('utf-8'))
#!/usr/bin/env python3

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    start_urls = ['http://www.usfigureskating.org/leaderboard/results/2018/25073/SEGM001.html']

    def parse(self, response):
        print('url:', response.url)

        body = response.body.replace(b'<<+', b'&lt;&lt;+').replace(b'<+', b'&lt;+')

        selector = scrapy.Selector(text=body.decode('utf-8'))

        i = 1
        for x  in selector.css('.elem::text').extract():
            if 'Elements' in x:
                print('---', i, '---')
                i += 1
            else:
                print(x)

# --- it runs without project and saves in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',

    # save in CSV or JSON
    #'FEED_FORMAT': 'csv',     # 'json
    #'FEED_URI': 'output.csv', # 'output.json
})
c.crawl(MySpider)
c.start()
Executed
--- 1 ---
2Ab1+2T
ChSt1
2Ab1
2Lz+1Lo+2Lo
2Lz
FSSp4
2F
CCoSp4
Executed
--- 2 ---
2Ab1
ChSt1
2Ab1+1Lo+2F
CCoSp2V
2Lz+2Lo
2Lo
2Lz
LSp4
Executed
--- 3 ---
CCoSp4
ChSt1
2Ab1+2Lo
2Lz+1Lo+2Lo
2Ab1
2Lz
2Fe
FSSp4
Executed
--- 4 ---
2Ab1+1Lo+2Lo
2Ab1
LSp4
ChSt1
2Lz
2F
2Lz+2T
CCoSp4
Executed
--- 5 ---
2Ab1
LSp2
ChSt1
2Ab1+1Lo+1Lo
2Lz+2Lo
2Lz
2F
CCoSp3
Executed
--- 6 ---
2Lz
1A
SSp3
ChSt1
2Lz+1Lo+2Lo
CCoSp3
2F+2Lo
2F
Executed
--- 7 ---
2F
2Ab1
CCoSp4
2Lz
2Ab1<+2T
ChSt1
2Lz+1Lo+2F
LSp4
Executed
--- 8 ---
1A
LSp4
ChSt1
2Lz
2Lz+2T
2Lo+2T+1Lo
2F
CCoSp4
Executed
--- 9 ---
2A<<
CCoSp4
ChSt1
2F+1Lo+2Lo
2Lze+2Lo
2Lze
2F
SSp4
Executed
--- 10 ---
2Lz
2Ab1
SSp3
ChSt1
2A<<+REP
2Lz+2Lo
2F
CCoSp4
Executed
--- 11 ---
FSSp4
2Ab1<+2Lo
ChSt1
2A<<
FCCoSp3
2F+2Lo<+1Lo<<
2Lz
2F
Executed
--- 12 ---
2A<<+1Lo+2Lo<
2Lze
SSp3
ChSt1
2A<<
2F
2F+2Lo<
CCoSp3
>> from scrapy import Selector

>> sel = Selector(text=response.body, type="xml")
>> sel.xpath('//table[@class="elm"][1]//td[@class="elem"]//text()') # should return it correctly