Warning: file_get_contents(/data/phpspider/zhask/data//catemap/7/sqlite/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python:如何抓取ajax加载的Facebook帖子(页面滚动)_Python_Ajax_Facebook - Fatal编程技术网

Python:如何抓取ajax加载的Facebook帖子(页面滚动)

Python:如何抓取ajax加载的Facebook帖子(页面滚动),python,ajax,facebook,Python,Ajax,Facebook,我试图在搜索中抓取由ajax(页面滚动)加载的Facebook帖子。 我使用抓取和请求 使用Grab的授权正在工作,然后我尝试模拟由浏览器生成的滚动请求,并通过post发送它以加载下一组结果。 html响应中的错误是:“抱歉,无法加载您的请求” 下面是代码(因为facebook请求量很大,所以代码很大) 谁能帮忙?另外,我不想使用Selenium,因为它会生成真正的浏览器来启动。您可以尝试使用phantomjs(),而不是Selenium,它将充当虚拟浏览器Facebook不喜欢被刮掉。他们有各

我试图在搜索中抓取由ajax(页面滚动)加载的Facebook帖子。 我使用抓取和请求

使用Grab的授权正在工作,然后我尝试模拟由浏览器生成的滚动请求,并通过post发送它以加载下一组结果。 html响应中的错误是:“抱歉,无法加载您的请求”

下面是代码(因为facebook请求量很大,所以代码很大)


谁能帮忙?另外,我不想使用Selenium,因为它会生成真正的浏览器来启动。

您可以尝试使用phantomjs(),而不是Selenium,它将充当虚拟浏览器Facebook不喜欢被刮掉。他们有各种各样的措施来预防并使其复杂化。这些度量值一天可以更改多次。从今天开始的刮刀不保证明天可以使用。要访问Facebook帖子,请使用带有库的API(又名SDK)@KlausD.-我知道,我们使用python facebook sdk完成其他任务。但Facebook关闭后,不可能通过API抓取搜索帖子this@userxxx谢谢,但这是javascript,我们需要pythonversion@VicNicethemer-它是JS,但您可以使用python将其自动化。我使用它也是为了同样的目的。代替Selenium,您可以尝试phantomjs(),它将充当虚拟浏览器Facebook不喜欢被刮。他们有各种各样的措施来预防并使其复杂化。这些度量值一天可以更改多次。从今天开始的刮刀不保证明天可以使用。要访问Facebook帖子,请使用带有库的API(又名SDK)@KlausD.-我知道,我们使用python facebook sdk完成其他任务。但Facebook关闭后,不可能通过API抓取搜索帖子this@userxxx谢谢,但这是javascript,我们需要pythonversion@VicNicethemer-它是JS,但您可以使用python将其自动化。我也用过它。
import requests
# from bs4 import BeautifulSoup
class FacebookScraper(object):
    def __init__(self):
        self.data = {
        "view":"list",
        "encoded_query":"{\"bqf\":\"keywords_top(\\u0025D0\\u0025BB\\u0025D0\\u0025B0\\u0025D0\\u0025B2\\u0025D0\\u0025B0\\u0025D0\\u0025BB\\u0025D1\\u00258C+\\u0025D0\\u0025BF\\u0025D0\\u0025BE\\u0025D1\\u002582\\u0025D0\\u0025BE\\u0025D0\\u0025BA)\",\"vertical\":\"content\",\"post_search_vertical\":None,\"intent_data\":\"{\\\"intent\\\":\\\"posts\\\",\\\"entity_id\\\":None,\\\"sub_intents\\\":{\\\"user\\\":True},\\\"user_confidence\\\":0.2384902536869,\\\"typeahead_user_confidence\\\":0.69785696268082,\\\"quel_topics\\\":[],\\\"multi_label_intents\\\":[{\\\"value\\\":True,\\\"confidence\\\":0.0113326292485},{\\\"value\\\":True,\\\"confidence\\\":6.7988303271704e-6},{\\\"value\\\":True,\\\"confidence\\\":0.0008767499239184},{\\\"value\\\":True,\\\"confidence\\\":0.025693353265524},{\\\"value\\\":True,\\\"confidence\\\":0.010204718448222},{\\\"value\\\":True,\\\"confidence\\\":0.0020656401757151},{\\\"value\\\":True,\\\"confidence\\\":0.01333892159164},{\\\"value\\\":True,\\\"confidence\\\":0.0087564773857594},{\\\"value\\\":True,\\\"confidence\\\":0.0016323747113347},{\\\"value\\\":True,\\\"confidence\\\":0.0084763253107667},{\\\"value\\\":True,\\\"confidence\\\":0.015831520780921},{\\\"value\\\":True,\\\"confidence\\\":0.004807879216969},{\\\"value\\\":True,\\\"confidence\\\":0.058732055127621},{\\\"value\\\":True,\\\"confidence\\\":0.033020552247763},{\\\"value\\\":True,\\\"confidence\\\":0.0066938307136297},{\\\"value\\\":True,\\\"confidence\\\":0.0071515664458275},{\\\"value\\\":True,\\\"confidence\\\":0.57516884803772},{\\\"value\\\":True,\\\"confidence\\\":0.21925939619541},{\\\"value\\\":True,\\\"confidence\\\":0.0026327867526561},{\\\"value\\\":True,\\\"confidence\\\":0.0046313949860632},{\\\"value\\\":True,\\\"confidence\\\":0.0015605170046911}],\\\"annotated_string\\\":\\\"{\\\\\\\"entities\\\\\\\":[],\\\\\\\"segments\\\\\\\":[{\\\\\\\"type\\\\\\\":\\\\\\\"\\\\\\\\u003Cusername>\\\\\\\",\\\\\\\"tokens\\\\\\\":\\\\\\\"\\\\\\\\u043b\\\\\\\\u0430\\\\\\\\u0432\\\\\\\\u0430\\\\\\\\u043b\\\\\\\\u044c \\\\\\\\u043f\\\\\\\\u043e\\\\\\\\u0442\\\\\\\\u043e\\\\\\\\u043a\\\\\\\"}]}\\\"}\",\"filters\":[],\"has_chrono_sort\":False,\"query_analysis\":\"\",\"subrequest_disabled\":False}",
        "encoded_title":"WyJcdTAwMjVEMFx1MDAyNUJCXHUwMDI1RDBcdTAwMjVCMFx1MDAyNUQwXHUwMDI1QjJcdTAwMjVEMFx1MDAyNUIwXHUwMDI1RDBcdTAwMjVCQlx1MDAyNUQxXHUwMDI1OEMrXHUwMDI1RDBcdTAwMjVCRlx1MDAyNUQwXHUwMDI1QkVcdTAwMjVEMVx1MDAyNTgyXHUwMDI1RDBcdTAwMjVCRVx1MDAyNUQwXHUwMDI1QkEiXQ",
        "ref":"unknown",
        "logger_source":"www_main",
        "typeahead_sid":"",
        "tl_log":False,
        "impression_id":"ed9cde39",
        "filter_ids":{
            "1597273353820386:1674330276114693:0":"1597273353820386:1674330276114693:0"
        },
        "experience_type":"grammar",
        "exclude_ids":None,
        "browse_location":"",
        "trending_source":None,
        "reaction_surface":None,
        "reaction_session_id":None,
        "ref_path":"/search/top/",
        "is_trending":False,
        "topic_id":None,
        "place_id":None,
        "story_id":None,
        "callsite":"browse_ui:init_result_set",
        "has_top_pagelet":True,
        "display_params":{
            "mrss":True
        },
        "cursor":"Abp0rWq8oGjiUWYmIeN9rZCuAXTubQ9beY7hyGlZpau3ekEF9BSelQe85TthKUvpoNEU65kQorg_Fya3xI47JPWGoJ-lNMS4JiOobTZ0Q3tP0HD1Z5JZiCLbT10PwrRNibnv-TKdTBhwtfMvqR816Hd9vIHPRxCBmT5lPfrVZC7f0ohVLeSCKFEYmP-47IiDsWW1YynB8Yqr_54b7iIQWB4uixrp5Zm5AHrCilxpqGtp9ye5Y2nKCyK8UtMkzzQ11CTfidmPmDMPQgU3rFroTvpUc96QGvfX1pNjNw3sn-CFyn5TLq_0mX_jOVP8BzfRP7qhnmcmjs3Rf2_l22Q9C-gQGqSwDUqE63XszHUYqEW5e-KtU_3Hcpb3OT0MJyZ7EQI",
        "page_number":2,
        "em":False,
        "mr":False,
        "tr":None
    }

    def scrape(self):
        posts = self.scrape_posts()
        for post in posts:
            print ("Фейсбучный пост ", post)

    def scrape_posts(self, max_pages=4):      
        posts = []
        pageno = 2

        cursor = [
            'AbocHlVuS3n0hW0t-IUaCGN9y1jZzVg_cbHqG4IPXcnKhfacxLL52L4S1fMLklQPLgIBtRBA6z_hRaeCS0xzr1BCZbBe8fIdpVYMegK7Ye2TxzWSNRWUEJC0IMYWCjEkZQvI-Ix_Kx_E0Kx1oap8jGJAN9housnxkkAq8_ElwDTK9TbaHr0TB1MNNwR4lQU2PKYXOWajSG_0_AZ2DOCsn-DIMHL4jbhnhrwps_2kIi0gaC6y3aqbp9q8iILcwbXsjUzsTQce3tr3U9GQdVFClJQf2guaIwNa7KSmTEM99-SeNAMC07e8-VTCAd5Uc01qSa4yW_Asfl27xsXKbv9Aa5JBMt7fB8fvo1OBpOrU43Zg-lb1C7wAKoeqmo6bp2VfHVw',
            'Abo8hIfmoANZ0PCtYHB4ooCJ59SHjd5JoFmxoE2nu99WKD5kwB8riOnI6OiqTTO3gnI2Gehgu8BONQytNPKWWgBPnOTV7OF1Rlwmp2bEO5tQfDEw3Vcjiy_X3QJe2bj74E-DV2qom6G75bL4tRa-zPi4JcGYOGWvBTFTabTHJR8fWIAK1NctX4_lztV-aaGAZanQvrcDKFP9gz7w5luvOo4Lv6eN4xfGlJwoEmyj9blI-EG1Ijc3lhKXrHse0hPSJlKaWVo4w1lkijdeJE1QyzW-WRtaGbm6FTLDrFaOeOnLp8g6LhS3nJ2X2yx6-9967VQbNtWUQ2dCs3QqomitMMuXkDb-B3ASJmtRc4krWmpi0-hN46Si2r_ntU4cSt5idmg'
        ]
        dyn = [
            '7AmajEzUGByA5Q9UoGya4A5EWq2W8GAdy8Z9LFwxBxCbzES2N6xybxu13wHgf8jyR88y8aGjzEgDKuEjKeCwxxW3Ouum2SUpGqewIUsz8nxm3a229yoOm8yUgx62q78vDw',
            '7AmajEzUGByA5Q9UoGya4A5EWq2W8GAdy8Z9LFwxBxvyUWdwIhEoyUnwgUaQ3O4UJi28y2GAUW49XDG4XzFE8ouwYDDBwJK6qCzEbe78O5UlwOwwyoCcBy8K48hwCxO7VU'
        ]
        req = [
            'q',
            'y'
        ]

        self.data['page_number'] = pageno

        while pageno < max_pages:
            #self.data['cursor'] = cursor[pageno-2]
            payload = {
                'data': json.dumps(self.data),
                '__dpr': 1,
                '__user':100012747062538,
                '__a':1,
                '__dyn':'7AmajEzUGByA5Q9UoGya4A5EWq2WiWF3oyfirWo8popyUWdwIhEoyUnwgUat0Hx24UJi28y2GAUW49XDG4XzFE8ouwh9VobrxCFEW2PxOcxu5pUaE88C9z9oybx24oqyUsx-u6vU',
                #'__dyn':dyn[pageno-2],
                '__req':8,
                '__be':-1,
                '__pc':'PHASED:DEFAULT',
                '__rev':2467498
            }

            #grab.cookies.load_from_file('cookies.html')
            url='https://www.facebook.com/search/top/?init=quick&q=%D0%BB%D0%B0%D0%B2%D0%B0%D0%BB%D1%8C%20%D0%BF%D0%BE%D1%82%D0%BE%D0%BA&tas=0.715880302462869'
            g = Grab(url=url, cookiefile='cookies.html', post=payload)
            g.go(url=url)
            r = g.response.unicode_body()
            """
           post =
           r = requests.post(
               url='https://www.facebook.com/search/top/?init=quick&q=%D0%BB%D0%B0%D0%B2%D0%B0%D0%BB%D1%8C%20%D0%BF%D0%BE%D1%82%D0%BE%D0%BA&tas=0.715880302462869',
               data=payload,
               #    'X-Requested-With': 'XMLHttpRequest'
               #}
           )
           """

            print ("---------------------- ↓r ", pageno , "↓ ------------------")
            print(r)
            print ("---------------------- ↑r ", pageno , "↑ ------------------")


            # Next page
            pageno += 1
            self.data['page_number'] = pageno

        return posts

def grab_demo(request):

    # search page to scrape
    page = "https://www.facebook.com/search/top/?init=quick&q=%D0%BB%D0%B0%D0%B2%D0%B0%D0%BB%D1%8C%20%D0%BF%D0%BE%D1%82%D0%BE%D0%BA&tas=0.715880302462869"
    # id of user
    parser_user_id = 100012747062538
    # 
    page_number = 1


    # Логинимся в фейсбуке
    grab = Grab(log_file='out.html')
    grab.go('https://www.facebook.com/')

    grab.set_input("email", "your_email_here")
    grab.set_input("pass", "your_password_here")
    grab.submit()
    s1 = grab.response.unicode_body()
    grab.dump_cookies('cookies.html')

    print ("---------------------- Login done ------------------")
    print(s1)
    # Login done


    scraper = FacebookScraper()
    scraper.scrape()
        g = Grab(url=url, cookiefile='cookies.html', post=payload)
        g.go(url=url)
        r = g.response.unicode_body()