Python JSONDecodeError:额外数据:第1行第8列(字符7)

Python JSONDecodeError:额外数据:第1行第8列(字符7),python,json,Python,Json,我按照一个教程从facebook的个人资料中抓取了一些信息,我不断地发现这个错误: JSONDecodeError:额外数据:第1行第8列(字符7) 有人知道问题出在哪里吗 以下是我的python脚本: def get_bs(session, url): #Makes a GET requests using the given Session objectand returns a BeautifulSoup object. r = None while Tr

我按照一个教程从facebook的个人资料中抓取了一些信息,我不断地发现这个错误: JSONDecodeError:额外数据:第1行第8列(字符7)

有人知道问题出在哪里吗

以下是我的python脚本:

def get_bs(session, url):
    #Makes a GET requests using the given Session objectand returns a BeautifulSoup object.
    
    r = None
    while True:
        r = session.get(url)
        if r.ok:
            break
    return BeautifulSoup(r.text, 'lxml'
#To login
def make_login(session, base_url, credentials):
    #Returns a Session object logged in with credentials.
    
    login_form_url = '/login/device-based/regular/login/?refsrc=https%3A'\
        '%2F%2Fmobile.facebook.com%2Flogin%2Fdevice-based%2Fedit-user%2F&lwv=100'
 
    params = {'email':credentials['email'], 'pass':credentials['pass']}
 
    while True:
        time.sleep(3)
        logged_request = session.post(base_url+login_form_url, data=params)
        
        if logged_request.ok:
            logging.info('[*] Logged in.')
            break

#Crawling FB
def crawl_profile(session, base_url, profile_url, post_limit):
   #Goes to profile URL, crawls it and extracts posts URLs.
    
    profile_bs = get_bs(session, profile_url)
    n_scraped_posts = 0
    scraped_posts = list()
    posts_id = None

    while n_scraped_posts < post_limit:
        try:
            posts_id = 'recent'
            posts = profile_bs.find('div', id=posts_id).div.div.contents
        except Exception:
            posts_id = 'structured_composer_async_container'
            posts = profile_bs.find('div', id=posts_id).div.div.contents

        posts_urls = [a['href'] for a in profile_bs.find_all('a', text='Full Story')] 

        for post_url in posts_urls:
            # print(post_url)
            try:
                post_data = scrape_post(session, base_url, post_url)
                scraped_posts.append(post_data)
            except Exception as e:
                logging.info('Error: {}'.format(e))
            n_scraped_posts += 1
            if posts_completed(scraped_posts, post_limit):
                break
        
        show_more_posts_url = None
        if not posts_completed(scraped_posts, post_limit):
            show_more_posts_url = profile_bs.find('div', id=posts_id).next_sibling.a['href']
            profile_bs = get_bs(session, base_url+show_more_posts_url)
            time.sleep(3)
        else:
            break
            
    return scraped_posts
def get_bs(session, url):
    #Makes a GET requests using the given Session object and returns a BeautifulSoup object.
    r = None
    while True:
        r = session.get(url)
        time.sleep(3)
        if r.ok:
            break
    return BeautifulSoup(r.text, 'lxml')
#Scraping FB
def scrape_post(session, base_url, post_url):
    #Goes to post URL and extracts post data.
   
    post_data = OrderedDict()

    post_bs = get_bs(session, base_url+post_url)
    time.sleep(5)

    # Here we populate the OrderedDict object
    post_data['url'] = post_url
    #Find Post main element
    try:
        post_text_element = post_bs.find('div', id='u_0_0').div
        string_groups = [p.strings for p in post_text_element.find_all('p')]
        strings = [repr(string) for group in string_groups for string in group]
        post_data['text'] = strings
    except Exception:
        post_data['text'] = []
    #Extract post media URL
    try:
        post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
    except Exception:
        post_data['media_url'] = ''
    
    #Extract remaining data
    try:
        post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
    except Exception:
        post_data['comments'] = []
    
    return dict(post_data)

#Scraping FB
def scrape_post(session, base_url, post_url):
    #Goes to post URL and extracts post data.
   
    post_data = OrderedDict()

    post_bs = get_bs(session, base_url+post_url)
    time.sleep(5)

    # Here we populate the OrderedDict object
    post_data['url'] = post_url
    #Find Post main element
    try:
        post_text_element = post_bs.find('div', id='u_0_0').div
        string_groups = [p.strings for p in post_text_element.find_all('p')]
        strings = [repr(string) for group in string_groups for string in group]
        post_data['text'] = strings
    except Exception:
        post_data['text'] = []
    #Extract post media URL
    try:
        post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
    except Exception:
        post_data['media_url'] = ''
    
    #Extract remaining data
    try:
        post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
    except Exception:
        post_data['comments'] = []
    
    return dict(post_data)

#Function for profile URL and creditials for FB
def json_to_obj(filename):
    #Extracts data from JSON file and saves it on Python object
    
    obj = None
    with open(filename) as json_file:
        obj = json.loads(json_file.read())
    return obj

def save_data(data):
    #Converts data to JSON.
   
    with open('profile_posts_data.json', 'w') as json_file:
        json.dump(data, json_file, indent=4)

if __name__ == "__main__":

    logging.basicConfig(level=logging.INFO)
    base_url = 'https://mobile.facebook.com'
    session = requests.session()

    # Extracts credentials for the login and all of the profiles URL to scrape
    credentials = json_to_obj('credentials.json')
    profiles_urls = json_to_obj('profiles_urls.json')

    make_login(session, base_url, credentials)

    posts_data = None
    for profile_url in profiles_urls:
        posts_data = crawl_profile(session, base_url, profile_url, 25)
    logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
    logging.info('[!] Saving.')
    save_data(posts_data)

if __name__ == "__main__":
 
    logging.basicConfig(level=logging.INFO)
    base_url = 'https://mobile.facebook.com'
    session = requests.session()
 
    # Extracts credentials for the login and all of the profiles URL to scrape
    credentials = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\credentials.json")
    profiles_urls = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\profiles_urls.json")
 
    make_login(session, base_url, credentials)
 
    posts_data = None
    for profile_url in profiles_urls:
        posts_data = crawl_profile(session, base_url, profile_url, 25)
    logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
    logging.info('[!] Saving.')
    save_data(posts_data)
def get_bs(会话,url):
#使用给定的会话对象发出GET请求,并返回BeautifulSoup对象。
r=无
尽管如此:
r=session.get(url)
如果r.ok:
打破
返回美化组(r.text,'lxml'
#登入
def make_登录(会话、基本url、凭据):
#返回使用凭据登录的会话对象。
login\u form\u url='/login/device-based/regular/login/?refsrc=https%3A'\
“%2F%2Fmobile.facebook.com%2Flogin%2Fdevice-based%2Fedit用户%2F&lwv=100”
参数={'email':凭据['email'],'pass':凭据['pass']}
尽管如此:
时间。睡眠(3)
logged\u request=session.post(基本url+登录表单url,数据=参数)
如果记录了\u request.ok:
logging.info(“[*]已登录”。)
打破
#爬行FB
def爬网配置文件(会话、基本url、配置文件url、发布限制):
#转到配置文件URL,对其进行爬网并提取帖子URL。
profile\u bs=get\u bs(会话、profile\u url)
n_刮除的_柱=0
刮掉的帖子=列表()
posts\u id=None
当n_刮_柱<柱极限:
尝试:
posts_id=‘最近的’
posts=profile\u bs.find('div',id=posts\u id).div.div.contents
除例外情况外:
posts\u id='structured\u composer\u async\u container'
posts=profile\u bs.find('div',id=posts\u id).div.div.contents
posts_url=[a['href']用于个人资料中的内容。find_all('a',text='Full Story')]
对于posts\u url中的post\u url:
#打印(post_url)
尝试:
post\u data=scrape\u post(会话、基本url、发布url)
删除的帖子。追加(帖子数据)
例外情况除外,如e:
logging.info('错误:{}'。格式(e))
n_刮伤_柱+=1
如果帖子完成(刮帖,帖子限制):
打破
显示更多帖子\u url=None
如果未完成帖子(刮帖,帖子限制):
show_more_posts_url=profile_bs.find('div',id=posts_id)。next_sibling.a['href']
profile_-bs=get_-bs(会话、基本url+show_-more_-posts_-url)
时间。睡眠(3)
其他:
打破
退回刮伤的电杆
def get_bs(会话,url):
#使用给定的会话对象发出GET请求,并返回BeautifulSoup对象。
r=无
尽管如此:
r=session.get(url)
时间。睡眠(3)
如果r.ok:
打破
返回美化组(r.text,'lxml')
#刮刀
def scrape_post(会话、基本url、发布url):
#转到post URL并提取post数据。
post_数据=OrderedDict()
post_-bs=get_-bs(会话、基本url+post_-url)
时间。睡眠(5)
#这里我们填充OrderedDict对象
post_数据['url']=post_url
#查找Post主元素
尝试:
post_text_元素=post_bs.find('div',id='u_0_0').div
string_groups=[post_text_元素中p的p.strings.find_all('p')]
strings=[repr(string)代表string中的组\u groups代表string中的组]
post_数据['text']=字符串
除例外情况外:
post_数据['text']=[]
#提取后媒体URL
尝试:
post_data['media_url']=post_bs.find('div',id='u_0_0')。find('a')['href']
除例外情况外:
post_数据['media_url']='
#提取剩余数据
尝试:
post_数据['comments']=提取_注释(会话、基本url、post_bs、post_url)
除例外情况外:
post_数据['comments']=[]
返回dict(post_数据)
#刮刀
def scrape_post(会话、基本url、发布url):
#转到post URL并提取post数据。
post_数据=OrderedDict()
post_-bs=get_-bs(会话、基本url+post_-url)
时间。睡眠(5)
#这里我们填充OrderedDict对象
post_数据['url']=post_url
#查找Post主元素
尝试:
post_text_元素=post_bs.find('div',id='u_0_0').div
string_groups=[post_text_元素中p的p.strings.find_all('p')]
strings=[repr(string)代表string中的组\u groups代表string中的组]
post_数据['text']=字符串
除例外情况外:
post_数据['text']=[]
#提取后媒体URL
尝试:
post_data['media_url']=post_bs.find('div',id='u_0_0')。find('a')['href']
除例外情况外:
post_数据['media_url']='
#提取剩余数据
尝试:
post_数据['comments']=提取_注释(会话、基本url、post_bs、post_url)
除例外情况外:
post_数据['comments']=[]
返回dict(post_数据)
#用于FB的配置文件URL和信誉的函数
def json_to_obj(文件名):
#从JSON文件中提取数据并将其保存在Python对象上
obj=无
打开(文件名)作为json_文件:
obj=json.load(json_file.read())
返回obj
def保存_数据(数据):
#将数据转换为JSON。
使用open('profile_posts_data.json','w')作为json_文件:
dump(数据,json_文件,缩进=4)
如果名称=“\uuuuu main\uuuuuuuu”:
logging.basicConfig(级别=logging.INFO)
基本url=https://mobile.facebook.com'
会话=请求。会话()
#提取登录的凭据和要刮取的所有配置文件URL
credentials=json\u to\u obj('credentials.json')
profiles\u url=json\u to\u obj('profiles\u url.json'))
进行登录(会话、基本url、凭据)
posts\u data=None
对于配置文件\u url中的配置文件\u url:
posts\u data=crawl\u profile(会话,基本url,配置文件url,25)
logging.info('[!]已完成刮削。总计:{}。格式(len(posts_数据)))
logging.info(“[!]正在保存”。)
保存_数据(发布_数据)
如果名称=“\uuuuu main\uuuuuuuu”:
logging.basicConfig(级别=日志记录)。