Python 在instagram html页面中查找BeautifulSoup 我发现bs4有问题。_Python_Web Scraping_Beautifulsoup_Find_Instagram

Python 在instagram html页面中查找BeautifulSoup 我发现bs4有问题。

python web-scraping instagram

Python 在instagram html页面中查找BeautifulSoup 我发现bs4有问题。,python,web-scraping,beautifulsoup,find,instagram,Python,Web Scraping,Beautifulsoup,Find,Instagram,我试图在html instagram页面中自动查找一些url，但（知道我是python noob）我找不到在html源代码中自动搜索示例中“display\u url”：http…之后的url的方法我想让我的脚本搜索乘以显示为“display_url”的url，然后下载它们。它们在源代码中出现的次数必须相同使用bs4，我尝试了以下方法： f = urllib.request.urlopen(fileURL) htmlSource = f.read() soup = bs(htmlSour

我试图在html instagram页面中自动查找一些url，但（知道我是python noob）我找不到在html源代码中自动搜索示例中

“display\u url”：http…

之后的url的方法

我想让我的脚本搜索乘以显示为“display_url”的url，然后下载它们。它们在源代码中出现的次数必须相同

使用bs4，我尝试了以下方法：

f = urllib.request.urlopen(fileURL)
htmlSource = f.read()
soup = bs(htmlSource, 'html.parser')
metaTag = soup.find_all('meta', {'property': 'og:image'})
imgURL = metaTag[0]['content']
urllib.request.urlretrieve(imgURL, 'fileName.jpg')

但是我不能做汤。找到所有的（…工作/搜索它。有没有办法找到bs4页面的这一部分

非常感谢你的帮助

下面是我现在的小（python）代码示例：


窗口。\u共享数据={
“配置”：{
“csrf_令牌”：“，
“观众”：{
“viewerId”：”
},
“支持”6：正确，
“国家/地区代码”：“FR”，
“语言代码”：“fr”，
“区域设置”：“fr_fr”，
“输入数据”：{
“延期”：[{
“graphql”：{
“快捷码\媒体”：{
“_typename”：“GraphSidecar”，
“尺寸”：{
“高度”：1080，
“宽度”：1080
},
“门控信息”：空，
“媒体预览”：空，
您可以在信息中找到合适的脚本标记和正则表达式。我假设第一个包含窗口的脚本标记。\u sharedData=
是合适的。您可以根据需要进行修改
from bs4 import BeautifulSoup as bs
import re

html = '''
<html>
 <head></head>
 <body class=""> 
  <span id="react-root">
   <svg width="50" height="50" viewbox="0 0 50 50" style="position:absolute;top:50%;left:50%;margin:-25px 0 0 -25px;fill:#c7c7c7"> 
    <path d="

        <!––deleted part for privacy -->

         " /> 
   </svg></span> 
  <script type="text/javascript">
    window._sharedData = {
      "config": {
        "csrf_token": "",
        "viewer": {

        <!––deleted part for privacy -->

        "viewerId": ""
      },
      "supports_es6": true,
      "country_code": "FR",
      "language_code": "fr",
      "locale": "fr_FR",
      "entry_data": {
        "PostPage": [{
          "graphql": {
            "shortcode_media": {
              "__typename": "GraphSidecar",

     <!––deleted part for privacy -->

              "dimensions": {
                "height": 1080,
                "width": 1080
              },
              "gating_info": null,
              "media_preview": null,

<--There's the important part that have to be extracted as many times it appear in the source code-->

              "display_url": "https://scontent-cdt1-1.cdninstagram.com/vp/",
              "display_resources": [{
                "src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
                "config_width": 640,
                "config_height": 640
              }, {
                "src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
                "config_width": 750,
                "config_height": 750
              }, {
                "src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
                "config_width": 1080,
                "config_height": 1080
              }],
              "is_video": false,</script>
 </body>
</html>
'''

soup = bs(html, 'lxml')
scripts = soup.select('script[type="text/javascript"]')
for script in scripts:
    if ' window._sharedData =' in script.text:
        data = script.text
        break
r = re.compile(r'"display_url":(.*)",')
print(r.findall(data))

这个项目进展顺利，它变成了这样：
thepage = urllib.request.urlopen(html)
    soup = BeautifulSoup(thepage, "html.parser")
    print(soup.title.text)
    txt = soup.select('script[type="text/javascript"]')[3] 
    texte = txt.get_text()
    f1 = open("tet.txt", 'w')
    f1.write(texte)
    f1.close() 
    with open('tet.txt','r') as f:
        data=''.join(f.readlines())
    print(data[data.index('"display_url":"'):data.index('","display_resources":')+1])

但现在出现了一些新情况：

只要tet.txt文件中出现（'“display_url”：“to-->”，“display_resources”：”），如何使程序的查找url部分（第10行，第11行）重复
可以使用while循环，但如何使其重复该过程
问题已解决
以下是在iOS上使用Pythonista 3从instagram url下载多幅图像的代码：
    from sys import argv
    import urllib
    import urllib.request
    from bs4 import BeautifulSoup 
    import re
    import photos
    import clipboard


    thepage = "your url"
#p.1
    thepage = urllib.request.urlopen(html)
    soup = BeautifulSoup(thepage, "html.parser")
    print(soup.title.text)
    txt = soup.select('script[type="text/javascript"]')[3] 
    texte = txt.get_text()
    fille = open("tet.txt", 'w')
    fille.write(texte)
    fille.close()
#p.2
    g = open('tet.txt','r')
    data=''.join(g.readlines())
    le1 = 0
    le2 = 0
    hturl = open('url.html', 'w')
    still_looking = True
    while still_looking:
        still_looking = False
        dat = data.find('play_url', le1)
        det = data.find('play_resources', le2)
        if dat >= le1:
            #urls.append(dat)
            le1 = dat + 1
            still_looking = True                
        if det >= le2:
            hturl.write(data[dat:det])
            le2 = det + 1
            still_looking = True
    hturl.close()
#p.3
    hturl2 = open('url.html', 'r')
    dete = ''.join(hturl2.readlines())
    le11 = 0
    le22 = 0
    urls = []
    still_looking2 = True
    while still_looking2:
        still_looking2 = False
        dat2 = dete.find('https://scontent-', le11)
        det2 = dete.find('","dis', le22)
        if dat2 >= le11:
            urls.append(dat2)
            le11 = dat2 + 1
            still_looking2 = True                
        if det2 >= le22:
            urls.append(dete[dat2:det2])
            le22 = det2 + 1
            still_looking2 = True   
    hturl2.close()
#p.4
    imgs = len(urls)
    nbind = imgs
    nbindr = 3 
    images = 1
    while nbindr < imgs:
        urllib.request.urlretrieve(urls[nbindr], 'photo.jpg')
        photos.create_image_asset('photo.jpg')
        print ('Image ' + str(images) + ' downloaded')
        nbindr = nbindr +2
        images += 1
    print("OK")

从系统导入argv
导入URL库
导入urllib.request
从bs4导入BeautifulSoup
进口稀土
导入照片
导入剪贴板
thepage=“您的url”
#p、 一,
页面=urllib.request.urlopen（html）
soup=BeautifulSoup（页面“html.parser”）
打印（soup.title.text）
txt=soup.select（'script[type=“text/javascript”]'）[3]
texte=txt.get_text（）
fille=open（“tet.txt”，“w”）
fille.write（文本）
fille.close（）
#p、 二,
g=打开（'tet.txt'，'r'）
数据=“”.join（g.readlines（））
le1=0
le2=0
hturl=open（'url.html'，'w'）
看起来还是真的吗
当你还在看的时候：
仍在寻找=错误
dat=data.find（'play_url'，le1）
det=data.find（'play_resources'，le2）
如果dat>=le1：
#URL.append（dat）
le1=dat+1
看起来还是真的吗
如果det>=le2：
write（数据[dat:det]）
le2=det+1
看起来还是真的吗
hturl.close（）
#p、 三,
hturl2=open（'url.html'，'r'）
dete=''.join（hturl2.readlines（））
le11=0
le22=0
URL=[]
仍然看起来像2=真的
当你还在看的时候2：
仍然看2=错误
dat2=dete.find（'https://scontent-",le11)
det2=dete.find（“，”dis'，le22）
如果dat2>=le11：
url.append（dat2）
le11=dat2+1
仍然看起来像2=真的
如果det2>=le22：
append（dete[dat2:det2]）
le22=det2+1
仍然看起来像2=真的
hturl2.close（）
#p、 四,
imgs=len（URL）
nbind=imgs
nbindr=3
图像=1
当nbindr

这有点挑剔，但它的工作和速度太快。
感谢您的帮助。
您有几个示例URL可供使用吗？是和否。是的，任何包含多张图片（旋转木马样式）的页面帖子页面，但这些页面是由InGram使用我们唯一的用户令牌生成的，因此我无法将其按原样传递给您…：（是的，因此我尝试添加源代码的一部分…^^获取适当的脚本标记和正则表达式。如果您在.find（）
中使用正则表达式作为文本
参数，您可以使代码更短，例如：数据=汤。find（'script'，text=r）。text
非常受欢迎！有时最好使用.find（）
/.find_all（）
因为它们接受正则表达式和函数。嗨。当我在我的软件中尝试@t.h.adam的想法时，结果是：文件“main.py”，第27行，在soup=bs（soup1，'lxml'）文件/home/runner/.site packages/bs4/_init_uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuTypeError:“非类型”对象不可调用。数据是否动态加载？打印soup和Verify无法打印它，因为：in-soup=bs（html，'lxml'）文件/var/containers/Bundle/Application/93F4D70C-FD37-45C8-80BD-F48B0E2BCCB3/Pythonista3.app/Frameworks/Py3Kit.framework/pylib/site packages/bs4/u init_uuuuuuuuuuuuuuuuuuuuuuuuuuuuuu“，第158行，在”\uuuu init\uuuu%“，”.join（features））bs4.FeatureNotFound:找不到具有您请求的功能的树生成器：lxml。是否需要安装解析器库？
    from sys import argv
    import urllib
    import urllib.request
    from bs4 import BeautifulSoup 
    import re
    import photos
    import clipboard


    thepage = "your url"
#p.1
    thepage = urllib.request.urlopen(html)
    soup = BeautifulSoup(thepage, "html.parser")
    print(soup.title.text)
    txt = soup.select('script[type="text/javascript"]')[3] 
    texte = txt.get_text()
    fille = open("tet.txt", 'w')
    fille.write(texte)
    fille.close()
#p.2
    g = open('tet.txt','r')
    data=''.join(g.readlines())
    le1 = 0
    le2 = 0
    hturl = open('url.html', 'w')
    still_looking = True
    while still_looking:
        still_looking = False
        dat = data.find('play_url', le1)
        det = data.find('play_resources', le2)
        if dat >= le1:
            #urls.append(dat)
            le1 = dat + 1
            still_looking = True                
        if det >= le2:
            hturl.write(data[dat:det])
            le2 = det + 1
            still_looking = True
    hturl.close()
#p.3
    hturl2 = open('url.html', 'r')
    dete = ''.join(hturl2.readlines())
    le11 = 0
    le22 = 0
    urls = []
    still_looking2 = True
    while still_looking2:
        still_looking2 = False
        dat2 = dete.find('https://scontent-', le11)
        det2 = dete.find('","dis', le22)
        if dat2 >= le11:
            urls.append(dat2)
            le11 = dat2 + 1
            still_looking2 = True                
        if det2 >= le22:
            urls.append(dete[dat2:det2])
            le22 = det2 + 1
            still_looking2 = True   
    hturl2.close()
#p.4
    imgs = len(urls)
    nbind = imgs
    nbindr = 3 
    images = 1
    while nbindr < imgs:
        urllib.request.urlretrieve(urls[nbindr], 'photo.jpg')
        photos.create_image_asset('photo.jpg')
        print ('Image ' + str(images) + ' downloaded')
        nbindr = nbindr +2
        images += 1
    print("OK")