Python 在instagram html页面中查找BeautifulSoup 我发现bs4有问题。

Python 在instagram html页面中查找BeautifulSoup 我发现bs4有问题。,python,web-scraping,beautifulsoup,find,instagram,Python,Web Scraping,Beautifulsoup,Find,Instagram,我试图在html instagram页面中自动查找一些url,但(知道我是python noob)我找不到在html源代码中自动搜索示例中“display\u url”:http…之后的url的方法 我想让我的脚本搜索乘以显示为“display_url”的url,然后下载它们。 它们在源代码中出现的次数必须相同 使用bs4,我尝试了以下方法: f = urllib.request.urlopen(fileURL) htmlSource = f.read() soup = bs(htmlSour

我试图在html instagram页面中自动查找一些url,但(知道我是python noob)我找不到在html源代码中自动搜索示例中
“display\u url”:http…
之后的url的方法

我想让我的脚本搜索乘以显示为“display_url”的url,然后下载它们。 它们在源代码中出现的次数必须相同


使用bs4,我尝试了以下方法:

f = urllib.request.urlopen(fileURL)
htmlSource = f.read()
soup = bs(htmlSource, 'html.parser')
metaTag = soup.find_all('meta', {'property': 'og:image'})
imgURL = metaTag[0]['content']
urllib.request.urlretrieve(imgURL, 'fileName.jpg')
但是我不能做汤。找到所有的(…工作/搜索它。 有没有办法找到bs4页面的这一部分

非常感谢你的帮助

下面是我现在的小(python)代码示例:


窗口。\u共享数据={
“配置”:{
“csrf_令牌”:“,
“观众”:{
“viewerId”:”
},
“支持”6:正确,
“国家/地区代码”:“FR”,
“语言代码”:“fr”,
“区域设置”:“fr_fr”,
“输入数据”:{
“延期”:[{
“graphql”:{
“快捷码\媒体”:{
“_typename”:“GraphSidecar”,
“尺寸”:{
“高度”:1080,
“宽度”:1080
},
“门控信息”:空,
“媒体预览”:空,

您可以在信息中找到合适的脚本标记和正则表达式。我假设第一个包含
窗口的脚本标记。\u sharedData=
是合适的。您可以根据需要进行修改

from bs4 import BeautifulSoup as bs
import re

html = '''
<html>
 <head></head>
 <body class=""> 
  <span id="react-root">
   <svg width="50" height="50" viewbox="0 0 50 50" style="position:absolute;top:50%;left:50%;margin:-25px 0 0 -25px;fill:#c7c7c7"> 
    <path d="

        <!––deleted part for privacy -->

         " /> 
   </svg></span> 
  <script type="text/javascript">
    window._sharedData = {
      "config": {
        "csrf_token": "",
        "viewer": {

        <!––deleted part for privacy -->

        "viewerId": ""
      },
      "supports_es6": true,
      "country_code": "FR",
      "language_code": "fr",
      "locale": "fr_FR",
      "entry_data": {
        "PostPage": [{
          "graphql": {
            "shortcode_media": {
              "__typename": "GraphSidecar",

     <!––deleted part for privacy -->

              "dimensions": {
                "height": 1080,
                "width": 1080
              },
              "gating_info": null,
              "media_preview": null,

<--There's the important part that have to be extracted as many times it appear in the source code-->

              "display_url": "https://scontent-cdt1-1.cdninstagram.com/vp/",
              "display_resources": [{
                "src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
                "config_width": 640,
                "config_height": 640
              }, {
                "src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
                "config_width": 750,
                "config_height": 750
              }, {
                "src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
                "config_width": 1080,
                "config_height": 1080
              }],
              "is_video": false,</script>
 </body>
</html>
'''

soup = bs(html, 'lxml')
scripts = soup.select('script[type="text/javascript"]')
for script in scripts:
    if ' window._sharedData =' in script.text:
        data = script.text
        break
r = re.compile(r'"display_url":(.*)",')
print(r.findall(data))

这个项目进展顺利,它变成了这样:

thepage = urllib.request.urlopen(html)
    soup = BeautifulSoup(thepage, "html.parser")
    print(soup.title.text)
    txt = soup.select('script[type="text/javascript"]')[3] 
    texte = txt.get_text()
    f1 = open("tet.txt", 'w')
    f1.write(texte)
    f1.close() 
    with open('tet.txt','r') as f:
        data=''.join(f.readlines())
    print(data[data.index('"display_url":"'):data.index('","display_resources":')+1])
但现在出现了一些新情况:

  • 只要tet.txt文件中出现('“display_url”:“to-->”,“display_resources”:”),如何使程序的查找url部分(第10行,第11行)重复
  • 可以使用while循环,但如何使其重复该过程
问题已解决 以下是在iOS上使用Pythonista 3从instagram url下载多幅图像的代码:

    from sys import argv
    import urllib
    import urllib.request
    from bs4 import BeautifulSoup 
    import re
    import photos
    import clipboard


    thepage = "your url"
#p.1
    thepage = urllib.request.urlopen(html)
    soup = BeautifulSoup(thepage, "html.parser")
    print(soup.title.text)
    txt = soup.select('script[type="text/javascript"]')[3] 
    texte = txt.get_text()
    fille = open("tet.txt", 'w')
    fille.write(texte)
    fille.close()
#p.2
    g = open('tet.txt','r')
    data=''.join(g.readlines())
    le1 = 0
    le2 = 0
    hturl = open('url.html', 'w')
    still_looking = True
    while still_looking:
        still_looking = False
        dat = data.find('play_url', le1)
        det = data.find('play_resources', le2)
        if dat >= le1:
            #urls.append(dat)
            le1 = dat + 1
            still_looking = True                
        if det >= le2:
            hturl.write(data[dat:det])
            le2 = det + 1
            still_looking = True
    hturl.close()
#p.3
    hturl2 = open('url.html', 'r')
    dete = ''.join(hturl2.readlines())
    le11 = 0
    le22 = 0
    urls = []
    still_looking2 = True
    while still_looking2:
        still_looking2 = False
        dat2 = dete.find('https://scontent-', le11)
        det2 = dete.find('","dis', le22)
        if dat2 >= le11:
            urls.append(dat2)
            le11 = dat2 + 1
            still_looking2 = True                
        if det2 >= le22:
            urls.append(dete[dat2:det2])
            le22 = det2 + 1
            still_looking2 = True   
    hturl2.close()
#p.4
    imgs = len(urls)
    nbind = imgs
    nbindr = 3 
    images = 1
    while nbindr < imgs:
        urllib.request.urlretrieve(urls[nbindr], 'photo.jpg')
        photos.create_image_asset('photo.jpg')
        print ('Image ' + str(images) + ' downloaded')
        nbindr = nbindr +2
        images += 1
    print("OK")
从系统导入argv
导入URL库
导入urllib.request
从bs4导入BeautifulSoup
进口稀土
导入照片
导入剪贴板
thepage=“您的url”
#p、 一,
页面=urllib.request.urlopen(html)
soup=BeautifulSoup(页面“html.parser”)
打印(soup.title.text)
txt=soup.select('script[type=“text/javascript”]')[3]
texte=txt.get_text()
fille=open(“tet.txt”,“w”)
fille.write(文本)
fille.close()
#p、 二,
g=打开('tet.txt','r')
数据=“”.join(g.readlines())
le1=0
le2=0
hturl=open('url.html','w')
看起来还是真的吗
当你还在看的时候:
仍在寻找=错误
dat=data.find('play_url',le1)
det=data.find('play_resources',le2)
如果dat>=le1:
#URL.append(dat)
le1=dat+1
看起来还是真的吗
如果det>=le2:
write(数据[dat:det])
le2=det+1
看起来还是真的吗
hturl.close()
#p、 三,
hturl2=open('url.html','r')
dete=''.join(hturl2.readlines())
le11=0
le22=0
URL=[]
仍然看起来像2=真的
当你还在看的时候2:
仍然看2=错误
dat2=dete.find('https://scontent-",le11)
det2=dete.find(“,”dis',le22)
如果dat2>=le11:
url.append(dat2)
le11=dat2+1
仍然看起来像2=真的
如果det2>=le22:
append(dete[dat2:det2])
le22=det2+1
仍然看起来像2=真的
hturl2.close()
#p、 四,
imgs=len(URL)
nbind=imgs
nbindr=3
图像=1
当nbindr
这有点挑剔,但它的工作和速度太快。
感谢您的帮助。

您有几个示例URL可供使用吗?是和否。是的,任何包含多张图片(旋转木马样式)的页面帖子页面,但这些页面是由InGram使用我们唯一的用户令牌生成的,因此我无法将其按原样传递给您…:(是的,因此我尝试添加源代码的一部分…^^获取适当的脚本标记和正则表达式。如果您在
.find()
中使用正则表达式作为
文本
参数,您可以使代码更短,例如:
数据=汤。find('script',text=r)。text
非常受欢迎!有时最好使用
.find()
/
.find_all()
因为它们接受正则表达式和函数。嗨。当我在我的软件中尝试@t.h.adam的想法时,结果是:文件“main.py”,第27行,在soup=bs(soup1,'lxml')文件/home/runner/.site packages/bs4/_init_uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuTypeError:“非类型”对象不可调用。数据是否动态加载?打印soup和Verify无法打印它,因为:
in-soup=bs(html,'lxml')文件/var/containers/Bundle/Application/93F4D70C-FD37-45C8-80BD-F48B0E2BCCB3/Pythonista3.app/Frameworks/Py3Kit.framework/pylib/site packages/bs4/u init_uuuuuuuuuuuuuuuuuuuuuuuuuuuuuu“,第158行,在”\uuuu init\uuuu%“,”.join(features))bs4.FeatureNotFound:找不到具有您请求的功能的树生成器:lxml。是否需要安装解析器库?
    from sys import argv
    import urllib
    import urllib.request
    from bs4 import BeautifulSoup 
    import re
    import photos
    import clipboard


    thepage = "your url"
#p.1
    thepage = urllib.request.urlopen(html)
    soup = BeautifulSoup(thepage, "html.parser")
    print(soup.title.text)
    txt = soup.select('script[type="text/javascript"]')[3] 
    texte = txt.get_text()
    fille = open("tet.txt", 'w')
    fille.write(texte)
    fille.close()
#p.2
    g = open('tet.txt','r')
    data=''.join(g.readlines())
    le1 = 0
    le2 = 0
    hturl = open('url.html', 'w')
    still_looking = True
    while still_looking:
        still_looking = False
        dat = data.find('play_url', le1)
        det = data.find('play_resources', le2)
        if dat >= le1:
            #urls.append(dat)
            le1 = dat + 1
            still_looking = True                
        if det >= le2:
            hturl.write(data[dat:det])
            le2 = det + 1
            still_looking = True
    hturl.close()
#p.3
    hturl2 = open('url.html', 'r')
    dete = ''.join(hturl2.readlines())
    le11 = 0
    le22 = 0
    urls = []
    still_looking2 = True
    while still_looking2:
        still_looking2 = False
        dat2 = dete.find('https://scontent-', le11)
        det2 = dete.find('","dis', le22)
        if dat2 >= le11:
            urls.append(dat2)
            le11 = dat2 + 1
            still_looking2 = True                
        if det2 >= le22:
            urls.append(dete[dat2:det2])
            le22 = det2 + 1
            still_looking2 = True   
    hturl2.close()
#p.4
    imgs = len(urls)
    nbind = imgs
    nbindr = 3 
    images = 1
    while nbindr < imgs:
        urllib.request.urlretrieve(urls[nbindr], 'photo.jpg')
        photos.create_image_asset('photo.jpg')
        print ('Image ' + str(images) + ' downloaded')
        nbindr = nbindr +2
        images += 1
    print("OK")