Python 在instagram html页面中查找BeautifulSoup 我发现bs4有问题。
我试图在html instagram页面中自动查找一些url,但(知道我是python noob)我找不到在html源代码中自动搜索示例中Python 在instagram html页面中查找BeautifulSoup 我发现bs4有问题。,python,web-scraping,beautifulsoup,find,instagram,Python,Web Scraping,Beautifulsoup,Find,Instagram,我试图在html instagram页面中自动查找一些url,但(知道我是python noob)我找不到在html源代码中自动搜索示例中“display\u url”:http…之后的url的方法 我想让我的脚本搜索乘以显示为“display_url”的url,然后下载它们。 它们在源代码中出现的次数必须相同 使用bs4,我尝试了以下方法: f = urllib.request.urlopen(fileURL) htmlSource = f.read() soup = bs(htmlSour
“display\u url”:http…
之后的url的方法
我想让我的脚本搜索乘以显示为“display_url”的url,然后下载它们。
它们在源代码中出现的次数必须相同
使用bs4,我尝试了以下方法:
f = urllib.request.urlopen(fileURL)
htmlSource = f.read()
soup = bs(htmlSource, 'html.parser')
metaTag = soup.find_all('meta', {'property': 'og:image'})
imgURL = metaTag[0]['content']
urllib.request.urlretrieve(imgURL, 'fileName.jpg')
但是我不能做汤。找到所有的(…工作/搜索它。
有没有办法找到bs4页面的这一部分
非常感谢你的帮助
下面是我现在的小(python)代码示例:
窗口。\u共享数据={
“配置”:{
“csrf_令牌”:“,
“观众”:{
“viewerId”:”
},
“支持”6:正确,
“国家/地区代码”:“FR”,
“语言代码”:“fr”,
“区域设置”:“fr_fr”,
“输入数据”:{
“延期”:[{
“graphql”:{
“快捷码\媒体”:{
“_typename”:“GraphSidecar”,
“尺寸”:{
“高度”:1080,
“宽度”:1080
},
“门控信息”:空,
“媒体预览”:空,
您可以在信息中找到合适的脚本标记和正则表达式。我假设第一个包含窗口的脚本标记。\u sharedData=
是合适的。您可以根据需要进行修改
from bs4 import BeautifulSoup as bs
import re
html = '''
<html>
<head></head>
<body class="">
<span id="react-root">
<svg width="50" height="50" viewbox="0 0 50 50" style="position:absolute;top:50%;left:50%;margin:-25px 0 0 -25px;fill:#c7c7c7">
<path d="
<!––deleted part for privacy -->
" />
</svg></span>
<script type="text/javascript">
window._sharedData = {
"config": {
"csrf_token": "",
"viewer": {
<!––deleted part for privacy -->
"viewerId": ""
},
"supports_es6": true,
"country_code": "FR",
"language_code": "fr",
"locale": "fr_FR",
"entry_data": {
"PostPage": [{
"graphql": {
"shortcode_media": {
"__typename": "GraphSidecar",
<!––deleted part for privacy -->
"dimensions": {
"height": 1080,
"width": 1080
},
"gating_info": null,
"media_preview": null,
<--There's the important part that have to be extracted as many times it appear in the source code-->
"display_url": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"display_resources": [{
"src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"config_width": 640,
"config_height": 640
}, {
"src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"config_width": 750,
"config_height": 750
}, {
"src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"config_width": 1080,
"config_height": 1080
}],
"is_video": false,</script>
</body>
</html>
'''
soup = bs(html, 'lxml')
scripts = soup.select('script[type="text/javascript"]')
for script in scripts:
if ' window._sharedData =' in script.text:
data = script.text
break
r = re.compile(r'"display_url":(.*)",')
print(r.findall(data))
这个项目进展顺利,它变成了这样:
thepage = urllib.request.urlopen(html)
soup = BeautifulSoup(thepage, "html.parser")
print(soup.title.text)
txt = soup.select('script[type="text/javascript"]')[3]
texte = txt.get_text()
f1 = open("tet.txt", 'w')
f1.write(texte)
f1.close()
with open('tet.txt','r') as f:
data=''.join(f.readlines())
print(data[data.index('"display_url":"'):data.index('","display_resources":')+1])
但现在出现了一些新情况:
- 只要tet.txt文件中出现('“display_url”:“to-->”,“display_resources”:”),如何使程序的查找url部分(第10行,第11行)重复
- 可以使用while循环,但如何使其重复该过程
问题已解决
以下是在iOS上使用Pythonista 3从instagram url下载多幅图像的代码:
from sys import argv
import urllib
import urllib.request
from bs4 import BeautifulSoup
import re
import photos
import clipboard
thepage = "your url"
#p.1
thepage = urllib.request.urlopen(html)
soup = BeautifulSoup(thepage, "html.parser")
print(soup.title.text)
txt = soup.select('script[type="text/javascript"]')[3]
texte = txt.get_text()
fille = open("tet.txt", 'w')
fille.write(texte)
fille.close()
#p.2
g = open('tet.txt','r')
data=''.join(g.readlines())
le1 = 0
le2 = 0
hturl = open('url.html', 'w')
still_looking = True
while still_looking:
still_looking = False
dat = data.find('play_url', le1)
det = data.find('play_resources', le2)
if dat >= le1:
#urls.append(dat)
le1 = dat + 1
still_looking = True
if det >= le2:
hturl.write(data[dat:det])
le2 = det + 1
still_looking = True
hturl.close()
#p.3
hturl2 = open('url.html', 'r')
dete = ''.join(hturl2.readlines())
le11 = 0
le22 = 0
urls = []
still_looking2 = True
while still_looking2:
still_looking2 = False
dat2 = dete.find('https://scontent-', le11)
det2 = dete.find('","dis', le22)
if dat2 >= le11:
urls.append(dat2)
le11 = dat2 + 1
still_looking2 = True
if det2 >= le22:
urls.append(dete[dat2:det2])
le22 = det2 + 1
still_looking2 = True
hturl2.close()
#p.4
imgs = len(urls)
nbind = imgs
nbindr = 3
images = 1
while nbindr < imgs:
urllib.request.urlretrieve(urls[nbindr], 'photo.jpg')
photos.create_image_asset('photo.jpg')
print ('Image ' + str(images) + ' downloaded')
nbindr = nbindr +2
images += 1
print("OK")
从系统导入argv
导入URL库
导入urllib.request
从bs4导入BeautifulSoup
进口稀土
导入照片
导入剪贴板
thepage=“您的url”
#p、 一,
页面=urllib.request.urlopen(html)
soup=BeautifulSoup(页面“html.parser”)
打印(soup.title.text)
txt=soup.select('script[type=“text/javascript”]')[3]
texte=txt.get_text()
fille=open(“tet.txt”,“w”)
fille.write(文本)
fille.close()
#p、 二,
g=打开('tet.txt','r')
数据=“”.join(g.readlines())
le1=0
le2=0
hturl=open('url.html','w')
看起来还是真的吗
当你还在看的时候:
仍在寻找=错误
dat=data.find('play_url',le1)
det=data.find('play_resources',le2)
如果dat>=le1:
#URL.append(dat)
le1=dat+1
看起来还是真的吗
如果det>=le2:
write(数据[dat:det])
le2=det+1
看起来还是真的吗
hturl.close()
#p、 三,
hturl2=open('url.html','r')
dete=''.join(hturl2.readlines())
le11=0
le22=0
URL=[]
仍然看起来像2=真的
当你还在看的时候2:
仍然看2=错误
dat2=dete.find('https://scontent-",le11)
det2=dete.find(“,”dis',le22)
如果dat2>=le11:
url.append(dat2)
le11=dat2+1
仍然看起来像2=真的
如果det2>=le22:
append(dete[dat2:det2])
le22=det2+1
仍然看起来像2=真的
hturl2.close()
#p、 四,
imgs=len(URL)
nbind=imgs
nbindr=3
图像=1
当nbindr
这有点挑剔,但它的工作和速度太快。
感谢您的帮助。您有几个示例URL可供使用吗?是和否。是的,任何包含多张图片(旋转木马样式)的页面帖子页面,但这些页面是由InGram使用我们唯一的用户令牌生成的,因此我无法将其按原样传递给您…:(是的,因此我尝试添加源代码的一部分…^^获取适当的脚本标记和正则表达式。如果您在.find()
中使用正则表达式作为文本
参数,您可以使代码更短,例如:数据=汤。find('script',text=r)。text
非常受欢迎!有时最好使用.find()
/.find_all()
因为它们接受正则表达式和函数。嗨。当我在我的软件中尝试@t.h.adam的想法时,结果是:文件“main.py”,第27行,在soup=bs(soup1,'lxml')文件/home/runner/.site packages/bs4/_init_uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuTypeError:“非类型”对象不可调用。数据是否动态加载?打印soup和Verify无法打印它,因为:in-soup=bs(html,'lxml')文件/var/containers/Bundle/Application/93F4D70C-FD37-45C8-80BD-F48B0E2BCCB3/Pythonista3.app/Frameworks/Py3Kit.framework/pylib/site packages/bs4/u init_uuuuuuuuuuuuuuuuuuuuuuuuuuuuuu“,第158行,在”\uuuu init\uuuu%“,”.join(features))bs4.FeatureNotFound:找不到具有您请求的功能的树生成器:lxml。是否需要安装解析器库?
from sys import argv
import urllib
import urllib.request
from bs4 import BeautifulSoup
import re
import photos
import clipboard
thepage = "your url"
#p.1
thepage = urllib.request.urlopen(html)
soup = BeautifulSoup(thepage, "html.parser")
print(soup.title.text)
txt = soup.select('script[type="text/javascript"]')[3]
texte = txt.get_text()
fille = open("tet.txt", 'w')
fille.write(texte)
fille.close()
#p.2
g = open('tet.txt','r')
data=''.join(g.readlines())
le1 = 0
le2 = 0
hturl = open('url.html', 'w')
still_looking = True
while still_looking:
still_looking = False
dat = data.find('play_url', le1)
det = data.find('play_resources', le2)
if dat >= le1:
#urls.append(dat)
le1 = dat + 1
still_looking = True
if det >= le2:
hturl.write(data[dat:det])
le2 = det + 1
still_looking = True
hturl.close()
#p.3
hturl2 = open('url.html', 'r')
dete = ''.join(hturl2.readlines())
le11 = 0
le22 = 0
urls = []
still_looking2 = True
while still_looking2:
still_looking2 = False
dat2 = dete.find('https://scontent-', le11)
det2 = dete.find('","dis', le22)
if dat2 >= le11:
urls.append(dat2)
le11 = dat2 + 1
still_looking2 = True
if det2 >= le22:
urls.append(dete[dat2:det2])
le22 = det2 + 1
still_looking2 = True
hturl2.close()
#p.4
imgs = len(urls)
nbind = imgs
nbindr = 3
images = 1
while nbindr < imgs:
urllib.request.urlretrieve(urls[nbindr], 'photo.jpg')
photos.create_image_asset('photo.jpg')
print ('Image ' + str(images) + ' downloaded')
nbindr = nbindr +2
images += 1
print("OK")