Web scraping 如何从网页中提取特定文本
我对寻找星座的模式感兴趣。我正在使用“天空地图”android应用程序进行视觉检查,现在我想构建一个应用程序来查找类似的星座结构。其中的一个子问题是找到特定天体的坐标 示例:如何获取给定时间、日期和位置的“月亮”坐标 以以下方式在其网页上提供此信息Web scraping 如何从网页中提取特定文本,web-scraping,Web Scraping,我对寻找星座的模式感兴趣。我正在使用“天空地图”android应用程序进行视觉检查,现在我想构建一个应用程序来查找类似的星座结构。其中的一个子问题是找到特定天体的坐标 示例:如何获取给定时间、日期和位置的“月亮”坐标 以以下方式在其网页上提供此信息 Object: Moon [info|live][less] Right Asc: 04h 15m 12.5s **Decl: 17° 05' 46.3"** (J2000) [HMS|Dec] Magnitude: -10.54 Altitu
Object: Moon [info|live][less]
Right Asc: 04h 15m 12.5s **Decl: 17° 05' 46.3"** (J2000) [HMS|Dec]
Magnitude: -10.54 Altitude: 56° Solar Elongation: 100.4° Constellation: Ari
Sun distance: 147.77 Million Km Earth distance: 0.38 Million Km
Rise: 10:48 Transit: 18:40 Set: 01:35 **Europe/London**
对于月球,我们可以使用网页找到坐标,是否有一些API?或者我们如何通过从网页中提取坐标信息来实现它。我不确定这是否有用,但这里有一个python实现。您必须找出可接受的位置参数,但日期、小时和分钟都在:
import requests
url = 'https://theskylive.com/planetariumdata'
params = {
'obj': 'moon',
'h': '10',
'm': '30',
'date': '2019-02-28',
'localdata': '51.48|0|Greenwich, United Kingdom|Europe/London'}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}
response = requests.get(url, params=params, headers=headers).json()
print (response['objects']['moon'])
输出:
{'status': True, 'utc_time': 1551349800, 'ar': 18.114288194444, 'dec': -21.301003146701, 'mag': '-9.11', 'distsun': 148031243.76562, 'distearth': 399053.81054688, 'constell': 'Oph', 'sot': 292.2907375, 'lastdate': '2019-Feb-28 00:00', 'hlong': '158.9866', 'hlongRad': 2.7748396365512, 'hlat': '0.0060', 'hlatRad': 0.00010471975511966, 'hx': -0.92639216172362, 'hy': 0.34779595586615, 'hz': 8.4403227939488e-05, 'elongation': '67.7', 'altitude': 7.7566655880485, 'id': 'moon', 'name': 'Moon', 'category': 'planets', 'circumstances': {'visibility': 'partial', 'raise': 3.0419974882059, 'set': 11.875359660362, 'transit': 7.4771821984014, 'raise_ut': 1551322951.191, 'set_ut': 1551354751.2948, 'transit_ut': 1551338917.8559, 'transit_height': 17.341269275926, 'azimuth_rise': 110.98610232928, 'azimuth_set': 248.66063774998, 'LSTr': 13.552197907652, 'LSTs': 22.409745024956, 'GSTr': 13.552197907652, 'GSTs': 22.409745024956, 'raise_local': 3.0419974882059, 'set_local': 11.875359660362, 'transit_local': 7.4771821984014}, 'timezone': 'Europe/London', 'age': 23}
{
"utc_seconds": 1551816600,
"utc_timestamp": "201903052010",
"objects": {
"moon": {
"distsun": 1.479847408587E8,
"altitude": -32.421642244539,
"dec": -12.501182812768,
"constell": "Cap",
"timezone": "Europe/London",
"hlat": "-0.0075",
"hlong": "163.9072",
"elongation": "9.6",
"lastdate": "2019-Mar-05 00:00",
"hx": -0.95427043393163,
"hy": 0.26061067578779,
"mag": "-4.82",
"hlongRad": 2.8607203077248,
"hz": -1.6343451194632E-4,
"utc_time": 1551816600,
"distearth": 405722.20937018,
"sot": 350.29647638889,
"id": "moon",
"circumstances": {
"transit_local": 11.428494722983,
"raise_ut": 1.5517668981849E9,
"set": 16.623858118962,
"raise_local": 6.3606069281934,
"visibility": "partial",
"azimuth_set": 256.90380469917,
"LSTs": 3.4997935653561,
"LSTr": 17.208442522882,
"set_local": 16.623858118962,
"azimuth_rise": 104.50312047906,
"GSTs": 3.4997935653561,
"GSTr": 17.208442522882,
"transit_ut": 1.551785142581E9,
"transit": 11.428494722983,
"raise": 6.3606069281934,
"set_ut": 1.5518038458892E9,
"transit_height": 24.710020581601
},
"ar": 22.578738425926,
"name": "Moon",
"category": "planets",
"hlatRad": -1.3089969389957E-4,
"age": 27,
"status": true
},
"mars": {
"distsun": 2.2963710671492E8,
"altitude": 27.808183248664,
"circumstances": {
"transit_local": 15.80120694427,
"raise_ut": 1.5517741680418E9,
"set": 23.222402283833,
"raise_local": 8.3800116047075,
"visibility": "partial",
"azimuth_set": 286.34760861411,
"LSTs": 10.11640394619,
"LSTr": 19.233376146402,
"set_local": 23.222402283833,
"azimuth_rise": 73.652391385888,
"GSTs": 10.11640394619,
"GSTr": 19.233376146402,
"transit_ut": 1.551800884345E9,
"transit": 15.80120694427,
"raise": 8.3800116047075,
"set_ut": 1.5518276006482E9,
"transit_height": 54.867608614112
},
"dec": 16.347608614112,
"constell": "Ari",
"timezone": "Europe/London",
"hlat": "0.8142",
"hlong": "75.6345",
"elongation": "58.1",
"lastdate": "2019-Mar-05 00:00",
"hx": 0.36958631955143,
"ar": 2.6748900462963,
"hy": 1.4897081109635,
"mag": "1.23",
"hlongRad": 1.3200710530997,
"hz": 0.022145899657793,
"utc_time": 1551816600,
"distearth": 2.704192732295E8,
"name": "Mars",
"sot": 58.1002,
"id": "mars",
"category": "planets",
"hlatRad": 0.014210470769738,
"status": true
},
"sun": {
"distsun": 0,
"altitude": -22.992657046501,
"circumstances": {
"transit_local": 12.176106019167,
"raise_ut": 1.551767861711E9,
"set": 17.739026911053,
"raise_local": 6.6282530456618,
"visibility": "partial",
"azimuth_set": 263.93596334029,
"LSTs": 4.618015588543,
"LSTr": 17.476821431166,
"set_local": 17.739026911053,
"azimuth_rise": 96.242086753282,
"GSTs": 4.618015588543,
"GSTr": 17.476821431166,
"transit_ut": 1.5517878339817E9,
"transit": 12.176106019167,
"raise": 6.6282530456618,
"set_ut": 1.5518078604969E9,
"transit_height": 32.366908597329
},
"dec": -6.0242450863769,
"constell": "Aqr",
"timezone": "Europe/London",
"hlat": "n.a.",
"hlong": "n.a.",
"elongation": 0,
"lastdate": "2019-Mar-05 00:00",
"hx": 0,
"ar": 23.060617283951,
"hy": 0,
"mag": "-26.76",
"hlongRad": null,
"hz": 0,
"utc_time": 1551816600,
"distearth": 1.4838474994878E8,
"name": "Sun",
"sot": 0,
"id": "sun",
"category": "planets",
"hlatRad": null,
"status": true
}
},
"target": "sun"
}
我不确定这是否有用,但这里有一个python实现。您必须找出可接受的位置参数,但日期、小时和分钟都在:
import requests
url = 'https://theskylive.com/planetariumdata'
params = {
'obj': 'moon',
'h': '10',
'm': '30',
'date': '2019-02-28',
'localdata': '51.48|0|Greenwich, United Kingdom|Europe/London'}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}
response = requests.get(url, params=params, headers=headers).json()
print (response['objects']['moon'])
输出:
{'status': True, 'utc_time': 1551349800, 'ar': 18.114288194444, 'dec': -21.301003146701, 'mag': '-9.11', 'distsun': 148031243.76562, 'distearth': 399053.81054688, 'constell': 'Oph', 'sot': 292.2907375, 'lastdate': '2019-Feb-28 00:00', 'hlong': '158.9866', 'hlongRad': 2.7748396365512, 'hlat': '0.0060', 'hlatRad': 0.00010471975511966, 'hx': -0.92639216172362, 'hy': 0.34779595586615, 'hz': 8.4403227939488e-05, 'elongation': '67.7', 'altitude': 7.7566655880485, 'id': 'moon', 'name': 'Moon', 'category': 'planets', 'circumstances': {'visibility': 'partial', 'raise': 3.0419974882059, 'set': 11.875359660362, 'transit': 7.4771821984014, 'raise_ut': 1551322951.191, 'set_ut': 1551354751.2948, 'transit_ut': 1551338917.8559, 'transit_height': 17.341269275926, 'azimuth_rise': 110.98610232928, 'azimuth_set': 248.66063774998, 'LSTr': 13.552197907652, 'LSTs': 22.409745024956, 'GSTr': 13.552197907652, 'GSTs': 22.409745024956, 'raise_local': 3.0419974882059, 'set_local': 11.875359660362, 'transit_local': 7.4771821984014}, 'timezone': 'Europe/London', 'age': 23}
{
"utc_seconds": 1551816600,
"utc_timestamp": "201903052010",
"objects": {
"moon": {
"distsun": 1.479847408587E8,
"altitude": -32.421642244539,
"dec": -12.501182812768,
"constell": "Cap",
"timezone": "Europe/London",
"hlat": "-0.0075",
"hlong": "163.9072",
"elongation": "9.6",
"lastdate": "2019-Mar-05 00:00",
"hx": -0.95427043393163,
"hy": 0.26061067578779,
"mag": "-4.82",
"hlongRad": 2.8607203077248,
"hz": -1.6343451194632E-4,
"utc_time": 1551816600,
"distearth": 405722.20937018,
"sot": 350.29647638889,
"id": "moon",
"circumstances": {
"transit_local": 11.428494722983,
"raise_ut": 1.5517668981849E9,
"set": 16.623858118962,
"raise_local": 6.3606069281934,
"visibility": "partial",
"azimuth_set": 256.90380469917,
"LSTs": 3.4997935653561,
"LSTr": 17.208442522882,
"set_local": 16.623858118962,
"azimuth_rise": 104.50312047906,
"GSTs": 3.4997935653561,
"GSTr": 17.208442522882,
"transit_ut": 1.551785142581E9,
"transit": 11.428494722983,
"raise": 6.3606069281934,
"set_ut": 1.5518038458892E9,
"transit_height": 24.710020581601
},
"ar": 22.578738425926,
"name": "Moon",
"category": "planets",
"hlatRad": -1.3089969389957E-4,
"age": 27,
"status": true
},
"mars": {
"distsun": 2.2963710671492E8,
"altitude": 27.808183248664,
"circumstances": {
"transit_local": 15.80120694427,
"raise_ut": 1.5517741680418E9,
"set": 23.222402283833,
"raise_local": 8.3800116047075,
"visibility": "partial",
"azimuth_set": 286.34760861411,
"LSTs": 10.11640394619,
"LSTr": 19.233376146402,
"set_local": 23.222402283833,
"azimuth_rise": 73.652391385888,
"GSTs": 10.11640394619,
"GSTr": 19.233376146402,
"transit_ut": 1.551800884345E9,
"transit": 15.80120694427,
"raise": 8.3800116047075,
"set_ut": 1.5518276006482E9,
"transit_height": 54.867608614112
},
"dec": 16.347608614112,
"constell": "Ari",
"timezone": "Europe/London",
"hlat": "0.8142",
"hlong": "75.6345",
"elongation": "58.1",
"lastdate": "2019-Mar-05 00:00",
"hx": 0.36958631955143,
"ar": 2.6748900462963,
"hy": 1.4897081109635,
"mag": "1.23",
"hlongRad": 1.3200710530997,
"hz": 0.022145899657793,
"utc_time": 1551816600,
"distearth": 2.704192732295E8,
"name": "Mars",
"sot": 58.1002,
"id": "mars",
"category": "planets",
"hlatRad": 0.014210470769738,
"status": true
},
"sun": {
"distsun": 0,
"altitude": -22.992657046501,
"circumstances": {
"transit_local": 12.176106019167,
"raise_ut": 1.551767861711E9,
"set": 17.739026911053,
"raise_local": 6.6282530456618,
"visibility": "partial",
"azimuth_set": 263.93596334029,
"LSTs": 4.618015588543,
"LSTr": 17.476821431166,
"set_local": 17.739026911053,
"azimuth_rise": 96.242086753282,
"GSTs": 4.618015588543,
"GSTr": 17.476821431166,
"transit_ut": 1.5517878339817E9,
"transit": 12.176106019167,
"raise": 6.6282530456618,
"set_ut": 1.5518078604969E9,
"transit_height": 32.366908597329
},
"dec": -6.0242450863769,
"constell": "Aqr",
"timezone": "Europe/London",
"hlat": "n.a.",
"hlong": "n.a.",
"elongation": 0,
"lastdate": "2019-Mar-05 00:00",
"hx": 0,
"ar": 23.060617283951,
"hy": 0,
"mag": "-26.76",
"hlongRad": null,
"hz": 0,
"utc_time": 1551816600,
"distearth": 1.4838474994878E8,
"name": "Sun",
"sot": 0,
"id": "sun",
"category": "planets",
"hlatRad": null,
"status": true
}
},
"target": "sun"
}
下面您可以找到有关如何在python中执行此操作的代码。有许多方法可以将代码合并到应用程序中。为了便于说明,我将结果投射到熊猫数据框中,以便您可以查看结果。我还添加了一些代码来处理代理设置,如果不是这样,您可以省去它,只需通过请求包获取url文本 希望能有帮助
import urllib
import pandas as pd
import numpy as np
username = 'userID' # ex. ID
password = "password!" # password
dataURL = "https://theskylive.com/moon-info"
proxies = {
'https': 'https://{}:{}@proxy:port'.format(username, password)}
proxy = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)
with urllib.request.urlopen(dataURL) as url:
text = str(url.read())
tableStart = text.find('The Moon Ephemeris')
tableEnd = text.find('Distance of The Moon from Earth')
tableProgress = tableStart
findSTR = 'moon&date='
loc = text.find(findSTR,tableStart,tableEnd)
startDate = text[loc+len(findSTR):loc+len(findSTR)+10]
table = []
tableRow = []
counter = 0
counter2 = 0
diff = [20,4]
while loc>0:
loc1 = text.find('<td class="desktop">',tableProgress,tableEnd)
loc2 = text.find('<td>',tableProgress,tableEnd)
if loc1<0:
if loc2<0:
loc = -1
else:
loc = loc2
pos = 1
else:
if loc2<0:
loc = loc1
pos = 0
else:
loc = np.min([loc1,loc2])
pos = np.argmin([loc1,loc2])
if loc>0:
locStart = loc+diff[pos]
loc = text.find('</td>',loc,tableEnd)
if loc>0:
extractedText = text[locStart:loc]
if counter ==1:
extractedText = extractedText.replace('°',' deg')
extractedText = extractedText.replace('’',' min')
extractedText = extractedText.replace('”',' sec')
elif counter ==3:
extractedText = extractedText.replace('”',' arcsec')
tableRow = tableRow+ [extractedText]
tableProgress = loc
counter = counter+1
if counter==5:
counter2 = counter2+1
counter = 0
table = table+[tableRow]
tableRow = []
idx = pd.date_range(start='2019-02-26', periods=len(table), freq='D')
cols = ['Right Ascension','Declination','Magnitude','Apparent Diameter','Constellation']
Data = pd.DataFrame(table,index=idx,columns=cols)
print(Data)
导入urllib
作为pd进口熊猫
将numpy作为np导入
用户名='userID'#例如ID
password=“password!”#密码
数据URL=”https://theskylive.com/moon-info"
代理={
'https':'https://{}:{}@proxy:port'。格式(用户名、密码)}
proxy=urllib.request.ProxyHandler(代理)
opener=urllib.request.build\u opener(代理)
urllib.request.install_opener(opener)
使用urllib.request.urlopen(dataURL)作为url:
text=str(url.read())
tableStart=text.find(‘月球星历’)
tableEnd=text.find(‘月球到地球的距离’)
tableProgress=tableStart
findSTR='moon&date='
loc=text.find(findSTR、tableStart、tableEnd)
startDate=文本[loc+len(findSTR):loc+len(findSTR)+10]
表=[]
tableRow=[]
计数器=0
计数器2=0
差异=[20,4]
当loc>0时:
loc1=text.find(“”,tableProgress,tableEnd)
loc2=text.find(“”,tableProgress,tableEnd)
如果是10:
extractedText=文本[locStart:loc]
如果计数器==1:
extractedText=extractedText.replace(“°;”,“°”)
extractedText=extractedText.replace(“&rsquo;”,“min”)
extractedText=extractedText.replace(“&rdquo;”,“sec”)
elif计数器==3:
extractedText=extractedText.replace(“&rdquo;”,“arcsec”)
tableRow=tableRow+[extractedText]
tableProgress=loc
计数器=计数器+1
如果计数器==5:
计数器2=计数器2+1
计数器=0
表=表+[tableRow]
tableRow=[]
idx=pd.日期\范围(开始时间=2019-02-26',期间=len(表),频率=D')
cols=[‘赤经’、‘赤纬’、‘星等’、‘视直径’、‘星座’]
Data=pd.DataFrame(表,索引=idx,列=cols)
打印(数据)
在下面,您可以找到有关如何在python中执行此操作的代码。有许多方法可以将代码合并到应用程序中。为了便于说明,我将结果投射到熊猫数据框中,以便您可以查看结果。我还添加了一些代码来处理代理设置,如果不是这样,您可以省去它,只需通过请求包获取url文本
希望能有帮助
import urllib
import pandas as pd
import numpy as np
username = 'userID' # ex. ID
password = "password!" # password
dataURL = "https://theskylive.com/moon-info"
proxies = {
'https': 'https://{}:{}@proxy:port'.format(username, password)}
proxy = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)
with urllib.request.urlopen(dataURL) as url:
text = str(url.read())
tableStart = text.find('The Moon Ephemeris')
tableEnd = text.find('Distance of The Moon from Earth')
tableProgress = tableStart
findSTR = 'moon&date='
loc = text.find(findSTR,tableStart,tableEnd)
startDate = text[loc+len(findSTR):loc+len(findSTR)+10]
table = []
tableRow = []
counter = 0
counter2 = 0
diff = [20,4]
while loc>0:
loc1 = text.find('<td class="desktop">',tableProgress,tableEnd)
loc2 = text.find('<td>',tableProgress,tableEnd)
if loc1<0:
if loc2<0:
loc = -1
else:
loc = loc2
pos = 1
else:
if loc2<0:
loc = loc1
pos = 0
else:
loc = np.min([loc1,loc2])
pos = np.argmin([loc1,loc2])
if loc>0:
locStart = loc+diff[pos]
loc = text.find('</td>',loc,tableEnd)
if loc>0:
extractedText = text[locStart:loc]
if counter ==1:
extractedText = extractedText.replace('°',' deg')
extractedText = extractedText.replace('’',' min')
extractedText = extractedText.replace('”',' sec')
elif counter ==3:
extractedText = extractedText.replace('”',' arcsec')
tableRow = tableRow+ [extractedText]
tableProgress = loc
counter = counter+1
if counter==5:
counter2 = counter2+1
counter = 0
table = table+[tableRow]
tableRow = []
idx = pd.date_range(start='2019-02-26', periods=len(table), freq='D')
cols = ['Right Ascension','Declination','Magnitude','Apparent Diameter','Constellation']
Data = pd.DataFrame(table,index=idx,columns=cols)
print(Data)
导入urllib
作为pd进口熊猫
将numpy作为np导入
用户名='userID'#例如ID
password=“password!”#密码
数据URL=”https://theskylive.com/moon-info"
代理={
'https':'https://{}:{}@proxy:port'。格式(用户名、密码)}
proxy=urllib.request.ProxyHandler(代理)
opener=urllib.request.build\u opener(代理)
urllib.request.install_opener(opener)
使用urllib.request.urlopen(dataURL)作为url:
text=str(url.read())
tableStart=text.find(‘月球星历’)
tableEnd=text.find(‘月球到地球的距离’)
tableProgress=tableStart
findSTR='moon&date='
loc=text.find(findSTR、tableStart、tableEnd)
startDate=文本[loc+len(findSTR):loc+len(findSTR)+10]
表=[]
tableRow=[]
计数器=0
计数器2=0
差异=[20,4]
当loc>0时:
loc1=text.find(“”,tableProgress,tableEnd)
loc2=text.find(“”,tableProgress,tableEnd)
如果是10:
extractedText=文本[locStart:loc]
如果计数器==1:
extractedText=extractedText.replace(“°;”,“°”)
extractedText=extractedText.replace(“&rsquo;”,“min”)
extractedText=extractedText.replace(“&rdquo;”,“sec”)
elif计数器==3:
extractedText=extractedText.replace(“&rdquo;”,“arcsec”)
tableRow=tableRow+[extractedText]
tableProgress=loc
计数器=计数器+1
如果计数器==5:
计数器2=计数器2+1
计数器=0
表=表+[tableRow]
tableRow=[]
idx=pd.日期\范围(开始时间=2019-02-26',期间=len(表),频率=D')
cols=[‘赤经’、‘赤纬’、‘星等’、‘视直径’、‘星座’]
Data=pd.DataFrame(表,索引=idx,列=cols)
打印(数据)
我不是安卓专家,但这是你可以做的
build.gradle
插件{
id‘java’
}
组“测试,测试”
版本“1.0-SNAPSHOT”
sourceCompatibility=1.8
存储库{
mavenCentral()
}
依赖关系{
testCompile组:“junit”,名称:“junit”,版本:“4.12”
实现'com.squareup.okhttp3:okhttp:3.13.1'
编译组:“org.json”,名称:“json”,版本:“20180813”
}
天文馆.java
import-okhttp3.OkHttpCli