Web scraping 如何从网页中提取特定文本

Web scraping 如何从网页中提取特定文本,web-scraping,Web Scraping,我对寻找星座的模式感兴趣。我正在使用“天空地图”android应用程序进行视觉检查,现在我想构建一个应用程序来查找类似的星座结构。其中的一个子问题是找到特定天体的坐标 示例:如何获取给定时间、日期和位置的“月亮”坐标 以以下方式在其网页上提供此信息 Object: Moon [info|live][less] Right Asc: 04h 15m 12.5s **Decl: 17° 05' 46.3"** (J2000) [HMS|Dec] Magnitude: -10.54 Altitu

我对寻找星座的模式感兴趣。我正在使用“天空地图”android应用程序进行视觉检查,现在我想构建一个应用程序来查找类似的星座结构。其中的一个子问题是找到特定天体的坐标

示例:如何获取给定时间、日期和位置的“月亮”坐标

以以下方式在其网页上提供此信息

 Object: Moon [info|live][less]
 Right Asc: 04h 15m 12.5s **Decl: 17° 05' 46.3"** (J2000) [HMS|Dec]
 Magnitude: -10.54 Altitude: 56° Solar Elongation: 100.4° Constellation: Ari 
 Sun distance: 147.77 Million Km Earth distance: 0.38 Million Km
 Rise: 10:48 Transit: 18:40 Set: 01:35 **Europe/London**

对于月球,我们可以使用网页找到坐标,是否有一些API?或者我们如何通过从网页中提取坐标信息来实现它。

我不确定这是否有用,但这里有一个python实现。您必须找出可接受的位置参数,但日期、小时和分钟都在:

import requests

url = 'https://theskylive.com/planetariumdata'

params = {
'obj': 'moon',
'h': '10',
'm': '30',
'date': '2019-02-28',
'localdata': '51.48|0|Greenwich, United Kingdom|Europe/London'}

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}

response = requests.get(url, params=params, headers=headers).json()

print (response['objects']['moon'])
输出:

{'status': True, 'utc_time': 1551349800, 'ar': 18.114288194444, 'dec': -21.301003146701, 'mag': '-9.11', 'distsun': 148031243.76562, 'distearth': 399053.81054688, 'constell': 'Oph', 'sot': 292.2907375, 'lastdate': '2019-Feb-28 00:00', 'hlong': '158.9866', 'hlongRad': 2.7748396365512, 'hlat': '0.0060', 'hlatRad': 0.00010471975511966, 'hx': -0.92639216172362, 'hy': 0.34779595586615, 'hz': 8.4403227939488e-05, 'elongation': '67.7', 'altitude': 7.7566655880485, 'id': 'moon', 'name': 'Moon', 'category': 'planets', 'circumstances': {'visibility': 'partial', 'raise': 3.0419974882059, 'set': 11.875359660362, 'transit': 7.4771821984014, 'raise_ut': 1551322951.191, 'set_ut': 1551354751.2948, 'transit_ut': 1551338917.8559, 'transit_height': 17.341269275926, 'azimuth_rise': 110.98610232928, 'azimuth_set': 248.66063774998, 'LSTr': 13.552197907652, 'LSTs': 22.409745024956, 'GSTr': 13.552197907652, 'GSTs': 22.409745024956, 'raise_local': 3.0419974882059, 'set_local': 11.875359660362, 'transit_local': 7.4771821984014}, 'timezone': 'Europe/London', 'age': 23}
{
  "utc_seconds": 1551816600,
  "utc_timestamp": "201903052010",
  "objects": {
    "moon": {
      "distsun": 1.479847408587E8,
      "altitude": -32.421642244539,
      "dec": -12.501182812768,
      "constell": "Cap",
      "timezone": "Europe/London",
      "hlat": "-0.0075",
      "hlong": "163.9072",
      "elongation": "9.6",
      "lastdate": "2019-Mar-05 00:00",
      "hx": -0.95427043393163,
      "hy": 0.26061067578779,
      "mag": "-4.82",
      "hlongRad": 2.8607203077248,
      "hz": -1.6343451194632E-4,
      "utc_time": 1551816600,
      "distearth": 405722.20937018,
      "sot": 350.29647638889,
      "id": "moon",
      "circumstances": {
        "transit_local": 11.428494722983,
        "raise_ut": 1.5517668981849E9,
        "set": 16.623858118962,
        "raise_local": 6.3606069281934,
        "visibility": "partial",
        "azimuth_set": 256.90380469917,
        "LSTs": 3.4997935653561,
        "LSTr": 17.208442522882,
        "set_local": 16.623858118962,
        "azimuth_rise": 104.50312047906,
        "GSTs": 3.4997935653561,
        "GSTr": 17.208442522882,
        "transit_ut": 1.551785142581E9,
        "transit": 11.428494722983,
        "raise": 6.3606069281934,
        "set_ut": 1.5518038458892E9,
        "transit_height": 24.710020581601
      },
      "ar": 22.578738425926,
      "name": "Moon",
      "category": "planets",
      "hlatRad": -1.3089969389957E-4,
      "age": 27,
      "status": true
    },
    "mars": {
      "distsun": 2.2963710671492E8,
      "altitude": 27.808183248664,
      "circumstances": {
        "transit_local": 15.80120694427,
        "raise_ut": 1.5517741680418E9,
        "set": 23.222402283833,
        "raise_local": 8.3800116047075,
        "visibility": "partial",
        "azimuth_set": 286.34760861411,
        "LSTs": 10.11640394619,
        "LSTr": 19.233376146402,
        "set_local": 23.222402283833,
        "azimuth_rise": 73.652391385888,
        "GSTs": 10.11640394619,
        "GSTr": 19.233376146402,
        "transit_ut": 1.551800884345E9,
        "transit": 15.80120694427,
        "raise": 8.3800116047075,
        "set_ut": 1.5518276006482E9,
        "transit_height": 54.867608614112
      },
      "dec": 16.347608614112,
      "constell": "Ari",
      "timezone": "Europe/London",
      "hlat": "0.8142",
      "hlong": "75.6345",
      "elongation": "58.1",
      "lastdate": "2019-Mar-05 00:00",
      "hx": 0.36958631955143,
      "ar": 2.6748900462963,
      "hy": 1.4897081109635,
      "mag": "1.23",
      "hlongRad": 1.3200710530997,
      "hz": 0.022145899657793,
      "utc_time": 1551816600,
      "distearth": 2.704192732295E8,
      "name": "Mars",
      "sot": 58.1002,
      "id": "mars",
      "category": "planets",
      "hlatRad": 0.014210470769738,
      "status": true
    },
    "sun": {
      "distsun": 0,
      "altitude": -22.992657046501,
      "circumstances": {
        "transit_local": 12.176106019167,
        "raise_ut": 1.551767861711E9,
        "set": 17.739026911053,
        "raise_local": 6.6282530456618,
        "visibility": "partial",
        "azimuth_set": 263.93596334029,
        "LSTs": 4.618015588543,
        "LSTr": 17.476821431166,
        "set_local": 17.739026911053,
        "azimuth_rise": 96.242086753282,
        "GSTs": 4.618015588543,
        "GSTr": 17.476821431166,
        "transit_ut": 1.5517878339817E9,
        "transit": 12.176106019167,
        "raise": 6.6282530456618,
        "set_ut": 1.5518078604969E9,
        "transit_height": 32.366908597329
      },
      "dec": -6.0242450863769,
      "constell": "Aqr",
      "timezone": "Europe/London",
      "hlat": "n.a.",
      "hlong": "n.a.",
      "elongation": 0,
      "lastdate": "2019-Mar-05 00:00",
      "hx": 0,
      "ar": 23.060617283951,
      "hy": 0,
      "mag": "-26.76",
      "hlongRad": null,
      "hz": 0,
      "utc_time": 1551816600,
      "distearth": 1.4838474994878E8,
      "name": "Sun",
      "sot": 0,
      "id": "sun",
      "category": "planets",
      "hlatRad": null,
      "status": true
    }
  },
  "target": "sun"
}

我不确定这是否有用,但这里有一个python实现。您必须找出可接受的位置参数,但日期、小时和分钟都在:

import requests

url = 'https://theskylive.com/planetariumdata'

params = {
'obj': 'moon',
'h': '10',
'm': '30',
'date': '2019-02-28',
'localdata': '51.48|0|Greenwich, United Kingdom|Europe/London'}

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}

response = requests.get(url, params=params, headers=headers).json()

print (response['objects']['moon'])
输出:

{'status': True, 'utc_time': 1551349800, 'ar': 18.114288194444, 'dec': -21.301003146701, 'mag': '-9.11', 'distsun': 148031243.76562, 'distearth': 399053.81054688, 'constell': 'Oph', 'sot': 292.2907375, 'lastdate': '2019-Feb-28 00:00', 'hlong': '158.9866', 'hlongRad': 2.7748396365512, 'hlat': '0.0060', 'hlatRad': 0.00010471975511966, 'hx': -0.92639216172362, 'hy': 0.34779595586615, 'hz': 8.4403227939488e-05, 'elongation': '67.7', 'altitude': 7.7566655880485, 'id': 'moon', 'name': 'Moon', 'category': 'planets', 'circumstances': {'visibility': 'partial', 'raise': 3.0419974882059, 'set': 11.875359660362, 'transit': 7.4771821984014, 'raise_ut': 1551322951.191, 'set_ut': 1551354751.2948, 'transit_ut': 1551338917.8559, 'transit_height': 17.341269275926, 'azimuth_rise': 110.98610232928, 'azimuth_set': 248.66063774998, 'LSTr': 13.552197907652, 'LSTs': 22.409745024956, 'GSTr': 13.552197907652, 'GSTs': 22.409745024956, 'raise_local': 3.0419974882059, 'set_local': 11.875359660362, 'transit_local': 7.4771821984014}, 'timezone': 'Europe/London', 'age': 23}
{
  "utc_seconds": 1551816600,
  "utc_timestamp": "201903052010",
  "objects": {
    "moon": {
      "distsun": 1.479847408587E8,
      "altitude": -32.421642244539,
      "dec": -12.501182812768,
      "constell": "Cap",
      "timezone": "Europe/London",
      "hlat": "-0.0075",
      "hlong": "163.9072",
      "elongation": "9.6",
      "lastdate": "2019-Mar-05 00:00",
      "hx": -0.95427043393163,
      "hy": 0.26061067578779,
      "mag": "-4.82",
      "hlongRad": 2.8607203077248,
      "hz": -1.6343451194632E-4,
      "utc_time": 1551816600,
      "distearth": 405722.20937018,
      "sot": 350.29647638889,
      "id": "moon",
      "circumstances": {
        "transit_local": 11.428494722983,
        "raise_ut": 1.5517668981849E9,
        "set": 16.623858118962,
        "raise_local": 6.3606069281934,
        "visibility": "partial",
        "azimuth_set": 256.90380469917,
        "LSTs": 3.4997935653561,
        "LSTr": 17.208442522882,
        "set_local": 16.623858118962,
        "azimuth_rise": 104.50312047906,
        "GSTs": 3.4997935653561,
        "GSTr": 17.208442522882,
        "transit_ut": 1.551785142581E9,
        "transit": 11.428494722983,
        "raise": 6.3606069281934,
        "set_ut": 1.5518038458892E9,
        "transit_height": 24.710020581601
      },
      "ar": 22.578738425926,
      "name": "Moon",
      "category": "planets",
      "hlatRad": -1.3089969389957E-4,
      "age": 27,
      "status": true
    },
    "mars": {
      "distsun": 2.2963710671492E8,
      "altitude": 27.808183248664,
      "circumstances": {
        "transit_local": 15.80120694427,
        "raise_ut": 1.5517741680418E9,
        "set": 23.222402283833,
        "raise_local": 8.3800116047075,
        "visibility": "partial",
        "azimuth_set": 286.34760861411,
        "LSTs": 10.11640394619,
        "LSTr": 19.233376146402,
        "set_local": 23.222402283833,
        "azimuth_rise": 73.652391385888,
        "GSTs": 10.11640394619,
        "GSTr": 19.233376146402,
        "transit_ut": 1.551800884345E9,
        "transit": 15.80120694427,
        "raise": 8.3800116047075,
        "set_ut": 1.5518276006482E9,
        "transit_height": 54.867608614112
      },
      "dec": 16.347608614112,
      "constell": "Ari",
      "timezone": "Europe/London",
      "hlat": "0.8142",
      "hlong": "75.6345",
      "elongation": "58.1",
      "lastdate": "2019-Mar-05 00:00",
      "hx": 0.36958631955143,
      "ar": 2.6748900462963,
      "hy": 1.4897081109635,
      "mag": "1.23",
      "hlongRad": 1.3200710530997,
      "hz": 0.022145899657793,
      "utc_time": 1551816600,
      "distearth": 2.704192732295E8,
      "name": "Mars",
      "sot": 58.1002,
      "id": "mars",
      "category": "planets",
      "hlatRad": 0.014210470769738,
      "status": true
    },
    "sun": {
      "distsun": 0,
      "altitude": -22.992657046501,
      "circumstances": {
        "transit_local": 12.176106019167,
        "raise_ut": 1.551767861711E9,
        "set": 17.739026911053,
        "raise_local": 6.6282530456618,
        "visibility": "partial",
        "azimuth_set": 263.93596334029,
        "LSTs": 4.618015588543,
        "LSTr": 17.476821431166,
        "set_local": 17.739026911053,
        "azimuth_rise": 96.242086753282,
        "GSTs": 4.618015588543,
        "GSTr": 17.476821431166,
        "transit_ut": 1.5517878339817E9,
        "transit": 12.176106019167,
        "raise": 6.6282530456618,
        "set_ut": 1.5518078604969E9,
        "transit_height": 32.366908597329
      },
      "dec": -6.0242450863769,
      "constell": "Aqr",
      "timezone": "Europe/London",
      "hlat": "n.a.",
      "hlong": "n.a.",
      "elongation": 0,
      "lastdate": "2019-Mar-05 00:00",
      "hx": 0,
      "ar": 23.060617283951,
      "hy": 0,
      "mag": "-26.76",
      "hlongRad": null,
      "hz": 0,
      "utc_time": 1551816600,
      "distearth": 1.4838474994878E8,
      "name": "Sun",
      "sot": 0,
      "id": "sun",
      "category": "planets",
      "hlatRad": null,
      "status": true
    }
  },
  "target": "sun"
}

下面您可以找到有关如何在python中执行此操作的代码。有许多方法可以将代码合并到应用程序中。为了便于说明,我将结果投射到熊猫数据框中,以便您可以查看结果。我还添加了一些代码来处理代理设置,如果不是这样,您可以省去它,只需通过请求包获取url文本

希望能有帮助

import urllib
import pandas as pd
import numpy as np

username = 'userID'  # ex. ID
password = "password!"  # password

dataURL = "https://theskylive.com/moon-info"

proxies = {
    'https':  'https://{}:{}@proxy:port'.format(username, password)}
proxy = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)

with urllib.request.urlopen(dataURL) as url:
    text = str(url.read())

tableStart = text.find('The Moon Ephemeris')
tableEnd = text.find('Distance of The Moon from Earth')
tableProgress = tableStart

findSTR = 'moon&date='
loc = text.find(findSTR,tableStart,tableEnd)
startDate = text[loc+len(findSTR):loc+len(findSTR)+10] 

table = []
tableRow = []
counter = 0
counter2 = 0

diff = [20,4]

while loc>0:

    loc1 = text.find('<td class="desktop">',tableProgress,tableEnd)
    loc2 = text.find('<td>',tableProgress,tableEnd)

    if loc1<0:
        if loc2<0:
            loc = -1
        else:
            loc = loc2
            pos = 1
    else:
        if loc2<0:
            loc = loc1
            pos = 0
        else:            
            loc = np.min([loc1,loc2])
            pos = np.argmin([loc1,loc2])


    if loc>0:



        locStart = loc+diff[pos]
        loc = text.find('</td>',loc,tableEnd)

        if loc>0:
            extractedText = text[locStart:loc]

            if counter ==1:

                extractedText = extractedText.replace('&deg;',' deg')
                extractedText = extractedText.replace('&rsquo;',' min')
                extractedText = extractedText.replace('&rdquo;',' sec')
            elif counter ==3:                
                extractedText = extractedText.replace('&rdquo;',' arcsec')
            tableRow = tableRow+ [extractedText]
            tableProgress = loc


            counter = counter+1
            if counter==5:
                counter2 = counter2+1                
                counter = 0
                table = table+[tableRow]
                tableRow = []


idx = pd.date_range(start='2019-02-26', periods=len(table), freq='D')
cols = ['Right Ascension','Declination','Magnitude','Apparent Diameter','Constellation']

Data = pd.DataFrame(table,index=idx,columns=cols)
print(Data)
导入urllib
作为pd进口熊猫
将numpy作为np导入
用户名='userID'#例如ID
password=“password!”#密码
数据URL=”https://theskylive.com/moon-info"
代理={
'https':'https://{}:{}@proxy:port'。格式(用户名、密码)}
proxy=urllib.request.ProxyHandler(代理)
opener=urllib.request.build\u opener(代理)
urllib.request.install_opener(opener)
使用urllib.request.urlopen(dataURL)作为url:
text=str(url.read())
tableStart=text.find(‘月球星历’)
tableEnd=text.find(‘月球到地球的距离’)
tableProgress=tableStart
findSTR='moon&date='
loc=text.find(findSTR、tableStart、tableEnd)
startDate=文本[loc+len(findSTR):loc+len(findSTR)+10]
表=[]
tableRow=[]
计数器=0
计数器2=0
差异=[20,4]
当loc>0时:
loc1=text.find(“”,tableProgress,tableEnd)
loc2=text.find(“”,tableProgress,tableEnd)
如果是10:
extractedText=文本[locStart:loc]
如果计数器==1:
extractedText=extractedText.replace(“°;”,“°”)
extractedText=extractedText.replace(“&rsquo;”,“min”)
extractedText=extractedText.replace(“&rdquo;”,“sec”)
elif计数器==3:
extractedText=extractedText.replace(“&rdquo;”,“arcsec”)
tableRow=tableRow+[extractedText]
tableProgress=loc
计数器=计数器+1
如果计数器==5:
计数器2=计数器2+1
计数器=0
表=表+[tableRow]
tableRow=[]
idx=pd.日期\范围(开始时间=2019-02-26',期间=len(表),频率=D')
cols=[‘赤经’、‘赤纬’、‘星等’、‘视直径’、‘星座’]
Data=pd.DataFrame(表,索引=idx,列=cols)
打印(数据)

在下面,您可以找到有关如何在python中执行此操作的代码。有许多方法可以将代码合并到应用程序中。为了便于说明,我将结果投射到熊猫数据框中,以便您可以查看结果。我还添加了一些代码来处理代理设置,如果不是这样,您可以省去它,只需通过请求包获取url文本

希望能有帮助

import urllib
import pandas as pd
import numpy as np

username = 'userID'  # ex. ID
password = "password!"  # password

dataURL = "https://theskylive.com/moon-info"

proxies = {
    'https':  'https://{}:{}@proxy:port'.format(username, password)}
proxy = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)

with urllib.request.urlopen(dataURL) as url:
    text = str(url.read())

tableStart = text.find('The Moon Ephemeris')
tableEnd = text.find('Distance of The Moon from Earth')
tableProgress = tableStart

findSTR = 'moon&date='
loc = text.find(findSTR,tableStart,tableEnd)
startDate = text[loc+len(findSTR):loc+len(findSTR)+10] 

table = []
tableRow = []
counter = 0
counter2 = 0

diff = [20,4]

while loc>0:

    loc1 = text.find('<td class="desktop">',tableProgress,tableEnd)
    loc2 = text.find('<td>',tableProgress,tableEnd)

    if loc1<0:
        if loc2<0:
            loc = -1
        else:
            loc = loc2
            pos = 1
    else:
        if loc2<0:
            loc = loc1
            pos = 0
        else:            
            loc = np.min([loc1,loc2])
            pos = np.argmin([loc1,loc2])


    if loc>0:



        locStart = loc+diff[pos]
        loc = text.find('</td>',loc,tableEnd)

        if loc>0:
            extractedText = text[locStart:loc]

            if counter ==1:

                extractedText = extractedText.replace('&deg;',' deg')
                extractedText = extractedText.replace('&rsquo;',' min')
                extractedText = extractedText.replace('&rdquo;',' sec')
            elif counter ==3:                
                extractedText = extractedText.replace('&rdquo;',' arcsec')
            tableRow = tableRow+ [extractedText]
            tableProgress = loc


            counter = counter+1
            if counter==5:
                counter2 = counter2+1                
                counter = 0
                table = table+[tableRow]
                tableRow = []


idx = pd.date_range(start='2019-02-26', periods=len(table), freq='D')
cols = ['Right Ascension','Declination','Magnitude','Apparent Diameter','Constellation']

Data = pd.DataFrame(table,index=idx,columns=cols)
print(Data)
导入urllib
作为pd进口熊猫
将numpy作为np导入
用户名='userID'#例如ID
password=“password!”#密码
数据URL=”https://theskylive.com/moon-info"
代理={
'https':'https://{}:{}@proxy:port'。格式(用户名、密码)}
proxy=urllib.request.ProxyHandler(代理)
opener=urllib.request.build\u opener(代理)
urllib.request.install_opener(opener)
使用urllib.request.urlopen(dataURL)作为url:
text=str(url.read())
tableStart=text.find(‘月球星历’)
tableEnd=text.find(‘月球到地球的距离’)
tableProgress=tableStart
findSTR='moon&date='
loc=text.find(findSTR、tableStart、tableEnd)
startDate=文本[loc+len(findSTR):loc+len(findSTR)+10]
表=[]
tableRow=[]
计数器=0
计数器2=0
差异=[20,4]
当loc>0时:
loc1=text.find(“”,tableProgress,tableEnd)
loc2=text.find(“”,tableProgress,tableEnd)
如果是10:
extractedText=文本[locStart:loc]
如果计数器==1:
extractedText=extractedText.replace(“°;”,“°”)
extractedText=extractedText.replace(“&rsquo;”,“min”)
extractedText=extractedText.replace(“&rdquo;”,“sec”)
elif计数器==3:
extractedText=extractedText.replace(“&rdquo;”,“arcsec”)
tableRow=tableRow+[extractedText]
tableProgress=loc
计数器=计数器+1
如果计数器==5:
计数器2=计数器2+1
计数器=0
表=表+[tableRow]
tableRow=[]
idx=pd.日期\范围(开始时间=2019-02-26',期间=len(表),频率=D')
cols=[‘赤经’、‘赤纬’、‘星等’、‘视直径’、‘星座’]
Data=pd.DataFrame(表,索引=idx,列=cols)
打印(数据)

我不是安卓专家,但这是你可以做的

build.gradle

插件{
id‘java’
}
组“测试,测试”
版本“1.0-SNAPSHOT”
sourceCompatibility=1.8
存储库{
mavenCentral()
}
依赖关系{
testCompile组:“junit”,名称:“junit”,版本:“4.12”
实现'com.squareup.okhttp3:okhttp:3.13.1'
编译组:“org.json”,名称:“json”,版本:“20180813”
}
天文馆.java

import-okhttp3.OkHttpCli