如何将in.HTM作为数据帧读入R?

如何将in.HTM作为数据帧读入R?,r,data-science,data-manipulation,data-cleaning,R,Data Science,Data Manipulation,Data Cleaning,我对我的一些交易进行回溯测试,我有非常大的.HTM本地文件。它们大约是250mb一块,我很难将它们读入R来对数据集进行一些数据分析。我曾尝试将本地文件中的数据复制并粘贴到excel中,然后读取到R中,但该文件太大,我无法在网页或excel崩溃之前将所有数据成功复制到excel中 我正试图进行以下工作: 1.将.htm文件成功读入R 2.通过下表解析页面顶部的文本和页面中心的图像 3.将具有列和行的数据放入数据框中进行数据分析 有人有什么精明的想法来阅读这些数据吗?我尝试过使用包RCurl、rli

我对我的一些交易进行回溯测试,我有非常大的
.HTM
本地文件。它们大约是250mb一块,我很难将它们读入R来对数据集进行一些数据分析。我曾尝试将本地文件中的数据复制并粘贴到excel中,然后读取到R中,但该文件太大,我无法在网页或excel崩溃之前将所有数据成功复制到excel中

我正试图进行以下工作:

1.将
.htm
文件成功读入R

2.通过下表解析页面顶部的文本和页面中心的图像

3.将具有列和行的数据放入数据框中进行数据分析

有人有什么精明的想法来阅读这些数据吗?我尝试过使用包
RCurl
rlist
XML
读取.lines,但没有成功

如有任何帮助/反馈,我将不胜感激

类似于:

require(dplyr)
require(plyer)
require(data.table)
require(jsonlite)
require(httr)
require(rvest)
require(XML)
require(plyr)


COLORDER = c("symbol", "code", "type", "expiry", "strike", "premium", "bid", "ask", "volume", "open.interest", "retrieved")


# AUSTRALIAN OPTIONS --------------------------------------------------------------------------------------------------

# ASX is the Australian Securities Exchange.

URLASX = 'http://www.asx.com.au/asx/markets/optionPrices.do?by=underlyingCode&underlyingCode=%s&expiryDate=&optionType=B'

getOptionChainAsx <- function(symbol) {
  url = sprintf(URLASX, symbol)

  html <- read_html(url)

  # Use the second element in the list (the first element gives data on the underlying stock)
  #
  options = (html %>% html_nodes("table.options") %>% html_table(header = TRUE))[[2]] %>%
    plyr::rename(c("Bid" = "bid", "Offer" = "ask", "Openinterest" = "open.interest", "Volume" = "volume", "Expirydate" = "expiry",
                   "P/C" = "type", "Margin Price" = "premium", "Exercise" = "strike", "Code" = "code")) %>%
    transform(
      symbol        = symbol,
      code          = code,
      retrieved     = Sys.time(),
      open.interest = suppressWarnings(as.integer(gsub(",", "", open.interest))),
      premium       = suppressWarnings(as.numeric(premium)),
      bid           = suppressWarnings(as.numeric(bid)),
      ask           = suppressWarnings(as.numeric(ask)),
      volume        = suppressWarnings(as.integer(gsub(",", "", volume))),
      expiry        = as.Date(expiry, format = "%d/%m/%Y")
    ) %>% dplyr::arrange(type, strike, expiry)
  options[, COLORDER]
}

getOptionChainAsx("BHP")

require(dplyr)
需要(plyer)
要求(数据表)
require(jsonlite)
要求(httr)
要求(rvest)
需要(XML)
需要(plyr)
着色剂=c(“符号”、“代码”、“类型”、“到期日”、“罢工”、“溢价”、“出价”、“询问”、“数量”、“未结利息”、“已检索”)
#澳大利亚选择--------------------------------------------------------------------------------------------------
#澳大利亚证券交易所是澳大利亚证券交易所。
URLASX=http://www.asx.com.au/asx/markets/optionPrices.do?by=underlyingCode&underlyingCode=%s&expiryDate=&optionType=B'
getOptionChainAsx%html_表(header=TRUE))[[2]]%>%
plyr::重命名(c(“投标”=“投标”,“报价”=“询问”,“开放兴趣”=“开放兴趣”,“数量”=“数量”,“到期日”=“到期日”,
“P/C”=“类型”,“保证金价格”=“溢价”,“行权”=“行权”,“代码”=“代码”)))%>%
转化(
符号=符号,
代码=代码,
已检索=Sys.time(),
open.interest=suppressWarnings(作为.integer(gsub(“,”,“”,open.interest)),
高级=抑制警告(如.numeric(高级)),
bid=suppressWarnings(作为.numeric(bid)),
ask=抑制警告(如.numeric(ask)),
volume=suppressWarnings(作为.integer(gsub(“,”,“”,volume)),
到期日=截止日期(到期日,格式=“%d/%m/%Y”)
)%%>%dplyr::排列(类型、罢工、到期)
选项[,着色器]
}
getOptionChainAsx(“必和必拓”)
类似于:

require(dplyr)
require(plyer)
require(data.table)
require(jsonlite)
require(httr)
require(rvest)
require(XML)
require(plyr)


COLORDER = c("symbol", "code", "type", "expiry", "strike", "premium", "bid", "ask", "volume", "open.interest", "retrieved")


# AUSTRALIAN OPTIONS --------------------------------------------------------------------------------------------------

# ASX is the Australian Securities Exchange.

URLASX = 'http://www.asx.com.au/asx/markets/optionPrices.do?by=underlyingCode&underlyingCode=%s&expiryDate=&optionType=B'

getOptionChainAsx <- function(symbol) {
  url = sprintf(URLASX, symbol)

  html <- read_html(url)

  # Use the second element in the list (the first element gives data on the underlying stock)
  #
  options = (html %>% html_nodes("table.options") %>% html_table(header = TRUE))[[2]] %>%
    plyr::rename(c("Bid" = "bid", "Offer" = "ask", "Openinterest" = "open.interest", "Volume" = "volume", "Expirydate" = "expiry",
                   "P/C" = "type", "Margin Price" = "premium", "Exercise" = "strike", "Code" = "code")) %>%
    transform(
      symbol        = symbol,
      code          = code,
      retrieved     = Sys.time(),
      open.interest = suppressWarnings(as.integer(gsub(",", "", open.interest))),
      premium       = suppressWarnings(as.numeric(premium)),
      bid           = suppressWarnings(as.numeric(bid)),
      ask           = suppressWarnings(as.numeric(ask)),
      volume        = suppressWarnings(as.integer(gsub(",", "", volume))),
      expiry        = as.Date(expiry, format = "%d/%m/%Y")
    ) %>% dplyr::arrange(type, strike, expiry)
  options[, COLORDER]
}

getOptionChainAsx("BHP")

require(dplyr)
需要(plyer)
要求(数据表)
require(jsonlite)
要求(httr)
要求(rvest)
需要(XML)
需要(plyr)
着色剂=c(“符号”、“代码”、“类型”、“到期日”、“罢工”、“溢价”、“出价”、“询问”、“数量”、“未结利息”、“已检索”)
#澳大利亚选择--------------------------------------------------------------------------------------------------
#澳大利亚证券交易所是澳大利亚证券交易所。
URLASX=http://www.asx.com.au/asx/markets/optionPrices.do?by=underlyingCode&underlyingCode=%s&expiryDate=&optionType=B'
getOptionChainAsx%html_表(header=TRUE))[[2]]%>%
plyr::重命名(c(“投标”=“投标”,“报价”=“询问”,“开放兴趣”=“开放兴趣”,“数量”=“数量”,“到期日”=“到期日”,
“P/C”=“类型”,“保证金价格”=“溢价”,“行权”=“行权”,“代码”=“代码”)))%>%
转化(
符号=符号,
代码=代码,
已检索=Sys.time(),
open.interest=suppressWarnings(作为.integer(gsub(“,”,“”,open.interest)),
高级=抑制警告(如.numeric(高级)),
bid=suppressWarnings(作为.numeric(bid)),
ask=抑制警告(如.numeric(ask)),
volume=suppressWarnings(作为.integer(gsub(“,”,“”,volume)),
到期日=截止日期(到期日,格式=“%d/%m/%Y”)
)%%>%dplyr::排列(类型、罢工、到期)
选项[,着色器]
}
getOptionChainAsx(“必和必拓”)

正如您提到的,您的
.HTM
文件是本地文件,因此要读取本地文件,下面是代码:

 rawHTML <- paste(readLines("path/toYour/file.html"), collapse="\n")

rawHTML正如您所提到的,您的
.HTM
文件是本地文件,因此要读取本地文件,下面是代码:

 rawHTML <- paste(readLines("path/toYour/file.html"), collapse="\n")
rawHTML