C# 解析剪贴板功能GetData(DataFormats.Html)输出的标准类
对不起,标题是 所以我想从剪贴板中提取文本。此文本是从网页(在浏览器中)复制的。在我的例子中,它是一个包含一些数据的表 因此,我用以下代码提取了数据(它是一个字符串):C# 解析剪贴板功能GetData(DataFormats.Html)输出的标准类,c#,clipboard,C#,Clipboard,对不起,标题是 所以我想从剪贴板中提取文本。此文本是从网页(在浏览器中)复制的。在我的例子中,它是一个包含一些数据的表 因此,我用以下代码提取了数据(它是一个字符串): IDataObject iData = Clipboard.GetDataObject(); if (iData.GetDataPresent(DataFormats.Html)) { string s = (string)iData.GetData(DataFormats.Html); } 我从中得到(s包含的内容
IDataObject iData = Clipboard.GetDataObject();
if (iData.GetDataPresent(DataFormats.Html))
{
string s = (string)iData.GetData(DataFormats.Html);
}
我从中得到(s包含的内容)如下:
Version:0.9
StartHTML:0000000397
EndHTML:0000004086
StartFragment:0000000433
EndFragment:0000004050
SourceURL:Bla Bla Bla
<html>
<body>
<!--StartFragment--><table class="listing tickets">Bla Bla Bla</table><!--EndFragment-->
</body>
</html>
版本:0.9
StartHTML:0000000397
EndHTML:0000004086
起始片段:0000000433
EndFragment:0000004050
SourceURL:blablabla
呜呜呜呜
所以,再一次。是否有任何标准类可以解析这些数据,或者我应该自己创建一个吗?好的,所以答案似乎是否定的!这让我有些惊讶 无论如何。我制作了自己的助手类,也许可以帮助你。这只是众多可能的解决方案之一。对于我的应用程序,如果没有找到任何内容,则返回null效果很好,也许您需要一个异常。还请记住,我将此作为一个辅助项目,因此没有对代码进行广泛的测试,因此,我不保证这是可行的
public class ClipboardHtmlOutput
{
public Double Version { get; private set; }
public String Source { get; private set; }
public String Input { get; private set; }
//public String Html { get { return Input.Substring(startHTML, (endHTML - startHTML)); } }
public String Html { get { return Input.Substring(startHTML, Math.Min(endHTML - startHTML, Input.Length - startHTML)); } }
public String Fragment { get { return Input.Substring(startFragment, (endFragment - startFragment)); } }
private int startHTML;
private int endHTML;
private int startFragment;
private int endFragment;
public static ClipboardHtmlOutput ParseString(string s)
{
ClipboardHtmlOutput html = new ClipboardHtmlOutput();
string pattern = @"Version:(?<version>[0-9]+(?:\.[0-9]*)?).+StartHTML:(?<startH>\d*).+EndHTML:(?<endH>\d*).+StartFragment:(?<startF>\d+).+EndFragment:(?<endF>\d*).+SourceURL:(?<source>f|ht{1}tps?://[-a-zA-Z0-9@:%_\+.~#?&//=]+)";
Match match = Regex.Match(s, pattern, RegexOptions.Singleline);
if (match.Success)
{
try
{
html.Input = s;
html.Version = Double.Parse(match.Groups["version"].Value, CultureInfo.InvariantCulture);
html.Source = match.Groups["source"].Value;
html.startHTML = int.Parse(match.Groups["startH"].Value);
html.endHTML = int.Parse(match.Groups["endH"].Value);
html.startFragment = int.Parse(match.Groups["startF"].Value);
html.endFragment = int.Parse(match.Groups["endF"].Value);
}
catch (Exception fe)
{
return null;
}
return html;
}
return null;
}
}
以下方法是Microsoft提供的方法。此方法包含在示例“XAML到HTML转换演示”中的HtmlParser类中,您可以在此处下载: 有关“HTML剪贴板格式”的其他信息,您可以在此处找到:
//
///通过解析htmlDataString中的标题信息,从剪贴板数据中提取Html字符串
///
///
///表示Html剪贴板数据的字符串。这包括Html标题
///
///
///仅包含htmlDataString的Html数据部分的字符串,不带标头
///
内部静态字符串提取HtmlFromClipboardData(字符串htmlDataString)
{
int startHtmlIndex=htmlDataString.IndexOf(“StartHTML:”);
如果(startHtmlIndex<0)
{
返回“错误:Urecognized html头”;
}
//TODO:我们假设指数严格由10个零表示(“0123456789”。长度),
//这可能是错误的假设。我们需要在这里实现更灵活的解析
startHtmlIndex=Int32.Parse(htmlDataString.Substring(startHtmlIndex+“StartHTML:”.Length,“0123456789.Length));
if(startHtmlIndex<0 | | startHtmlIndex>htmlDataString.Length)
{
返回“错误:Urecognized html头”;
}
int-endHtmlIndex=htmlDataString.IndexOf(“EndHTML:”);
if(endHtmlIndex<0)
{
返回“错误:Urecognized html头”;
}
//TODO:我们假设指数严格由10个零表示(“0123456789”。长度),
//这可能是错误的假设。我们需要在这里实现更灵活的解析
endHtmlIndex=Int32.Parse(htmlDataString.Substring(endHtmlIndex+“EndHTML:”.Length,“0123456789.Length));
if(endHtmlIndex>htmlDataString.Length)
{
endHtmlIndex=htmlDataString.Length;
}
返回htmlDataString.Substring(startHtmlIndex,endHtmlIndex-startHtmlIndex);
}
2015年2月25日增补
在我实施之后。我必须注意UTF-8(参见at和of方法)
//
///通过解析标题信息,从剪贴板数据中提取选定的Html片段字符串
///在htmlDataString中
///
///
///表示Html剪贴板数据的字符串。这包括Html标题
///
///
///仅包含htmlDataString的Html选择部分的字符串,不带标头
///
内部静态字符串ExtractHtmlFragmentFromClipboardData(字符串htmlDataString)
{
//HTML剪贴板格式
// (https://msdn.microsoft.com/en-us/library/aa767917(v=vs.85).aspx)
//该片段包含表示用户所选区域的有效HTML
//包括基本粘贴HTML片段所需的信息,如下所示:
//-选定文本。
//-在所选文本中具有结束标记的任何元素的开始标记和属性。
//-与包含的洞口标记匹配的端点标记。
//片段的前面和后面应该是HTML注释和
//(在!--和文本之间不允许有空格)以指示
//片段的开始和结束。因此片段的开始和结束由
//注释以及StartFragment和EndFragment字节计数。虽然冗余,
//这使得查找片段的开头(从字节计数)和标记
//片段直接在HTML树中的位置。
//从剪贴板开始到片段开始的字节计数。
int startFragmentIndex=htmlDataString.IndexOf(“StartFragment:”);
如果(startFragmentIndex<0)
{
返回“错误:无法识别的html头”;
}
//TODO:我们假设指数严格由10个零表示(“0123456789”。长度),
//这可能是错误的假设。我们需要在这里实现更灵活的解析
startFragmentIndex=Int32.Parse(htmlDataString.Substring(startFragmentIndex+“startFragmentIndex:”.Length,10));
if(startFragmentIndex<0 | | startFragmentIndex>htmlDataString.Length)
{
返回“错误:无法识别的html头”;
}
//从剪贴板开始到片段结束的字节计数。
int endFragmentIndex=htmlDataString.IndexOf(“EndFragment:”);
如果(endFragmentIndex<0)
{
返回“错误:无法识别的html头”;
}
//TODO:我们假设指数严格由10个零表示(“0123456789”。长度),
//这可能是错误的假设。我们需要在这里实现更灵活的解析
endFragmentIndex=Int32.Parse(htmlDataString.Substring(endFragmentIndex+“EndFragment:”.Length,10));
if(endFragmentIndex>htmlDataString.Length)
{
endFragmentIndex=htmlDataString.Length;
}
//CF_HTML完全是文本格式,并使用转换格式UTF-8
byte[]bytes=Encoding.UTF8.GetBytes(htmlDataString);
返回Encoding.UTF8.GetString(字节、startFragmentIndex、endFragmentIndex-startFragmentIn
IDataObject iData = Clipboard.GetDataObject();
if (iData.GetDataPresent(DataFormats.Html))
{
ClipboardHtmlOutput cho = ClipboardHtmlOutput.ParseString((string)iData.GetData(DataFormats.Html));
XmlDocument xml = new XmlDocument();
xml.LoadXml(cho.Fragment);
}
/// <summary>
/// Extracts Html string from clipboard data by parsing header information in htmlDataString
/// </summary>
/// <param name="htmlDataString">
/// String representing Html clipboard data. This includes Html header
/// </param>
/// <returns>
/// String containing only the Html data part of htmlDataString, without header
/// </returns>
internal static string ExtractHtmlFromClipboardData(string htmlDataString)
{
int startHtmlIndex = htmlDataString.IndexOf("StartHTML:");
if (startHtmlIndex < 0)
{
return "ERROR: Urecognized html header";
}
// TODO: We assume that indices represented by strictly 10 zeros ("0123456789".Length),
// which could be wrong assumption. We need to implement more flrxible parsing here
startHtmlIndex = Int32.Parse(htmlDataString.Substring(startHtmlIndex + "StartHTML:".Length, "0123456789".Length));
if (startHtmlIndex < 0 || startHtmlIndex > htmlDataString.Length)
{
return "ERROR: Urecognized html header";
}
int endHtmlIndex = htmlDataString.IndexOf("EndHTML:");
if (endHtmlIndex < 0)
{
return "ERROR: Urecognized html header";
}
// TODO: We assume that indices represented by strictly 10 zeros ("0123456789".Length),
// which could be wrong assumption. We need to implement more flrxible parsing here
endHtmlIndex = Int32.Parse(htmlDataString.Substring(endHtmlIndex + "EndHTML:".Length, "0123456789".Length));
if (endHtmlIndex > htmlDataString.Length)
{
endHtmlIndex = htmlDataString.Length;
}
return htmlDataString.Substring(startHtmlIndex, endHtmlIndex - startHtmlIndex);
}
/// <summary>
/// Extracts selected Html fragment string from clipboard data by parsing header information
/// in htmlDataString
/// </summary>
/// <param name="htmlDataString">
/// String representing Html clipboard data. This includes Html header
/// </param>
/// <returns>
/// String containing only the Html selection part of htmlDataString, without header
/// </returns>
internal static string ExtractHtmlFragmentFromClipboardData(string htmlDataString)
{
// HTML Clipboard Format
// (https://msdn.microsoft.com/en-us/library/aa767917(v=vs.85).aspx)
// The fragment contains valid HTML representing the area the user has selected. This
// includes the information required for basic pasting of an HTML fragment, as follows:
// - Selected text.
// - Opening tags and attributes of any element that has an end tag within the selected text.
// - End tags that match the included opening tags.
// The fragment should be preceded and followed by the HTML comments <!--StartFragment--> and
// <!--EndFragment--> (no space allowed between the !-- and the text) to indicate where the
// fragment starts and ends. So the start and end of the fragment are indicated by these
// comments as well as by the StartFragment and EndFragment byte counts. Though redundant,
// this makes it easier to find the start of the fragment (from the byte count) and mark the
// position of the fragment directly in the HTML tree.
// Byte count from the beginning of the clipboard to the start of the fragment.
int startFragmentIndex = htmlDataString.IndexOf("StartFragment:");
if (startFragmentIndex < 0)
{
return "ERROR: Unrecognized html header";
}
// TODO: We assume that indices represented by strictly 10 zeros ("0123456789".Length),
// which could be wrong assumption. We need to implement more flrxible parsing here
startFragmentIndex = Int32.Parse(htmlDataString.Substring(startFragmentIndex + "StartFragment:".Length, 10));
if (startFragmentIndex < 0 || startFragmentIndex > htmlDataString.Length)
{
return "ERROR: Unrecognized html header";
}
// Byte count from the beginning of the clipboard to the end of the fragment.
int endFragmentIndex = htmlDataString.IndexOf("EndFragment:");
if (endFragmentIndex < 0)
{
return "ERROR: Unrecognized html header";
}
// TODO: We assume that indices represented by strictly 10 zeros ("0123456789".Length),
// which could be wrong assumption. We need to implement more flrxible parsing here
endFragmentIndex = Int32.Parse(htmlDataString.Substring(endFragmentIndex + "EndFragment:".Length, 10));
if (endFragmentIndex > htmlDataString.Length)
{
endFragmentIndex = htmlDataString.Length;
}
// CF_HTML is entirely text format and uses the transformation format UTF-8
byte[] bytes = Encoding.UTF8.GetBytes(htmlDataString);
return Encoding.UTF8.GetString(bytes, startFragmentIndex, endFragmentIndex - startFragmentIndex);
}